|
发表于 2023-5-20 16:05:30
|
显示全部楼层
给你改了一下:
- import requests
- from lxml import etree
- def geturl():
- url = "https://www.zbj.com/search/service/?kw=saas&r=1"
- head = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37"
- }
- html = requests.get(url=url,headers=head)
- html.encoding = 'utf-8'
- return html
- def get_data(html):
- tree = etree.HTML(html.text)
- #//*[@id="__layout"]/div/div[3]/div/div[4]/div/div[2]/div[1] 根div的xpath
- #//*[@id="__layout"]/div/div[3]/div/div[4]/div/div[2]/div[1]/div[2]/div/div[3]/div[2]/a/text() 标题的xpath
- #//*[@id="__layout"]/div/div[3]/div/div[4]/div/div[2]/div[1]/div[1]/div/div[3]/div[1]/span/text() 价格的xpath
-
- #问题一:下面这两个xpath打印不出数据
- #//*[@id="__layout"]/div/div[3]/div/div[4]/div/div[2]/div[1]/div[1]/div/div[3]/div[4]/div[2]/div/span[2]/text() 销量的xpath
- #//*[@id="__layout"]/div/div[3]/div/div[4]/div/div[2]/div[1]/div[1]/div/div[3]/div[4]/div[3]/div/span[2]/text() 好评的xpath
- all_div = tree.xpath(r'//*[@id="__layout"]/div/div[3]/div/div[4]/div/div[2]/div[1]/div')
- #问题二:
- #思路是使用for循环遍历出每一个div,在此基础上爬取每一个div的信息
- #title的xpath路劲修改为./div/div[3]/div[2]/a/text()没有数据,应该怎么修改
- for div in all_div:
- title = div.xpath(r'./div/div[3]/div[2]/a/text()')
- if not title:
- continue
- print(title)
- score = div.xpath(r'./div/div[3]/div[4]/div[1]/span[1]/span/text()')
- print(score)
- price = div.xpath(r'./div/div[3]/div[1]/span/text()')
- print(price)
- sales = div.xpath('./div/div[3]/div[4]/div[2]/span/div/span[2]/text()') #空列表
- print(sales)
- comment = div.xpath(r'./div/div[3]/div[4]/div[3]/span/div/span[2]/text()') #空列表
- print(comment)
- if __name__ == "__main__":
- data = geturl()
- get_data(data)
复制代码 |
|