|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
爬取汽车之家的,数据也正常拿到手了
但因为有些车子是新车,没有二手价的,导致在保存的时候报错
IndexError: list index out of range
不知道有没有大佬可以帮忙解决下
代码如下:
- import parsel
- import requests
- import openpyxl
- def get_url(url):
-
- r = requests.get(url, headers=headers).text
- html = parsel.Selector(r)
-
- # 车辆名称
- name = html.xpath('//div[@class="cards-bottom"]/h4/text()').getall()
- # 里程数
- milage = html.xpath('//div[@class="cards-bottom"]/p/text()').getall()
- print(len(milage))
- # 现价
- price = html.xpath('//div[@class="cards-price-box"]/span[1]/text()').getall()
- print(len(price))
- # 原价
- new_price = html.xpath('//div[@class="cards-price-box"]/s/text()').getall()
- print(len(new_price))
-
- datalist = []
- for i,_ in enumerate(name):
- data = []
- data.append(name[i])
- data.append(milage[i].split("/")[0])
- data.append(milage[i].split("/")[1])
- data.append(milage[i].split("/")[2])
- data.append(price[i])
- if price == None:
- continue
-
- data.append(new_price[i])
- if new_price == None:
- continue
- datalist.append(data)
-
-
- return datalist
-
- def toexcel(datalist):
- wb = openpyxl.Workbook()
- wb.guess_type = True
- ws = wb.active
- ws.append(['车辆名称','里程数','上牌时间',"地点","现价","原价"])
-
- for each in datalist:
-
- ws.append(each)
- wb.save("汽车之家.xlsx")
- if __name__ == "__main__":
-
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3760.400 QQBrowser/10.5.4083.400'
- }
- page = int(input("请输入你要下载的页数:"))
-
- for i in range(1,page+1):
- url = f"https://www.che168.com/china/a0_0msdgscncgpi1ltocsp{i}exx0/?pvareaid=102179#currengpostion"
- print(url)
- datalist = get_url(url)
- toexcel(datalist)
- print("采集完成^_^")
复制代码
我用的lxml,把car单独取出来,然后逐个遍历,对于ori_price为空的你可以做一个判断。
然后组成data,写到xlsx里。
- import requests
- from lxml import etree
- def main():
- url = 'https://www.che168.com/china/a0_0msdgscncgpi1ltocsp1exx0/?pvareaid=102179#currengpostion'
- headers = {'user-agent': 'firefox'}
- r = requests.get(url, headers=headers)
- html = etree.HTML(r.text)
- result = html.xpath('//a[@class="carinfo"]')
- for item in result:
- name = item.xpath('./div[2]/h4/text()')
- milage = item.xpath('./div[2]/p/text()')
- ori_price = item.xpath('./div[2]/div/s/text()')
- cur_price = item.xpath('./div[2]/div/span[1]/em/text()')
- print(name, milage, ori_price, cur_price)
- if __name__ == '__main__':
- main()
复制代码
|
|