python爬取汽车之家汽车数据,Python交流,编程语言专区,鱼C论坛

huyw 发表于 2018-8-17 16:30:36

python爬取汽车之家汽车数据

# -- coding: utf-8 --

'''
Created on 2018年08月07日

@author: huyw
'''

from bs4 import BeautifulSoup
import datetime
import urllib.request, urllib.parse, http.cookiejar
import xlsxwriter

# 定义解析网页函数
def getHtml(url):
cj=http.cookiejar.CookieJar()
opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
# 伪装成浏览器，虽然并没啥卵用
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'),('Cookie','4564564564564564565646540')]
urllib.request.install_opener(opener)
html_string = urllib.request.urlopen( url ).read()
return html_string

# 第0次爬取的网页
url0= 'https://car.autohome.com.cn/price/brand-117.html'
html_doc0 = getHtml(url0)
soup0 = BeautifulSoup(html_doc0, 'lxml',from_encoding='gb18030')
# soup1 = soup1.decode('gbk').encode('utf-8')

# 获取到所有品牌及url
listname = []
listurl = []
for id in range(ord("A"),ord("Z")+1):
id1 = 'brand' + chr(id)
pri = soup0.find(class_='brandcont-open fn-hide',id=id1)
# 能抓取到就加入list
if pri != None:
   model = pri.find_all('a')
   for i in model:
         # print(i.get('href'))
         listname.append(i.string)
         # print(i.string) 获取各大品牌URL
         i1 = 'https://car.autohome.com.cn' + i.get('href')
         listurl.append(i1)
# print(listurl)

# 获取到所有品牌，车名，车型，价格，销售状态
listcar = []
listcarname = []
listcarmodel = []
listcarprice = []
listsale = []

# 开始按list里的url爬取数据----不用管二级菜单，简单粗暴
for num in range(len(listurl)):
# 解析各品牌URL----没有在售的车型品牌URL没有获取到，下次再说
html_doc1 = getHtml(listurl)
soup1 = BeautifulSoup(html_doc1, 'lxml',from_encoding='gb18030') #解决编码问题
str00 = soup1.find(class_='tab-nav border-t-no')
if str00 != None:
   str0 = str00.find_all('a')
# 品牌下的在售、即将销售、停售url
for a in str0:
   if a.string != None:
         # 逐个解析各品牌URL下的在售、即将销售、停售车型数据
         url01 = 'https://car.autohome.com.cn' + a.get('href')
         html_doc01 = getHtml(url01)
         soup01 = BeautifulSoup(html_doc01, 'lxml', from_encoding='gb18030')

         str1 = soup01.find_all(class_='list-cont-bg')

         #打印日志
         print(datetime.datetime.now() , '正在爬取车型', listname,a.string, url01)
         for name1 in str1:
            # print(name1.find(class_='font-bold').text) #车名
            # print(name1.find(class_='info-gray').text) #车型（中型、紧凑...）
            # print(name1.find(class_='font-arial').text)#价格
            listcarname.append(name1.find(class_='font-bold').text)
            listcarmodel.append(name1.find(class_='info-gray').text)
            listcarprice.append(name1.find(class_='font-arial').text)
            listcar.append(listname)
            listsale.append(a.string)

         # 获取分页数据
         if soup01.find(class_='page') == None :
            print('当前车型',listname,a.string,'没有分页')
         else:
            # 获取所有页面url内容
            listpage = soup01.find(class_='page').find_all('a')
            # 减掉前一页、后一页、当前页
            for pagenum in range(len(listpage) - 3):
               url2 = 'https://car.autohome.com.cn' + listpage.get('href')

               # 打印日志
               print(datetime.datetime.now(), '正在爬取车型', listname,a.string,pagenum + 2,'页的数据',url2)
               # 第二次循环爬取网页
               html_doc2 = getHtml(url2)
               soup2 = BeautifulSoup(html_doc2, 'lxml', from_encoding='gb18030')

               str2 = soup2.find_all(class_='list-cont-bg')
               # 第二次循环获取name2
               for name2 in str2:
                     # print(name2)
                     # print(name2.find(class_='font-bold').text)# 车名
                     # print(name2.find(class_='info-gray').text)# 车型（中型、紧凑...）
                     # print(name2.find(class_='font-arial').text)# 价格
                     listcarname.append(name2.find(class_='font-bold').text)
                     listcarmodel.append(name2.find(class_='info-gray').text)
                     listcarprice.append(name2.find(class_='font-arial').text)
                     listcar.append(listname)
                     listsale.append(a.string)
# 生成文件
workbook = xlsxwriter.Workbook('car_data0806_1.xlsx')
worksheet = workbook.add_worksheet('sheet1')

# 数据写入excel
worksheet.write_column('A1',listcar)
worksheet.write_column('B1',listcarname)
worksheet.write_column('C1',listcarmodel)
worksheet.write_column('D1',listcarprice)
worksheet.write_column('E1',listsale)
workbook.close()

wanghongguang 发表于 2018-8-21 11:50:26

真厉害呀，前几天我也尝试爬取汽车之家的数据，老是出错，今天看到高手了！

焰虎发表于 2018-8-21 18:24:27

大佬创建的excle表不知道在哪里啊求解

huyw 发表于 2018-8-23 15:15:05

wanghongguang 发表于 2018-8-21 11:50
真厉害呀，前几天我也尝试爬取汽车之家的数据，老是出错，今天看到高手了！

新手入门啦

huyw 发表于 2018-8-23 15:22:19

焰虎发表于 2018-8-21 18:24
大佬创建的excle表不知道在哪里啊求解

excel上传不了{:10_266:}

shihongji 发表于 2019-3-29 19:00:37

汽车之家代码改了吧，代码执行尝试，listurl, listname爬不到任何内容

woniuchen 发表于 2020-6-10 16:23:36

你好我拿了你的汽车之家的数据去跑了一下提示了
for a in str0:
NameError: name 'str0' is not defined
请问怎么解决

页: [1]

鱼C论坛's Archiver

python爬取汽车之家汽车数据