|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
我想获取知乎热榜的排行榜并保存到excel 可是excel那里出来问题 请大神们帮帮忙
- from lxml import etree
- import requests
- import openpyxl
- def get_url(url):
-
-
- headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36",
- 'Cookie':'_zap=99431b2c-8767-4685-b98b-cd9d0d3d88cd; _xsrf=5p5XGCN215CjXjnbPuj1OmbQmvR2pQdQ; d_c0="AKDg7kP8dxCPTsNRshHIPT2mH69mOZb8xuE=|1575709991"; z_c0="2|1:0|10:1575709998|4:z_c0|92:Mi4xa2poTkNBQUFBQUFBb09EdVFfeDNFQ1lBQUFCZ0FsVk5MYnZZWGdEZ0xuLWJoVWFzV2ZtYjk4T2M3dHNYa1lRd1dn|ab32d6e8ac6c6c86e72b218ad426d24b26d9adea21fdbd28afe64ada5ac53e77"; _ga=GA1.2.2089582079.1583647145; q_c1=5919a9b1025f437f8105f1a16a9f0d36|1583647148000|1575710038000; _gid=GA1.2.633099727.1584344528; tst=h; tshl=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1584357660,1584362899,1584406978,1584411187; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1584420694; KLBRSID=53650870f91603bc3193342a80cf198c|1584420984|1584420275'}
- ret = requests.get(url, headers=headers)
- ret.encoding = 'GBK'
- html_str = ret.content.decode()
- html = etree.HTML(html_str)
- res = html.xpath("//section[@class='HotItem']")
- return res
- #获取知乎热榜排名、标题、热度
- def get_data(res):
-
- for table in res:
- data = {}
- data['排名'] = table.xpath(".//div[@class='HotItem-index']/div/text()")
- data['标题'] = table.xpath(".//div[@class='HotItem-content']/a/h2/text()")
- data['热度'] = table.xpath(".//div[@class='HotItem-metrics HotItem-metrics--bottom']/text()")
- data['地址'] = table.xpath(".//div[@class='HotItem-content']/a/@href")
- return data
-
- def to_excel(data):
- wb = openpyxl.Workbook()
- wb.guess_types = True
- ws = wb.active
- ws.append(['排名', '标题', '热度', '地址'])
- for each in data:
- ws.append(each)
- wb.save("知乎热榜排行榜.xlsx")
- def main():
- url = "https://www.zhihu.com/hot"
- res = get_url(url)
- data = get_data(res)
- to_excel(data)
-
-
- if __name__ == "__main__":
- main()
复制代码
我给你改完了代码,完全爬出来了50个,应该只是第一页,,本来想先让你写一下的,算了 - from lxml import etree
- import requests
- import openpyxl
- def get_url(url):
-
-
- headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36",
- 'Cookie':'_zap=99431b2c-8767-4685-b98b-cd9d0d3d88cd; _xsrf=5p5XGCN215CjXjnbPuj1OmbQmvR2pQdQ; d_c0="AKDg7kP8dxCPTsNRshHIPT2mH69mOZb8xuE=|1575709991"; z_c0="2|1:0|10:1575709998|4:z_c0|92:Mi4xa2poTkNBQUFBQUFBb09EdVFfeDNFQ1lBQUFCZ0FsVk5MYnZZWGdEZ0xuLWJoVWFzV2ZtYjk4T2M3dHNYa1lRd1dn|ab32d6e8ac6c6c86e72b218ad426d24b26d9adea21fdbd28afe64ada5ac53e77"; _ga=GA1.2.2089582079.1583647145; q_c1=5919a9b1025f437f8105f1a16a9f0d36|1583647148000|1575710038000; _gid=GA1.2.633099727.1584344528; tst=h; tshl=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1584357660,1584362899,1584406978,1584411187; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1584420694; KLBRSID=53650870f91603bc3193342a80cf198c|1584420984|1584420275'}
- ret = requests.get(url, headers=headers)
- ret.encoding = 'GBK'
- html_str = ret.content.decode()
- html = etree.HTML(html_str)
- res = html.xpath("//section[@class='HotItem']")
- return res
- #获取知乎热榜排名、标题、热度
- def get_data(table):
- data = {}
- data['排名'] = str(table.xpath(".//div[@class='HotItem-index']/div/text()"))
- data['标题'] = str(table.xpath(".//div[@class='HotItem-content']/a/h2/text()"))
- data['热度'] = str(table.xpath(".//div[@class='HotItem-metrics HotItem-metrics--bottom']/text()"))
- data['地址'] = str(table.xpath(".//div[@class='HotItem-content']/a/@href"))
- return data
-
- def to_excel(res):
- wb = openpyxl.Workbook()
- wb.guess_types = True
- ws = wb.active
- ws.append(['排名', '标题', '热度', '地址'])
- i = 0
- for table in res:
- data = get_data(table)
- i +=1
- j = 0
- for each in data:
- j += 1
- _=ws.cell(column=j,row=i+1,value=data[each])
- wb.save("C:\\Users\\Chysial\\Desktop\\知乎热榜排行榜.xlsx")
- url = "https://www.zhihu.com/hot"
- res = get_url(url)
- to_excel(res)
复制代码
|
|