|
|
发表于 2019-6-3 21:50:11
|
显示全部楼层
1.建议每个函数调试一遍 一般不会出错了
2.网页源码去格式化一下再去分析
3.可以参考我爬的猫眼电影的代码,如下
import requests
import time
import pandas as pd
from lxml import etree
#获取电影信息
def get_page(url):
try:
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
res=requests.get(url,headers=headers)
res.encoding='utf-8'
html=etree.HTML(res.text)
index=html.xpath('//dl[@class="board-wrapper"]/dd/i/text()') #排名
title=html.xpath('//dl[@class="board-wrapper"]/dd/a/@title') #电影名
star=[i.strip()[3:] for i in html.xpath('//dl[@class="board-wrapper"]/dd/div//p[@class="star"]/text()')] #主演 #列表推导式去除特殊字符
release_time=[j.strip()[5:15] for j in html.xpath('//dl[@class="board-wrapper"]/dd/div//p[@class="releasetime"]/text()')] #上映时间
integer=html.xpath('//dl[@class="board-wrapper"]/dd/div//p[@class="score"]/i[1]/text()')
fraction=html.xpath('//dl[@class="board-wrapper"]/dd/div//p[@class="score"]/i[2]/text()') #评分
rdata=list(map(lambda x:(index[x],title[x],star[x],release_time[x],integer[x]+fraction[x]),range(len(index))))
data=pd.DataFrame(rdata)
return data
except Exception as e:
return None
#写入csv文件
def write_to_file(data):
header=['排名','电影名','主演','上映时间','评分']
data.to_csv(r'C:\Users\Danta\Desktop\maoyandianying.csv',header=header,index=0,mode="a",encoding='ANSI')
def main(page):
data=pd.DataFrame()
for i in range(page):
url='https://maoyan.com/board/4?offset='+str(i*10)
data=data.append(get_page(url))
time.sleep(2)
print('第'+str(i+1)+'页爬取成功'+'\n')
write_to_file(data)
print('\n'+'写入完成')
if __name__=='__main__':
main(10)
|
|