import urllib.request
from bs4 import BeautifulSoup as bs
import re
import openpyxl
def urlopen(url):
head = {}
head['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
head['Accept-Language'] = 'zh-CN,zh;q=0.9'
head['Cache-Control'] = 'no-cache'
head['Connection'] = 'keep-alive'
head['Cookie']='bid=Wv1u2my5GJI; gr_user_id=ec943490-8875-40fe-b5b9-538d784cbf84; _vwo_uuid_v2=D6C966AC33758154BD3FC61FB43687FE2|718456dd9cd6126870e9d38c3a11a25e; douban-fav-remind=1; viewed="26820803_1200840_30209224"; ps=y; dbcl2="186505260:SSyljm2guj8"; push_noty_num=0; push_doumail_num=0; ck=giUI; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1541515329%2C%22https%3A%2F%2Ffishc.com.cn%2Fthread-94979-1-1.html%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.1606666109.1528767205.1540645031.1541515329.9; __utmb=30149280.0.10.1541515329; __utmc=30149280; __utmz=30149280.1541515329.9.4.utmcsr=fishc.com.cn|utmccn=(referral)|utmcmd=referral|utmcct=/thread-94979-1-1.html; __utma=223695111.832413198.1541515329.1541515329.1541515329.1; __utmb=223695111.0.10.1541515329; __utmc=223695111; __utmz=223695111.1541515329.1.1.utmcsr=fishc.com.cn|utmccn=(referral)|utmcmd=referral|utmcct=/thread-94979-1-1.html; __yadk_uid=VJnac125pQedgBMAqBXBVd9hGRBckHeH; _pk_id.100001.4cf6=607e87647801a894.1541515329.1.1541515370.1541515329.'
head['Host']='movie.douban.com'
head['Pragma']='no-cache'
head['Upgrade-Insecure-Requests']='1'
head['User-Agent']='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
req = urllib.request.Request(url,headers = head)
html = urllib.request.urlopen(req)
html = html.read()
return html
def xia():
ye =0
wb = openpyxl.Workbook()
ws = wb.active
for i in range(10):
url = 'https://movie.douban.com/top250?start={}&filter='.format (ye)
ye = ye+25
html = urlopen(url)
html = html.decode('utf-8')
htm = bs(html,'lxml')
data = htm.ol
da = str(data)
url_name = re.findall(r'(href=".*?)">\n<span class="title">(.*?)<',da)
dao =re.findall(r'\n (.*?)<br',da)
pin = re.findall(r'property="v:average">(.*?)<',da)
for i in range(25):
print('电影名:'+url_name[i][1])
print('链接:'+url_name[i][0])
print('导演:'+dao[i])
print('电影评分'+pin[i]+'\n\n')
ws.append([url_name[i][1],pin[i],url_name[i][0],dao[i]])
wb.save('电影.xlsx')
xia()
|