|
发表于 2018-11-7 22:04:39
|
显示全部楼层
- import urllib.request
- from bs4 import BeautifulSoup as bs
- import re
- import openpyxl
- def urlopen(url):
- head = {}
- head['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
- head['Accept-Language'] = 'zh-CN,zh;q=0.9'
- head['Cache-Control'] = 'no-cache'
- head['Connection'] = 'keep-alive'
-
- head['Cookie']='bid=Wv1u2my5GJI; gr_user_id=ec943490-8875-40fe-b5b9-538d784cbf84; _vwo_uuid_v2=D6C966AC33758154BD3FC61FB43687FE2|718456dd9cd6126870e9d38c3a11a25e; douban-fav-remind=1; viewed="26820803_1200840_30209224"; ps=y; dbcl2="186505260:SSyljm2guj8"; push_noty_num=0; push_doumail_num=0; ck=giUI; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1541515329%2C%22https%3A%2F%2Ffishc.com.cn%2Fthread-94979-1-1.html%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.1606666109.1528767205.1540645031.1541515329.9; __utmb=30149280.0.10.1541515329; __utmc=30149280; __utmz=30149280.1541515329.9.4.utmcsr=fishc.com.cn|utmccn=(referral)|utmcmd=referral|utmcct=/thread-94979-1-1.html; __utma=223695111.832413198.1541515329.1541515329.1541515329.1; __utmb=223695111.0.10.1541515329; __utmc=223695111; __utmz=223695111.1541515329.1.1.utmcsr=fishc.com.cn|utmccn=(referral)|utmcmd=referral|utmcct=/thread-94979-1-1.html; __yadk_uid=VJnac125pQedgBMAqBXBVd9hGRBckHeH; _pk_id.100001.4cf6=607e87647801a894.1541515329.1.1541515370.1541515329.'
- head['Host']='movie.douban.com'
- head['Pragma']='no-cache'
- head['Upgrade-Insecure-Requests']='1'
- head['User-Agent']='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
- req = urllib.request.Request(url,headers = head)
-
- html = urllib.request.urlopen(req)
- html = html.read()
- return html
- def xia():
-
- ye =0
- wb = openpyxl.Workbook()
- ws = wb.active
- for i in range(10):
- url = 'https://movie.douban.com/top250?start={}&filter='.format (ye)
- ye = ye+25
- html = urlopen(url)
- html = html.decode('utf-8')
- htm = bs(html,'lxml')
- data = htm.ol
- da = str(data)
- url_name = re.findall(r'(href=".*?)">\n<span class="title">(.*?)<',da)
- dao =re.findall(r'\n (.*?)<br',da)
- pin = re.findall(r'property="v:average">(.*?)<',da)
- for i in range(25):
- print('电影名:'+url_name[i][1])
- print('链接:'+url_name[i][0])
- print('导演:'+dao[i])
- print('电影评分'+pin[i]+'\n\n')
- ws.append([url_name[i][1],pin[i],url_name[i][0],dao[i]])
- wb.save('电影.xlsx')
-
- xia()
复制代码 |
|