|

楼主 |
发表于 2020-8-7 14:06:37
|
显示全部楼层
import re
import urllib
import urllib.request
import random
url="https://maoyan.com/board/4?"
#用户输入开始和结束的页数
while True:
print("输入你想爬取电影的开始页数")
start=int(input())
if start<=0 or start>34:
print("输入的开始页数有误请重新输入")
continue
else:
break
while True:
print("输入你想爬取电影的结束页数")
end=int(input())
if end<=0 or end>34:
print("输入的结束页数有误请重新输入")
continue
else:
break
list1=[]
listrank=[]
listmes=[]
listactor=[]
header1="Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
header2="Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
header3="Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"
header4="Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"
list1.append(header1)
list1.append(header2)
list1.append(header3)
list1.append(header4)
for page in range(start,end+1):
#获取网页
offset=(page-1)*10
fullurl=url+"offset="+str(offset)
choicuesr=random.choice(list1)
headers={"User-Agent":choicuesr}
rep=urllib.request.Request(fullurl,headers=headers)
reponse=urllib.request.urlopen(rep).read().decode()
#获取排名
#<i class="board-index board-index-1">1</i>
#<i class="board-index board-index-2">2</i>
pat=re.compile(r'<i class="board-index board-index-\d*">(.*?)</i>')
result=pat.findall(reponse)
for i in range(0,len(result)):
listrank.append(result[i])
#电影信息
# title="钢琴家" data-act="boarditem-click"
# title="勇敢的心" data-act="boarditem-click"
pat=re.compile(r'alt="(.*?)" class="board-img" />')
result=pat.findall(reponse)
for i in range(0,len(result)):
listmes.append(result[i])
#获取演员
#<p class="star">
# 主演:梅尔·吉布森,苏菲·玛索,帕特里克·麦高汉
# </p>
pat=re.compile(r'<p class="star">[\s\S]*?主演:(*?)[\s\S]*?</p>')
result=pat.findall(reponse)
for i in range(0,len(result)):
listactor.append(result[i])
print(listactor) |
|