|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import urllib.request
import urllib.error
import re
import sys
class MovieComment():
def __init__(self):
self.start=0
self.param='&filter=&type='
self.headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
self.commentist=[]
self.filepath='E:/Projects/Python-Spiders/Python网络爬虫-罗攀/comment.txt'
def getPage(self):
try:
for i in range(0,100,20):
url = 'https://movie.douban.com/subject/25823277/comments?start=60&limit={i}&sort=new_score&status=P'
request = urllib.request.Request(url,headers=self.headers)
reponse = urllib.request.urlopen(request)
page = reponse.read().decode('utf-8')
print('正在抓取第' + str(i) + '页数据')
return page
except urllib.error.URLError as e:
if hasattr(e,"reason"):
print('抓取失败',e.reason)
def getMovie(self):
pattern=re.compile(u'<div.*?class="avatar">.*?'
+ u'<a.*?title="(.*?)".*?href=".*?">.*?</a>.*?'
+ u'<p.*?class="">(.*?)</p>',re.S)
page=self.getPage()
comments=re.findall(pattern,page)
for comment in comments:
self.commentist.append([comment[0],comment[1].strip()])
with open('comment.txt','wb') as file:
file.write(self.commentist.append([comment[0],comment[1].strip()]))
file.close()
def main(self):
print('正在从《三生三世十里桃花》电影短评中抓取数据...')
self.getMovie()
print('抓取完毕...')
DouBanSpider = MovieComment()
DouBanSpider.main()
代码很多问题,帮楼主修改了。
- import urllib.request
- import urllib.error
- import re
- class MovieComment:
- def __init__(self):
- self.start = 0
- self.param = '&filter=&type='
- self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, '
- 'like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
- self.commentist = []
- self.filepath = 'E:/Projects/Python-Spiders/Python网络爬虫-罗攀/comment.txt'
- def getPage(self):
- try:
- for i in range(0, 100, 20):
- url = 'https://movie.douban.com/subject/25823277/comments?start=60&limit={i}&sort=new_score&status=P'
- request = urllib.request.Request(url, headers=self.headers)
- reponse = urllib.request.urlopen(request)
- page = reponse.read().decode('utf-8')
- print('正在抓取第' + str(i) + '页数据')
- return page
- except urllib.error.URLError as e:
- if hasattr(e, "reason"):
- print('抓取失败', e.reason)
- def getMovie(self):
- pattern = re.compile(u'<div.*?class="avatar">.*?'
- + u'<a.*?title="(.*?)".*?href=".*?">.*?</a>.*?'
- + u'<p.*?class="">(.*?)</p>', re.S)
- page = self.getPage()
- comments = re.findall(pattern, page)
- for comment in comments:
- self.commentist.append([comment[0], comment[1].strip()])
- with open('comment.txt', 'wb') as file:
- print(comment[0])
- self.commentist.append([comment[0], comment[1].strip()])
- file.write(" ".join([comment[0], comment[1].strip()]).encode())
- def main(self):
- print('正在从《三生三世十里桃花》电影短评中抓取数据...')
- self.getMovie()
- print('抓取完毕...')
- DouBanSpider = MovieComment()
- DouBanSpider.main()
复制代码
|
|