[已解决]请教一下，这段爬虫为啥爬不出东西？？？

slhlde · 发表于 2019-12-25 20:28:06

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

import urllib.request
import urllib.error
import re
import sys

class MovieComment():
def __init__(self):

      self.start=0
      self.param='&filter=&type='
      self.headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
      self.commentist=[]
      self.filepath='E:/Projects/Python-Spiders/Python网络爬虫-罗攀/comment.txt'

def getPage(self):
      try:
         for i in range(0,100,20):
            url = 'https://movie.douban.com/subject/25823277/comments?start=60&limit={i}&sort=new_score&status=P'
            request = urllib.request.Request(url,headers=self.headers)
            reponse = urllib.request.urlopen(request)
            page = reponse.read().decode('utf-8')
            print('正在抓取第' + str(i) + '页数据')
            return page
      except urllib.error.URLError as e:
         if hasattr(e,"reason"):
            print('抓取失败',e.reason)
def getMovie(self):
      pattern=re.compile(u'<div.*?class="avatar">.*?'
                           + u'<a.*?title="(.*?)".*?href=".*?">.*?</a>.*?'
                           + u'<p.*?class="">(.*?)</p>',re.S)
      page=self.getPage()
      comments=re.findall(pattern,page)
      for comment in comments:
         self.commentist.append([comment[0],comment[1].strip()])
         with open('comment.txt','wb') as file:
            file.write(self.commentist.append([comment[0],comment[1].strip()]))

            file.close()
def main(self):
      print('正在从《三生三世十里桃花》电影短评中抓取数据...')
      self.getMovie()
      print('抓取完毕...')

DouBanSpider = MovieComment()
DouBanSpider.main()

最佳答案

月排行榜 / 总排行榜

zltzlt

2019-12-25 20:32:07

代码很多问题，帮楼主修改了。

import urllib.request
import urllib.error
import re
class MovieComment:
def __init__(self):
self.start = 0
self.param = '&filter=&type='
self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
self.commentist = []
self.filepath = 'E:/Projects/Python-Spiders/Python网络爬虫-罗攀/comment.txt'
def getPage(self):
try:
for i in range(0, 100, 20):
url = 'https://movie.douban.com/subject/25823277/comments?start=60&limit={i}&sort=new_score&status=P'
request = urllib.request.Request(url, headers=self.headers)
reponse = urllib.request.urlopen(request)
page = reponse.read().decode('utf-8')
print('正在抓取第' + str(i) + '页数据')
return page
except urllib.error.URLError as e:
if hasattr(e, "reason"):
print('抓取失败', e.reason)
def getMovie(self):
pattern = re.compile(u'<div.*?class="avatar">.*?'
+ u'<a.*?title="(.*?)".*?href=".*?">.*?</a>.*?'
+ u'<p.*?class="">(.*?)</p>', re.S)
page = self.getPage()
comments = re.findall(pattern, page)
for comment in comments:
self.commentist.append([comment[0], comment[1].strip()])
with open('comment.txt', 'wb') as file:
print(comment[0])
self.commentist.append([comment[0], comment[1].strip()])
file.write(" ".join([comment[0], comment[1].strip()]).encode())
def main(self):
print('正在从《三生三世十里桃花》电影短评中抓取数据...')
self.getMovie()
print('抓取完毕...')
DouBanSpider = MovieComment()
DouBanSpider.main()

复制代码

跳转到最佳答案楼层

Stubborn · 发表于 2019-12-25 20:30:54

url = 'https://movie.douban.com/subject/25823277/comments?start=60&limit={i}&sort=new_score&status=P'

复制代码

这是什么语法，其他的暂时没看

zltzlt · 发表于 2019-12-25 20:32:07

这个最佳答案由 zltzlt 给出，感谢 zltzlt 的回答。

单击隐藏图章

代码很多问题，帮楼主修改了。

import urllib.request
import urllib.error
import re
class MovieComment:
def __init__(self):
self.start = 0
self.param = '&filter=&type='
self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
self.commentist = []
self.filepath = 'E:/Projects/Python-Spiders/Python网络爬虫-罗攀/comment.txt'
def getPage(self):
try:
for i in range(0, 100, 20):
url = 'https://movie.douban.com/subject/25823277/comments?start=60&limit={i}&sort=new_score&status=P'
request = urllib.request.Request(url, headers=self.headers)
reponse = urllib.request.urlopen(request)
page = reponse.read().decode('utf-8')
print('正在抓取第' + str(i) + '页数据')
return page
except urllib.error.URLError as e:
if hasattr(e, "reason"):
print('抓取失败', e.reason)
def getMovie(self):
pattern = re.compile(u'<div.*?class="avatar">.*?'
+ u'<a.*?title="(.*?)".*?href=".*?">.*?</a>.*?'
+ u'<p.*?class="">(.*?)</p>', re.S)
page = self.getPage()
comments = re.findall(pattern, page)
for comment in comments:
self.commentist.append([comment[0], comment[1].strip()])
with open('comment.txt', 'wb') as file:
print(comment[0])
self.commentist.append([comment[0], comment[1].strip()])
file.write(" ".join([comment[0], comment[1].strip()]).encode())
def main(self):
print('正在从《三生三世十里桃花》电影短评中抓取数据...')
self.getMovie()
print('抓取完毕...')
DouBanSpider = MovieComment()
DouBanSpider.main()

复制代码

slhlde · 发表于 2019-12-25 20:37:41

zltzlt 发表于 2019-12-25 20:32
代码很多问题，帮楼主修改了。

首先感谢您的回复
您的代码我试了，可以跑但是只能抓一页？？我中间写了一个循环为啥没有起到作用？？

zltzlt · 发表于 2019-12-25 20:42:28

用这段代码：

import urllib.request
import urllib.error
import re
class MovieComment:
def __init__(self):
self.start = 0
self.param = '&filter=&type='
self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
self.commentist = []
self.filepath = 'E:/Projects/Python-Spiders/Python网络爬虫-罗攀/comment.txt'
def getPage(self):
try:
for i in range(0, 100, 20):
url = f'https://movie.douban.com/subject/25823277/comments?start=60&limit=' + \
f'{i}&sort=new_score&status=P'
request = urllib.request.Request(url, headers=self.headers)
reponse = urllib.request.urlopen(request)
page = reponse.read().decode('utf-8')
print('正在抓取第' + str(i) + '页数据')
yield page
except urllib.error.URLError as e:
if hasattr(e, "reason"):
print('抓取失败', e.reason)
def getMovie(self):
pattern = re.compile(u'<div.*?class="avatar">.*?'
+ u'<a.*?title="(.*?)".*?href=".*?">.*?</a>.*?'
+ u'<p.*?class="">(.*?)</p>', re.S)
page = "".join(list(self.getPage()))
comments = re.findall(pattern, page)
for comment in comments:
self.commentist.append([comment[0], comment[1].strip()])
with open('comment.txt', 'wb') as file:
print(comment[0])
self.commentist.append([comment[0], comment[1].strip()])
file.write(" ".join([comment[0], comment[1].strip()]).encode())
def main(self):
print('正在从《三生三世十里桃花》电影短评中抓取数据...')
self.getMovie()
print('抓取完毕...')
DouBanSpider = MovieComment()
DouBanSpider.main()

复制代码

slhlde · 发表于 2019-12-25 21:15:59

zltzlt 发表于 2019-12-25 20:42
用这段代码：

多谢回复~~~

账号		自动登录	找回密码
密码			立即注册

[已解决]请教一下 ，这段爬虫为啥爬不出东西？？？

马上注册，结交更多好友，享用更多功能^_^

浏览过的版块

[已解决]请教一下，这段爬虫为啥爬不出东西？？？