|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
# -*- coding:utf-8 -*-
2 __author__ = 'Jz'
3 import urllib2
4 import re
5 import sys
6
7 class MovieTop250:
8 def __init__(self):
9 #设置默认编码格式为utf-8
10 reload(sys)
11 sys.setdefaultencoding('utf-8')
12 self.start = 0
13 self.param = '&filter=&type='
14 self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64)'}
15 self.movieList = []
16 self.filePath = 'D:/coding_file/python_file/File/DoubanTop250.txt'
17
18 def getPage(self):
19 try:
20 URL = 'http://movie.douban.com/top250?start=' + str(self.start)
21 request = urllib2.Request(url = URL, headers = self.headers)
22 response = urllib2.urlopen(request)
23 page = response.read().decode('utf-8')
24 pageNum = (self.start + 25)/25
25 print '正在抓取第' + str(pageNum) + '页数据...'
26 self.start += 25
27 return page
28 except urllib2.URLError, e:
29 if hasattr(e, 'reason'):
30 print '抓取失败,具体原因:', e.reason
31
32 def getMovie(self):
33 pattern = re.compile(u'<div.*?class="item">.*?<div.*?class="pic">.*?'
34 + u'<em.*?class="">(.*?)</em>.*?'
35 + u'<div.*?class="info">.*?<span.*?class="title">(.*?)'
36 + u'</span>.*?<span.*?class="title">(.*?)</span>.*?'
37 + u'<span.*?class="other">(.*?)</span>.*?</a>.*?'
38 + u'<div.*?class="bd">.*?<p.*?class="">.*?'
39 + u'导演: (.*?) '
40 + u'主演: (.*?)<br>'
41 + u'(.*?) / (.*?) / '
42 + u'(.*?)</p>'
43 + u'.*?<div.*?class="star">.*?<em>(.*?)</em>'
44 + u'.*?<span>(.*?)人评价</span>.*?<p.*?class="quote">.*?'
45 + u'<span.*?class="inq">(.*?)</span>.*?</p>', re.S)
46 while self.start <= 225:
47 page = self.getPage()
48 movies = re.findall(pattern, page)
49 for movie in movies:
50 self.movieList.append([movie[0], movie[1], movie[2].lstrip(' / '),
51 movie[3].lstrip(' / '), movie[4],
52 movie[5], movie[6].lstrip(), movie[7], movie[8].rstrip(),
53 movie[9], movie[10], movie[11]])
54
55 def writeTxt(self):
56 fileTop250 = open(self.filePath, 'w')
57 try:
58 for movie in self.movieList:
59 fileTop250.write('电影排名:' + movie[0] + '\r\n')
60 fileTop250.write('电影名称:' + movie[1] + '\r\n')
61 fileTop250.write('外文名称:' + movie[2] + '\r\n')
62 fileTop250.write('电影别名:' + movie[3] + '\r\n')
63 fileTop250.write('导演姓名:' + movie[4] + '\r\n')
64 fileTop250.write('参与主演:' + movie[5] + '\r\n')
65 fileTop250.write('上映年份:' + movie[6] + '\r\n')
66 fileTop250.write('制作国家/地区:' + movie[7] + '\r\n')
67 fileTop250.write('电影类别:' + movie[8] + '\r\n')
68 fileTop250.write('电影评分:' + movie[9] + '\r\n')
69 fileTop250.write('参评人数:' + movie[10] + '\r\n')
70 fileTop250.write('简短影评:' + movie[11] + '\r\n\r\n')
71 print '文件写入成功...'
72 finally:
73 fileTop250.close()
74
75 def main(self):
76 print '正在从豆瓣电影Top250抓取数据...'
77 self.getMovie()
78 self.writeTxt()
79 print '抓取完毕...'
80
81 DouBanSpider = MovieTop250()
82 DouBanSpider.main()
这是一段豆瓣top250爬取信息的代码
里面一段正则表达式希望有大神能给解释一下 |
|