| 
 | 
 
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册  
 
x
 
- import requests
 
 - from lxml import etree
 
  
 
- headers = {
 
 -     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
 
 -     'Cookie':'ll="108314"; bid=_LAWkGgi1Js; ct=y; __yadk_uid=89YZlsdVstEBGt8rNf4innJ79Hi3mosn; __gads=ID=92c5f47901f551a0-22f530c6f2c40037:T=1606531304:RT=1606531304:S=ALNI_Mae6Hq1eo6RkR_9HRim4ml_HoEYRg; __utmc=30149280; __utmz=30149280.1606727878.7.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=223695111; __utmz=223695111.1606727878.7.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=D30C7E4BAE9F96458B12515507FA532DC|b7a3a014f000afb51be010698c310b59; __utma=30149280.1401849356.1606469592.1606797311.1606801152.10; __utmb=30149280.0.10.1606801152; __utma=223695111.352245982.1606469592.1606797311.1606801152.10; __utmb=223695111.0.10.1606801152; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1606801152%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D9C40AoCTrSNIfGCVB3nwTgy1k-fMWS3e2H0qSiDrPdvEzWRxS1RD8d937yYKIajE%26wd%3D%26eqid%3D8b4f698a0001959a000000035fc4b8c1%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; _pk_id.100001.4cf6=3e8dbc93b7d06862.1606469591.10.1606801589.1606798873.'
 
 - }
 
  
- urls = []
 
 -     
 
 - for i in range(0,10,1):
 
 -     i = i*25
 
 -     url = 'https://movie.douban.com/top250?start={}'.format(i)
 
 -     urls.append(url)
 
 -     
 
  
- movies =[]
 
 - m = []
 
 - x = 0
 
 - for url in urls:
 
 -     response = requests.get(url=url,headers = headers)
 
 -     content = response.content.decode('utf8')
 
 -     html = etree.HTML(content)
 
 -     
 
 -     mingcheng = html.xpath('//div[@class="hd"]/a/span[1]/text()')
 
  
-     daoyan = html.xpath('//div[@class="bd"]/p[1]/text()[1]')
 
  
-     pingfen = html.xpath('//div/span[@class="rating_num"]/text()')
 
  
-     jianjie = html.xpath('//p[@class="quote"]/span/text()')
 
 -     
 
 -    
 
 -     for mingcheng,daoyan,pingfen,jianjie in zip(mingcheng,daoyan,pingfen,jianjie):
 
 -         movies = {}
 
 -         movies = {
 
 -             'mingcheng' : mingcheng,
 
 -             'daoyan' : daoyan,
 
 -             'pingfen' : pingfen,
 
 -             'jianjie' : jianjie
 
 -             }
 
 -         m.append(movies)
 
 -         x += 1
 
 -         print('正在加载第%d个' % x )
 
 
  复制代码 
 
 
 
为什么爬到第243个就停止了,不应该把这10个网页都爬完吗? 
求大神解答
quote有的没有内容,缺少东西容易造成数据不齐。这个适合把每部的li各自过xpath: 
 
- import requests
 
 - from lxml import etree
 
 - import json
 
  
 
- def main():
 
 -     headers = {
 
 -         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
 
 -         'Cookie': 'll="108314"; bid=_LAWkGgi1Js; ct=y; __yadk_uid=89YZlsdVstEBGt8rNf4innJ79Hi3mosn; __gads=ID=92c5f47901f551a0-22f530c6f2c40037:T=1606531304:RT=1606531304:S=ALNI_Mae6Hq1eo6RkR_9HRim4ml_HoEYRg; __utmc=30149280; __utmz=30149280.1606727878.7.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=223695111; __utmz=223695111.1606727878.7.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=D30C7E4BAE9F96458B12515507FA532DC|b7a3a014f000afb51be010698c310b59; __utma=30149280.1401849356.1606469592.1606797311.1606801152.10; __utmb=30149280.0.10.1606801152; __utma=223695111.352245982.1606469592.1606797311.1606801152.10; __utmb=223695111.0.10.1606801152; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1606801152%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D9C40AoCTrSNIfGCVB3nwTgy1k-fMWS3e2H0qSiDrPdvEzWRxS1RD8d937yYKIajE%26wd%3D%26eqid%3D8b4f698a0001959a000000035fc4b8c1%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; _pk_id.100001.4cf6=3e8dbc93b7d06862.1606469591.10.1606801589.1606798873.'
 
 -     }
 
 -     movies = []
 
 -     num = 1
 
 -     for i in range(10):
 
 -         n = i * 25
 
 -         url = f'https://movie.douban.com/top250?start={n}'
 
 -         r = requests.get(url=url, headers=headers)
 
 -         html = etree.HTML(r.text)
 
 -         lis = html.xpath('//ol/li')
 
 -         for li in lis:
 
 -             rank = li.xpath('./div/div[1]/em/text()')[0]
 
 -             name = li.xpath('./div/div[2]/div[1]/a/span[1]/text()')[0]
 
 -             # print(name)
 
 -             director = li.xpath('normalize-space(./div/div[2]/div[2]/p/text()[1])')
 
 -             # print(director)
 
 -             score = li.xpath('./div/div[2]/div[2]/div/span[2]/text()')[0]
 
 -             # print(score)
 
 -             quote = li.xpath('./div/div[2]/div[2]/p[2]/span/text()')
 
 -             if not quote:
 
 -                 quote = ['暂无']
 
 -             m = {'rank': rank, 'name': name, 'director': director, 'score': score, 'quote': quote[0]}
 
 -             movies.append(m)
 
 -             print(f'已添加{name}, 共添加{num}部。')
 
 -             num += 1
 
 -     with open('movie.json', 'w', encoding='utf-8') as f:
 
 -         f.write(json.dumps(movies, indent=2, ensure_ascii=False))
 
  
 
- if __name__ == '__main__':
 
 -     main()
 
  复制代码 
 
 
 |   
 
 
 
 |