|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
有没有大神,帮我看看这哪错了,这是爬取头条美图- import requests
- import json
- import re
- import os
- from urllib.parse import urlencode
- from requests.exceptions import RequestException
- from bs4 import BeautifulSoup
- from hashlib import md5
- def get_page_index(offset,keyword):
- data={
- 'offset':offset,
- 'format': 'json',
- 'keyword': keyword,
- 'autoload': 'true',
- 'count': 20,
- 'cur_tab': 1,
- 'from':'search_tab'
- }#字典的形式
- url='https://www.toutiao.com/search_content/?'+urlencode(data)#把字典对象转化为URL请求参数
- headers = {
- 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
- }
- try:
- response=requests.get(url,headers=headers)
- if response.status_code==200:
- return response.text
- return None
- except RequestException:
- print('请求索引页错误')
- return None
- def parse_page_index(html):
- data=json.loads(html)
- if data and 'data' in data.keys():#键值名
- for item in data.get('data'):
- yield item.get('article_url')#提取article_url的信息
- def get_page_detail(url):
- headers = {
- 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
- }
- try:
- response=requests.get(url,headers=headers)
- if response.status_code==200:
- return response.text
- return None
- except RequestException:
- print('请求索引页错误')
- return None
- def parse_page_detail(html,url):
- soup=BeautifulSoup(html,'lxml')
- title=soup.select('title')[0].get_text()
- images_pattern=re.compile('gallery: JSON.parse\("(.*?)"\)',re.S)
- result=re.search(images_pattern,html)
- if result:
- data=json.loads(result.group(1).replace('\\',''))
- if data and 'sub_images' in data.keys():
- sub_images=data.get('sub_images')
- images=[item.get('url') for item in sub_images]
- return{
- 'title':title,
- 'url':url,
- 'images':images
- }
- def main():
- html=get_page_index(0,'街拍')
- for url in parse_page_index(html):
- html=get_page_detail(url)
- if html:
- result=parse_page_detail(html,url)
- print(result)
- if __name__=='__main__':
- main()
复制代码
错误是 IndexError: list index out of range |
|