|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import requests
- from urllib.parse import urlencode
- from requests.exceptions import RequestException
- import json
- from bs4 import BeautifulSoup
- import re
- from hashlib import md5
- import os
- def get_page_index(offset,keyword):
- data = {
- 'offset':offset,
- 'format':'json',
- 'keyword':keyword,
- 'autoload':'true',
- 'count':'20',
- 'cur_tab':'3'
- }
- url = 'http://www.toutiao.com/search_content/?'+urlencode(data)
- headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
- }
- try:
- response = requests.get(url,headers=headers)
- if response.status_code == 200:
- return response.text
- return None
- except RequestException:
- print('请求索引出错')
- return None
- def parse_page_index(html):
- data = json.loads(html)
- if data and 'data' in data.keys():
- for item in data.get('data'):
- yield item.get('article_url')
-
- def get_page_detail(url):
- try:
- response = requests.get(url)
- if response.status_code == 200:
- return response.text
- return None
- except RequestException:
- print('请求详情页出错',url)
- return None
-
- def parse_page_detail(html,url):
- soup = BeautifulSoup(html,'lxml')
- title = soup.select('title')[0].get_text()
- print(title)
- images_pattern = re.compile('gallery: (.*?),\n siblingList',re.S)
- result = re.search(images_pattern,html)
- if result:
- data = json.loads(result.group(1))
- if data and 'sub_images' in data.keys():
- sub_images = data.get('sub_images')
- images = [item.get('url')for item in sub_images]
- for image in images:download_image(image)
- return{
- 'title':title,
- 'url': url,
- 'images':images
- }
- def download_image(url):
- try:
- response = requests.get(url)
- if response.status_code == 200:
- save_image(response.content)
- return None
- except RequestException:
- print('请求图片出错',url)
- return None
-
- def save_image(content):
- file_path = '{0}/{1}.{2}'.format('C:\Python34',md5(content).hexdigest(),'jpg')
- if not os.path.exists(file_path):
- with open(file_path,'wb') as f:
- f.write(content)
- f.close()
-
-
- def main():
- html = get_page_index(0,'街拍')
- for url in parse_page_index(html):
- print(url)
- html = get_page_detail(url)
- if html:
- result = parse_page_detail(html,url)
-
-
- if __name__ == '__main__':
- main()
复制代码
为什么到下载图片的时候就报错,哪位大佬帮我开导开导
都说了让单个的测试你的函数了
这样测试一下不就知道哪个函数出错了么
- def download_image(url):
- try:
- response = requests.get(url)
- if response.status_code == 200:
- print('get')
- except RequestException:
- print('请求图片出错',url)
- return None
- if __name__ == '__main__':
- download_image('//p3.pstatp.com/origin/3c7c0001ed84f784045c')
- download_image('http://p3.pstatp.com/origin/3c7c0001ed84f784045c')
复制代码
测试结果
- 请求图片出错 //p3.pstatp.com/origin/3c7c0001ed84f784045c
- get
复制代码
能看出你错误的原因么
|
|