|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
这是我的代码:import requests
from urllib.parse import urlencode
from requests.exceptions import RequestException
import json
import re
from hashlib import md5
import os
cur_directory = os.getcwd()
def get_one_page(offset):
data = {
'offset': '0',
'format': 'json',
'keyword': '桥本环奈',
'autoload': 'true',
'count': '20',
'cur_tab': '1',
'from':'search_tab'
}
url = 'https://www.toutiao.com/search_content/?'+ urlencode(data)
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except RequestException:
print('请求索引列表失败')
def jiexi(html):
try:
html = json.loads(html)
if html and 'data' in html.keys():
for url in html.get('data'):
yield url.get('article_url')
except:
pass
def get_url(item):
try:
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.4858.400 QQBrowser/10.0.775.400'}
response = requests.get(item,headers=headers)
if response.status_code == 200:
return response.text
except RequestException:
print('请求详情页失败')
def get_html(html):
# soup = BeautifulSoup(html,'lxml')
# title = soup.select('title')[0].get_text()
pattern = re.compile('gallery: JSON.parse\((.*?)\)', re.S)
result = re.search(pattern, html)
if result:
data = json.loads(json.loads(result.group(1)))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images ]
for qq in images:
open(qq)
def open(qq):
print('正在下载:',qq)
try:
response = requests.get(qq)
if response.status_code == 200:
return dow(response.content)
except RequestException:
print('请求索引列表失败')
def dow(content):
file_path = '{0}/{1}/{2}.{3}'.format(os.getcwd(),'jpg', md5(content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(content)
f.close()
else:
print('image already download: ',url)
def main():
html = get_one_page('0')
for item in jiexi(html):
html = get_url(item)
if html:
images = get_html(html)
这是网上复制的别人的代码:
import requests
from urllib.parse import urlencode
from requests.exceptions import RequestException
import json
from bs4 import BeautifulSoup
import time
import re
from pprint import pprint
from config import *
import pymongo
import os
from hashlib import md5
import os
from multiprocessing import Pool
cur_directory = os.getcwd()
if os.path.exists(cur_directory + '/jpg/'):
pass
else:
os.mkdir(cur_directory + '/jpg/')
def get_page_index(offset):
data = {
'offset': '0',
'format': 'json',
'keyword': '桥本环奈',
'autoload': 'true',
'count': '20',
'cur_tab': '3'
}
# 请求的url后面是需要把data编码,然后构建成一个完整的url
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
except RequestException:
print('get index error url is: ',url)
return None
# 解析1个网页
# 请求网页后返回的response是一串字符串格式,通过json.loads方法转换成JSON变量
# 定义一个生成器,放入url
def parse_one_page_index(html):
try:
data = json.loads(html)
# 查询是否有data和data的键名存在,并取出其中article_url
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except:
pass
# 获取详情页的内容
def get_detail_page(url):
try:
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.4858.400 QQBrowser/10.0.775.400'}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
else:
return None
except RequestException:
print('请求详情页错误',url)
# 解析详情页
def parse_detail_page(html):
# soup = BeautifulSoup(html,'lxml')
# title = soup.select('title')[0].get_text()
# print(title)
pattern = re.compile('gallery: JSON.parse\((.*?)\)', re.S)
result = re.search(pattern, html)
print(result)
if result:
data = json.loads(json.loads(result.group(1)))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
# print(type(images))
for image in images:
download_img(image)
# print(type(data))
# print(data)
# print('*'*100)
def download_img(url):
print('downloading image',url)
try:
response = requests.get(url)
if response.status_code == 200:
# return response.content
save_image_local(response.content,url)
else:
return None
except RequestException:
print('图片请求出错',url)
return None
def save_image_local(content,url):
# 文件路径,文件名,文件名后缀
file_path = '{0}/{1}/{2}.{3}'.format(os.getcwd(),'jpg', md5(content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(content)
f.close()
else:
print('image already download: ',url)
def main(offset):
html = get_page_index('0')
for url in parse_one_page_index(html):
if url:
cc = get_detail_page(url)
image = parse_detail_page(cc)
if __name__ == '__main__':
main('0')
不知道为什么我的代码下载不了图片,报错如下open() takes 1 positional argument but 2 were given,求求大神帮助 |
|