求求大神帮忙找不同。

二百斤的浪子 · 发表于 2018-3-5 22:50:09

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

这是我的代码：import requests
from urllib.parse import urlencode
from requests.exceptions import RequestException
import json
import re
from  hashlib import md5
import os

cur_directory = os.getcwd()
def get_one_page(offset):
data = {
      'offset': '0',
      'format': 'json',
      'keyword': '桥本环奈',
      'autoload': 'true',
      'count': '20',
      'cur_tab': '1',
      'from':'search_tab'
}
url = 'https://www.toutiao.com/search_content/?'+ urlencode(data)
try:
      response = requests.get(url)
      if response.status_code == 200:
         return response.text
except RequestException:
      print('请求索引列表失败')
def jiexi(html):
try:
      html = json.loads(html)
      if html and 'data' in html.keys():
         for url in html.get('data'):
            yield url.get('article_url')
except:
      pass
def get_url(item):
try:
      headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.4858.400 QQBrowser/10.0.775.400'}
      response = requests.get(item,headers=headers)
      if response.status_code == 200:
         return response.text

except RequestException:
   print('请求详情页失败')
def get_html(html):
# soup = BeautifulSoup(html,'lxml')
# title = soup.select('title')[0].get_text()
pattern = re.compile('gallery: JSON.parse\((.*?)\)', re.S)
result = re.search(pattern, html)
if result:
      data = json.loads(json.loads(result.group(1)))
      if data and 'sub_images' in data.keys():
         sub_images = data.get('sub_images')
         images = [item.get('url') for item in sub_images ]
         for qq in images:
            open(qq)
def open(qq):
print('正在下载：',qq)
try:
      response = requests.get(qq)
      if response.status_code == 200:
         return dow(response.content)
except RequestException:
      print('请求索引列表失败')
def dow(content):
file_path = '{0}/{1}/{2}.{3}'.format(os.getcwd(),'jpg', md5(content).hexdigest(), 'jpg')

if not os.path.exists(file_path):
      with open(file_path,'wb') as f:
         f.write(content)
         f.close()
else:
      print('image already download: ',url)

def main():
html = get_one_page('0')
for item in jiexi(html):
      html = get_url(item)
      if html:
         images = get_html(html)

这是网上复制的别人的代码：
import requests
from urllib.parse import urlencode
from requests.exceptions import RequestException
import json
from bs4 import BeautifulSoup
import time
import re
from pprint import pprint
from config import *
import pymongo
import os
from  hashlib import md5
import os
from multiprocessing import Pool

cur_directory = os.getcwd()
if os.path.exists(cur_directory + '/jpg/'):
pass
else:
os.mkdir(cur_directory + '/jpg/')

def get_page_index(offset):
data = {
      'offset': '0',
      'format': 'json',
      'keyword': '桥本环奈',
      'autoload': 'true',
      'count': '20',
      'cur_tab': '3'
}
# 请求的url后面是需要把data编码，然后构建成一个完整的url
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
      response = requests.get(url)
      if response.status_code == 200:
         return response.text
      else:
         return None
except RequestException:
      print('get index error url is: ',url)
      return None

# 解析1个网页
# 请求网页后返回的response是一串字符串格式，通过json.loads方法转换成JSON变量
# 定义一个生成器，放入url
def parse_one_page_index(html):
try:
      data = json.loads(html)
# 查询是否有data和data的键名存在，并取出其中article_url
      if data and 'data' in data.keys():
         for item in data.get('data'):
            yield item.get('article_url')
except:
      pass
# 获取详情页的内容
def get_detail_page(url):
try:
      headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.4858.400 QQBrowser/10.0.775.400'}
      response = requests.get(url,headers=headers)
      if response.status_code == 200:
         return response.text
      else:
         return None
except RequestException:
      print('请求详情页错误',url)

# 解析详情页
def parse_detail_page(html):
# soup = BeautifulSoup(html,'lxml')
# title = soup.select('title')[0].get_text()
# print(title)
pattern = re.compile('gallery: JSON.parse\((.*?)\)', re.S)
result = re.search(pattern, html)
print(result)

if result:
      data = json.loads(json.loads(result.group(1)))
      if data and 'sub_images' in data.keys():
         sub_images = data.get('sub_images')
         images = [item.get('url') for item in sub_images]
         # print(type(images))

         for image in images:
            download_img(image)

      # print(type(data))
      # print(data)
      # print('*'*100)

def download_img(url):
print('downloading image',url)
try:
      response = requests.get(url)
      if response.status_code == 200:
         # return response.content
         save_image_local(response.content,url)
      else:
         return None
except RequestException:
      print('图片请求出错',url)
      return None

def save_image_local(content,url):
# 文件路径，文件名，文件名后缀
file_path = '{0}/{1}/{2}.{3}'.format(os.getcwd(),'jpg', md5(content).hexdigest(), 'jpg')

if not os.path.exists(file_path):
      with open(file_path,'wb') as f:
         f.write(content)
         f.close()
else:
      print('image already download: ',url)

def main(offset):
html = get_page_index('0')
for url in parse_one_page_index(html):
      if url:
         cc = get_detail_page(url)
         image = parse_detail_page(cc)

if __name__ == '__main__':
main('0')

不知道为什么我的代码下载不了图片，报错如下open() takes 1 positional argument but 2 were given，求求大神帮助

°蓝鲤歌蓝 · 发表于 2018-3-5 23:00:13

贴报错图片吧。

SixPy · 发表于 2018-3-6 00:55:55

@小甲鱼
手机能不能@好友。？

SixPy · 发表于 2018-3-6 00:56:52

好像并不能

小甲鱼 · 发表于 2018-3-7 11:40:12

SixPy 发表于 2018-3-6 00:56
好像并不能

可以哈~

塔利班 · 发表于 2018-3-7 12:28:48

小甲鱼发表于 2018-3-7 11:40
可以哈~

惊现野生小甲鱼，撒网

SixPy · 发表于 2018-3-7 18:03:31

小甲鱼发表于 2018-3-7 11:40
可以哈~

怎么用？
最好有个象表情那样的按钮

账号		自动登录	找回密码
密码			立即注册