马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
#导包
import requests
import time
import os
import threading
import parsel
if not os.path.exists('image'):
os.mkdir('image')
base_url = 'https://anime-pictures.net/pictures/view_posts/0?lang=en'
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
def get(url,headers,cookie):
'''请求数据'''
response = requests.get(url,headers)
html_data = response.text
return html_data
def parsel_data(html_data):
'''筛选数据'''
selector = parsel.Selector(html_data)
result_list = selector.xpath('//span[@class="img_block_big"]')
for result in result_list:
image_url = result.xpath('./a/picture/source/img/@src').extract_first()
image_id = result.xpath('./a/picture/source/img/@id').extract_first()
img_url = 'https:' + image_url #手动拼url
all_title = img_url
img_data = requests.get(url = all_title,headers = headers).content
return all_title,image_id,img_data
def save(all_title,image_id,img_data):
'''保存数据'''
try:
with open('image\\' + all_title, mode='wb') as f:
print('保存成功:', image_id)
f.write(img_data)
except:
pass
print('保存失败')
def sleep(time):
'''休眠'''
time.sleep(time)
for _ in range(0,100):
html_data = get(url=base_url, headers=headers)
image_data = parsel_data(html_data = html_data)
all_title = image_data[0] #url https://xxxxxxx...
img_id = image_data[1] #ID号
img_data = image_data[2] #数据
print(all_title,img_id,img_data)
如果调试的话 会发现他一直在重复同一个图片的数据
本帖最后由 yjsx86 于 2020-7-21 10:50 编辑
让parsel_data函数变成生成器就行
# 导包
import requests
import time
import os
import threading
import parsel
if not os.path.exists('image'):
os.mkdir('image')
base_url = 'https://anime-pictures.net/pictures/view_posts/0?lang=en'
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
def get(url, headers):
'''请求数据'''
response = requests.get(url, headers)
html_data = response.text
return html_data
def parsel_data(html_data):
'''筛选数据'''
selector = parsel.Selector(html_data)
result_list = selector.xpath('//span[@class="img_block_big"]')
for result in result_list:
image_url = result.xpath('./a/picture/source/img/@src').extract_first()
image_id = result.xpath('./a/picture/source/img/@id').extract_first()
img_url = 'https:' + image_url # 手动拼url
all_title = img_url
img_data = requests.get(url=all_title, headers=headers).content
# return 改 yield
yield all_title, image_id, img_data
def save(all_title, image_id, img_data):
'''保存数据'''
try:
with open('image\\' + all_title, mode='wb') as f:
print('保存成功:', image_id)
f.write(img_data)
except:
pass
print('保存失败')
def sleep(time):
'''休眠'''
time.sleep(time)
if __name__ == '__main__':
html_data = get(url=base_url, headers=headers)
for image_data in parsel_data(html_data):
all_title = image_data[0] # url https://xxxxxxx...
img_id = image_data[1] # ID号
img_data = image_data[2] # 数据
print(all_title, img_id, img_data)
|