|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- #导包
- import requests
- import time
- import os
- import threading
- import parsel
- if not os.path.exists('image'):
- os.mkdir('image')
- base_url = 'https://anime-pictures.net/pictures/view_posts/0?lang=en'
- headers = {
- 'User-Agent':
- 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
- }
- def get(url,headers,cookie):
- '''请求数据'''
- response = requests.get(url,headers)
- html_data = response.text
- return html_data
- def parsel_data(html_data):
- '''筛选数据'''
- selector = parsel.Selector(html_data)
- result_list = selector.xpath('//span[@class="img_block_big"]')
- for result in result_list:
- image_url = result.xpath('./a/picture/source/img/@src').extract_first()
- image_id = result.xpath('./a/picture/source/img/@id').extract_first()
- img_url = 'https:' + image_url #手动拼url
- all_title = img_url
- img_data = requests.get(url = all_title,headers = headers).content
- return all_title,image_id,img_data
- def save(all_title,image_id,img_data):
- '''保存数据'''
- try:
- with open('image\\' + all_title, mode='wb') as f:
- print('保存成功:', image_id)
- f.write(img_data)
- except:
- pass
- print('保存失败')
- def sleep(time):
- '''休眠'''
- time.sleep(time)
- for _ in range(0,100):
- html_data = get(url=base_url, headers=headers)
- image_data = parsel_data(html_data = html_data)
- all_title = image_data[0] #url https://xxxxxxx...
- img_id = image_data[1] #ID号
- img_data = image_data[2] #数据
- print(all_title,img_id,img_data)
复制代码
如果调试的话 会发现他一直在重复同一个图片的数据
本帖最后由 yjsx86 于 2020-7-21 10:50 编辑
让parsel_data函数变成生成器就行
- # 导包
- import requests
- import time
- import os
- import threading
- import parsel
- if not os.path.exists('image'):
- os.mkdir('image')
- base_url = 'https://anime-pictures.net/pictures/view_posts/0?lang=en'
- headers = {
- 'User-Agent':
- 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
- }
- def get(url, headers):
- '''请求数据'''
- response = requests.get(url, headers)
- html_data = response.text
- return html_data
- def parsel_data(html_data):
- '''筛选数据'''
- selector = parsel.Selector(html_data)
- result_list = selector.xpath('//span[@class="img_block_big"]')
- for result in result_list:
- image_url = result.xpath('./a/picture/source/img/@src').extract_first()
- image_id = result.xpath('./a/picture/source/img/@id').extract_first()
- img_url = 'https:' + image_url # 手动拼url
- all_title = img_url
- img_data = requests.get(url=all_title, headers=headers).content
- # return 改 yield
- yield all_title, image_id, img_data
- def save(all_title, image_id, img_data):
- '''保存数据'''
- try:
- with open('image\\' + all_title, mode='wb') as f:
- print('保存成功:', image_id)
- f.write(img_data)
- except:
- pass
- print('保存失败')
- def sleep(time):
- '''休眠'''
- time.sleep(time)
- if __name__ == '__main__':
- html_data = get(url=base_url, headers=headers)
- for image_data in parsel_data(html_data):
- all_title = image_data[0] # url https://xxxxxxx...
- img_id = image_data[1] # ID号
- img_data = image_data[2] # 数据
- print(all_title, img_id, img_data)
复制代码
|
|