|

楼主 |
发表于 2020-5-20 14:13:31
|
显示全部楼层
下面是代码和图片,因为爬的是一个不太正经的网站(没有反爬),所以把网址隐藏了,不知道会不会影响你研究
图片:
- import requests
- from fake_useragent import UserAgent
- import os
- import threading
- from lxml import etree
- from queue import Queue
- import time
- import re
- class Producer(threading.Thread):
- headers = {
- "Referer": "https://www.隐藏.com",
- "User-Agent": UserAgent().random
- }
- def __init__(self, page_queue, img_queue):
- threading.Thread.__init__(self)
- self.page_queue = page_queue
- self.img_queue = img_queue
- def get_url(self, page_url):
- re1 = requests.get(page_url, headers=self.headers)
- e1 = etree.HTML(re1.text)
- title_urls_half = e1.xpath("//div[@class='box list channel']/ul/li/a/@href")
- zhu_url = get_zhuye()
- for title_urls in title_urls_half:
- title_url = zhu_url + title_urls
- re2 = requests.get(title_url, headers=self.headers)
- e2 = etree.HTML(re2.text)
- img_urls = e2.xpath("//div[@class='content']/p/img/@src")
- for img_url in img_urls:
- self.img_queue.put(img_url)
- # img_url 就是图片地址
- def run(self):
- while self.page_queue.empty() == False:
- page_url = self.page_queue.get()
- self.get_url(page_url)
- class Consumer(threading.Thread):
- headers = {
- "Referer": "https://www.隐藏.com",
- "User-Agent": UserAgent().random
- }
- def __init__(self, page_queue, img_queue):
- threading.Thread.__init__(self)
- self.page_queue = page_queue
- self.img_queue = img_queue
- def run(self):
- while True:
- time.sleep(2)
- if self.img_queue.empty():
- break
- img_url = self.img_queue.get()
- filename = img_url.split("/")[-1]
- try:
- response = requests.get(img_url, headers=self.headers, timeout=5)
- print(img_url)
- with open(filename, "wb") as f:
- f.write(response.content)
- except:
- print("超时")
- def get_zhuye():
- headers = {
- "Referer": "https://www.隐藏.com",
- "User-Agent": UserAgent().random
- }
- base_url = "https://www.隐藏.com"
- response = requests.get(base_url, headers=headers)
- zhu_url = re.findall(r'window\.location\.href="(.+?)"', response.text)[0][:-1]
- return zhu_url
- def main():
- zhu_url = get_zhuye()
- page_queue = Queue(50)
- img_queue = Queue(1000)
- a = int(input("输入下载开始的页码(≥2):"))
- b = int(input("输入下载结束的页码(≥2):"))
- for x in range(a, b + 1):
- page_url = zhu_url + "/pic/2/index_%d.html" % x
- page_queue.put(page_url)
- for x in range(5):
- t = Producer(page_queue, img_queue)
- t.start()
- # t.join()
- for x in range(5):
- t = Consumer(page_queue, img_queue)
- t.start()
- # t.join()
- if __name__ == '__main__':
- images_path = os.path.join(os.path.dirname(__file__), "images")
- if not os.path.exists(images_path):
- os.mkdir(images_path)
- os.chdir(images_path)
- main()
复制代码 |
|