|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 千公子 于 2020-4-16 11:13 编辑
- import json
- import os
- import random
- import sys
- from concurrent.futures.thread import ThreadPoolExecutor
- import pymysql
- from lxml import etree
- from requests import *
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.chrome.options import Options
- class Spider:
- def __init__(self, cookieFile=None):
- if cookieFile is None:
- self.writeCookie()
- self.cookies = self.readCookies("cookie.txt")
- else:
- self.cookies = self.readCookies(cookieFile)
-
- self.headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
- }
- self.indexUrl = "http://pic.netbian.com"
- self.catelogue = self.getCatalogue()
- # 每天限量只能下载200张
- self.downCount = 0
- self.ddir = 'D:\\Data\\照片\\彼岸图网\\'
- def writeCookie(self):
- try:
- chrome_options = Options()
- chrome_options.add_argument('--headless')
- chrome = webdriver.Chrome(options=chrome_options)
- chrome.get("http://pic.netbian.com/e/memberconnect/?apptype=qq")
- ptlogin_iframe = WebDriverWait(chrome, 15).until(EC.presence_of_element_located((By.ID,"ptlogin_iframe")))
- chrome.switch_to.frame(ptlogin_iframe)
- chrome.find_element_by_id("switcher_plogin").click()
- # 设置qq号
- chrome.find_element_by_id("u").send_keys("输入您的QQ号")
- # 设置qq密码
- chrome.find_element_by_id("p").send_keys("输入您的密码")
- # 确定登录
- chrome.find_element_by_id("login_button").click()
- WebDriverWait(chrome, 15).until(EC.url_to_be("http://pic.netbian.com/"))
- with open("cookie.txt", "wt") as f:
- for cookieMap in chrome.get_cookies():
- k = cookieMap["name"]
- v = cookieMap["value"]
- f.write(k+"="+v+"\n")
- finally:
- chrome.quit()
- def readCookies(self, cookieFile):
- cookies = {}
- with open(cookieFile, "r", encoding="utf-8") as f:
- while True:
- c = f.readline()
- if c is not None:
- c = c.strip()
- if len(c) == 0:
- break
- else:
- c = c.split("=")
- cookies[c[0]] = c[1]
- else:
- break
- return cookies
- def reqGet(self, url):
- html = get(url, headers=self.headers, cookies=self.cookies).content.decode("gbk")
- return html
- def getImg(self, url):
- return get(url, headers=self.headers, cookies=self.cookies)
- def getCatalogue(self):
- index = self.reqGet(self.indexUrl)
- h = etree.HTML(index)
- href = h.xpath('//div[@class="classify clearfix"]/a/@href')
- title = h.xpath('//div[@class="classify clearfix"]/a/text()')
- return zip(title, href)
- def getRealUrl(self, href):
- """
- ('阿尔卑斯山风景4k高清壁纸3840x2160', 'http://pic.netbian.com/downpic.php?id=21953&classid=53')
- """
- dh = self.reqGet(self.indexUrl + href)
- h = etree.HTML(dh)
- dataId = h.xpath('//div[@class="downpic"]/a/@data-id')[0]
- title = h.xpath('//div[@class="photo-hd"]/h1/text()')[0]
- url = "{0}/e/extend/downpic.php?id={1}&t={2}".format(self.indexUrl, dataId, random.random())
- msg = self.reqGet(url)
- return title, self.indexUrl + json.loads(msg)['pic']
- def getPicUrls(self, url=None, html=None):
- if html is None:
- html = self.reqGet(url)
- h = etree.HTML(html)
- hrefs = h.xpath('//ul[@class="clearfix"]/li/a/@href')
- realHrefs = []
- for href in hrefs:
- realHrefs.append(self.getRealUrl(href))
- return realHrefs
- def getMaxPage(self, html):
- h = etree.HTML(html)
- pages = h.xpath('//div[@class="page"]/a/text()')
- return int(pages[-2].strip())
- def saveToDB(self, category, v, i):
- url = "%s%sindex_%d.html" % (self.indexUrl, v, i)
- if i == 1:
- url = "%s%sindex.html" % (self.indexUrl, v)
- nus = self.getPicUrls(url=url)
- for nu in nus:
- self.add(category, nu[0], nu[1])
- def savePicInfoToDB(self):
- executor = ThreadPoolExecutor(max_workers=64)
- for c, v in self.catelogue:
- html = self.reqGet(self.indexUrl + v)
- if not os.path.exists("%s%s" % (self.ddir, c)):
- os.mkdir("%s%s" % (self.ddir, c))
- print("%s%s" % (self.ddir, c))
- maxPage = self.getMaxPage(html)
- for i in range(1, maxPage + 1):
- executor.submit(self.saveToDB, c, v, i)
- executor.shutdown(wait=True)
- def getConn(self):
- conn = pymysql.Connect(
- host="127.0.0.1",
- port=3306,
- charset='utf8',
- user='root',
- password='toor',
- db='photos'
- )
- return conn
- def add(self, category, filename, url):
- try:
- conn = self.getConn()
- cursor = conn.cursor()
- sql = "INSERT INTO purl VALUES ('{0}', '{1}', '{2}')".format(category, filename, url)
- cursor.execute(sql)
- conn.commit()
- print(filename + " was added to database successfully")
- except:
- sys.stderr.write(filename + " was existed!\n")
- finally:
- cursor.close()
- conn.close()
- def downPic(self):
- executor = ThreadPoolExecutor(max_workers=32)
- sql = "select * from purl"
- conn = self.getConn()
- cursor = conn.cursor()
- cursor.execute(sql)
- result = cursor.fetchall()
- for index in range(0, len(result)):
- if self.downCount > 200:
- print("finished today, welcome come back tomorrow!")
- break
- executor.submit(self.download, result[index])
- executor.shutdown(wait=True)
- cursor.close()
- conn.close()
- def download(self, cnu):
- path = "{0}{1}\{2}.jpg".format(self.ddir, cnu[0], cnu[1])
- if os.path.exists(path) and os.path.getsize(path) > 10000:
- return
- print("download... " + path)
- rimg = self.getImg(cnu[2])
- if (rimg.status_code != 200 or len(rimg.content) <= 1024):
- print("invalid img!")
- return
- with open(path, "wb") as f:
- f.write(rimg.content)
- self.downCount += 1
- print(str(self.downCount) + ": finished!!! " + path)
- def start(self, hasUrlData=None):
- if hasUrlData is None:
- self.savePicInfoToDB()
- self.downPic()
- if __name__ == '__main__':
- spider = Spider() # 如果没有cookie
- # spider = Spider(cookieFile="D:") # 如果有cookie
- spider.start(hasUrlData=True) # 已有数据库文件,直接下载,没有数据库文件则不填
复制代码
- SET NAMES utf8mb4;
- SET FOREIGN_KEY_CHECKS = 0;
- -- ----------------------------
- -- Table structure for purl
- -- ----------------------------
- DROP TABLE IF EXISTS `purl`;
- CREATE TABLE `purl` (
- `category` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
- `name` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
- `url` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
- PRIMARY KEY (`url`) USING BTREE
- ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
- SET FOREIGN_KEY_CHECKS = 1;
复制代码
记得替换自己的qq号和qq密码,以及sql账号密码
https://blog.csdn.net/qq_38203808/article/details/105483673
csdn上介绍的详细些,哪儿没看懂请回复,我也是初学者,大家一起学习,一起进步!
|
-
|