爬取彼岸图网4K原图,Python交流,编程语言专区,鱼C论坛

千公子 发表于 2020-4-16 11:11:14

爬取彼岸图网4K原图

本帖最后由千公子于 2020-4-16 11:13 编辑

import json
import os
import random
import sys
from concurrent.futures.thread import ThreadPoolExecutor
import pymysql
from lxml import etree
from requests import *
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

class Spider:
def __init__(self, cookieFile=None):
   if cookieFile is None:
         self.writeCookie()
         self.cookies = self.readCookies("cookie.txt")
   else:
         self.cookies = self.readCookies(cookieFile)

   self.headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
   }
   self.indexUrl = "http://pic.netbian.com"
   self.catelogue = self.getCatalogue()
   # 每天限量只能下载200张
   self.downCount = 0
   self.ddir = 'D:\\Data\\照片\\彼岸图网\\'

def writeCookie(self):
   try:
         chrome_options = Options()
         chrome_options.add_argument('--headless')
         chrome = webdriver.Chrome(options=chrome_options)
         chrome.get("http://pic.netbian.com/e/memberconnect/?apptype=qq")
         ptlogin_iframe = WebDriverWait(chrome, 15).until(EC.presence_of_element_located((By.ID,"ptlogin_iframe")))
         chrome.switch_to.frame(ptlogin_iframe)
         chrome.find_element_by_id("switcher_plogin").click()
         # 设置qq号
         chrome.find_element_by_id("u").send_keys("输入您的QQ号")
         # 设置qq密码
         chrome.find_element_by_id("p").send_keys("输入您的密码")
         # 确定登录
         chrome.find_element_by_id("login_button").click()
         WebDriverWait(chrome, 15).until(EC.url_to_be("http://pic.netbian.com/"))
         with open("cookie.txt", "wt") as f:
            for cookieMap in chrome.get_cookies():
               k = cookieMap["name"]
               v = cookieMap["value"]
               f.write(k+"="+v+"\n")
   finally:
         chrome.quit()

def readCookies(self, cookieFile):
   cookies = {}
   with open(cookieFile, "r", encoding="utf-8") as f:
         while True:
            c = f.readline()
            if c is not None:
               c = c.strip()
               if len(c) == 0:
                     break
               else:
                     c = c.split("=")
                     cookies] = c
            else:
               break
   return cookies

def reqGet(self, url):
   html = get(url, headers=self.headers, cookies=self.cookies).content.decode("gbk")
   return html

def getImg(self, url):
   return get(url, headers=self.headers, cookies=self.cookies)

def getCatalogue(self):
   index = self.reqGet(self.indexUrl)
   h = etree.HTML(index)
   href = h.xpath('//div[@class="classify clearfix"]/a/@href')
   title = h.xpath('//div[@class="classify clearfix"]/a/text()')
   return zip(title, href)

def getRealUrl(self, href):
   """
   ('阿尔卑斯山风景4k高清壁纸3840x2160', 'http://pic.netbian.com/downpic.php?id=21953&classid=53')
   """
   dh = self.reqGet(self.indexUrl + href)
   h = etree.HTML(dh)
   dataId = h.xpath('//div[@class="downpic"]/a/@data-id')
   title = h.xpath('//div[@class="photo-hd"]/h1/text()')
   url = "{0}/e/extend/downpic.php?id={1}&t={2}".format(self.indexUrl, dataId, random.random())
   msg = self.reqGet(url)
   return title, self.indexUrl + json.loads(msg)['pic']

def getPicUrls(self, url=None, html=None):
   if html is None:
         html = self.reqGet(url)
   h = etree.HTML(html)
   hrefs = h.xpath('//ul[@class="clearfix"]/li/a/@href')
   realHrefs = []
   for href in hrefs:
         realHrefs.append(self.getRealUrl(href))
   return realHrefs

def getMaxPage(self, html):
   h = etree.HTML(html)
   pages = h.xpath('//div[@class="page"]/a/text()')
   return int(pages[-2].strip())

def saveToDB(self, category, v, i):
   url = "%s%sindex_%d.html" % (self.indexUrl, v, i)
   if i == 1:
         url = "%s%sindex.html" % (self.indexUrl, v)
   nus = self.getPicUrls(url=url)
   for nu in nus:
         self.add(category, nu, nu)

def savePicInfoToDB(self):
   executor = ThreadPoolExecutor(max_workers=64)
   for c, v in self.catelogue:
         html = self.reqGet(self.indexUrl + v)
         if not os.path.exists("%s%s" % (self.ddir, c)):
            os.mkdir("%s%s" % (self.ddir, c))
         print("%s%s" % (self.ddir, c))
         maxPage = self.getMaxPage(html)
         for i in range(1, maxPage + 1):
            executor.submit(self.saveToDB, c, v, i)
   executor.shutdown(wait=True)

def getConn(self):
   conn = pymysql.Connect(
         host="127.0.0.1",
         port=3306,
         charset='utf8',
         user='root',
         password='toor',
         db='photos'
   )
   return conn

def add(self, category, filename, url):
   try:
         conn = self.getConn()
         cursor = conn.cursor()
         sql = "INSERT INTO purl VALUES ('{0}', '{1}', '{2}')".format(category, filename, url)
         cursor.execute(sql)
         conn.commit()
         print(filename + " was added to database successfully")
   except:
         sys.stderr.write(filename + " was existed!\n")
   finally:
         cursor.close()
         conn.close()

def downPic(self):
   executor = ThreadPoolExecutor(max_workers=32)
   sql = "select * from purl"
   conn = self.getConn()
   cursor = conn.cursor()
   cursor.execute(sql)
   result = cursor.fetchall()
   for index in range(0, len(result)):
         if self.downCount > 200:
            print("finished today, welcome come back tomorrow!")
            break
         executor.submit(self.download, result)
   executor.shutdown(wait=True)
   cursor.close()
   conn.close()

def download(self, cnu):
   path = "{0}{1}\{2}.jpg".format(self.ddir, cnu, cnu)
   if os.path.exists(path) and os.path.getsize(path) > 10000:
         return
   print("download... " + path)
   rimg = self.getImg(cnu)
   if (rimg.status_code != 200 or len(rimg.content) <= 1024):
         print("invalid img!")
         return
   with open(path, "wb") as f:
         f.write(rimg.content)
         self.downCount += 1
   print(str(self.downCount) + ": finished!!! " + path)

def start(self, hasUrlData=None):
   if hasUrlData is None:
         self.savePicInfoToDB()
   self.downPic()

if __name__ == '__main__':
spider = Spider() # 如果没有cookie
# spider = Spider(cookieFile="D:") # 如果有cookie
spider.start(hasUrlData=True) # 已有数据库文件，直接下载，没有数据库文件则不填

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for purl
-- ----------------------------
DROP TABLE IF EXISTS `purl`;
CREATE TABLE `purl`(
`category` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`name` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`url` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
PRIMARY KEY (`url`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;

SET FOREIGN_KEY_CHECKS = 1;

static/image/hrline/line7.png
记得替换自己的qq号和qq密码，以及sql账号密码

https://blog.csdn.net/qq_38203808/article/details/105483673
csdn上介绍的详细些，哪儿没看懂请回复，我也是初学者，大家一起学习，一起进步！

六小鸭 发表于 2020-4-16 11:13:12

我去

隔壁繁星吖 发表于 2020-4-16 11:13:31

高产啊
学的很不错{:10_275:}

六小鸭 发表于 2020-4-16 11:13:42

厉害

隔壁繁星吖 发表于 2020-4-16 11:15:37

已收录入【作品分享与欣赏】！

liuzhengyuan 发表于 2020-4-16 11:30:58

那个口算和这个都不错！！！

编程鱼C 发表于 2020-4-16 12:35:40

不错

Charon彡 发表于 2020-4-16 19:04:04

不错哦 nice

永恒的蓝色梦想 发表于 2020-4-16 19:07:52

不错{:10_256:}

永恒的蓝色梦想 发表于 2020-4-16 19:08:24

不过鱼币还是要拿的{:10_256:}

永恒的蓝色梦想 发表于 2020-4-16 19:09:16

老非酋了{:10_266:}

王富帅 发表于 2020-4-16 19:18:47

好

wuhao4221961 发表于 2020-4-16 19:34:54

慢慢来最快{:10_256:}

乔珂珂 发表于 2020-4-16 19:55:12

这么长。。。。。。。。。

winsome8538 发表于 2020-4-16 19:55:36

我去牛批代码&鱼币都牛批

依可儿 发表于 2020-4-16 19:59:07

头晕眼花。。。。。

Mike_python小 发表于 2020-4-16 19:59:28

看看

珂乔乔 发表于 2020-4-16 20:01:39

真心看不懂，太长了都没有看到底就头晕了。。。。厉害啊大佬们

atlength 发表于 2020-4-16 21:39:57

太长了都没有看到底就头晕了

枫枫duck 发表于 2020-4-16 21:46:21

学习

页: [1] 2 3

鱼C论坛's Archiver

爬取彼岸图网4K原图