鱼C论坛

 找回密码
 立即注册

https://www.mzitu.com Python爬取美女图片

已有 1793 次阅读2019-8-16 14:46 |个人分类:Python

集合容错和多线程爬取
前提环境
pip install requests
pip install lxml

代码如下:
import requests
from lxml import etree
from threading import Thread
import os
import hashlib
import time
import random

def downloadFile(session,url,headers,dirPath,fileName):
if fileName is None:
fileName = hashlib.md5(url.encode(encoding='UTF-8')).hexdigest()+url[url.rindex("."):]
#print(fileName)


if dirPath is None:
dirPath = os.path.dirname(__file__)+"/file/"
if not os.path.exists(dirPath):
os.makedirs(dirPath)
if not os.path.exists(dirPath):
os.makedirs(dirPath)

# if not dirPath.endswith("/"):
# dirPath = dirPath+"/"

filePath = os.path.join(dirPath,fileName)


if os.path.exists(filePath):
print("-"*5+filePath+" exists"+"-"*5)
return


requests.packages.urllib3.disable_warnings()

response = requests.get(url,headers=headers,verify=False)

with open(filePath,'wb') as fp:
fp.write(response.content)

print("-"*5+filePath+" write finish"+"-"*5)
# print(filePath)



def parseHtmlByUrl(session,url,headers,xpath):


requests.packages.urllib3.disable_warnings()

response = requests.get(url,headers=headers,verify=False)
# print(response.content)
html = etree.HTML(response.content)
# print(type(html))
results = html.xpath(xpath)
return results







if __name__=="__main__":
basicPath = os.path.join(os.path.dirname(__file__),'file')
session= requests.Session()
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'referer': 'https://m.mzitu.com/',
}

listNums = [i for i in range(1,7)]
for i in listNums[::-1]:
indexUrl = "https://www.mzitu.com/hot/page/{}/".format(i)
indexUrl = "".join(indexUrl)
#print(indexUrl)
try:
figures = parseHtmlByUrl(session,indexUrl,headers,"//li")
except:
continue

if figures is None or len(figures) == 0:
continue
for figure in figures:
try:
picBaicUrl = figure.xpath(".//a/@href")[0]
picBaicTitle = figure.xpath(".//span//a/text()")[0]
except:
continue
# print("picBaicTitle:"+picBaicTitle)
# print("picBaicUrl"+picBaicUrl)

while not picBaicUrl is None:
print("indexUrl:"+indexUrl)
print("picBaicUrl:" + picBaicUrl)
try:
PicNextUrl = parseHtmlByUrl(session,picBaicUrl,headers,"//div[@class='pagenavi']//a[last()]/@href")
except:
print("PicNextUrl error!")
continue

if len(PicNextUrl) == 0:
print("PicNextUrl len")
break


PicNextUrl = "".join(PicNextUrl[0])


if len(PicNextUrl[PicNextUrl.rindex("/"):]) > 3:
break

# print("PicNextUrl:" + PicNextUrl)
# print(etree.tostring(p, encoding="utf-8").decode("utf-8") )

try:
picDownloadsUrl = parseHtmlByUrl(session,picBaicUrl,headers,"//div[@class='main-image']//img/@src")[0]
except:
continue

print("picDownloadsUrl:"+picDownloadsUrl)
#
# print("indexUrl:"+indexUrl+"<-->PicNextUrl:"+PicNextUrl)
picDownloadsDir = os.path.join(basicPath,picBaicTitle)
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'referer': picBaicUrl,
}
#
try:
Thread(target=downloadFile(session,picDownloadsUrl,headers,picDownloadsDir,None))
except:
continue

time.sleep(random.randint(0,4))
#

picBaicUrl = PicNextUrl

print("-"*10)










路过

鸡蛋

鲜花

握手

雷人

全部作者的其他最新日志

评论 (0 个评论)

facelist

您需要登录后才可以评论 登录 | 立即注册

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-3-29 16:00

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

返回顶部