集合容错和多线程爬取前提环境
pip install requests
pip install lxml
代码如下:
import requests
from lxml import etree
from threading import Thread
import os
import hashlib
import time
import random
def downloadFile(session,url,headers,dirPath,fileName):
if fileName is None:
fileName = hashlib.md5(url.encode(encoding='UTF-8')).hexdigest()+url[url.rindex("."):]
#print(fileName)
if dirPath is None:
dirPath = os.path.dirname(__file__)+"/file/"
if not os.path.exists(dirPath):
os.makedirs(dirPath)
if not os.path.exists(dirPath):
os.makedirs(dirPath)
# if not dirPath.endswith("/"):
# dirPath = dirPath+"/"
filePath = os.path.join(dirPath,fileName)
if os.path.exists(filePath):
print("-"*5+filePath+" exists"+"-"*5)
return
requests.packages.urllib3.disable_warnings()
response = requests.get(url,headers=headers,verify=False)
with open(filePath,'wb') as fp:
fp.write(response.content)
print("-"*5+filePath+" write finish"+"-"*5)
# print(filePath)
def parseHtmlByUrl(session,url,headers,xpath):
requests.packages.urllib3.disable_warnings()
response = requests.get(url,headers=headers,verify=False)
# print(response.content)
html = etree.HTML(response.content)
# print(type(html))
results = html.xpath(xpath)
return results
if __name__=="__main__":
basicPath = os.path.join(os.path.dirname(__file__),'file')
session= requests.Session()
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'referer': 'https://m.mzitu.com/',
}
listNums = [i for i in range(1,7)]
for i in listNums[::-1]:
indexUrl = "https://www.mzitu.com/hot/page/{}/".format(i)
indexUrl = "".join(indexUrl)
#print(indexUrl)
try:
figures = parseHtmlByUrl(session,indexUrl,headers,"//li")
except:
continue
if figures is None or len(figures) == 0:
continue
for figure in figures:
try:
picBaicUrl = figure.xpath(".//a/@href")[0]
picBaicTitle = figure.xpath(".//span//a/text()")[0]
except:
continue
# print("picBaicTitle:"+picBaicTitle)
# print("picBaicUrl"+picBaicUrl)
while not picBaicUrl is None:
print("indexUrl:"+indexUrl)
print("picBaicUrl:" + picBaicUrl)
try:
PicNextUrl = parseHtmlByUrl(session,picBaicUrl,headers,"//div[@class='pagenavi']//a[last()]/@href")
except:
print("PicNextUrl error!")
continue
if len(PicNextUrl) == 0:
print("PicNextUrl len")
break
PicNextUrl = "".join(PicNextUrl[0])
if len(PicNextUrl[PicNextUrl.rindex("/"):]) > 3:
break
# print("PicNextUrl:" + PicNextUrl)
# print(etree.tostring(p, encoding="utf-8").decode("utf-8") )
try:
picDownloadsUrl = parseHtmlByUrl(session,picBaicUrl,headers,"//div[@class='main-image']//img/@src")[0]
except:
continue
print("picDownloadsUrl:"+picDownloadsUrl)
#
# print("indexUrl:"+indexUrl+"<-->PicNextUrl:"+PicNextUrl)
picDownloadsDir = os.path.join(basicPath,picBaicTitle)
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'referer': picBaicUrl,
}
#
try:
Thread(target=downloadFile(session,picDownloadsUrl,headers,picDownloadsDir,None))
except:
continue
time.sleep(random.randint(0,4))
#
picBaicUrl = PicNextUrl
print("-"*10)