|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import re
import urllib.request
import os
Num=1
os.chdir('C:\\Users\\zhangje\\Desktop\\Newpic')
url='http://tu.duowan.com/tu'
def gethtml(url): #解析url
url_request = urllib.request.urlopen(url)
url_request.add_head=('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36')
html=url_request.read().decode('utf-8')
return html
def gethref(url): #获取网页内含链接
href=r'<a href="([^"]+\.html)'
each_href=[]
var_href=[]
storePic(url)
for each in re.findall(href, gethtml(url)):
each_href.append(each)
if each_href.count(each)>1:
continue
else:
for var in re.findall(href,gethtml(each)):
var_href.append((var))
if var_href.count(var)>1:
continue
else:
storePic(var)
def storePic(url): #下载图片
JPG=r'img src="(.+\.jpg)'
PNG=r'img src="(.*?\.png)'
JPEG=r'img src="(.*?\.jpeg)'
GIF=r'img src="(.*?\.gif)'
jpg='.jpg'
png='.png'
jpeg='.jpeg'
gif='.gif'
global Num
try:
for each1 in re.findall(JPG, gethtml(url)):
Filename = str(Num) + jpg
Num += 1
print(each1)
urllib.request.urlretrieve(each1, Filename, None)
for each2 in re.findall(PNG, gethtml(url)):
Filename = str(Num) + png
Num += 1
print(each2)
urllib.request.urlretrieve(each2, Filename, None)
for each3 in re.findall(JPEG, gethtml(url)):
Filename = str(Num) + jpeg
Num += 1
print(each3)
urllib.request.urlretrieve(each3, Filename, None)
for each4 in re.findall(GIF, gethtml(url)):
Filename = str(Num) + gif
Num += 1
print(each4)
urllib.request.urlretrieve(each4, Filename, None)
except ValueError or TypeError:
pass
storePic(url)
问题就是gethref(url)函数的正则表达式匹配了一堆地址,其中出现两个地址很有问题:
http://tu.duowan.com/gallery/136914.html
http://tu.duowan.com/scroll/136914.html
http://tu.duowan.com/gallery/136913.html
http://tu.duowan.com/scroll/136913.html
http://tu.duowan.com/gallery/136912.html
http://tu.duowan.com/scroll/136912.html
http://tu.duowan.com/tag/12605.html" target="_blank" style="text-decoration-line: none; color: rgb(75, 178, 251);">今日囧图</a> <a target="_blank" style="text-decoration-line: none; color: rgb(75, 178, 251);">吐槽囧图</a> <a target="_blank" style="text-decoration-line: none; color: rgb(75, 178, 251);">爆笑gif </a><a target="_blank" style="text-decoration-line: none; color: rgb(75, 178, 251);">日式冷笑话</a> <a href="http://tu.duowan.com/tag/23515.html
http://tu.duowan.com/gallery/136911.html
http://tu.duowan.com/scroll/136911.html
http://tu.duowan.com/gallery/136922.html
http://tu.duowan.com/gallery/136909.html
http://tu.duowan.com/scroll/136909.html
http://tu.duowan.com/gallery/136908.html
http://tu.duowan.com/gallery/136881.html
http://tu.duowan.com/gallery/136869.html
http://tu.duowan.com/gallery/136922.html
http://tu.duowan.com/scroll/136922.html
http://tu.duowan.com/tag/12605.html" target="_blank" style="color: rgb(75, 178, 251); text-decoration-line: none;">今日囧图</a> <a target="_blank" style="color: rgb(75, 178, 251); text-decoration-line: none;">吐槽囧图</a> <a target="_blank" style="color: rgb(75, 178, 251); text-decoration-line: none;">爆笑gif </a><a target="_blank" style="color: rgb(75, 178, 251); text-decoration-line: none;">日式冷笑话</a> <a href="http://tu.duowan.com/tag/23515.html
http://tu.duowan.com/editors/36.html
http://tu.duowan.com/gallery/136911.html
http://tu.duowan.com/gallery/136903.html
http://tu.duowan.com/gallery/136902.html
标红的就是出现错误的,匹配了几百个URL只有这两个,想了两天了,不得解,求助大神分析并给出方法啊,虽然用书上的([^"]*\.html)不出现上面的错误,但是会匹配到非HTTP的HTML。
|
|