爬虫--煎蛋网随手拍
#写于学习Python教程P57之后,未使用正则表达式##网址:http://jandan.net/ooxx
##代码未考虑非jpg格式图片,已知网站还有gif格式图片,有待改进
import urllib.request
import os
def open_url(url):
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36')
response = urllib.request.urlopen(url)
html = response.read()
return html
def find_imgs(url):
html = open_url(url).decode('utf-8')
pic_list = []
a = html.find('查看原图')
while a!=-1:
c = a + 4
a = a - 127
b = html.find('.jpg', a) + 4
pic_list.append('https://' + html)
a = html.find('查看原图', c)
return pic_list
def get_next_page(url):
html = open_url(url).decode('utf-8')
a = html.find('下一页') - 80
a = html.find('//', a)
b = html.find('comments', a)
return 'https:' + html + 'comments'
def sava_imgs(folder, pic_list):
for each in pic_list:
filename = each.split('/')[-1]
with open(filename,'wb') as f:
img = open_url(each)
f.write(img)
def g_pic_dl(folder='妹子图',pages=10):
pic_list = []
url = "http://jandan.net/ooxx"
os.mkdir(folder)
os.chdir(folder)
while pages > 0:
pic_list=find_imgs(url)
sava_imgs(folder, pic_list)
url = get_next_page(url)
if __name__ == '__main__':
g_pic_dl()
#写于学习完Python教程P61之后
小甲鱼对Ip地址的正则表达式有小bug
自己学习理解后写出对Ipv4地址的如下正则
r'((25|2|1\d\d|\d\d|\d)\.){3}(25|2|1\d\d|\d\d|\d)'
单独一段三位数测试如下:
for i in range(1,256):
re.search(r'(25|2|1\d\d|\d\d|\d)', str(i))
#加入正则表达式后对随手拍的爬虫代码(正常下载jpg及gif):
import urllib.request
import os
import re
def open_url(url):
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36')
response = urllib.request.urlopen(url)
html = response.read()
return html
def find_imgs(url):
html = open_url(url).decode('utf-8')
p = r'<a href="([^"]+.(?:jpg|gif))"'
pic_list = re.findall(p, html)
return pic_list
def get_next_page(url):
html = open_url(url).decode('utf-8')
p = r'<a title="Older Comments" href="([^"]+)"'
_result = re.search(p, html)
new_url = 'https:' + _result.group(1)
return new_url
def sava_imgs(pic_list):
for each in pic_list:
filename = each.split('/')[-1]
each = 'https:' + each
urllib.request.urlretrieve(each,filename)
def g_pic_dl(folder='妹子图',pages=5):
pic_list = []
url = "https://jandan.net/ooxx"
if os.path.isdir(folder) == 0:
os.mkdir(folder)
os.chdir(folder)
while pages > 0:
pic_list=find_imgs(url)
sava_imgs(pic_list)
url = get_next_page(url)
pages -= 1
if __name__ == '__main__':
g_pic_dl()
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
这个没有加入正则表达式的代码不行 403了 希望可以解决一下
页:
[1]