|
|
发表于 2019-10-14 10:57:12
|
显示全部楼层
修改一下代码,可以爬,但是服务器有反爬
- import urllib.request
- import urllib.error
- import re
- def read_url(url):
- '''
- req = urllib.request.Request(url)
- req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36")
- req.add_header("Referer","https://www.ilemiss.net")
- '''
- header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36","Referer":"https://www.ilemiss.net"}
- req = urllib.request.Request(url,headers=header)
- date = urllib.request.urlopen(req).read().decode("utf-8","ignore")
- return date
- def download_picture(url):
- date = read_url(url)
- last = re.findall(r'<a href="\d{4}_(\d{2}).html">尾页',date)[0]
- name = re.findall(r'<a href="(\d{4})_\d{2}.html">尾页',date)[0]
- fist = re.search(r'http://img.qincns.com/\d{4}/1\.jpg',date).group() #修改
- for i in range(1,int(last)+1):
- scr = fist.split("1.jpg")[0]
- pic_url = scr+str(i)+".jpg" #修改
- pic_name = name+"-"+str(i)+".jpg" #修改
- urllib.request.urlretrieve(pic_url,pic_name) #修改
- for i in range(2,10):
- url = "https://www.ilemiss.net/sexy/index_"+str(i)+".html"
- date = read_url(url)
- pic_list = re.findall(r'class="imbtxt"><p><a href="(.*?\.html)" ',date)
- for each in pic_list:
- print(each)
- try:
- download_picture(each)
- except Exception as e:
- print(e)
复制代码
|
|