|
楼主 |
发表于 2020-3-18 18:38:56
|
显示全部楼层
import requests
import re
def open_url(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}
req = requests.get(url = url,headers=headers)
return req
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}
res = requests.get(url = 'http://www.rentifq.com/',headers = headers)
res= res.text
ur = re.findall('<a href=[\s\S]*?.html',res)
for l in ur:
ul = re.findall('<a href="([\s\S]*?).html',l) #获取页面所有的网址
# print(ul)
for each1 in ul:#一个网址一个网址的遍历打开爬
print(each1)
if len(each1) <=30:
url1 = 'http://www.rentifq.com'+ each1
print(url1)
url2 = url1 + '.html'
jpg_all = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}
response = requests.get(url = url2,headers=headers)
response = response.text
l = re.findall('<FONT COLOR=red><B>1</B></FONT> <A HREF="([\s\S]*)</a> <A', response)
for i in l:
ur = re.findall('HREF="([\s\S]*?)">', i)
ur =len(ur) + 2
print('共%d页'%ur)
r = re.findall('img alt=([\s\S]*?.jpg)',response)
for each in r:
jpg = re.findall('src="(/[\s\S]*?.jpg)',each)
jpg = 'http://www.rentifq.com'+jpg[0]
jpg_all.append(jpg)
for i in range(ur+1):
if i > 1:
url = url1 + '$' + str(i) + '.html'
response = requests.get(url=url, headers=headers)
response = response.text
r = re.findall('img alt=([\s\S]*?.jpg)', response)
for each in r:
jpg = re.findall('src="(/[\s\S]*?.jpg)', each)
jpg = 'http://www.rentifq.com' + jpg[0]
jpg_all.append(jpg)
print(jpg_all)
for i in jpg_all:
print(i)
filname = i.split('/'[-1][-1])
print(filname[-1])
img = open_url(i).content
with open(filname[-1],'wb') as f:
f.write(img)
更新版.............. |
|