|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 一个账号 于 2020-3-30 09:57 编辑
看老污龟旧版视频学习的python,研究了不少时间,终于鼓捣出了我的第一个爬虫程序,加了随机延迟,太快会被封,话不多说,源码贴上
- import requests
- import re
- import random
- import time
- import os
- proxy = [
- {'https': '47.99.236.251:3128'}, #未启用
- {'https': '1.197.16.250:9999'},
- {'https': '123.163.96.235:9999'},
- {'https': '202.183.32.182:80'},
- {'https': '113.194.31.21:9999'}]
- header = [
- {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.17 Safari/537.36'},
- {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1'},
- {'user-agent':'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/58.0.3029.110 safari/537.36 edge/16.16299'}
- ]
- def ym (http,referer): #返回妹子图源码
- headers=random.choice(header)
- headers['Referer'] = '{}'.format(referer)
-
-
- re=requests.get(url=http,headers=random.choice(header))
-
- return re.text
- def bcwj(wname,http,referer): #保存文件,需要在headers中间加入referer
- headers=random.choice(header)
- headers['Referer'] = '{}'.format(referer)
- name=http.split('/')[-1]
- time.sleep(random.randint(1,8))
- r=requests.get(http,headers=headers)
- filename=wname+'\\'+name
- with open(filename,'wb') as f:
- f.write(r.content)
- print(headers,name,'下载成功')
- def ppzpz(ym): #匹配源码中照片组,返回list
- zpz=re.findall(r'<li><a href="(https://www.mzitu.com/\d{4,7})',ym)
- return zpz
- def ppjpg(ym2): #返回jpg下载链接
- jpg=re.findall(r'https://i3.mmzztt.com/\d+/\d+/.{1,6}.jpg',ym2)
- return jpg
- def ppjpg2(ym2): #返回jpg下载链接
- jpg=re.findall(r'https://i\d{0,1}.mmzztt.com/\d+/\d+/.{1,6}.jpg',ym2)
- return jpg
- def fhys(aa): #返回照片组中的最大页数
- ys=re.findall(r'<span>(\d{1,2})</span>',aa)
- return ys
- def ppwjj(httpjpg): #给一个jpg链接,注意返回的是list
- y=ym(httpjpg,httpjpg)
- wname=re.findall(r'content="(.*?)- 第',y)
- return wname
- ys=input('请输入爬取的页数(建议10):')
- for i in range(2,int(ys)): #首页,首页中有24组照片
- time.sleep(random.randint(8,15))
- print(i)
- a=('https://www.mzitu.com/page/%s/') % i
- zpz=ppzpz(ym(a,a)) #首页上的照片组列表
- print(zpz)
- for n in zpz: #循环有几个照片组
- wname=ppwjj(n)[0]
- wname=wname.rstrip()
- try:
- os.mkdir(wname)
- except Exception:
- print('文件夹已存在')
- #print(n,wname)
- try:
- yss=int(fhys(ym(n,n))[-1]) #这个照片组中的妹子有几张jpg(页数)
- except Exception:
- print('第一次循环未获取到页数')
- continue
- time.sleep(random.randint(5,11))
- for ii in range(0,yss): #循环一个照片组中的jpg,然后保存到本地
- time.sleep(3)
- a=(n+'/%s' )% ii
- #print(a)
- try:
- y=ym(a,i)
- jpg=ppjpg2(y)[0]
- except Exception:
- print('获取jpg异常,跳过本次循环(jpg,源码):\n',y)
- continue
- print(jpg)
- try:
- bcwj(wname,jpg,i)
- except Exception:
- print(',保存文件出现异常')
- continue
复制代码
|
-
-
|