|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
"""
* 完成时间:2020-03-04 *
* 作者:惜言 *
* 功能:自动爬取xicidaili.com的高匿代理 *
"""
from urllib import request
import re
import pickle
import easygui
#代理信息获取模块
def getmessage():
ip_list = []
message = ""
with open("./proxy.html","r",encoding="UTF-8") as fp:
for each in fp.readlines():
# 抓取IP信息
ip = re.search("[\d]+\.[\d]+\.[\d]+\.[\d]+",each)
if None != ip:
message += "ip address:" + ip.group()
continue
# 抓取端口信息
port = re.search(">[\d]{1,5}<",each)
if None != port:
message += " port:" + re.sub("[<>]","",port.group())
continue
#抓所在地
address = re.search('href="/[\d]{4,4}-[\d][\d]-[\d][\d]/[\w]*">[\w]*</a>',each)
if None != address:
address = address.group()
message += " 所在地:" + re.sub('[a-zA-Z/0-9="<>]|-',"",address)
continue
#匿名度
anonymous = re.search('td class="country">[\w]{2,2}</td>',each)
if None != anonymous:
anonymous = anonymous.group()
message += " 匿名度:" + re.sub('[a-zA-Z/<>="]',"",anonymous)
continue
#协议
protocol = re.search('<td>[HTPS]{4,5}</td>',each)
if None != protocol:
protocol = protocol.group()
message += " 协议:" + re.sub('[<>td/]',"",protocol)
continue
#存活天数
time = re.search('<td>[\w]*[天时分秒]</td>',each)
if None != time:
time = time.group()
message += " 存活时间:" + re.sub("[td<>/]","",time)
continue
#测试时间
test = re.search('[\d]{2,2}-[\d]{2,2}-[\d]{2,2} [\d]{2,2}:[\d]{2,2}',each)
if None != test:
test = test.group()
message += " 最后测试时间:" + test
ip_list.append(message)
message = ""
return ip_list
#页面请求模块
def myrequest():
proxy_url = "https://www.xicidaili.com/nn/"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"Referer":"https://www.xicidaili.com/wn/"
}
req = request.Request(proxy_url,headers=headers,method="GET")
resp = request.urlopen(req)
with open("./proxy.html","w",encoding="UTF-8") as fp:
fp.write(resp.read().decode("UTF-8"))
#把获取的代理信息写入文件
def writeproxy():
global proxy_list
with open("./proxy","wb") as px:
pickle.dump(proxy_list,px)
#从文件中读取代理信息
def getproxy():
with open("./proxy","rb") as px:
lists = pickle.load(px)
return lists
if __name__ == '__main__':
#爬取页面
myrequest()
#获取代理信息
proxy_list = getmessage()
#把获取的代理信息写入
writeproxy() |
|