修改版的,可以直接用,不会出现错误的ip爬取import urllib.request
import re
import os
import random
import urllib.error
def url_open(url):
req = urllib.request.Request(url)
req.add_header("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36")
proxies = ["125.108.123.66:9000","183.166.103.45:9999","115.195.84.31:8118","61.178.149.237:59042"]
proxy = random.choice(proxies)
proxy_support = urllib.request.ProxyHandler({'http':proxy})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
response = urllib.request.urlopen(req)
html = response.read().decode("utf-8")
return html
def split_addrs_port(temp_str):
[a,b] = temp_str.split(">",2)
[c,d] =b.split("<",2)
return c
def get_addrs_port(url):
html = url_open(url)
compile_ip = re.compile(r'("IP">(([0-1]?\d?\d|2[0-4]\d|25[0-5])\.){3}([0-1]?\d?\d|2[0-4]\d|25[0-5])<)')
compile_port = re.compile(r""PORT">\d{0,5}<")
url_addrs_temp = compile_ip.findall(html)
url_port_temp= compile_port.findall(html)
list_addrs_port = []
for i in range(len(url_addrs_temp)):
url_addrs = split_addrs_port(str(url_addrs_temp[i][0]))
url_port = split_addrs_port(str(url_port_temp[i]))
url_addrs_port = url_addrs + ":" + url_port
list_addrs_port.append(url_addrs_port)
return list_addrs_port
def save_addrs_port(addrs_port):
with open("ip_port.txt","a") as f:
for each in addrs_port:
f.write(str(each) + "\n")
def download_url(folder = "proxy_support3", pages = "10"):
os.chdir("C:\\Users\\Chysial\\Desktop")
os.mkdir(folder)
os.chdir(folder)
url = "https://www.kuaidaili.com/free/inha/"
url_list = []
for i in range(int(pages)):
page_num =i+1
pages_url = url + str(page_num)
try:
url_addrs_port = get_addrs_port(pages_url)
save_addrs_port(url_addrs_port)
print(url_addrs_port)
except urllib.error.HTTPError as e:
print(e.code)
continue
if __name__ == "__main__":
download_url()
|