|
发表于 2018-10-14 09:25:13
|
显示全部楼层
参考代码:
- def get_proxy(html):
- #代理正则表达式
- IP = r'(?:(?:[01]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[01]?\d?\d|2[0-4]\d|25[0-5])'
- port = r'<td>([0-5]?\d?\d?\d?\d|6[0-4]\d\d\d|65[0-4]\d\d|655[0-2]\d|6553[0-6])</td>'
- protocol = r'<td>(HTTP|HTTPS|socks4/5)</td>'
- IP_list = re.findall(IP, html)
- port_list = re.findall(port, html)
- protocol_list = re.findall(protocol, html)
- for i in range(len(protocol_list)):
- protocol_list[i] = protocol_list[i].lower()
- #保证读取结果的数量一样
- assert len(IP_list) == len(port_list) == len(protocol_list)
- #储存代理
- proxy = []
- for i in range(len(IP_list)):
- if protocol_list[i] == 'socks4/5':
- proxy.append({'{}'.format(protocol_list[i]):'{}:{}'.format(IP_list[i], port_list[i])})
- return proxy
- proxy_list = get_proxy(open_url(IP_url))
- proxy = random.choice(proxy_list)
- proxy_support = urllib.request.ProxyHandler(proxy)
- opener = urllib.request.build_opener(proxy_support)
复制代码 |
|