|
楼主 |
发表于 2017-7-17 00:22:07
|
显示全部楼层
经过指点是for循环和里面的调用重复,所以一直无法跑全部数据,现在改装代码如下,主要代理IP不够好,速度很慢,能满足三位数的IP抓取。需要将IP放在ipsearch.txt的文档里面。
- import socket
- import urllib.request
- import random
- f = open("ipsearch.txt","r")
- w = open("查询结果.txt","w")
- ipsearch = f.read()
- ip_list = ipsearch.split()
- def is_valid_ip(ip):
- """Returns true if the given string is a well-formed IP address.
-
- Supports IPv4 and IPv6.
- """
- if not ip or '\x00' in ip:
- # getaddrinfo resolves empty strings to localhost, and truncates
- # on zero bytes.
- return False
- try:
- res = socket.getaddrinfo(ip, 0, socket.AF_UNSPEC,
- socket.SOCK_STREAM,
- 0, socket.AI_NUMERICHOST)
- return bool(res)
- except socket.gaierror as e:
- if e.args[0] == socket.EAI_NONAME:
- return False
- raise
- return True
- def ipsearch(ip1):
- if is_valid_ip(ip1):
- proxy_ip = ('111.11.83.243','183.222.102.100','119.36.92.41','122.72.32.72')
- proxy_handler = urllib.request.ProxyHandler({'http':random.choice(proxy_ip)})
- opener = urllib.request.build_opener(proxy_handler)
- opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0')]
- urllib.request.install_opener(opener)
- url = "http://www.ip138.com/ips1388.asp?ip="+ip1+"&action=2"
- File1 = urllib.request.urlopen(url)
- File2 = File1.read().decode('gbk')
- x = File2.index("本站数据",6000)
- y = File2.index("参考数据1",6000)
- File3 = ip1 + File2[x+4:y-9] +"\n"
- print(w.write(File3))
- else:
- print(w.write("错误值\n"))
- def ipsearch_while():
- while len(ip_list) > 0 :
- ip1 = ip_list.pop()
- ipsearch(ip1)
- try:
- ipsearch_while()
- except TimeoutError:
- ipsearch_while()
- finally:
- w.close()
- f.close()
复制代码 |
|