|
发表于 2017-5-12 20:05:40
|
显示全部楼层
本帖最后由 MSK 于 2017-5-12 20:07 编辑
- from urllib.request import *
- import re
- def get(url='http://www.xicidaili.com'):
- ip_dict = {}
- '返回一个叫做 ip_dict 的字典\n,格式:\nip_dict[ip] = [port,place,anonymity,form,live_time,update_time]'
-
- url = Request(url)
- url.add_header("User-Agent",'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36')
- html = urlopen(url)
- response = html.read().decode('utf-8')
- a = re.findall(r'<td>(?:.|\n)+?</td>',response)
- c = {}
- #print(a)
- count = 0
- for i in a:
- count += 1
- i = i.replace('<','')
- i = i.replace('>','')
- i = i.replace('/','')
- i = i.replace('td','')
- i = i.replace('<','')
- if count == 1:
- ip = i
- elif count == 2:
- port = i
- elif count == 3:
- place = i
- elif count == 4:
- form = i
- elif count == 5:
- live_time = i
- elif count == 6:
- check_time = i
- ip_dict[ip] = [port,place,form,live_time,check_time]
- count = 0
- continue
- return ip_dict
复制代码
因为经常写爬虫,所以觉得这一段特别有效。
|
评分
-
查看全部评分
|