|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import urllib.request
import re
def open_url(url):
req = urllib.request.Request(url) # urllib.request.Request讲解见14.2.1、14.3.1,举例见p14_4.py、p14_5.py
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36') # add_header讲解见14.3.1,举例见爬取猪八戒词条
response = urllib.request.urlopen(req)
#response
#print(response)
html = response.read().decode('utf-8')
return html
def get_img(html):
p = r'(?:(?:[0,1]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[0,1]?\d?\d|2[0-4]\d|25[0-5])' # ip由四段数字组成:XXX.XXX.XXX.XXX
iplist = re.findall(p, html)
for each in iplist:
print(each)
if __name__ == '__main__': # 见13.4(P147-149)
url = 'https://proxy.seofangfa.com'
get_img(open_url(url)) # 看不懂???!!!
运行后获得:
36.134.91.82
117.157.197.18
103.59.151.99
111.23.16.25
203.34.48.10
45.189.254.70
43.255.113.23
43.255.113.23
43.255.113.23
43.255.113.23
与实际网页对比如下:
红框处不同,是不是正则表达式有错造成的呢?完全按小甲鱼教学视频上敲的哦
>>> import re
>>> p = r'(?:(?:2[0-4]\d|25[0-5]|[0,1]?\d?\d)\.){3}(?:2[0-4]\d|25[0-5]|[0,1]?\d?\d)'
>>> re.match(p, "103.133.177.,41")
<re.Match object; span=(0, 15), match='103.133.177.,41'>
>>> p = r'(?:(?:2[0-4]\d|25[0-5]|[01]?\d?\d)\.){3}(?:2[0-4]\d|25[0-5]|[01]?\d?\d)'
>>> re.match(p, "103.133.177.,41")
>>>
|
|