|

楼主 |
发表于 2017-6-27 23:34:06
|
显示全部楼层
- import re
- import urllib.request
- def get_html():
- url='https://www.pengfu.com/'
- req = urllib.request.Request(url)
- req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36')
- html = urllib.request.urlopen(req).read().decode('utf-8')
- #print(html)
- return html
- def get_page(html):
- list1 = []
-
- #<h1 class="dp-b"><a href="https://www.pengfu.com/content_1706980_1.html" target="_blank">等………</a>
- reg =re.compile(r'<h1 class="dp-b"><a href="https://www.pengfu.com/content_(\d{7})_1.html"')
- item = re.findall(reg,html)
- for each in item:
- list1.append("https://www.pengfu.com/content_"+ each +"_1.html")
- print(list1)
-
- return list1
- html = get_html()
- get_page(html)
复制代码
可以了尴尬 |
|