import re
import urllib.request
def get_html():
url='https://www.pengfu.com/'
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36')
html = urllib.request.urlopen(req).read().decode('utf-8')
#print(html)
return html
def get_page(html):
list1 = []
#<h1 class="dp-b"><a href="https://www.pengfu.com/content_1706980_1.html" target="_blank">等………</a>
reg =re.compile(r'<h1 class="dp-b"><a href="https://www.pengfu.com/content_(\d{7})_1.html"')
item = re.findall(reg,html)
for each in item:
list1.append("https://www.pengfu.com/content_"+ each +"_1.html")
print(list1)
return list1
html = get_html()
get_page(html)
可以了尴尬 |