|
5鱼币
import urllib.request
def download_html(url):
header = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
"AppleWebKit/537.36 (KHTML, like Gecko)"
"Chrome/90.0.4430.85 Safari/537.36"
}
req = urllib.request.Request(url = url, headers = header)
response = urllib.request.urlopen(req)
html = response.read().decode("utf-8")
return html
html = duwnload_html("https://movie.douban.com/top250")
import re
pattern = 'https://movie.douban.com/subject/[0-9]+/'
urls = re.findall(pattern,html)
urls = set(urls)
print("urls count=%d"%(len(urls)))
for url in urls:
print(url)
改好之后我这运行正常
|
|