马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
from bs4 import BeautifulSoup
import urllib.request
import re
def get_html(url):
rep = urllib.request.Request(url)
#rep.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0')
html = urllib.request.urlopen(rep).read().decode('utf-8')
return html
url = 'https://baike.baidu.com/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB'
html = get_html(url)
s = BeautifulSoup(html,'lxml')
link = s.find_all('a',href = re.compile('/item'))
count = 0
for i in link[:3]:
l = get_html('https://baike.baidu.com' + i['href'])
with open('%d.txt'%count,'w',encoding = 'utf-8') as f:
f.write(l)
count += 1
|