|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- from bs4 import BeautifulSoup
- import urllib.request
- import re
- def get_html(url):
- rep = urllib.request.Request(url)
- #rep.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0')
- html = urllib.request.urlopen(rep).read().decode('utf-8')
- return html
- url = 'https://baike.baidu.com/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB'
- html = get_html(url)
- s = BeautifulSoup(html,'lxml')
- link = s.find_all('a',href = re.compile('/item'))
- count = 0
- for i in link[:3]:
- l = get_html('https://baike.baidu.com' + i['href'])
- with open('%d.txt'%count,'w',encoding = 'utf-8') as f:
- f.write(l)
- count += 1
复制代码
|
|