|
发表于 2020-12-2 10:21:09
|
显示全部楼层
本楼为最佳答案
 - import urllib.request as ur
- import urllib.parse as upa
- from bs4 import BeautifulSoup as bso
- import re
- def main():
- keyword = input('请输入关键词:')
- keyword2 = upa.quote(keyword)
- url = 'http://baike.baidu.com/item/%s' % keyword2
- http1 = ur.Request(url)
- http1.addheaders = [('user-Agent',
- 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')]
- html1 = ur.urlopen(http1).read().decode('utf-8')
- soup = bso(html1, 'html.parser')
- for i in soup.find_all(href=re.compile('item')):
- content = i.text # 这里优化一下,可以直接用,不用join。
- url2 = 'http://baike.baidu.com' + upa.quote(i['href']) # 忘记转成url编码了
- http2 = ur.Request(url2)
- http2.addheaders = [('user-Agent',
- 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')]
- html2 = ur.urlopen(http2).read().decode('utf-8')
- soup2 = bso(html2, 'html.parser')
- if soup2.h2:
- content = ''.join([content, soup.h2.string]) # 是h2的string,不是h2
- content = ''.join([content, '->', url2])
- print(content)
- if __name__ == '__main__':
- main()
复制代码 |
|