|
发表于 2021-6-18 23:49:40
|
显示全部楼层
本楼为最佳答案
- from urllib import request
- import re
- from bs4 import BeautifulSoup
- def main():
- url = "http://baike.baidu.com/view/284853.htm"
- headers = {'User-Agent': 'Firefox'} # 得改header防反扒
- req = request.Request(url, headers=headers)
- r = request.urlopen(req)
- html = r.read().decode('utf-8')
- soup = BeautifulSoup(html, "html.parser")
- for each in soup.find_all(href=re.compile("view")):
- print(each.text, "->", ''.join(["http://baike.baidu.com", each["href"]]))
- if __name__ == "__main__":
- main()
复制代码 |
|