|
发表于 2019-5-17 16:24:23
|
显示全部楼层
- from bs4 import BeautifulSoup
- import requests
- import codecs
- import os,re
- headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
- def get_url_list(url):
- html = requests.get(url,headers)
- soup = BeautifulSoup(html.content, 'lxml')
- con_list = []
- x1 = soup.find_all('div',attrs={'id':'content'})
- x2=re.sub("[a-zA-Z0-9\<\>\/\(\);\=\[\]"]+",'',str(x1))
- print(x2)
-
-
- if '__main__' == __name__:
- url = 'http://www.xinshubao.net/22/22070/3179772_2.html'
- url_list = get_url_list(url)
复制代码
试一下看 |
|