新浪新闻首页新闻标题和链接
本帖最后由 wcq15759797758 于 2021-8-27 17:12 编辑很简单的爬虫
import requests
import cchardet
import traceback
from lxml import etree
def downloader(url, timeout=10, headers=None, debug=False, binary=False):
_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
redirected_url = url
if headers:
_headers = headers
try:
r = requests.get(url, headers=_headers, timeout=timeout)
if binary:
html = r.content
else:
encoding = cchardet.detect(r.content)['encoding']
html = r.content.decode(encoding)
status = r.status_code
redirected_url = r.url
except:
if debug:
traceback.print_exc()
msg = 'failed download: {}'.format(url)
print(msg)
if binary:
html = b''
else:
html = ''
status = 0
return title(html)
def title(html):
title_html = etree.HTML(html)
titles = title_html.xpath('//a[@target="_blank"]')
for title in titles:
item = {}
tit = title.xpath('./text()')
urls = title.xpath('./@href')
item['title'] = str(processing(tit))
item['url'] = str(processing(urls))
if len(item['title']) > 4 :
print(item)
def processing(strs):
s = ''# 定义保存内容的字符串
for n in strs:
n = ''.join(n.split())# 去除空字符
s = s + n# 拼接字符串
return s # 返回拼接后的字符串
if __name__ == '__main__':
url = 'https://news.sina.com.cn/'
downloader(url) 学习了! 学习 学习 {:9_227:} 支持一下 鱼币选手 {:5_95:} 学习一下 感谢 厉害了
页:
[1]