|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 wcq15759797758 于 2021-8-27 17:12 编辑
很简单的爬虫
- import requests
- import cchardet
- import traceback
- from lxml import etree
- def downloader(url, timeout=10, headers=None, debug=False, binary=False):
- _headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
- redirected_url = url
- if headers:
- _headers = headers
- try:
- r = requests.get(url, headers=_headers, timeout=timeout)
- if binary:
- html = r.content
- else:
- encoding = cchardet.detect(r.content)['encoding']
- html = r.content.decode(encoding)
- status = r.status_code
- redirected_url = r.url
- except:
- if debug:
- traceback.print_exc()
- msg = 'failed download: {}'.format(url)
- print(msg)
- if binary:
- html = b''
- else:
- html = ''
- status = 0
- return title(html)
- def title(html):
- title_html = etree.HTML(html)
- titles = title_html.xpath('//a[@target="_blank"]')
- for title in titles:
- item = {}
- tit = title.xpath('./text()')
- urls = title.xpath('./@href')
- item['title'] = str(processing(tit))
- item['url'] = str(processing(urls))
- if len(item['title']) > 4 :
- print(item)
- def processing(strs):
- s = '' # 定义保存内容的字符串
- for n in strs:
- n = ''.join(n.split()) # 去除空字符
- s = s + n # 拼接字符串
- return s # 返回拼接后的字符串
-
- if __name__ == '__main__':
- url = 'https://news.sina.com.cn/'
- downloader(url)
复制代码 |
|