爬取网站标题,作者信息,简介,文章内容
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup# 网页解析获取数据
import re# 正则表达式 进行文字匹配
import random
url = 'https://cybernews.com/news/'
headers = {'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
ret = Request(url)
html = urlopen(ret)
bs = BeautifulSoup(html, 'html.parser')
names = bs.find_all("h3",{"class":"jeg_post_title"})
for name in names:
name = name.get_text()
print(name)
neirongs = bs.find_all("div",{"class":"jeg_post_meta"})
for neirong in neirongs:
neirong = neirong.get_text()
print(neirong)
contents = bs.find_all("div",{"class":"jeg_post_excerpt"})
for content in contents:
content = content.get_text()
print(content)
刚接触爬虫,大佬们能看下问题所在吗,以及爬取文章内容这个不太会 本帖最后由 YunGuo 于 2021-1-28 17:52 编辑
定义了headers不添加进去?headers是一个集合?响应结果不读取?建议基础多学学再动手
这里改下
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
ret = Request(url, headers=headers)
html = urlopen(ret)
bs = BeautifulSoup(html.read(), 'html.parser') 这三个内容不用分开使用三次循环 太麻烦了 本帖最后由 笨鸟学飞 于 2021-2-1 20:04 编辑
import requests
from lxml import etree
if __name__ == '__main__':
url = 'https://cybernews.com/news/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
res = requests.get(url, headers=headers)
res.encoding = res.apparent_encoding
tree = etree.HTML(res.text)
# 标题
titles = tree.xpath('//div[@class="jeg_posts jeg_load_more_flag"]/article/h3/a/text()')
# 作者
authors = tree.xpath('//div[@class="jeg_meta_author"]/a/text()')
# 简介
excerpts = tree.xpath('//div[@class="jeg_post_excerpt"]/p/text()')
# 详情页面url
text_urls = tree.xpath('//div[@class="jeg_posts jeg_load_more_flag"]/article/h3/a/@href')
with open('./1.txt', 'w', encoding='utf-8') as fp:
for title,anthor,excerpt,text_url in zip(titles, authors, excerpts, text_urls):
fp.write(title+'\nby:'+anthor+'\n'+excerpt+'\n'+text_url+'\n\n')
==============
你的headers写法是错误的,少了冒号。。。
详情页面的数据爬取自己写吧,已经到了这个地步了相信难不倒你了
页:
[1]