写了个爬虫报错了改正
#爬取知乎上的一些问题和答案import re
from pyquery import PyQuery as pq
import requests
from bs4 import BeautifulSoup
import time
url='https://www.zhihu.com/explore'
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
html=requests.get(url,headers=headers).text
soup=BeautifulSoup(html,'lxml')
doc=pq(html)
items=doc('.ExploreCollectionCard-contentList').items()
items1=doc('.ExploreSpecialCard-contentItem').items()
#其中一个分类的标题和回答
for item in items:
print(item('.ExploreCollectionCard-contentTitle').text())
print(item('.ExploreCollectionCard-contentExcerpt').text())
print('---------------------------------手动分割线-------------------------')
#另一个分类问题的标题和回答
for item in items1:
print(item('.ExploreSpecialCard-contentItem').text())
#答案不在源码里,故通过给的链接再去爬答案
link='https://www.zhihu.com'+item('.ExploreSpecialCard-contentTitle').attr('href')
html1=requests.get(url=link,headers=headers).text
doc=pq(html1)
items2=doc('#js-initialData').items()
for item3 in items2:
print(re.findall(r'"content":"(.*?)"', item3.text()))
print('---------------------------------手动分割线-------------------------') 有什么问题? 我看了一下报错的那个网址和不报错的网址规律不一样,
你是按一种规律来解析的
static/image/hrline/line6.png
报错的网址
不报错的网址
static/image/hrline/2.gif
网址解析出错,所以报错
页:
[1]