第55讲动动手题1
import reimport urllib.request
import urllib.parse
from bs4 import BeautifulSoup
def main():
word = input('请输入搜索关键字: ')
key = urllib.parse.urlencode({'word':word}).encode('utf-8')
response = urllib.request.urlopen('http://baike.baidu.com/search/word?%s' % key)
html = response.read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
for each in soup.find_all(href=re.compile('view')):
content = ''.join()
url2 = ''.join(['http://baike.baidu.com/', each['href']])
response2 = urllib.request.urlopen(url2)
html2 = response2.read()
soup2 = BeautifulSoup(html2)
if soup2.h2:
content = ''.join()
content = ' '.join()
print(content)
if __name__ == '__main__':
main()
UnicodeEncodeError: 'ascii' codec can't encode characters in position 37-40: ordinal not in range(128)
为什么运行结果会出错?怎样避免这类错误? 本帖最后由 Twilight6 于 2020-8-7 18:39 编辑
按照这里来改就行,soup.find_all(href=re.compile("view")) 改成 soup.find_all(href=re.compile("item"))
055课 爬百度百科“网络爬虫”的词条 问题
https://fishc.com.cn/thread-169631-1-1.html
(出处: 鱼C论坛)
参考代码:
import re
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
def main():
word = input('请输入搜索关键字: ')
key = urllib.parse.urlencode({'word':word}).encode('utf-8')
response = urllib.request.urlopen('http://baike.baidu.com/search/word?%s' % key)
html = response.read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
for each in soup.find_all(href=re.compile("item")):
content = ''.join()
url2 = ''.join(['http://baike.baidu.com/', each['href']])
response2 = urllib.request.urlopen(url2)
html2 = response2.read()
soup2 = BeautifulSoup(html2)
if soup2.h2:
content = ''.join()
content = ' '.join()
print(content)
if __name__ == '__main__':
main()
import re
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
def main():
word = input('请输入搜索关键字: ')
key = urllib.parse.urlencode({'word':word}).encode('utf-8')
response = urllib.request.urlopen('http://baike.baidu.com/search/word?%s' % key)
html = response.read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
for each in soup.find_all(href=re.compile('view')):
content = ''.join()
url2 = ''.join(['https://baike.baidu.com/', urllib.parse.quote(each['href'])])
response2 = urllib.request.urlopen(url2)
html2 = response2.read()
soup2 = BeautifulSoup(html2, 'html.parser')
if soup2.h2:
content = ''.join()
content = ' '.join()
print(content)
if __name__ == '__main__':
main() zltzlt 发表于 2020-8-7 18:39
确实可以运行了!感谢!
但是还想问一下,为啥这里要用urllib.parse.quote, 而不是each['href']? Twilight6 发表于 2020-8-7 18:38
按照这里来改就行,
055课 爬百度百科“网络爬虫”的词条 问题
可以运行,但为啥还有错误?
UnicodeEncodeError: 'ascii' codec can't encode characters in position 34-42: ordinal not in range(128) sharank 发表于 2020-8-7 20:43
确实可以运行了!感谢!
但是还想问一下,为啥这里要用urllib.parse.quote, 而不是each['href']?
因为 url 中带有中文字符 zltzlt 发表于 2020-8-7 20:45
因为 url 中带有中文字符
好的,感谢 Twilight6 发表于 2020-8-7 18:38
按照这里来改就行,
055课 爬百度百科“网络爬虫”的词条 问题
为啥这里是item呀?而且还是 解决了!!!
其实把re.compile('view')改成re.compile('\#view')就可以了
不过还是得看需求
页:
[1]