|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import urllib.request
import urllib.parse
import re
from bs4 import BeautifulSoup
def test_url(soup):
result = soup.find(text=re.compile("百度百科尚未收录词条"))
if result:
print(result[0:-1]) # 百度这个碧池在最后加了个“符号,给它去掉
return False
else:
return True
def summary(soup):
word = soup.h1.text
# 如果存在副标题,一起打印
if soup.h2:
word += soup.h2.text
# 打印标题
print(word)
# 打印简介
if soup.find(class_="lemma-summary"):
print(soup.find(class_="lemma-summary").text)
def get_urls(soup):
for each in soup.find_all(href=re.compile("view")):
content = ''.join([each.text])
url2 = ''.join(["http://baike.baidu.com", each["href"]])
response2 = urllib.request.urlopen(url2)
html2 = response2.read()
soup2 = BeautifulSoup(html2, "html.parser")
if soup2.h2:
content = ''.join([content, soup2.h2.text])
content = ''.join([content, " -> ", url2])
yield content
def main():
word = input("请输入关键词:")
keyword = urllib.parse.urlencode({"word":word})
response = urllib.request.urlopen("http://baike.baidu.com/search/word?%s" % keyword)
html = response.read()
soup = BeautifulSoup(html, "html.parser")
if test_url(soup):
summary(soup)
print("下边打印相关链接:")
each = get_urls(soup)
while True:
try:
for i in range(10):
print(next(each))
except StopIteration:
break
command = input("输入任意字符将继续打印,q退出程序:")
if command == 'q':
break
else:
continue
if __name__ == "__main__":
main()
"D:\python 练习\venv\Scripts\python.exe" "D:/python 练习/98.py"
请输入关键词:西宁
西宁(青海省省会)
西宁,古称青唐城、西平郡、鄯州,是青海省省会,国务院批复确定的中国西北地区重要的中心城市
[1] 
。截至2019年,全市下辖5个区、2个县,总面积7660平方千米,建成区面积129平方千米,常住人口238.71万人,城镇人口173.90万人,城镇化率72.85%。
[2-3] 
西宁地处中国西北地区、青海省东部、湟水中游河谷盆地,是青藏高原的东方门户,古“丝绸之路”南路和“唐蕃古道”的必经之地,自古就是西北交通要道和军事重地,素有”西海锁钥“、海藏咽喉之称,是世界高海拔城市之一,青海省的政治、经济、科教、文化、交通和通讯中心,也是国务院确定的内陆开放城市,中央军委西宁联勤保障中心驻地。
[4] 
西宁历史文化渊源流长,有着得天独厚的自然资源,绚丽多彩的民俗风情,是青藏高原一颗璀璨的明珠,取”西陲安宁“之意。先后荣获全国卫生城市、中国特色魅力城市200强、中国优秀旅游城市、中国园林绿化先进城市、国家森林城市、全国文明城市
[5] 
等荣誉称号,是”无废城市”建设试点城市。
[6] 
2020年6月,经中央依法治国委入选为第一批全国法治政府建设示范地区和项目名单。
[7] 
下边打印相关链接:
Traceback (most recent call last):
File "D:/python 练习/98.py", line 66, in <module>
main()
File "D:/python 练习/98.py", line 56, in main
print(next(each))
File "D:/python 练习/98.py", line 32, in get_urls
response2 = urllib.request.urlopen(url2)
File "D:\python38\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "D:\python38\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "D:\python38\lib\urllib\request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "D:\python38\lib\urllib\request.py", line 502, in _call_chain
result = func(*args)
File "D:\python38\lib\urllib\request.py", line 1348, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "D:\python38\lib\urllib\request.py", line 1319, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "D:\python38\lib\http\client.py", line 1230, in request
self._send_request(method, url, body, headers, encode_chunked)
File "D:\python38\lib\http\client.py", line 1241, in _send_request
self.putrequest(method, url, **skips)
File "D:\python38\lib\http\client.py", line 1096, in putrequest
self._output(self._encode_request(request))
File "D:\python38\lib\http\client.py", line 1176, in _encode_request
return request.encode('ascii')
UnicodeEncodeError: 'ascii' codec can't encode characters in position 36-39: ordinal not in range(128)
进程已结束,退出代码 1
和示例运行结果不一样
这样即可,甲鱼哥的标签viem 已经过时了,现在百科的链接的关键字都是 item
import urllib.request
import urllib.parse
import re
from bs4 import BeautifulSoup
def test_url(soup):
result = soup.find(text=re.compile("百度百科尚未收录词条"))
if result:
print(result[0:-1]) # 百度这个碧池在最后加了个“符号,给它去掉
return False
else:
return True
def summary(soup):
word = soup.h1.text
# 如果存在副标题,一起打印
if soup.h2:
word += soup.h2.text
# 打印标题
print(word)
# 打印简介
if soup.find(class_="lemma-summary"):
print(soup.find(class_="lemma-summary").text)
def get_urls(soup):
for each in soup.find_all(href=re.compile("item"))[7:]:
content = ''.join([each.text])
url2 = ''.join(["http://baike.baidu.com", each["href"]])
response2 = urllib.request.urlopen(url2)
html2 = response2.read()
soup2 = BeautifulSoup(html2, "html.parser")
if soup2.h2:
content = ''.join([content, soup2.h2.text])
content = ''.join([content, " -> ", url2])
yield content
def main():
word = input("请输入关键词:")
keyword = urllib.parse.urlencode({"word": word})
response = urllib.request.urlopen("http://baike.baidu.com/search/word?%s" % keyword)
html = response.read()
soup = BeautifulSoup(html, "html.parser")
if test_url(soup):
summary(soup)
print("下边打印相关链接:")
each = get_urls(soup)
while True:
try:
for i in range(10):
print(next(each))
except StopIteration:
break
command = input("输入任意字符将继续打印,q退出程序:")
if command == 'q':
break
else:
continue
if __name__ == "__main__":
main()
|
|