# -*- coding: utf-8 -*-
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import re
import random
def main():
#搜查内容
search = input('请输入关键词:')
search = urllib.parse.urlencode({'word':search})
url = 'https://baike.baidu.com/search/word?%s' %search
#建立代理
ip1 = '117.191.11.111:8080'
ip2 = '222.223.115.30:41303'
ip3 = '121.61.0.86:9999'
ip4 = '101.251.216.103:8080'
iplist=[ip1,ip2,ip3,ip4]
proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
opener = urllib.request.build_opener(proxy_support)
opener.addheader = [('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36')]
urllib.request.install_opener(opener)
#进入词条
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
soup = BeautifulSoup(html,'html.parser')
#进入具体词条内容
for each in soup.find_all(href = re.compile('item')):
url2 = ''.join(['https://baike.baidu.com',each['href']])
#处理秒懂星课堂等链接的汉字问题
if re.search(u'[\u4e00-\u9fa5]+',url2):
result = re.search(u'[\u4e00-\u9fa5]+',url2)
context = result.group()
url2 = ''.join([url2[0:result.start()],urllib.parse.quote(context)])
response2 = urllib.request.urlopen(url2)
html2 = response2.read().decode('utf-8')
soup2 = BeautifulSoup(html2,'html.parser')
if soup2.h2:
text = each.text + soup2.h2.text
print(each.text,soup2.h2.text,'->',url2)
else:
print(each.text,'->',url2)
if __name__=='__main__':
main()