之前学python写的,没修改 可以看一下
import requests, re, bs4
def openurl(url):
header = {}
header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400'
cookie = {}
cookie['cookie'] = 'BAIKE_SHITONG=%7B%22data%22%3A%2262e53b847649f191f0341408dc015051c3815b8d95a146376b6ddf7ab1f30629149107f49d8554ff26579436ccfb408ae1df5539ef6fd355b388cdebab039e69b4bee62df650bc9a1935a1a9fb08e754db95811b3f85703579e6e0a28ce9a56dba1eb497ee18b073aee2abbc54a4612da2d7cc5ed1c119b03a3e5744c4d8b54a%22%2C%22key_id%22%3A%2210%22%2C%22sign%22%3A%22fdd414b4%22%7D; PSTM=1583827166; BAIDUID=0090B882C7DE33774FA3F765E6F31BE5:FG=1; BIDUPSID=4E3FF7727E5D7436F281D44BD0E01463; BDUSS=JIOEU3UTVFTGF6flBZOG5jRUhFaE9yT1lmby1oUDEydlJybVg2V2RuRUpUZHRlSVFBQUFBJCQAAAAAAAAAAAEAAAC4a2VizOzfsdi8tcIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAnAs14JwLNeOH; BDUSS_BFESS=JIOEU3UTVFTGF6flBZOG5jRUhFaE9yT1lmby1oUDEydlJybVg2V2RuRUpUZHRlSVFBQUFBJCQAAAAAAAAAAAEAAAC4a2VizOzfsdi8tcIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAnAs14JwLNeOH; delPer=0; ZD_ENTRY=baidu; BDRCVFR[S_ukKV6dOkf]=PfwjKziWnFsTgD8mvqV; H_BDCLCKID_SF_BFESS=tbCjoIDMJC-3f-cPbKTMMt_e2x7t54CXKKOLVhQRBp7keq8CD6O0Kl-Ijp5BtlRiWeAqhT6FLtoKSnc2y5jHhpFSK-6UQpOKWNkOLDOCWq3psIJM0p_WbT8ULf5H5xCqaKviaKOEBMb1M56DBT5h2M4qMxtOLR3pWDTm_q5TtUJMeCnTD-Dhe6j-jGLOt6Dff5vfL5rD5RT0j-5YKKTjhPrMKnQTWMT-0bFHVxOGtKjKqRcEM-TkLTKRjfJiQ6byKHn7_JLbytLbJbcR0tJcDbLv5pJ4XUQxtNRy0DnjtpvhHxcGMxbobUPUDUc9LUkJHmcdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj0DKLK-oj-DLxjjAB3e; BCLID=7417333749751258671; BDSFRCVID=D9_OJeC624aGiU7rjMDIJC6X5gy_SzbTH6f38_KYEq7a3p-jhueHEG0Pof8g0KubZujCogKK3mOTH4-F_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF=tbCjoIDMJC-3f-cPbKTMMt_e2x7t54CXKKOLVhQRBp7keq8CD6O0Kl-Ijp5BtlRiWeAqhT6FLtoKSnc2y5jHhpFSK-6UQpOKWNkOLDOCWq3psIJM0p_WbT8ULf5H5xCqaKviaKOEBMb1M56DBT5h2M4qMxtOLR3pWDTm_q5TtUJMeCnTD-Dhe6j-jGLOt6Dff5vfL5rD5RT0j-5YKKTjhPrMKnQTWMT-0bFHVxOGtKjKqRcEM-TkLTKRjfJiQ6byKHn7_JLbytLbJbcR0tJcDbLv5pJ4XUQxtNRy0DnjtpvhHxcGMxbobUPUDUc9LUkJHmcdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj0DKLtDI-hDPGD6t35n-Wqxj-aRT3bC5jsJOOaCvUbf7Oy4oTj6j3-tJ0L6bH2ejRalcnJfPhePT30n6D3MvBbG-OyMnk36Cf3COMHRcsfJ5SQft20-4EeMtjBbQayjTB5b7jWhk5Dq72y58VQlRX5q79atTMfNTJ-qcH0KQpsIJM5-DWbT8EjH62btt_tJ-eoD3P; Hm_lvt_55b574651fcae74b0a9f1cf9c8d7c93a=1599823721,1599823735,1600168241,1600421681; Hm_lpvt_55b574651fcae74b0a9f1cf9c8d7c93a=1600423997; PSINO=7; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=32617_1430_32744_31254_32723_7632_32115_7564_22158'
res = requests.get(url, headers=header, cookies=cookie)
res.encoding = res.apparent_encoding
return bs4.BeautifulSoup(res.text, 'html.parser')
def introduce_crawl(soup):
head = soup.find('dd', class_='lemmaWgt-lemmaTitle-title')
temp = soup.find('div', class_='lemma-summary')
content = temp.find_all('div', class_='para')
cp = re.compile('<[^>]+?>')
print(head.h1.text + head.h2.text)
for each in content:
key_content = cp.sub('', str(each))
print('\t' + key_content)
def depth_crawl(soup):
ploy = soup.find('ul', class_='polysemantList-wrapper cmn-clearfix')
main = soup.find('div', class_='main-content')
a = await_ten(ploy, main)
print('下边打印相关链接:')
while True:
for i in range(10):
try:
next(a)
except StopIteration:
break
temp = input('输入任意字符将继续打印。q退出程序:')
if temp == 'q':
break
def await_ten(ploy, main):
ploy_content = ploy.find_all(href = re.compile('/item'))
for each in ploy_content:
print(each.text + '(' + ''.join(each['title']) + ')' + '->' + ''.join(['https://baike.baidu.com', each['href']]))
yield
main_content = main.find_all(href = re.compile('/item'))
for each in main_content:
print(each.text.replace('\n', '') + '->' + ''.join(['https://baike.baidu.com', each['href']]))
yield
def main():
# name = input('请输入关键字:')
name = '猪八戒'
url = 'https://baike.baidu.com/item/' + name
soup = openurl(url)
if soup.find(text = re.compile('您所访问的页面不存在')):
print(soup.find('p', class_ = 'sorryCont').text)
else:
introduce_crawl(soup)
depth_crawl(soup)
if __name__ == '__main__':
main()
|