requests库爬取百度百科
我在用python爬取百度百科的‘猪八戒’:import requests
from bs4 import BeautifulSoup as bs
r=requests.get('https://baike.baidu.com/item.猪八戒',headers={'user-agent':'Mozilla-5.0'})
soup=bs(r.text,'html.parser')
tag=soup.find_all(name='meta',attrs={'name':'description'})
print(tag.get('content'))
这个只是测试,还没封装和做异常处理
但输出结果是
猪八戒又名猪刚鬣,是明代神妖小说《西游记》中的四大主要角色之一,法号悟能(观音取),浑名八戒(唐僧取),是唐僧的二徒弟,孙悟空的二师弟,沙僧的二师兄。在原著中,自投为猪胎之后,生的猪头人身,体型肥胖硕大,皮肤黑灰,獠牙长出嘴外,外形丑陋吓人。武功高强,力大无穷,又会天罡数的三十六般变化术,所持的兵器为太上老君所造、玉皇大帝亲赐的上宝沁金钯(俗称九齿钉耙),重达五千零四十八斤,为一藏之数。猪八戒前世为执掌天河八万水兵的“天蓬元帅”,一直倾慕容貌过人的霓裳仙子(属于嫦娥中的一员,《西游记》中各路神仙基本借鉴了正统道教神仙录,由高老庄一集猪八戒提及可见,猪八戒的前世天蓬元帅即是水神天河宪节)。因调戏嫦娥并且惹来纠察灵官后,又拱倒斗牛宫被贬下凡尘,却错投猪胎,在福陵山云栈洞落草。后受观音菩萨点化,入赘高老庄务农,等待取经人。后成为唐僧的弟子之一,与孙悟空、沙悟净一同保护唐僧去西天取经,几经九九八十...
九九八十后面就没了,为什么呀? 代码敲错了,是'item/猪八戒',不是'item.猪八戒'
之前学python写的,没修改 可以看一下
import requests, re, bs4
def openurl(url):
header = {}
header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400'
cookie = {}
cookie['cookie'] = 'BAIKE_SHITONG=%7B%22data%22%3A%2262e53b847649f191f0341408dc015051c3815b8d95a146376b6ddf7ab1f30629149107f49d8554ff26579436ccfb408ae1df5539ef6fd355b388cdebab039e69b4bee62df650bc9a1935a1a9fb08e754db95811b3f85703579e6e0a28ce9a56dba1eb497ee18b073aee2abbc54a4612da2d7cc5ed1c119b03a3e5744c4d8b54a%22%2C%22key_id%22%3A%2210%22%2C%22sign%22%3A%22fdd414b4%22%7D; PSTM=1583827166; BAIDUID=0090B882C7DE33774FA3F765E6F31BE5:FG=1; BIDUPSID=4E3FF7727E5D7436F281D44BD0E01463; BDUSS=JIOEU3UTVFTGF6flBZOG5jRUhFaE9yT1lmby1oUDEydlJybVg2V2RuRUpUZHRlSVFBQUFBJCQAAAAAAAAAAAEAAAC4a2VizOzfsdi8tcIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAnAs14JwLNeOH; BDUSS_BFESS=JIOEU3UTVFTGF6flBZOG5jRUhFaE9yT1lmby1oUDEydlJybVg2V2RuRUpUZHRlSVFBQUFBJCQAAAAAAAAAAAEAAAC4a2VizOzfsdi8tcIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAnAs14JwLNeOH; delPer=0; ZD_ENTRY=baidu; BDRCVFR=PfwjKziWnFsTgD8mvqV; H_BDCLCKID_SF_BFESS=tbCjoIDMJC-3f-cPbKTMMt_e2x7t54CXKKOLVhQRBp7keq8CD6O0Kl-Ijp5BtlRiWeAqhT6FLtoKSnc2y5jHhpFSK-6UQpOKWNkOLDOCWq3psIJM0p_WbT8ULf5H5xCqaKviaKOEBMb1M56DBT5h2M4qMxtOLR3pWDTm_q5TtUJMeCnTD-Dhe6j-jGLOt6Dff5vfL5rD5RT0j-5YKKTjhPrMKnQTWMT-0bFHVxOGtKjKqRcEM-TkLTKRjfJiQ6byKHn7_JLbytLbJbcR0tJcDbLv5pJ4XUQxtNRy0DnjtpvhHxcGMxbobUPUDUc9LUkJHmcdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj0DKLK-oj-DLxjjAB3e; BCLID=7417333749751258671; BDSFRCVID=D9_OJeC624aGiU7rjMDIJC6X5gy_SzbTH6f38_KYEq7a3p-jhueHEG0Pof8g0KubZujCogKK3mOTH4-F_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF=tbCjoIDMJC-3f-cPbKTMMt_e2x7t54CXKKOLVhQRBp7keq8CD6O0Kl-Ijp5BtlRiWeAqhT6FLtoKSnc2y5jHhpFSK-6UQpOKWNkOLDOCWq3psIJM0p_WbT8ULf5H5xCqaKviaKOEBMb1M56DBT5h2M4qMxtOLR3pWDTm_q5TtUJMeCnTD-Dhe6j-jGLOt6Dff5vfL5rD5RT0j-5YKKTjhPrMKnQTWMT-0bFHVxOGtKjKqRcEM-TkLTKRjfJiQ6byKHn7_JLbytLbJbcR0tJcDbLv5pJ4XUQxtNRy0DnjtpvhHxcGMxbobUPUDUc9LUkJHmcdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj0DKLtDI-hDPGD6t35n-Wqxj-aRT3bC5jsJOOaCvUbf7Oy4oTj6j3-tJ0L6bH2ejRalcnJfPhePT30n6D3MvBbG-OyMnk36Cf3COMHRcsfJ5SQft20-4EeMtjBbQayjTB5b7jWhk5Dq72y58VQlRX5q79atTMfNTJ-qcH0KQpsIJM5-DWbT8EjH62btt_tJ-eoD3P; Hm_lvt_55b574651fcae74b0a9f1cf9c8d7c93a=1599823721,1599823735,1600168241,1600421681; Hm_lpvt_55b574651fcae74b0a9f1cf9c8d7c93a=1600423997; PSINO=7; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=32617_1430_32744_31254_32723_7632_32115_7564_22158'
res = requests.get(url, headers=header, cookies=cookie)
res.encoding = res.apparent_encoding
return bs4.BeautifulSoup(res.text, 'html.parser')
def introduce_crawl(soup):
head = soup.find('dd', class_='lemmaWgt-lemmaTitle-title')
temp = soup.find('div', class_='lemma-summary')
content = temp.find_all('div', class_='para')
cp = re.compile('<[^>]+?>')
print(head.h1.text + head.h2.text)
for each in content:
key_content = cp.sub('', str(each))
print('\t' + key_content)
def depth_crawl(soup):
ploy = soup.find('ul', class_='polysemantList-wrapper cmn-clearfix')
main = soup.find('div', class_='main-content')
a = await_ten(ploy, main)
print('下边打印相关链接:')
while True:
for i in range(10):
try:
next(a)
except StopIteration:
break
temp = input('输入任意字符将继续打印。q退出程序:')
if temp == 'q':
break
def await_ten(ploy, main):
ploy_content = ploy.find_all(href = re.compile('/item'))
for each in ploy_content:
print(each.text + '(' + ''.join(each['title']) + ')' + '->' + ''.join(['https://baike.baidu.com', each['href']]))
yield
main_content = main.find_all(href = re.compile('/item'))
for each in main_content:
print(each.text.replace('\n', '') + '->' + ''.join(['https://baike.baidu.com', each['href']]))
yield
def main():
# name = input('请输入关键字:')
name = '猪八戒'
url = 'https://baike.baidu.com/item/' + name
soup = openurl(url)
if soup.find(text = re.compile('您所访问的页面不存在')):
print(soup.find('p', class_ = 'sorryCont').text)
else:
introduce_crawl(soup)
depth_crawl(soup)
if __name__ == '__main__':
main() 本帖最后由 城中城 于 2020-9-29 10:21 编辑
为什么没打完呢,是因为那个只有前一半的文字,你想找后面的文字就需要用re,把文字都查找出来
下面第二张图才有完整的信息
页:
[1]