|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers = {'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 '
'Safari/537.36'}
page_test = requests.get(url,headers=headers)
page_test.encoding = 'utf-8'
soup = BeautifulSoup(page_test.text,'lxml')
li_list = soup.select('.tabli')
print('li_list:',li_list)
fp = open('./sanguoyanyi.txt','w',encoding='utf-8')
for li in li_list:
print(type(li))
print(li)
title = li.a.string
# FIXME
detail_url = 'http://www.shicimingju.com' + li.a['href']
try:
detail_page_text = requests.get(url=detail_url,headers=headers).content
detail_soup =BeautifulSoup(detail_page_text,'html.parser')
div_tag = detail_soup.find('div',class_='chapter_content')
if div_tag:
content = div_tag.text
print(content)
fp.write(title + ':' + content + '\n')
print(title,'爬虫成功')
else:
print(title,'爬取失败')
except requests.exceptions.RequestException as e:
print(f'请求失败:{e}')
fp.close()
报错如下:
Traceback (most recent call last):
File "E:\Projects\Python-Spiders\Examples\11_all_spider_examples\spider_novels\spider_sanguoyanyi.py", line 28, in <module>
title = li.a.string
AttributeError: 'NoneType' object has no attribute 'string'
但是 li 标签是有东西的啊
给你稍微改了一下:
- import requests
- from bs4 import BeautifulSoup
- url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
- headers = {'User-Agent':
- 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 '
- 'Safari/537.36'}
- page_test = requests.get(url,headers=headers)
- page_test.encoding = 'utf-8'
- soup = BeautifulSoup(page_test.text,'lxml')
- li_list = soup.select('.tabli')
- print('li_list:',li_list)
- fp = open('./sanguoyanyi.txt','w',encoding='utf-8')
- for li in li_list:
- print(type(li))
- print(li)
- title = li.string
- # FIXME
- detail_url = 'http://www.shicimingju.com' + li['href']
- try:
- detail_page_text = requests.get(url=detail_url,headers=headers).content
- detail_soup =BeautifulSoup(detail_page_text,'html.parser')
- div_tags = detail_soup.findAll('p')
- if div_tags:
- fp.write(title + '\n\n')
- for div_tag in div_tags:
- content = div_tag.text
- print(content)
- fp.write(content + '\n')
- else:
- fp.write('\n')
- print(title,'爬虫成功')
- else:
- print(title,'爬取失败')
- except requests.exceptions.RequestException as e:
- print(f'请求失败:{e}')
- fp.close()
复制代码
|
|