|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
看了一段时间书,尝试着也来爬网络小说,
小说地址是:https://www.zhuishubang.com/131036/54103552.html
能把首页的标题都爬取到,正文都爬取不到。是“获取章节”里的那个地方出错了?
- from bs4 import BeautifulSoup
- import requests, sys
- class download(object):
- def __init__(self):
- self.server_url = 'http://www.zhuishubang.com'
- self.target_url = 'http://www.zhuishubang.com/131036/'
- self.names = []
- self.urls = []
- self.nums = 0
- """
- 获取下载的链接
- 获取目录
- """
- def download_url(self):
- req = requests.get(url=self.target_url)
- html = req.text.encode("latin1").decode("gbk")
- bf = BeautifulSoup(html, 'lxml')
- texts = bf.find_all('div', 'chapterCon')
- bf_a = BeautifulSoup(str(texts), 'lxml')
- a = bf_a.find_all('a')
- self.nums = len(a)
- for i in a:
- self.names.append(i.string)
- self.urls.append(self.server_url + i.get('href'))
- """
- 获取每一章节的内容
- """
- def download_content(self, target_url):
- req = requests.get(url=target_url)
- html = req.text.encode("latin1").decode("gbk")
- bf = BeautifulSoup(html, 'lxml')
- texts = bf.find_all('div', class_='articleCon')
- bf_div = BeautifulSoup(str(texts), 'lxml')
- div = bf_div.find_all('div')
- txt = ''
- for i in div:
- if i.string is not None:
- txt = txt + i.string + '\n\n'
- return txt
- def writer(self, name, path, text):
- write_flag = True
- with open(path, 'a', encoding='utf-8') as f:
- f.write(name + '\n')
- f.writelines(text)
- f.write('\n\n')
- if __name__ == '__main__':
- dl = download()
- dl.download_url()
- print("开始下载")
- for i in range(dl.nums):
- dl.writer(dl.names[i], '全球制造.txt', dl.download_content(dl.urls[i]))
- sys.stdout.write("已下载:%.3f%%" % float(i / dl.nums) + '\r')
- sys.stdout.flush
- print('已下载完成')
复制代码
好像找到问题原因了:
- for i in div:
- if i.string is not None: ---这一句 i.string始终都是None
- txt = txt + i.string + '\n\n'
复制代码
|
|