爬取小说

大马强 · 发表于 2021-8-2 09:30:40

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

本帖最后由大马强于 2021-8-2 20:09 编辑

写了一个小说的爬虫作为练手，想分享给各位鱼油和交流
网站：铅笔小说网
代码涉及模块

from concurrent.futures import ThreadPoolExecutor #多线程
import requests #网页请求
from lxml import etree #网页处理，数据获取
import re #获取数据
import os #保存内容

复制代码

最近在看这部小说《灰与幻想的格林姆加尔》，所以就拿它来当本次目标了，选择作为起始mian_url
一、网页分析
【1】观察主页面，发现这个小说的所有章节都在这上面，拉到底部时没有动态刷新，可以猜测这是一个静态的

登录/注册后可看大图

【2】点击右键，查看网页源代码，发现每一章的url和章节名字都在上面，不过要注意的是，url要经过一些处理才能使用，每一章的url要再加上前缀 https://www.23qb.net

登录/注册后可看大图

【3】点击任意一章查看，打开浏览器的调试工具f12，找到文本内容以及下一页的url，再重复【1】操作，可以确定二者都在网页源代码中

登录/注册后可看大图

二、代码实现

from concurrent.futures import ThreadPoolExecutor
import requests
from lxml import etree
import re
import os
main_url = "https://www.23qb.net/book/1883/"
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}
# 获取每一章的url
def get_chapter_url(html):
chapter_url_list = [] # 存放每一章的地址
chapter_name_list = [] # 存放每一章的名字
# 将小说放在特定的位置
dir_name = html.xpath('//*[@id="bookinfo"]/div[2]/div[1]/h1/text()')[0]
try:
os.mkdir(dir_name)
print(f"{dir_name}创建成功！")
except:
print(f"{dir_name}已经存在！")
pass
os.chdir(dir_name)
# 获取每一个章节
chapter_list = html.xpath('//*[@id="chapterList"]/li')
for each in chapter_list:
# 章节的url不能直接访问，要先经过处理、拼接
chapter_url = main_url + each.xpath("./a/@href")[0].split("/")[-1]
chapter_name = each.xpath("./a/text()")[0]
chapter_url_list.append(chapter_url)
chapter_name_list.append(chapter_name)
# print(chapter_url)
return zip(chapter_url_list, chapter_name_list)
# 将数据下载保存
def save_content(p_list, fp):
for i in p_list:
fp.write(i+'\n')
# 获取数据
def get_content(text, fp):
html = etree.HTML(text)
p_list = html.xpath('//*[@id="TextContent"]//p/text()')
save_content(p_list, fp)
# 读取每一章的信息
def get_page(*params):
chapter_url = params[0]
chapter_file_name = params[1]
with open(f"{chapter_file_name}.txt", mode="w", encoding="utf-8") as fp:
# 每一章小说不止一页，每一页的规律为 xxx_1.html,xxx_2.html
# 先将将原url分成两部分,舍去后面 .html 部分
# 先假设本章有20p ，要判断next_url是否是本章的，如果不是就跳出，到下一章
target = chapter_url.split(".html")[0]
for page in range(1, 20): #
target_url = target+f"_{page}.html"
with requests.get(target_url, headers=headers) as req:
# 获取下一个页的url,判断是否为同一张章节
p = re.compile(
'<script>.*?nextpage="(?P<next_url>.*?)".*?', re.S)
res = p.search(req.text)
next_url = main_url + res.group('next_url').split("/")[-1]
if next_url.split("_")[0] != target_url.split("_")[0]:
# 此时为本章的最后一页
get_content(req.text, fp)
break
get_content(req.text, fp)
print(f"{chapter_file_name}下载完毕！")
main_html = requests.get(url=main_url).text
html = etree.HTML(main_html)
chapter_url_list = get_chapter_url(html)
# 开启多线程，num可以设置线程数
num = 5
with ThreadPoolExecutor(num) as t:
for each_chapter in chapter_url_list:
t.submit(get_page, *each_chapter)
# 下面的是测试代码
# est_list = ("https://www.23qb.net/book/1883/270977.html", "第一卷 1.净是些令人费解的事")
# get_page(*test_list)

复制代码

登录/注册后可看大图

速度还可以

最后
太难了，第一次发这样的长帖，早上没点到保存为草稿，审核了两次

大马强 · 发表于 2021-8-2 20:06:32

啊这，我都不敢改了，三次了，编辑一次重新审核一次，这是为啥

大马强 · 发表于 2021-8-2 20:08:31

望一些想练手鱼油，对这网站下手轻点，感觉它服务器不是很好的样子

江湖散人 · 发表于 2021-8-2 21:30:58

大马强 · 发表于 2021-8-2 21:32:17

江湖散人发表于 2021-8-2 21:30
牛

江湖散人 · 发表于 2021-8-2 21:42:44

大马强发表于 2021-8-2 21:32

我认为会爬虫都是很牛逼的人！

懒狗李 · 发表于 2021-8-3 10:00:59

一条|咸鱼 · 发表于 2021-8-3 14:09:02

Kayko · 发表于 2021-8-3 15:50:02

不大不小甲鱼 · 发表于 2021-8-3 17:27:36

hornwong · 发表于 2021-8-3 20:44:46

感谢分享！

hornwong · 发表于 2021-8-3 20:45:31

fxj2002 · 发表于 2021-8-4 10:17:42

fxj2002 · 发表于 2021-8-4 10:18:47

感谢

良木 · 发表于 2021-8-6 10:11:30

import requests
import re
import bs4

def get_input():
#keyword = input("请输入关键词：")
pages = int(input("请输入要爬取得页数（1~50）："))

while pages not in range(1, 51):
      pages = int(input("请输入正确的页数："))

#return keyword, pages
return pages

def get_datas(res):

soup = bs4.BeautifulSoup(res, "html.parser")

#target = soup.find_all("div", id="TextContent")
target = soup.find_all("p")

biaoti = soup.find_all("h1")
#target = content.find_all("p")
return target,biaoti


def get_html(url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}
res = requests.get(url, headers=headers)

return res.text

def find_depth(res):
soup = bs4.BeautifulSoup(res, 'html.parser')
depth = soup.find_all('ul',class_="chaw_c")

return int(depth)

def main():
host = "https://www.23qb.net/book/1883/"
res = get_html(host)
#pages = get_input()
soup = bs4.BeautifulSoup(res, 'html.parser')
content = soup.find('ul',class_="chaw_c")
datas = []
videos = iter(content.find_all("li"))
for video in videos:
      datas.append(video.a['href'])
#print(datas)

#print(content)
      url = "https://www.23qb.net" + video.a['href']
      res = get_html(url)
      target,biaoti  = get_datas(res)
      #print(biaoti.text)
      print(biaoti)
      for each in biaoti:
         b=[]
         b.append(each.text)
      #b.append(biaoti.h1)
      with open('{}.txt'.format(b), 'w', encoding="utf-8") as file:

         for each in target:
            #print(each.text)
            file.write("       " + each.text + '\n\n')

if __name__ == "__main__":
main()
#太不容易了，终于搞好了，有疑问的可以联系我的qq哈，704377095。

大马强 · 发表于 2021-8-6 10:14:15

良木发表于 2021-8-6 10:11
import requests
import re
import bs4

账号		自动登录	找回密码
密码			立即注册

[技术交流] 爬取小说

马上注册，结交更多好友，享用更多功能^_^

回帖奖励 +2 鱼币

回帖奖励 +2 鱼币

回帖奖励 +2 鱼币

回帖奖励 +2 鱼币

回帖奖励 +2 鱼币

回帖奖励 +2 鱼币

回帖奖励 +2 鱼币

回帖奖励 +2 鱼币

回帖奖励 +2 鱼币

回帖奖励 +2 鱼币

浏览过的版块