|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
纵横小说网址http://book.zongheng.com/showchapter/1130229.html 爬取小说 生成txt文件
- import requests
- import os
- from lxml import etree
- import random
- import time
- import pymysql
- def get_html(url):
- headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
- res = requests.get(url,headers=headers,timeout=600)
- res.encoding = res.apparent_encoding
- #res.encoding = 'UTF-8'
- #res.encoding='gb2312'
- htmlmu = res.text
- return htmlmu
- #获取页面
- def get_book(url):
- html = get_html(url)
- selector = etree.HTML(html)
- nov_name = selector.xpath('///html/body/div[3]/div[1]/h1/text()') # 标题
- all_urls = selector.xpath('///html/body/div[3]/div[2]/div[2]/div/ul//a/@href') # 连接
- nov_name = ' '.join(nov_name) # 去除前后[]字符
- print("匹配到共%d章节" % len(all_urls))
- k=0
- for part_url in all_urls:
- k += 1
- part_url = f'{part_url}'
- booktxt=get_html(part_url)
- selector = etree.HTML(booktxt)
- title = selector.xpath('//*[@class="title_txtbox"]//text()') # 标题
- content = selector.xpath('//*[@class="content"]//text()') # 作者
- title = ' '.join(title) # 去除前后[]字符
- content = ' '.join(content) # 去除前后[]字符
- title = title.strip()
- content = content.strip()
- title=f'\n{title}\n\n'
- content=f'\t{content}\n'
- content = content.replace(" ", "\n\n  ")
- new = []
- new.append(title)
- new.append(content)
- down = "./txt/"
- if os.path.exists(down) != True:
- os.makedirs(down)
- print("正在写入第%d章" % k)
- with open("./txt/" + nov_name + ".txt", 'a+', encoding='utf-8')as p:
- p.writelines(new)
- m = random.randint(1,6)
- time.sleep(m)
- if __name__ == '__main__': #主函数
- url="http://book.zongheng.com/showchapter/1130229.html"
- # url=input("请输入采集目录地址:")
- get_html(url)
- get_book(url)
复制代码 |
评分
-
参与人数 1 | 荣誉 +5 |
鱼币 +3 |
贡献 +2 |
收起
理由
|
梦鸩鸩
| + 5 |
+ 3 |
+ 2 |
感谢楼主无私奉献! |
查看全部评分
|