|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
下载的章节怎么能按章节保存
用的3.5 上代码
import os
from bs4 import BeautifulSoup
import re
import requests
import time
url = 'http://www.quanshu.net/book/1/1018/'
def geturl(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
openurl = requests.get(url,headers = headers)
text = openurl.content.decode('gbk',"ignore")
#print(text)
return text #获取内容
def openurl(text):
reg = r'<a href="(.*html)" title='#匹配章节网址
rew = re.findall(reg,text)
urlz = []
#print(rew)
for i in rew:
urlz = 'http://www.quanshu.net/book/1/1018/'+i
print ( 'http://www.quanshu.net/book/1/1018/'+i)#完整的网址章节目录
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
openev= requests.get(urlz,headers = headers)
textev = openev.content.decode('gbk',"ignore")
soup= BeautifulSoup(textev,'html.parser')
every = soup.findAll('div',{'class':'mainContenr'})#匹配每一章内容
for t in every:
with open ('D:\新建文件夹\小说.txt','a',encoding='utf-8') as f:
f.write(str(t.get_text())+'\n')#每一章写入小说TXT
openurl(geturl(url))
#怎么能把下载的每一章节按章节保存
- import urllib.request
- from bs4 import BeautifulSoup
- from urllib.error import HTTPError
- import os
- def url_open(url):
- try:
- req=urllib.request.Request(url)
- req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 \
- (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36")
- response=urllib.request.urlopen(req)
- html=response.read().decode("GBK")
- except HTTPError as e:
- html=None
- return html
- def get_neirong(url):
- p=[]
- html=url_open(url)
- soup=BeautifulSoup(html,"html.parser")
- a1=soup.findAll("div",{"class":"novel_list"})
- for i in a1:
- a2=i.findAll("a")
- for x in a2:
- p.append(x.attrs["href"])
- return p
- def save(plist,name,ur):
- os.mkdir(name)
- os.chdir(name)
- for i in plist:
- url=ur.replace("index.html",i)
- html=url_open(url)
- soup=BeautifulSoup(html,"html.parser")
- d=soup.findAll("div",{"class":"novel_content"})
- title=soup.find("title")
- for x in d:
- with open(title.get_text(),"a",encoding="utf-8")as f:
- f.write(x.get_text().replace("\xa0"," "))
-
-
- def down_sanwen():
- name=input("名字:")
- ur=input("网址:")
- p=get_neirong(ur)
- save(p,name,ur)
- if __name__=="__main__":
- down_sanwen()
复制代码
这个是我很久以前专门为某个网站写的爬小说的脚本,就是按章节保存得。你可以看一下,具体的我自己都忘了。。。
|
|