|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
代码如下:
- import re,urllib,os,html
- import requests as req
- def url_open(url):#网页打开函数,以防被禁
- req = urllib.request.Request(url)
- req.add_header('User-Agent',"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36")
- response = urllib.request.urlopen(url)
- ret = response.read()
- return ret
- def get_txt_url(url):#获取小说的url
- ret = url_open(url).decode('gbk')
- reg = r'<li><a href="/(.*?)" title=".*?" target="_top">.*?</a></li>'
- book_url = re.findall(reg, ret,re.S)
- return book_url
- def get_txt_book(url_1):#获取正文url后缀
- ret = url_open(url_1).decode('gbk')
- reg = r'<li><a href="/.*?/(.*?)">.*?</a></li>'
- char = re.findall(reg,ret,re.S)
- return char
- def get_txt_nr(url_z):
- ret = url_open(url_z).decode('gbk')
- ptn_nr = re.compile(br'<div class="nr_con">(.*?)</div>', re.DOTALL) # 正文
- ptn_p = re.compile(br'</?p>') # 正文
- rsp = req.get(url_z)#必须要req.get
- nr = ptn_nr.findall(rsp.content)
- nr = ptn_p.sub(b'', nr[0]).decode('gbk')
- nr = html.unescape(nr)
- return nr
- def get_name(url_z):#获取书名
- ret = url_open(url_z).decode('gbk')
- reg = r'<a href="/.*?/">(.*?)</a>'
- bookname = re.findall(reg,ret)
- return bookname
- def save_book(folder,txt_nr,txt_name):#保存正文函数,问题主要在这里:让小说内容按书名保存到一个txt文件中,不是分章节保存
- file = open(txt_name + '.txt', 'a')
- file.writelines(txt_nr)
- file.close()
- def txt_down(folder='gulongtxt'): #主函数,我电脑已经创建了一个叫'gulongtxt'的文件夹
- os.chdir(folder)#让小说在这个文件夹保存
- url = 'http://www.gulongwang.com/'
- txt_url = get_txt_url(url)#获取小说的url
- for i in txt_url:
- url_1 = url + i#拼接小说的url
- txt_book =get_txt_book(url_1)#获取小说正文url后缀
- #print(txt_book)
- for a in txt_book:
- url_z = url_1 +a#拼接小说正文的url
- txt_nr = get_txt_nr(url_z)#获取小说内容
- txt_name = get_name(url_z)#获取小说书名
- for t in txt_name:
- txt_save =save_book(folder,txt_nr,txt_name)
- if __name__ == "__main__":
- txt_down()
复制代码
这是爬取整个网站小说的爬虫,目的是把这个网站全部小说下载,并按照书名保存成一个txt文件(不是分章节保存).
问题如下:
0. 如何让小说内容按照书名保存成一个txt文件?
1. 自我感觉是因为书名获取函数get_name(url_z)写的太后面了,大神们说是不是?
2. 我的代码是不是很丑?运行界面很难看?求轻拍.
最后感谢@BngThea @SixPy @Teagle @wyp02033 (排名不分先后)的指导让我写出的第一条爬虫,虽然每次只能爬取一部小说,而且里面还有html的标签但我很高兴能写出来.
你这里用到了requests类,所以建议全都用requests类来操作即可。
修改后的代码:
- import re, os
- import requests
- def url_open(url): # 网页打开函数,以防被禁
- headers = {'User-Agent':"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36"}
- req = requests.get(url, headers=headers)
- req.encoding = 'gbk'
- return req.text
- def get_txt_url_name(url): # 获取小说的url和书名
- ret = url_open(url)
- reg = r'<li><a href="/(.*?)" title="(.*?)" target="_top">.*?</a></li>'
- book_url = re.findall(reg, ret,re.S)
- return book_url
- def get_txt_book(url_1): # 获取正文url后缀
- ret = url_open(url_1)
- reg = r'<li><a href="/.*?/(.*?)">.*?</a></li>'
- char = re.findall(reg,ret,re.S)
- return char
- def get_txt_nr(url_z):
- ret = url_open(url_z)
- ptn_nr = re.compile(r'<div class="nr_con">(.*?)</div>', re.DOTALL) # 正文
- ptn_p = re.compile(r'</?p>') # 正文
- nr = ptn_nr.findall(ret)
- nr = ptn_p.sub('', nr[0])
- return nr
- def save_book(txt_name, txt_nr): # 保存正文函数,问题主要在这里:让小说内容按书名保存到一个txt文件中,不是分章节保存
- with open(txt_name + '.txt', 'a') as f:
- f.writelines(txt_nr)
- def txt_down(folder='gulongtxt'): # 主函数,我电脑已经创建了一个叫'gulongtxt'的文件夹
- os.chdir(folder) # 让小说在这个文件夹保存
- url = 'http://www.gulongwang.com/'
- txt_url_name = get_txt_url_name(url) # 获取每部小说的url和书名
- for i in txt_url_name:
- url_1 = url + i[0] # 拼接每部小说的url
- txt_book =get_txt_book(url_1) # 获取每部小说正文url后缀
- for a in txt_book:
- url_z = url_1 + a # 拼接每部小说每章节正文的url
- txt_nr = get_txt_nr(url_z) # 获取小说内容
- txt_save =save_book(i[1], txt_nr) # 按书名保存内容
-
- if __name__ == "__main__":
- txt_down()
-
复制代码
|
|