完美世界小说爬取程序
看动漫看上瘾了。刚好学到爬虫,试着简单写一个。内置库的urllib.request库用不明白,只能用第三方库写一个了。比较写得比较粗糙,不知道怎么优化输出格式,有知道的可以回复一下。有其他建议的也可说一下,大家一起学习。static/image/hrline/line4.png
import re
import requests
class Spider:
headers = { #发送HTTP请求时的HEAD信息
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language':
'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'User-Agent':
'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
def __init__(self,url,timeout=60):
self.url = url
self.timeout = timeout
self.titles = []
self.data = None
def getHtml(self): #获取网页内容
response = requests.get(self.url,timeout = self.timeout,headers=self.headers)
response.encoding = "utf-8"
self.html = response.text
#print(self.html)
def getContent(self): #获取标题及网页信息
titles = []
self.title=re.findall(r'<div class="read_title"><h1>(.*?)</h1>',self.html)
content_list=re.findall(r'\S (.*?)\s',self.html)
return content_list
def save_info(self,content):#保存文件
with open("wangmeishijie.txt","a+",encoding="utf-8") as f:
f.write(f"{self.title}\n")
for each in content:
f.write(f" {each}\n")
def getPage(page):
print("正在搜索第 {} 章".format(page-643602))
url = "https://www.soshuw.com/WanMeiShiJie/"+str(page)+".html"
web = Spider(url)
web.getHtml()
content = web.getContent()
web.save_info(content)
if __name__ == "__main__":
page = 643866#对应264章
n = int(input("输入需要爬取的页数(从264章开始):"))
for i in range(n):
page += 1
getPage(page)
print("爬取完成")
import re
import requests
class Spider:
headers = { #发送HTTP请求时的HEAD信息
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language':
'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'User-Agent':
'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
def __init__(self,url,timeout=60):
self.url = url
self.timeout = timeout
self.titles = []
self.data = None
def getHtml(self): #获取网页内容
response = requests.get(self.url,timeout = self.timeout,headers=self.headers)
response.encoding = "utf-8"
self.html = response.text
#print(self.html)
def getContent(self): #获取标题及网页信息
titles = []
self.title=re.findall(r'<div class="read_title"><h1>(.*?)</h1>',self.html)
content_list=re.findall(r'\S (.*?)\s',self.html)
return content_list
def save_info(self,content):#保存文件
with open("wangmeishijie.txt","a+",encoding="utf-8") as f:
f.write(f"{self.title}\n")
for each in content:
f.write(f" {each}\n")
def getPage(page):
print("正在搜索第 {} 章".format(page-643602))
url = "https://www.soshuw.com/WanMeiShiJie/"+str(page)+".html"
web = Spider(url)
web.getHtml()
content = web.getContent()
web.save_info(content)
if __name__ == "__main__":
page = 643866#对应264章
n = int(input("输入需要爬取的页数(从264章开始):"))
for i in range(n):
page += 1
getPage(page)
print("爬取完成")
有鱼B! {:5_95:} dlas[dl
{:5_103:} bksn,牛皮
页:
[1]