|

楼主 |
发表于 2021-12-4 15:54:06
|
显示全部楼层
- import re
- import requests
- class Spider:
- headers = { #发送HTTP请求时的HEAD信息
- 'Connection': 'Keep-Alive',
- 'Accept': 'text/html, application/xhtml+xml, */*',
- 'Accept-Language':
- 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
- 'Accept-Encoding': 'gzip, deflate',
- 'User-Agent':
- 'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
- }
- def __init__(self,url,timeout=60):
- self.url = url
- self.timeout = timeout
- self.titles = []
- self.data = None
-
-
-
- def getHtml(self): #获取网页内容
- response = requests.get(self.url,timeout = self.timeout,headers=self.headers)
- response.encoding = "utf-8"
- self.html = response.text
- #print(self.html)
-
- def getContent(self): #获取标题及网页信息
- titles = []
- self.title=re.findall(r'<div class="read_title"><h1>(.*?)</h1>',self.html)
- content_list=re.findall(r'\S (.*?)\s',self.html)
- return content_list
-
- def save_info(self,content): #保存文件
- with open("wangmeishijie.txt","a+",encoding="utf-8") as f:
- f.write(f"{self.title[0]}\n")
- for each in content:
- f.write(f" {each}\n")
-
- def getPage(page):
- print("正在搜索第 {} 章".format(page-643602))
- url = "https://www.soshuw.com/WanMeiShiJie/"+str(page)+".html"
- web = Spider(url)
- web.getHtml()
- content = web.getContent()
- web.save_info(content)
-
- if __name__ == "__main__":
- page = 643866 #对应264章
- n = int(input("输入需要爬取的页数(从264章开始):"))
- for i in range(n):
- page += 1
- getPage(page)
- print("爬取完成")
复制代码 |
|