import requests
import re
from lxml import etree
class Zw:
def __init__(self):
self.index_url="http://www.zuowen.com/gaozhong/"
self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.71 Safari/537.36"}
def list_href(self,response):
index_str=response.xpath("//div[@class='taglist']/ul/li")
for li in index_str:
list_url=li.xpath("./a[@target='_blank']/@href")
return list_url
def arc_next(self,list_url):
for i in list_url:
list_str=requests.get(i,headers=self.headers)
list_str1 = etree.HTML(list_str.text)
list_str2=list_str1.xpath("//div[@class='artlist']/div[@class='artlist_l']")
for arc in list_str2:
arc_href=arc.xpath("./div/div[@class='artbox_l_t']/a/@href")
return arc_href
def arc_body(self,arc_href):
for k in arc_href:
html1=requests.get(k,headers=self.headers)
html = etree.HTML(html1.text)
title=html.xpath("//h1[@class='h_title']/text()")
#data=re.findall("<p style="text-align:center;padding:10px">20\d{2}-\d{2}-\d{2}.*?</p>",html1)
body=html.xpath("//div[@class='con_content']/text()")
return title,body
def run(self):
#1 获取封面网址
index_str=requests.get(self.index_url,headers=self.headers)
response=etree.HTML(index_str.text)
#2 获取列表网址
list_url=self.list_href(response)
#3获取文章网址
arc_href=self.arc_next(list_url)
print(arc_href)
#4 获取数据
title,body=self.arc_body(arc_href)
#5 保存数据
print(title,body)
if __name__ == "__main__":
zuowen=Zw()
zuowen.run()
|