import re
from sys import path
from json import dump, load
from multiprocessing import Process
from time import sleep
from requests import get
from bs4 import BeautifulSoup
from semi import checkandWrite
from logmod import logger
# 搜寻模式,写这种最简单的管道符号正则即可
meth = r"东方财富|医疗|储能|电池"
# 含url的字典,键值是当前探测对象的总命名。
# 每键下对应一个列表。
# 列表0:网站地址;
# 列表1:探测至最新页面(不存在)时的指示标题<title/>;
# 列表2:正文的text对应的soup元素,
url_dict = {
"office369" : ["这里填url1", "你访问的页面不存在", {'class':'art-content'}],
"topye" : ["这里填url2", "-上野财新网", {'id':'newscontent'}],
}
def rGet(url):
# 获取页面内容
page = get(url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
'Accept':'text/html,application/xhtml+xml,*/*'})
return page
def search4result(regex, string:str):
# 用正则表达式忽略大小写搜寻关键词。返回regex的搜索结果对象
return re.search(regex, string, flags=re.I)
def get_url_soup(url:str):
# 获取网站的美丽汤值对象
return BeautifulSoup(rGet(url).content, 'html.parser')
def get_fuckads(name:str, this_url:str, detect_title:str, detect_attr:dict):
"""
向上寻找到网站最新的更新页面,一个一个的检索关键词。
attrs:
name: 当前探测对象的总命名。
this_url: 网站地址;
detect_title: 探测至最新页面(不存在)时的指示标题<title/>;
detect_attr: 正文的text对应的soup元素。
return:
None.
"""
neo_dict = dict()
with open(f"{path[0]}\\detect\\status_{name}.json", "r+", encoding="utf-8") as f:
# json存储的字典的简单结构:{"name": "office369", "now": 190904}
stat = load(f)
start_counter = stat['now']
logger.info(f"<<!FUCKADS!>>: {name}: running from {start_counter}...") #日志打印开始信号
try:
temp_soup = get_url_soup(f"{this_url}{stat['now']}.html")
#for循环我糊的,不知道该怎么写。。。
while(temp_soup.title.text != detect_title): #标题不是不存在的网页标题
now_title = temp_soup.title.text #取标题正文
now_url = f"{this_url}{stat['now']}.html" #当前网址
now_text = temp_soup.find("div", attrs=detect_attr).text #正文内容
#now_text = now_text.text.replace(" ", "").replace("\n", "")
title_result = search4result(meth, now_title)
text_result = search4result(meth, now_text)
# 标题有关键词 or 正文有关键词
is_exist = bool(title_result) or bool(text_result)
if(is_exist):
#日志记一笔
logger.critical(f"<<!FUCKADS!>>:\n{now_title[0:10]} : {now_url} \
\n{title_result}, {text_result}")
neo_dict[now_title] = now_url #字典加一笔
stat['now'] += 1
temp_soup = get_url_soup(f"{this_url}{stat['now']}.html")
except Exception: #你看,我as e都不敢用,生怕e占了内存空间
# 我实在是不晓得那些attrError或者typeError是怎么出来的
# 明明设定了只要title相等循环就可以结束了!
pass
logger.info(f"<<!FUCKADS!>>: {name}:DONE from {start_counter} tu {stat['now']}")
# 清空json,字典存储与写入json
f.seek(0, 0)
f.truncate(0)
dump(stat, f)
if neo_dict: #逐个写入
for url in neo_dict.values():
checkandWrite(url, name)
if __name__ =="__main__":
for name in url_dict.keys():
#内存占用这么多的话就不怎么敢玩多线程了
#get_fuckads(name, url_dict[name][0], url_dict[name][1],url_dict[name][2])
Process(target=get_fuckads,
args=(name, url_dict[name][0], url_dict[name][1],url_dict[name][2], )).start()
sleep(1)