|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本来可以用dict的,但是只有两个模块,不想拆腾了。
- #香港商报。
- #by yh6788
- #coding:utf-8
- import yh_fanyi,yh_dl
- import time,datetime,os,requests,re
- from bs4 import BeautifulSoup
- import re
- import hqbsh_post
- import chardet
- import html
- requests.packages.urllib3.disable_warnings()
- def save(html):
- name="1.html"
- name='f:\\1.txt'
- # print(html)
- # html1=html.content
- html1=str(html)
- # print(html1)
- with open(name,"w",encoding='utf8') as f:
- # with open(name,"wb") as f:
- f.write(html1)
- with open(name,'r',encoding='gb18030') as ff:
- txt=ff.read()
- return txt
- def save_txts(txts,file='f:\\1.txt'):
- # file='f:\\1.txt'
- file1=file.replace('\\\\','\\')
- # os.system('del /q %s'%file1)
- os.system('copy NUL %s'%file1)
- # os.system('echo ..>>%s'%file1)
- # os.system('echo .>>%s'%file1)
- for i in range(len(txts)):
- # print(txts[i])
-
- with open(file,'a',encoding='gb18030') as f:
- f.write(txts[i]+'\n')
- return file1
- def post(top_txt,txt,post_0='54'):
- hqbsh_username='2'
- hqbsh_password='2'
- fid=post_0
- typeid=''
- file_dir=''
- hqbsh_post.main(hqbsh_username,hqbsh_password,fid,typeid,file_dir,top_txt,txt)
- def get_html_2(url,headers,url_0='',post_0='54'):
- pass
- # url='http://www.hkcd.com/content/2020-07/14/content_1202007.html'
- print('\n准备进入:',url)
- html_21=requests.get(url,headers=headers,verify=False)
- html_21.encoding='utf8'
- html_22=BeautifulSoup(html_21.text,'lxml')
- html_23=html_22.find(class_='con_box')
- title=html_23.find_all('h2')[0].text #得到文件标题
- # title.encoding='utf-8'
- # title=yh_fanyi.fanyi_txt(title)
- print('抬头:',title)
- # save(title)
- html_24=html_23.find(class_='content_main')
- # os.exit()
- txts=[]
- txt=''
- # html_24=html_23
- # print(len(html_24))
- html_24=str(html_24).split('</p>')
- # print(html_24)
- # print(html_24[0])
- # os.exit()
- for i in html_24:
- i2=[]
- # print(i)
- # time.sleep(1)
- if i=='':
- continue
- i=i+'</p>'
- i1=BeautifulSoup(i,'lxml')
- # print(i1.text)
- if '<img' in str(i):
- i2=i1.find_all('img')
- # re1=r'data-src=(.*)" alt'
- # i2=re.findall(re1,i)
- # txt_21=yh_fanyi.fanyi_txt(str(i1.text))
- if i2!=[]:
- # print(i2)
- for ii in i2:
- a1=ii.get('src')
- if 'http' in a1:
- txt=txt+'[img]'+a1+'[/img]'+'\n'
- print(a1)
- else:
- txt=txt+'[img]'+url_0+a1+'[/img]'+'\n'
- print(url_0+a1)
- continue
-
- txt_21=i1.text
- # print(txt_21)
- if '香港商報' in txt_21:
- re2=r'(【香港.*報道:)'
- txt_22=re.findall(re2,txt_21)
-
- # print(txt_22)
- if txt_22!=[]:
- txt_21=txt_21.replace(txt_22[0],'')
- # if '【香港商報網訊】' in txt_21:
- txt_21=txt_21.replace('【香港商報網訊】','')
- txt_21=txt_21.replace('【香港商报网讯】','')
- txt_21=txt_21.replace('香港商报','')
- txt=txt+txt_21+'\n'
- # time.sleep(1)
- # txt=save(txt)
- print('\n\n'+txt)
- kill_txt=['暴亂','黑暴','港獨','五大訴求','國安法','香港自治法案']
- for i3 in kill_txt:
- if i3 in title+txt:
- print('%s在txt里面,该条目不上传。'%i3)
- print('\a')
- time.sleep(5)
- return ''
- # os.exit()
- # txt.replace('ü','')
- txt=txt.replace('http://','https://')
- post(title,txt,post_0)
- def get_html_1(url,headers,post_0='54'):
- html_1=requests.get(url,headers=headers,verify=False)
- html_1.encoding='utf8'
- html_2=BeautifulSoup(html_1.text,'lxml')
- html_3=html_2.find(class_="imps_new")
- html_3=html_3.find(class_="cat_new_zs cat_newlist")
- html_4=html_3.find_all('ul')
- print(len(html_4[0]))
- # html_4=html_4.decode('utf-8')
- # print(html_4[0])
- yesterday = (datetime.datetime.now() + datetime.timedelta(days=-1)).date().strftime('%Y-%m-%d')
- print(yesterday)
- # yesterday='2020-07-14'
- # print(yesterday)
- # os.exit()
- url_0='http://www.hkcd.com/'
- urls=[]
- n=0
- for i in html_4[0]:
- i1=[]
- # print(i)
- if yesterday in str(i):
- # print(i)
- # print('============')
- i1=i.find_all('a')
- if i1!=[]:
- # print(i1[0])
- title=i1[0].text
- re_1=r'href="(.*)" target='
- url1=re.findall(re_1,str(i1[0]))
- url=url_0+url1[0]
- urls.append(url)
- # print(url)
- get_html_2(url,headers,url_0,post_0)
- n+=1
- # os.exit()
- # print('\a')
- # print('-------------')
- # time.sleep(1)
- print(n,'个数据已经处理。')
- def main():
- user_agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
- headers={"User-Agent":user_agent}
- #人在香港
- url='http://www.hkcd.com/node_61.html'
- post_0='54'
- #民用科技
- url_1='http://www.hkcd.com/node_24.html'
- post_1='46'
- os.system('cls')
- get_html_1(url,headers,post_0)
- get_html_1(url_1,headers,post_1)
- if __name__=='__main__':
- yh_dl.kill_system_dl()
- main()
复制代码 |
|