抓取香港商报数据，并上传网站更新数据

yh6788 · 发表于 2020-9-18 16:22:23

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

本来可以用dict的，但是只有两个模块，不想拆腾了。

#香港商报。
#by yh6788
#coding:utf-8
import yh_fanyi,yh_dl
import time,datetime,os,requests,re
from bs4 import BeautifulSoup
import re
import hqbsh_post
import chardet
import html
requests.packages.urllib3.disable_warnings()
def save(html):
name="1.html"
name='f:\\1.txt'
# print(html)
# html1=html.content
html1=str(html)
# print(html1)
with open(name,"w",encoding='utf8') as f:
# with open(name,"wb") as f:
f.write(html1)
with open(name,'r',encoding='gb18030') as ff:
txt=ff.read()
return txt
def save_txts(txts,file='f:\\1.txt'):
# file='f:\\1.txt'
file1=file.replace('\\\\','\\')
# os.system('del /q %s'%file1)
os.system('copy NUL %s'%file1)
# os.system('echo ..>>%s'%file1)
# os.system('echo .>>%s'%file1)
for i in range(len(txts)):
# print(txts[i])
with open(file,'a',encoding='gb18030') as f:
f.write(txts[i]+'\n')
return file1
def post(top_txt,txt,post_0='54'):
hqbsh_username='2'
hqbsh_password='2'
fid=post_0
typeid=''
file_dir=''
hqbsh_post.main(hqbsh_username,hqbsh_password,fid,typeid,file_dir,top_txt,txt)
def get_html_2(url,headers,url_0='',post_0='54'):
pass
# url='http://www.hkcd.com/content/2020-07/14/content_1202007.html'
print('\n准备进入：',url)
html_21=requests.get(url,headers=headers,verify=False)
html_21.encoding='utf8'
html_22=BeautifulSoup(html_21.text,'lxml')
html_23=html_22.find(class_='con_box')
title=html_23.find_all('h2')[0].text #得到文件标题
# title.encoding='utf-8'
# title=yh_fanyi.fanyi_txt(title)
print('抬头：',title)
# save(title)
html_24=html_23.find(class_='content_main')
# os.exit()
txts=[]
txt=''
# html_24=html_23
# print(len(html_24))
html_24=str(html_24).split('</p>')
# print(html_24)
# print(html_24[0])
# os.exit()
for i in html_24:
i2=[]
# print(i)
# time.sleep(1)
if i=='':
continue
i=i+'</p>'
i1=BeautifulSoup(i,'lxml')
# print(i1.text)
if '<img' in str(i):
i2=i1.find_all('img')
# re1=r'data-src=(.*)" alt'
# i2=re.findall(re1,i)
# txt_21=yh_fanyi.fanyi_txt(str(i1.text))
if i2!=[]:
# print(i2)
for ii in i2:
a1=ii.get('src')
if 'http' in a1:
txt=txt+'[img]'+a1+'[/img]'+'\n'
print(a1)
else:
txt=txt+'[img]'+url_0+a1+'[/img]'+'\n'
print(url_0+a1)
continue
txt_21=i1.text
# print(txt_21)
if '香港商報' in txt_21:
re2=r'(【香港.*報道：)'
txt_22=re.findall(re2,txt_21)
# print(txt_22)
if txt_22!=[]:
txt_21=txt_21.replace(txt_22[0],'')
# if '【香港商報網訊】' in txt_21:
txt_21=txt_21.replace('【香港商報網訊】','')
txt_21=txt_21.replace('【香港商报网讯】','')
txt_21=txt_21.replace('香港商报','')
txt=txt+txt_21+'\n'
# time.sleep(1)
# txt=save(txt)
print('\n\n'+txt)
kill_txt=['暴亂','黑暴','港獨','五大訴求','國安法','香港自治法案']
for i3 in kill_txt:
if i3 in title+txt:
print('%s在txt里面，该条目不上传。'%i3)
print('\a')
time.sleep(5)
return ''
# os.exit()
# txt.replace('ü','')
txt=txt.replace('http://','https://')
post(title,txt,post_0)
def get_html_1(url,headers,post_0='54'):
html_1=requests.get(url,headers=headers,verify=False)
html_1.encoding='utf8'
html_2=BeautifulSoup(html_1.text,'lxml')
html_3=html_2.find(class_="imps_new")
html_3=html_3.find(class_="cat_new_zs cat_newlist")
html_4=html_3.find_all('ul')
print(len(html_4[0]))
# html_4=html_4.decode('utf-8')
# print(html_4[0])
yesterday = (datetime.datetime.now() + datetime.timedelta(days=-1)).date().strftime('%Y-%m-%d')
print(yesterday)
# yesterday='2020-07-14'
# print(yesterday)
# os.exit()
url_0='http://www.hkcd.com/'
urls=[]
n=0
for i in html_4[0]:
i1=[]
# print(i)
if yesterday in str(i):
# print(i)
# print('============')
i1=i.find_all('a')
if i1!=[]:
# print(i1[0])
title=i1[0].text
re_1=r'href="(.*)" target='
url1=re.findall(re_1,str(i1[0]))
url=url_0+url1[0]
urls.append(url)
# print(url)
get_html_2(url,headers,url_0,post_0)
n+=1
# os.exit()
# print('\a')
# print('-------------')
# time.sleep(1)
print(n,'个数据已经处理。')
def main():
user_agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
headers={"User-Agent":user_agent}
#人在香港
url='http://www.hkcd.com/node_61.html'
post_0='54'
#民用科技
url_1='http://www.hkcd.com/node_24.html'
post_1='46'
os.system('cls')
get_html_1(url,headers,post_0)
get_html_1(url_1,headers,post_1)
if __name__=='__main__':
yh_dl.kill_system_dl()
main()

复制代码

yue2677678183 · 发表于 2020-12-23 15:03:26

你那个妹子图代码出问题了，你能更新更新吗

账号		自动登录	找回密码
密码			立即注册

[作品展示] 抓取香港商报数据，并上传网站更新数据

马上注册，结交更多好友，享用更多功能^_^

浏览过的版块