|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
功能:爬取微博热搜前50,显示实时时间,导出结果到csv,但是有个问题迟迟无法解决:导出的csv里面,内容带【】和‘’)
如图:
当前时间 b'2022-02-28 15:28:06'
热搜排名 内容 热度
['1'] ['吴倩张雨剑新剧的名字'] ['剧集 4825754']
['2'] ['俄乌局势对中国有何启示'] [' 2248885']
['3'] ['运动员是冬残奥会开幕式主角'] [' 2028425']
['4'] ['香港新增确诊病例约34466例'] [' 1955103']
['5'] ['郑州一小学统一女生头绳颜色'] [' 1763060']
['6'] ['俄乌谈判会场准备完毕'] [' 1297432']
['•'] ['工作三年你有哪些变化'] [' ']
['7'] ['乌代表团希望抵达谈判地点后休息'] [' 1221655']
['8'] ['中国一年吃掉近50亿只白羽肉鸡'] [' 1128963']
['9'] ['蔡文姬花朝如约'] [' 1048941']
['10'] ['香港幼儿园及中小学暑假提前至3月7日开始'] [' 995869']
['11'] ['三亚已暂停所有前往北京的航班'] [' 994974']
import csv
import requests
from lxml import etree
import time
link='https://s.weibo.com/top/summary?cate=realtimehot'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36',
'Cookie':'SUB=_2AkMVX_Daf8NxqwJRmP8dzWzrboh0zA3EieKjAwEBJRMxHRl-yT9jqnAatRB6Pt_eNXUD4Q6s4uR7shXrYHP6N5s0DWjy; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5zapBNnh4B9Tkqsf9MdbS-; SINAGLOBAL=9740631714924.42.1644396619729; _s_tentry=-; Apache=7548092887889.8545.1644754490235; ULV=1644754490284:2:2:1:7548092887889.8545.1644754490235:1644396619996'
}
r=requests.get(link,headers=headers)
print("页响应状态码:", r.status_code)
html=etree.HTML(r.text)
a=[]
b=[]
c=[]
m=[a,b,c]
s=[]
print("当前时间是:",time.strftime("%Y-%m-%d %H:%M:%S"))
print("微博热搜榜第1~50名:")
for i in range(51):
b.insert(i,html.xpath('// *[ @ id = "pl_top_realtimehot"] / table / tbody / tr[{}] / td[2] / a/text()'.format(i + 2)))
a.insert(i,html.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr[{}]/td[1]/text()'.format(i + 2)))
c.insert(i,html.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr[{}]/td[2]/span/text()'.format(i + 2)))
print(*a[i],*b[i],*c[i])
outlist=['热搜排名','内容','热度']
m=list(map(list, zip(*m)))
s=['当前时间',time.strftime("%Y-%m-%d %H:%M:%S").encode('utf-8')]#加上后缀导出csv的日期可以去除#
with open(r'C:\Users\Public\Desktop\weibo20.csv','a+',encoding='UTF-8',newline='') as csvfile:
w=csv.writer(csvfile)
w.writerow(s)
w.writerow(outlist)
w.writerows(m)
import csv
import requests
from lxml import etree
import time
link='https://s.weibo.com/top/summary?cate=realtimehot'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36',
'Cookie':'SUB=_2AkMVX_Daf8NxqwJRmP8dzWzrboh0zA3EieKjAwEBJRMxHRl-yT9jqnAatRB6Pt_eNXUD4Q6s4uR7shXrYHP6N5s0DWjy; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5zapBNnh4B9Tkqsf9MdbS-; SINAGLOBAL=9740631714924.42.1644396619729; _s_tentry=-; Apache=7548092887889.8545.1644754490235; ULV=1644754490284:2:2:1:7548092887889.8545.1644754490235:1644396619996'
}
r=requests.get(link,headers=headers)
print("页响应状态码:", r.status_code)
html=etree.HTML(r.text)
a=[]
b=[]
c=[]
m=[a,b,c]
s=[]
print("当前时间是:",time.strftime("%Y-%m-%d %H:%M:%S"))
print("微博热搜榜第1~50名:")
for i in range(51):
b.insert(i,html.xpath('// *[ @ id = "pl_top_realtimehot"] / table / tbody / tr[{}] / td[2] / a/text()'.format(i + 2))[0]) # 加上[0]
a.insert(i,html.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr[{}]/td[1]/text()'.format(i + 2))[0]) # 加上[0]
c.insert(i,html.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr[{}]/td[2]/span/text()'.format(i + 2))[0]) # 加上[0]
print(*a[i],*b[i],*c[i])
outlist=['热搜排名','内容','热度']
m=list(map(list, zip(*m)))
s=['当前时间',time.strftime("%Y-%m-%d %H:%M:%S").encode('utf-8')]#加上后缀导出csv的日期可以去除#
with open(r'C:\Users\Public\Desktop\weibo20.csv','a+',encoding='UTF-8',newline='') as csvfile:
w=csv.writer(csvfile)
w.writerow(s)
w.writerow(outlist)
w.writerows(m)
|
|