|
楼主 |
发表于 2020-10-28 21:47:51
|
显示全部楼层
import json
import pandas
import requests
from bs4 import BeautifulSoup
import time
import random
import csv
cookies={
'Accept':'application/json, text/plain, */*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection':'keep-alive',
'Cookie':'XSRF-TOKEN=a8637b; WEIBOCN_FROM=1110006030; MLOGIN=1; M_WEIBOCN_PARAMS=from%3Dpage_1001061925694254_profile%26oid%3D4341943448410727%26luicode%3D20000061%26lfid%3D4558599567448599%26uicode%3D20000061%26fid%3D4341943448410727; loginScene=102003; SUB=_2A25ynWtbDeRhGeVP7FAV8y3OyDiIHXVufnUTrDV6PUJbkdAKLVXckW1NTOPEyyCKN8fWKgTZSR87tLPlVkVV9m0K; SUHB=0sqBPJ7_yIjGMS; _T_WM=66965420006',
'Host':'m.weibo.cn',
'MWeibo-Pwa':'1',
'Referer':'https://m.weibo.cn/detail/4558599567448599',
'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
'X-Requested-With':'XMLHttpRequest',
'X-XSRF-TOKEN':'deea1b'
}
#抓取单页面内的标题、时间、正文
def getNewsDetail(data):
result = {}
try:
result['内容'] = data['text'].strip()
result['点赞数'] = data['like_count']
result['评论数'] = data['total_number']
result['评论人'] = data['user']['screen_name']
result['评论时间'] = data['created_at']
except:
print('shibai')
return result
def parseListLinks(data):
newsdetails = []
for ent in data['data']['data']:
try:
newsdetails.append(getNewsDetail(ent))
except:
print('shibai')
return newsdetails
url = 'https://m.weibo.cn/comments/hotflow?id=4341943448410727&mid=4341943448410727&max_id_type=0'
b=0#page
for i in range(1,5):
news_total = []
res = requests.get(url,cookies=cookies)
res.encoding = 'utf-8'
jd = json.loads(res.text)
max_id = str(jd['data']['max_id'])
newsary = parseListLinks(jd)
news_total.extend(newsary)
df = pandas.DataFrame(news_total)
df.to_csv('20101028test.csv',mode='a',encoding='utf-8')
b+=1
print('page=',b)
url = 'https://m.weibo.cn/comments/hotflow?id=4341943448410727&mid=4341943448410727&max_id='+max_id+'&max_id_type=0'
print(url)
time.sleep(random.randint(3,8))
代码差不多就这些,我是在jupyter notebook里运行的,所以没怎么规整,一块是一块,后面的还没用到,因为抓取第一页不会出问题,从url第一次变更就会有问题。代码多的不用看,光运行下面这一部分就会有问题。
import json
import requests
cookies={
'Accept':'application/json, text/plain, */*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection':'keep-alive',
'Cookie':'XSRF-TOKEN=a8637b; WEIBOCN_FROM=1110006030; MLOGIN=1; M_WEIBOCN_PARAMS=from%3Dpage_1001061925694254_profile%26oid%3D4341943448410727%26luicode%3D20000061%26lfid%3D4558599567448599%26uicode%3D20000061%26fid%3D4341943448410727; loginScene=102003; SUB=_2A25ynWtbDeRhGeVP7FAV8y3OyDiIHXVufnUTrDV6PUJbkdAKLVXckW1NTOPEyyCKN8fWKgTZSR87tLPlVkVV9m0K; SUHB=0sqBPJ7_yIjGMS; _T_WM=66965420006',
'Host':'m.weibo.cn',
'MWeibo-Pwa':'1',
'Referer':'https://m.weibo.cn/detail/4558599567448599',
'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
'X-Requested-With':'XMLHttpRequest',
'X-XSRF-TOKEN':'deea1b'
}
url='https://m.weibo.cn/comments/hotflow?id=4558599567448599&mid=4558599567448599&max_id=167296692120734&max_id_type=0'
res = requests.get(url,cookies=cookies)
print(res)
print(res.text)
|
|