爬虫问题,pycharm运行
本帖最后由 黑夜之惑 于 2020-6-24 10:13 编辑import time
import json
import pandas as pd
import requests
import re
header = {'content-type': 'application/json; charset=utf-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}
Cookie = {'Cookie': '_T_WM: 16302431710; WEIBOCN_FROM=1110006030; ALF: 1595143722; SUB: _2A25z6Bl7DeRhGeBN7FcS8CbMyD-IHXVREqczrDV6PUNbktANLVH9kW1NRCJy5XgYpfllO6KntJ0z9VlG3WS8Dk5B;SUBP: 0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh-s9cYGJn3yMhQAD89z7Ve5JpX5KzhUgL.Foq0S0-0ehn7e0e2dJLoIpjLxK-LBo5LBo.LxKqL1-eL1h.LxKqLBozLB.zt; SUHB: 0auQfAd6YqJgiP; SSOLoginState=1592205572; MLOGIN=1; M_WEIBOCN_PARAMS: luicode%3D10000011%26lfid%3D102803%26fid%3D231522type%253D1%2526t%253D10%2526q%253D%2523%25E5%25A4%2596%25E4%25BA%25A4%25E9%2583%25A8%25E5%259B%259E%25E5%25BA%2594%25E4%25B8%25AD%25E5%258D%25B0%25E8%25BE%25B9%25E5%25A2%2583%25E5%2586%25B2%25E7%25AA%2581%2523%26uicode%3D10000011; XSRF-TOKEN: 49dc89'}
for ii in range(100):
url_base = 'https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26t%3D10%26q%3D%23%E5%A4%96%E4%BA%A4%E9%83%A8%E5%9B%9E%E5%BA%94%E4%B8%AD%E5%8D%B0%E8%BE%B9%E5%A2%83%E5%86%B2%E7%AA%81%23&extparam=%23%E5%A4%96%E4%BA%A4%E9%83%A8%E5%9B%9E%E5%BA%94%E4%B8%AD%E5%8D%B0%E8%BE%B9%E5%A2%83%E5%86%B2%E7%AA%81%23&luicode=10000011&lfid=102803&page_type=searchall'
url = url_base + str(ii + 1)
# print(url)
html = requests.get(url, headers=header, cookies=Cookie)
print(html.text)
try:
for jj in range(len(html.json()['data']['cards'])):
print(html.json()['data']['cards']['mblog']['isLongText'])
if not html.json()['data']['cards']['mblog']['isLongText']:
data1 = [(html.json()['data']['cards']['mblog']['user']['id'],
html.json()['data']['cards']['mblog']['user']['screen_name'],
html.json()['data']['cards']['mblog']['reposts_count'],
html.json()['data']['cards']['mblog']['comments_count'],
html.json()['data']['cards']['mblog']['attitudes_count'],
html.json()['data']['cards']['mblog']['text'],
html.json()['data']['cards']['mblog']['created_at'],
html.json()['data']['cards']['mblog']['source'])]
else:
data1 = [(html.json()['data']['cards']['mblog']['user']['id'],
html.json()['data']['cards']['mblog']['user']['screen_name'],
html.json()['data']['cards']['mblog']['reposts_count'],
html.json()['data']['cards']['mblog']['comments_count'],
html.json()['data']['cards']['mblog']['attitudes_count'],
html.json()['data']['cards']['mblog']['longText']['longTextContent'],
html.json()['data']['cards']['mblog']['created_at'],
html.json()['data']['cards']['mblog']['source'])]
data2 = pd.DataFrame(data1)
data2.to_csv('微博评论.csv', header=False, index=False, mode='a+')
except:
print("抓取失败")
print('page ' + str(ii + 1) + ' has done')
time.sleep(3)
1、import json显示没有引用
2、tryexcept一直运行抓取失败
让我们自己提问题? ??问题呢 suchocolate 发表于 2020-6-23 19:17
让我们自己提问题?
估计是代码有问题吧
咱也不知道,咱也问不了 Cookie直接写到headers里:
header = {'content-type': 'application/json; charset=utf-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Cookie': '_T_WM: 16302431710; WEIBOCN_FROM=1110006030; ALF: 1595143722; SUB: _2A25z6Bl7DeRhGeBN7FcS8CbMyD-IHXVREqczrDV6PUNbktANLVH9kW1NRCJy5XgYpfllO6KntJ0z9VlG3WS8Dk5B;SUBP: 0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh-s9cYGJn3yMhQAD89z7Ve5JpX5KzhUgL.Foq0S0-0ehn7e0e2dJLoIpjLxK-LBo5LBo.LxKqL1-eL1h.LxKqLBozLB.zt; SUHB: 0auQfAd6YqJgiP; SSOLoginState=1592205572; MLOGIN=1; M_WEIBOCN_PARAMS: luicode%3D10000011%26lfid%3D102803%26fid%3D231522type%253D1%2526t%253D10%2526q%253D%2523%25E5%25A4%2596%25E4%25BA%25A4%25E9%2583%25A8%25E5%259B%259E%25E5%25BA%2594%25E4%25B8%25AD%25E5%258D%25B0%25E8%25BE%25B9%25E5%25A2%2583%25E5%2586%25B2%25E7%25AA%2581%2523%26uicode%3D10000011; XSRF-TOKEN: 49dc89'
}
html = requests.get(url, headers=header) suchocolate 发表于 2020-6-23 19:17
让我们自己提问题?
抱歉,疏忽了 Python初学者8号 发表于 2020-6-23 20:13
??问题呢
加了 suchocolate 发表于 2020-6-23 22:24
Cookie直接写到headers里:
不可以嗯,用了后requests也出问题了 黑夜之惑 发表于 2020-6-24 10:28
不可以嗯,用了后requests也出问题了
我这里正常。
你的什么问题,报错现象和代码发出来。 if not html.json()['data']['cards']['mblog']['isLongText']:
这句之前加一个判断
if not html.json()['data']['cards'].get('mblog'):
continue #cards 里面不是所有的条目都符合后面的数据筛选
这个py 没有用到json模块 不用import , html.json 属于requests模块的函数
另外 这个网站的cookie似乎是不需要的 ,你可以试试看
suchocolate 发表于 2020-6-24 12:13
我这里正常。
你的什么问题,报错现象和代码发出来。
C:\Users\yinyo\Desktop\图片\1
图片传上来了吗,我以前没弄过
try后面的数据输出出错了,现在我不知道改那里
代码就在最上面的帖子上了 yifenyu 发表于 2020-6-24 14:01
if not html.json()['data']['cards']['mblog']['isLongText']:
这句之前加一个判断
i ...
还是一样的抓捕失败了 黑夜之惑 发表于 2020-6-24 18:48
图片传上来了吗,我以前没弄过
try后面的数据输出出错了,现在我不知道改那里
代码就在最上面的帖子 ...
最上面的贴子上里的代码还是cookie不在headers里,你把cookie挪到headers再试试。
另外报错发不出图片可以贴报错输出文本。
页:
[1]