|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 疾风怪盗 于 2020-9-9 21:32 编辑
最近又学习了下爬虫的视频,学习了json的数据格式爬取,把之前做的 获取‘松江’为关键词的实施新浪微博 的代码重新做了一遍
发现这是不是就不用cookie了,直接就可以获取?会被封IP么?还是会怎么反爬?
- import requests, json, time
- class XinlangWeibo():
- def __init__(self, name):
- self.url = f'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D61%26q={name}&page_type=searchall&page='
- self.headers = {
- 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'}
- self.data = []
- def Get_Data(self, page): # 主程序,爬取数据
- for num in range(1, page):
- url = self.url + str(num)
- print(url)
- response = requests.get(url=url, headers=self.headers)
- data_all = json.loads(response.content.decode('utf-8'))
- data_all = data_all['data']['cards']
- for i in range(len(data_all)):
- if data_all[i]['card_type'] == 9:
- data_temp = {}
- data_temp['screen_name'] = data_all[i]['mblog']['user']['screen_name'] # 发帖人昵称
- data_temp['screen_name'] = data_temp['screen_name'] if len(data_temp['screen_name']) > 0 else None
- data_temp['profile_url'] = data_all[i]['mblog']['user']['profile_url'] # 发帖人主页
- data_temp['profile_url'] = data_temp['profile_url'] if len(data_temp['profile_url']) > 0 else None
- data_temp['created_time'] = data_all[i]['mblog']['created_at'] # 发帖时间距离现在时间
- data_temp['created_time'] = data_temp['created_time'] if len(
- data_temp['created_time']) > 0 else None
- data_temp['raw_text'] = data_all[i]['mblog']['raw_text'] # 发帖内容
- data_temp['raw_text'] = data_temp['raw_text'] if len(data_temp['raw_text']) > 0 else None
- data_temp['scheme_url'] = data_all[i]['scheme'] # 帖子地址
- data_temp['scheme_url'] = data_temp['scheme_url'] if len(data_temp['scheme_url']) > 0 else None
- self.data.append(data_temp)
- time.sleep(3)
- return data_all
- def Save_Josn(self, data_all):
- with open('新浪微博-dataall.json', 'w', encoding='utf-8') as files: # 保存json文件
- json.dump(data_all, files, ensure_ascii=False, indent=4)
- def Save_Data(self):
- with open('新浪微博-data.txt', 'w', encoding='utf-8') as file: # 保存帖子内容
- for s in range(0, len(self.data)):
- file.write(f"第{s + 1}个帖子为:"
- + "\n" + f"微博发帖人为:{self.data[s]['screen_name']}"
- + "\n" + f"微博发帖人主页为:{self.data[s]['profile_url']}"
- + "\n" + f"发帖时间为:{self.data[s]['created_time']}"
- + "\n" + f"帖子内容为:{self.data[s]['raw_text']}"
- + "\n" + f"帖子链接为:{self.data[s]['scheme_url']}"
- + "\n" + "*" * 100 + "\n"
- + "\n")
- print('已保存')
- if __name__ == '__main__':
- xinlangweibo = XinlangWeibo(name='松江') # 关键词
- data_all = xinlangweibo.Get_Data(page=3) # 获取页数
- # xinlangweibo.Save_Josn(data_all)
- xinlangweibo.Save_Data()
复制代码 |
评分
-
参与人数 2 | 荣誉 +1 |
鱼币 +10 |
贡献 +1 |
收起
理由
|
紫癜聪
| + 1 |
+ 5 |
+ 1 |
谢谢帮忙 |
千古
| |
+ 5 |
|
感谢楼主无私奉献! |
查看全部评分
|