|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
如题所述,爬取的是知乎的一个页面,为什么会打印出乱码?
完整代码如下:
- import json
- import time
- import requests
- import os
- from bs4 import BeautifulSoup
- # -*- coding:utf-8 -*-
- # 获取cookies
- ckdesk = os.path.join(os.path.expanduser("~"), 'Desktop') + '\\知乎数据更新\\zhihucookie.txt'
- with open(ckdesk,'r') as f :
- cookie = f.read()
- f.close()
- cookie = json.loads(cookie)
- print('cookie读取成功')
- desk = os.path.join(os.path.expanduser("~"), 'Desktop') + '\\知乎数据更新\\知乎问答数据.xlsx'
- # url设置
- url = 'https://www.zhihu.com/question/395871463'
- # 请求头
- headers = {
- 'authority': 'www.zhihu.com',
- 'method': 'GET',
- 'path': url.replace("https://www.zhihu.com/", ""),
- 'scheme': 'https',
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
- 'accept-encoding': 'gzip, deflate, br',
- 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
- 'cache-control': 'max-age=0',
- 'referer': url,
- 'sec-fetch-dest': 'document',
- 'sec-fetch-mode': 'navigate',
- 'sec-fetch-site': 'same-origin',
- 'sec-fetch-user': '?1',
- 'upgrade-insecure-requests': '1',
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
- }
- # 请求页面,获取数据
- res = requests.get(url=url, cookies=cookie, headers=headers)
- session = requests.session()
- login_response = session.post(url, headers=headers)
- # res.encoding = 'utf-8'
- # print(res.encoding)
- soup = BeautifulSoup(res.text, 'html.parser',)
- print(soup)
复制代码
把headers中'accept-encoding': 'gzip, deflate, br'的br去掉
|
|