| 
 | 
 
 
发表于 2020-10-12 17:17:50
|
显示全部楼层
 
 
 
这样可以了,用正则。。。。。。。。。。beautifulsoup不知道怎么用,正则有时更方便 
- import requests
 
 - import re
 
  
- url = 'https://www.zhihu.com/people/meng-wa-83-63/posts?page=3'
 
 - headers = {
 
 -     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36 Edg/86.0.622.38'}
 
 - response = requests.get(url=url, headers=headers)
 
 - html_str = response.content.decode()
 
  
- partten = r'"content":"(.*?)","commentCount"'
 
 - data = re.findall(partten, html_str)
 
  
- # print(data)
 
 - # print(len(data))
 
 - with open('test.txt', 'w', encoding='utf-8') as f:
 
 -     for i in data:
 
 -         a = i.replace(r'\u003Cp\u003E', '\n').replace(r'\u003C\u002Fp\u003E', '\n').replace(
 
 -             r'\u003Cp class="ztext-empty-paragraph"\u003E\u003Cbr\u002F\u003E', '\n')
 
 -         f.write(a)
 
 -         f.write('\n')
 
 -         f.write('\n')
 
 
  复制代码 |   
 
 
 
 |