|
发表于 2019-7-1 20:10:06
|
显示全部楼层
- from bs4 import BeautifulSoup as bs
- import requests
- import lxml
- import re
- writer = []
- own_page = []
- recent = []
- prize = []
- headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}
- url = 'https://www.jianshu.com/recommendations/users'
- #见了个鬼,有的用户数字后面带w
- pat = re.compile('<p>(\d+(?:\.\dw)?)</p>')
- page = requests.get(url, headers=headers).content
- soup = bs(page, 'lxml')
- names = soup.find_all('h4', class_='name')
- titles = soup.find_all('div', class_='recent-update')
- # 作者名
- for name in names:
- writer.append(name.get_text())
- # 最近更新
- for title in titles:
- recent.append([title.a.get_text()])
- # 个人主页
- for name in names:
- own_page.append('https://www.jianshu.com'+name.parent.attrs['href'])
- # 关注 粉丝 文章 字数 喜欢
- for each_url in own_page:
- wrap = []
- page = requests.get(each_url, headers=headers).content
- soup = bs(page, 'lxml')
- for data in soup.find_all('div', class_='meta-block'):
- info = re.findall(pat, str(list(data.children)[1]))
- wrap.append(info[0])
- prize.append(wrap)
复制代码 |
|