|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- # -*- coding: utf-8 -*-
- 有没有大神给我讲讲yeild的具体的递归过程,我已经绕晕了
- import scrapy
- from scrapy import Spider,Request
- import json
- from zhihuuser.items import UserItem
- class ZhihuSpider(Spider):
- name = 'zhihu'
- allowed_domains = ['www.zhihu.com']
- start_urls = ['https://www.zhihu.com']
- start_user = 'excited-vczh'
- user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
- uesr_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
- follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include{include}&offset={offset}&limit={limit}'
- follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
- #当spider启动爬取时调用
- def start_requests(self):
- yield Request(self.user_url.format(user=self.start_user,include=self.uesr_query),self.parse_user)
- yield Request(self.follows_url.format(user=self.start_user,include=self.follows_query,offset=0,limit=20),callback=self.parse_follows)
- #解析单个用户的信息
- def parse_user(self, response):
- result = json.loads(response.text)
- item = UserItem()
- # item.fields输出items.py里面定义的所有名称
- for field in item.fields:
- if field in result.keys():
- #字典的get函数拿到值
- item[field] = result.get(field)
- yield item
- #每一个人再请求自己的关注列表
- yield Request(self.follows_url.format(user=result.get('url_token'),include=self.follows_query,limit=20,offset=0),\
- callback=self.parse_follows)
- #解析关注列表的信息
- def parse_follows(self,response):
- results = json.loads(response.text)
- #得到关注列表信息,利用parse_user解析每个关注人的信息
- if 'data' in results.keys():
- #分析网页后,用户关注列表data的值是一个列表
- for result in results.get('data'):
- yield Request(self.user_url.format(user=result.get('url_token'),include=self.uesr_query),callback=self.parse_user)
- #分页判断
- if 'paging' in results.keys() and ( results.get('paging').get('is_end') == False):
- next_page = results.get('paging').get('next')
- #print('开始请求下一页',next_page)
- yield Request(url=next_page,callback=self.parse_follows)
复制代码
没有多层嵌套~~
就跟 return 一样,但 yield 是生成一个对象。如果是生成对象的话,可以当参数或其他的用。
函数用 return 就没办法
|
|