|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 kikyy 于 2020-6-14 10:52 编辑
这是我创建的爬虫,正则表达式r'.*\?query=python&page=\d+.*',r'.*/job_detail/[a-z|0-9|A-Z]{27}~\.html.*',也可以找到 示例: #/c100010000/?query=python&page=3
#/job_detail/fcf44698faf5ad540X183Nm0E1E~.html ,为什么还是爬取不到数据呢?求大神帮帮小白
是不是因为爬取到的链接不完整吗?可是我在爬wxapp社区时,为什么没有这个问题呢?
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from bosszhiping.items import BosszhipingItem
class ZhipingSpiderSpider(CrawlSpider):
name = 'zhiping_spider'
allowed_domains = ['zhipin.com']
start_urls = ['https://www.zhipin.com/c100010000/?query=python&page=1']
#/c100010000/?query=python&page=3
#/job_detail/fcf44698faf5ad540X183Nm0E1E~.html
rules = (
Rule(LinkExtractor(allow=r'.*\?query=python&page=\d+.*'), callback='parse_ture',follow=True),
Rule(LinkExtractor(allow=r'.*/job_detail/[a-z|0-9|A-Z]{27}~\.html.*'), callback='parse_detail',follow=True)
)
def parse_ture(self):
print('='*60)
print('正在爬取1')
print('=' * 60)
def parse_detal(self,response):
print('='*60)
print('正在爬取2')
print('=' * 60)
name = response.xpath("//h2[@class='name']/text()").get()
div = response.xpath("//div[@class='info-primary']//div[@class='name']")
work = div.xpath("./h1/text()").get()
salary = div.xpath("./span/text()").get()
city = response.xpath("//div[@class='info-primary']//p//text").extract()[0]
item = BosszhipingItem(name=name,work=work,salary=salary,city=city)
yield item
|
|