马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 Stubborn 于 2019-4-18 02:30 编辑
这个只是部分代码,鱼油们写到这里来回感叹,我擦,真特么牛逼,什么时候自己也要写这样的框架出来,你就牛逼了
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from downloadlie.items import DownloadlieItem
import re
class LiepSpider(CrawlSpider):
name = 'liep'
allowed_domains = ['liepin.com']
start_urls = ['https://www.liepin.com/zhaopin/?key=%E7%88%AC%E8%99%AB']
rules = (
Rule(LinkExtractor(allow=r'</span><a href="(.*?)">下一页'), follow=True),
Rule(LinkExtractor(allow=r'https://www.liepin.com/job/\d+.shtml'), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = DownloadlieItem()
item['profession'] = response.xpath("//div[@class='title-info']/h1/@title").extract_first()
item['profession_pay'] = response.xpath("//p[@class='job-item-title']/text()").extract_first()
item['profession_pay'] = re.sub(r'\r\n','',item['profession_pay']).strip()
item['profession_region'] = response.xpath("//p[@class='basic-infor']/span/a/text()").extract_first()
item['profession_require'] = response.xpath("//div[@class='job-qualifications']/span/text()").extract()
item['profession_welfare'] = response.xpath("//ul[@class='comp-tag-list clearfix']/li/span/text()").extract()
item['profession_describe'] = response.xpath("//div[@class='job-item main-message job-description']/div/text()").extract()
item['profession_describe'] = [re.sub(r'\r\n\t','',xr).strip() for xr in item['profession_describe']]
item['company_name'] = response.xpath("//div[@class='company-logo']/p/a/text()").extract_first()
item['company_industry'] = response.xpath("//ul[@class='new-compintro']/li[1]/text()").extract_first()
item['company_industry'] = re.sub(r'\r\n','',item['company_industry']).strip()
item['company_scale'] = response.xpath("//ul[@class='new-compintro']/li[2]/text()").extract_first()
item['company_adress'] = response.xpath("//ul[@class='new-compintro']/li[3]/text()").extract_first()
|