|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
学了scrapy后,参照网上的scrapy爬煎蛋写了个爬花瓣的,但一直爬不出来,找了好久原因都没找到,大神帮我看一下吧:
huaban_spider.py
- import scrapy
- from huaban.items import HuabanItem
- from scrapy.crawler import CrawlerProcess
- class HuabanSpider(scrapy.spiders.Spider):
- name = "huaban"
- allow_domains = ["huaban.com"]
- start_urls = ["http://huaban.com/pins/872365966"]
- def parse(self,response):
- item = HuabanItem()
- item['image_urls'] = response.xpath('//div[@class="main-image"]//div/a/img/@src').extract()
- print('image_urls',item['image_urls'])
- yield item
- new_url = response.xpath('//div[@id="board_pins_waterfall"]/a/@href').extract_first()
- if new_url:
- yield scrapy.Request(new_url,callback=self.parse)
复制代码
pipelines.py
- # -*- coding: utf-8 -*-
- # Define your item pipelines here
- #
- # Don't forget to add your pipeline to the ITEM_PIPELINES setting
- # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
- import os
- import urllib
- from huaban import settings
- class HuabanPipeline(object):
- def process_item(self,item,spider):
- dir_path = '%s/%s'%(settings.IMAGES_STORE,spider.name)
- print("dir_path",dir_path)
- if not os.path.exists(dir_path):
- os.makedirs(dir_path)
- for image_url in item['image_urls']:
- list_name = image_url.split('/')
- file_name = list_name[len(list_name)-1]
- file_path = '%s/%s'%(dir_path,file_name)
- if os.path.exists(file_name):
- continue
- with open(file_path,'wb') as file_writer:
- conn = urllib.request.urlopen(image_url)
- file_write.write(conn.read())
- file_writer.close()
- return item
复制代码
items.py
- # -*- coding: utf-8 -*-
- # Define here the models for your scraped items
- #
- # See documentation in:
- # http://doc.scrapy.org/en/latest/topics/items.html
- import scrapy
- class HuabanItem(scrapy.Item):
- # define the fields for your item here like:
- # name = scrapy.Field()
- image_urls = scrapy.Field()
复制代码
settings.py
- # -*- coding: utf-8 -*-
- # Scrapy settings for huaban project
- #
- # For simplicity, this file contains only settings considered important or
- # commonly used. You can find more settings consulting the documentation:
- #
- # http://doc.scrapy.org/en/latest/topics/settings.html
- # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
- # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
- BOT_NAME = 'huaban'
- SPIDER_MODULES = ['huaban.spiders']
- NEWSPIDER_MODULE = 'huaban.spiders'
- # Crawl responsibly by identifying yourself (and your website) on the user-agent
- #USER_AGENT = 'huaban (+http://www.yourdomain.com)'
- # Obey robots.txt rules
- ROBOTSTXT_OBEY = True
- # Configure maximum concurrent requests performed by Scrapy (default: 16)
- #CONCURRENT_REQUESTS = 32
- # Configure a delay for requests for the same website (default: 0)
- # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
- # See also autothrottle settings and docs
- #DOWNLOAD_DELAY = 3
- # The download delay setting will honor only one of:
- #CONCURRENT_REQUESTS_PER_DOMAIN = 16
- #CONCURRENT_REQUESTS_PER_IP = 16
- # Disable cookies (enabled by default)
- #COOKIES_ENABLED = False
- # Disable Telnet Console (enabled by default)
- #TELNETCONSOLE_ENABLED = False
- # Override the default request headers:
- #DEFAULT_REQUEST_HEADERS = {
- # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- # 'Accept-Language': 'en',
- #}
- # Enable or disable spider middlewares
- # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
- #SPIDER_MIDDLEWARES = {
- # 'huaban.middlewares.MyCustomSpiderMiddleware': 543,
- #}
- # Enable or disable downloader middlewares
- # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
- #DOWNLOADER_MIDDLEWARES = {
- # 'huaban.middlewares.MyCustomDownloaderMiddleware': 543,
- #}
- # Enable or disable extensions
- # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
- #EXTENSIONS = {
- # 'scrapy.extensions.telnet.TelnetConsole': None,
- #}
- # Configure item pipelines
- # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
- ITEM_PIPELINES = {
- 'huaban.pipelines.HuabanPipeline': 1,
- }
- IMAGES_STORE = 'D:'
- DOWNLOAD_DELAY = 0.25
- # Enable and configure the AutoThrottle extension (disabled by default)
- # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
- #AUTOTHROTTLE_ENABLED = True
- # The initial download delay
- #AUTOTHROTTLE_START_DELAY = 5
- # The maximum download delay to be set in case of high latencies
- #AUTOTHROTTLE_MAX_DELAY = 60
- # The average number of requests Scrapy should be sending in parallel to
- # each remote server
- #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
- # Enable showing throttling stats for every response received:
- #AUTOTHROTTLE_DEBUG = False
- # Enable and configure HTTP caching (disabled by default)
- # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
- #HTTPCACHE_ENABLED = True
- #HTTPCACHE_EXPIRATION_SECS = 0
- #HTTPCACHE_DIR = 'httpcache'
- #HTTPCACHE_IGNORE_HTTP_CODES = []
- #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
复制代码
爬出来的结果是这样的
C:\Users\Administrator\huaban>scrapy crawl huaban
2016-09-30 22:24:52 [scrapy] INFO: Scrapy 1.1.3 started (bot: huaban)
2016-09-30 22:24:52 [scrapy] INFO: Overridden settings: {'DOWNLOAD_DELAY': 0.25,
'SPIDER_MODULES': ['huaban.spiders'], 'BOT_NAME': 'huaban', 'ROBOTSTXT_OBEY': T
rue, 'NEWSPIDER_MODULE': 'huaban.spiders'}
2016-09-30 22:24:52 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole']
2016-09-30 22:24:53 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-09-30 22:24:53 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-09-30 22:24:53 [scrapy] INFO: Enabled item pipelines:
['huaban.pipelines.HuabanPipeline']
2016-09-30 22:24:53 [scrapy] INFO: Spider opened
2016-09-30 22:24:53 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 i
tems (at 0 items/min)
2016-09-30 22:24:53 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-09-30 22:24:53 [scrapy] DEBUG: Crawled (200) <GET http://huaban.com/robots.
txt> (referer: None)
2016-09-30 22:24:54 [scrapy] DEBUG: Crawled (200) <GET http://huaban.com/pins/87
2365966> (referer: None)
image_urls []
dir_path D:/huaban
2016-09-30 22:24:54 [scrapy] DEBUG: Scraped from <200 http://huaban.com/pins/872
365966>
{'image_urls': []}
2016-09-30 22:24:54 [scrapy] INFO: Closing spider (finished)
2016-09-30 22:24:54 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 440,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 16658,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 9, 30, 14, 24, 54, 540733),
'item_scraped_count': 1,
'log_count/DEBUG': 4,
'log_count/INFO': 7,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2016, 9, 30, 14, 24, 53, 674683)}
2016-09-30 22:24:54 [scrapy] INFO: Spider closed (finished)
'downloader/response_status_count/200': 2,
成功2次了阿
|
|