scrapy持久化存储输出结果间隔一行None
import scrapy
from tutorial.items import QuoteItem
class QuotesSpider(scrapy.Spider):
name = 'quotes'
#allowed_domains = ['quoten.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
tr_list = response.xpath('//div[@class="col-md-8"]//div')
for tr in tr_list:
item = QuoteItem()
item['text'] = tr.xpath('./span[@class="text"]/text()').extract_first()
item['author'] = tr.xpath('./span/small[@class="author"]/text()').extract_first()
item['tags'] = tr.xpath('./div/a/text()').extract()
yield item 这是Item的设置
import scrapy
class QuoteItem(scrapy.Item):
#定义三个字段
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
这是pipelines
class TutorialPipeline:
def process_item(self, item, spider):
print(item)
return item
输出结果
{'author': 'Albert Einstein',
'tags': ['change', 'deep-thoughts', 'thinking', 'world'],
'text': '“The world as we have created it is a process of our thinking. It '
'cannot be changed without changing our thinking.”'}
{'author': None, 'tags': [], 'text': None}
{'author': 'J.K. Rowling',
'tags': ['abilities', 'choices'],
'text': '“It is our choices, Harry, that show what we truly are, far more '
'than our abilities.”'}
{'author': None, 'tags': [], 'text': None}
{'author': 'Albert Einstein',
'tags': ['inspirational', 'life', 'live', 'miracle', 'miracles'],
'text': '“There are only two ways to live your life. One is as though nothing '
'is a miracle. The other is as though everything is a miracle.”'}
{'author': None, 'tags': [], 'text': None}
{'author': 'Jane Austen',
'tags': ['aliteracy', 'books', 'classic', 'humor'],
'text': '“The person, be it gentleman or lady, who has not pleasure in a good '
'novel, must be intolerably stupid.”'}
{'author': None, 'tags': [], 'text': None}
{'author': 'Marilyn Monroe',
'tags': ['be-yourself', 'inspirational'],
'text': "“Imperfection is beauty, madness is genius and it's better to be "
'absolutely ridiculous than absolutely boring.”'}
{'author': None, 'tags': [], 'text': None}
{'author': 'Albert Einstein',
'tags': ['adulthood', 'success', 'value'],
'text': '“Try not to become a man of success. Rather become a man of value.”'}
{'author': None, 'tags': [], 'text': None}
{'author': 'André Gide',
'tags': ['life', 'love'],
'text': '“It is better to be hated for what you are than to be loved for what '
'you are not.”'}
{'author': None, 'tags': [], 'text': None}
{'author': 'Thomas A. Edison',
'tags': ['edison', 'failure', 'inspirational', 'paraphrased'],
'text': "“I have not failed. I've just found 10,000 ways that won't work.”"}
{'author': None, 'tags': [], 'text': None}
{'author': 'Eleanor Roosevelt',
'tags': ['misattributed-eleanor-roosevelt'],
'text': '“A woman is like a tea bag; you never know how strong it is until '
"it's in hot water.”"}
{'author': None, 'tags': [], 'text': None}
{'author': 'Steve Martin',
'tags': ['humor', 'obvious', 'simile'],
'text': '“A day without sunshine is like, you know, night.”'}
{'author': None, 'tags': [], 'text': None}
wcq15759797758 发表于 2021-6-5 13:50
输出结果
{'author': 'Albert Einstein',
'tags': ['change', 'deep-thoughts', 'thinking', 'world'],
...
数据是爬取到了但是 爬取到数据之后下一行的数据全是空
页:
[1]