|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import scrapy
from C005.items import C005Item
class JklSpider(scrapy.Spider):
name = 'jkl'
# allowed_domains = ['www.123.com']
start_urls = ['https://www.qiushibaike.com/']
页码 = 1
def parse(self, response):
总页数 = int(response.xpath(r'//a/span/text()').extract()[-2][1:-1])
文字 = response.xpath('//a[@class="recmd-content"]/text()').extract()
# print(文字)
item = C005Item()
item['文字'] = 文字
yield item
if self.页码 <= 总页数:
url = f'https://www.qiushibaike.com/8hr/page/{self.页码}/'
self.页码 += 1
yield scrapy.Request(url=url, callback=self.parse)
我的结果是对的,所要的内容已经保存到Csv文件中了 但是返回出报了很多错误不知道如何解决,报错信息如下
Traceback (most recent call last):
File "d:\program files\python38\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "d:\program files\python38\lib\site-packages\scrapy\utils\defer.py", line 157, in f
return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
File "D:\code\abc\C005\C005\pipelines.py", line 15, in process_item
数据.to_csv('d:/百科.csv', index=False, header=0, mode='a', encoding='ANSI')
File "d:\program files\python38\lib\site-packages\pandas\core\generic.py", line 3204, in to_csv
formatter.save()
File "d:\program files\python38\lib\site-packages\pandas\io\formats\csvs.py", line 204, in save
self._save()
File "d:\program files\python38\lib\site-packages\pandas\io\formats\csvs.py", line 325, in _save
self._save_chunk(start_i, end_i)
File "d:\program files\python38\lib\site-packages\pandas\io\formats\csvs.py", line 356, in _save_chunk
libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
File "pandas\_libs\writers.pyx", line 68, in pandas._libs.writers.write_csv_rows
File "d:\program files\python38\lib\encodings\mbcs.py", line 25, in encode
return mbcs_encode(input, self.errors)[0]
UnicodeEncodeError: 'mbcs' codec can't encode characters in position 0--1: invalid character
本帖最后由 Twilight6 于 2020-8-1 10:30 编辑
你电脑安装了 Excel 了吗? 如果用 Excel 打开 csv 文件的确会乱码,要改 Excel 的编码
我去看了下网站就是 utf-8 的编码的 所以你用 utf-8 爬取没事,然后右击打开方式选择记事本就不会出现乱码情况
import pandas as pd
class C005Pipeline:
def process_item(self, item, spider):
文字 = item['文字']
数据 = pd.DataFrame({'a': 文字})
数据.to_csv('d:\\abc.csv', index=False, header=0, mode='a', encoding='utf-8')
return item
用上面代码,然后用记事本打开 csv 文件,应该就不会乱码
|
|