预测数据时报错是什么原因,Python交流,编程语言专区,鱼C论坛

Terence888 发表于 2024-11-10 00:06:33

预测数据时报错是什么原因

报错如下---------------------------------------------------------------------------
ValueError                            Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/pandas/core/indexes/range.py:413, in RangeIndex.get_loc(self, key)
412 try:
--> 413 return self._range.index(new_key)
414 except ValueError as err:

ValueError: 0 is not in range

The above exception was the direct cause of the following exception:

KeyError                               Traceback (most recent call last)
Cell In, line 1
----> 1 pre_valid = inference(model,valid_eval_dataloader)#测试集预测值

Cell In, line 6, in inference(model, test_dataloader)
   3 model.eval()
   5 with torch.no_grad():
----> 6 for batch in tqdm(test_dataloader):
   7       input_ids = batch['input_ids'].to(device)
   8       attention_mask = batch['attention_mask'].to(device)

File /opt/conda/lib/python3.10/site-packages/tqdm/notebook.py:250, in tqdm_notebook.__iter__(self)
248 try:
249 it = super().__iter__()
--> 250 for obj in it:
251       # return super(tqdm...) will not catch exception
252       yield obj
253 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt

File /opt/conda/lib/python3.10/site-packages/tqdm/std.py:1181, in tqdm.__iter__(self)
1178 time = self._time
1180 try:
-> 1181 for obj in iterable:
1182       yield obj
1183       # Update and possibly print the progressbar.
1184       # Note: does not call self.update(1) for speed optimisation.

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
627 if self._sampler_iter is None:
628 # TODO(https://github.com/pytorch/pytorch/issues/76750)
629 self._reset()# type: ignore
--> 630 data = self._next_data()
631 self._num_yielded += 1
632 if self._dataset_kind == _DatasetKind.Iterable and \
633       self._IterableDataset_len_called is not None and \
634       self._num_yielded > self._IterableDataset_len_called:

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:673, in _SingleProcessDataLoaderIter._next_data(self)
671 def _next_data(self):
672 index = self._next_index()# may raise StopIteration
--> 673 data = self._dataset_fetcher.fetch(index)# may raise StopIteration
674 if self._pin_memory:
675       data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:52, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
50       data = self.dataset.__getitems__(possibly_batched_index)
51 else:
---> 52       data = for idx in possibly_batched_index]
53 else:
54 data = self.dataset

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:52, in <listcomp>(.0)
50       data = self.dataset.__getitems__(possibly_batched_index)
51 else:
---> 52       data = for idx in possibly_batched_index]
53 else:
54 data = self.dataset

Cell In, line 15, in LLMDataset.__getitem__(self, idx)
14 def __getitem__(self,idx):
---> 15 text = self.df.loc # extracting text from each row
17 encoded_dict = self.tokenizer.encode_plus(
18       text,
19       add_special_tokens=True,#自动在每个文本前后添加特殊标记(如CLS和SEP)
(...)
24       return_attention_mask=True, # We should put it into the model，计算注意力（attention）时忽略那些paddle值
25 )
27 if self.is_grad:#训练集

File /opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py:1183, in _LocationIndexer.__getitem__(self, key)
1181 key = tuple(com.apply_if_callable(x, self.obj) for x in key)
1182 if self._is_scalar_access(key):
-> 1183       return self.obj._get_value(*key, takeable=self._takeable)
1184 return self._getitem_tuple(key)
1185 else:
1186 # we by definition only have the 0th axis

File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:4221, in DataFrame._get_value(self, index, col, takeable)
4215 engine = self.index._engine
4217 if not isinstance(self.index, MultiIndex):
4218 # CategoricalIndex: Trying to use the engine fastpath may give incorrect
4219 #results if our categories are integers that dont match our codes
4220 # IntervalIndex: IntervalTree has no get_loc
-> 4221 row = self.index.get_loc(index)
4222 return series._values
4224 # For MultiIndex going through engine effectively restricts us to
4225 #same-length tuples; see test_get_set_value_no_partial_indexing

File /opt/conda/lib/python3.10/site-packages/pandas/core/indexes/range.py:415, in RangeIndex.get_loc(self, key)
413       return self._range.index(new_key)
414 except ValueError as err:
--> 415       raise KeyError(key) from err
416 if isinstance(key, Hashable):
417 raise KeyError(key)

KeyError: 0

相关代码如下

from torch.utils.data import Dataset
import torch

#定义数据集
class LLMDataset(Dataset):
def __init__(self,df,is_grad,tokenizer):
   self.df = df # Pandas.DataFrame
   self.is_grad = is_grad # True: train,valid / False: test
   self.tokenizer = tokenizer

def __len__(self):
   return len(self.df) # number of samples

def __getitem__(self,idx):
   text = self.df.loc # extracting text from each row

   encoded_dict = self.tokenizer.encode_plus(
         text,
         add_special_tokens=True,#自动在每个文本前后添加特殊标记(如CLS和SEP)
         padding='max_length',#补0
         truncation=True,#句子长度大于max_length时截断
         max_length=512, # given to the max_length of tokenized text
         return_tensors='pt', # PyTorch
         return_attention_mask=True, # We should put it into the model，计算注意力（attention）时忽略那些paddle值
   )

   if self.is_grad:#训练集
         labels = self.df.loc['label']
         # -> #使用squeeze降维
         return {'input_ids':encoded_dict['input_ids'].squeeze(),
               'attention_mask':encoded_dict['attention_mask'].squeeze(),
               'token_type_ids':encoded_dict['token_type_ids'].squeeze(),
               # Our loss_fn wants it to be a "float" type
               'labels':torch.tensor(labels,dtype=torch.float).unsqueeze(dim=0)}
   else:#测试集
         # ->
         return {'input_ids':encoded_dict['input_ids'].squeeze(),
               'attention_mask':encoded_dict['attention_mask'].squeeze(),
               'token_type_ids':encoded_dict['token_type_ids'].squeeze()}model = LLMModel(configs['model_name']).to(device)
def inference(model,test_dataloader):
all_preds = []
model.eval()

with torch.no_grad():
   for batch in tqdm(test_dataloader):
         input_ids = batch['input_ids'].to(device)
         attention_mask = batch['attention_mask'].to(device)

         logits = model(input_ids,attention_mask)
         logits = logits.detach().cpu().numpy()
         all_preds.append(logits)

return all_preds
valid_dataset = LLMDataset(train_df.drop('label',axis=1),False,tokenizer)
valid_eval_dataloader = DataLoader(valid_dataset,batch_size=8,shuffle=False,pin_memory=True)
pre_valid = inference(model,valid_eval_dataloader)#测试集预测值

FishC 发表于 2024-11-10 00:06:44

在分析错误信息和代码后，我发现您的问题可能在于：

1. **`.loc` 用法错误**：从错误信息来看，出现了 `KeyError: 0`。在 `__getitem__` 方法中，您尝试直接使用 `self.df.loc` 来提取文本，但 `loc` 通常需要指定键或索引。正确用法应该是 `self.df.loc`，根据数据框的索引来获取行数据。

2. **缺少列名称**：稍微修改一下获取列数据的方法。如果您的数据框包含某个列名（比如 "text"），则需要指定该列名，比如 `self.df.loc` 来提取文本。

3. **Tensor 类型问题**：`encoded_dict` 的键值在 `encode_plus` 返回的是字典，您要确保对齐数据类型，比如 `encoded_dict['input_ids'].squeeze()`。

以下是更新后的 `__getitem__` 方法：

def __getitem__(self, idx):
text = self.df.loc# 假设 'text' 是列名

encoded_dict = self.tokenizer.encode_plus(
   text,
   add_special_tokens=True,
   padding='max_length',
   truncation=True,
   max_length=512,
   return_tensors='pt',
   return_attention_mask=True
)

# 查找相对列名是重要的, 下方假设列名为 'label'
if self.is_grad:
   labels = self.df.loc
   return {
         'input_ids': encoded_dict['input_ids'].squeeze(),
         'attention_mask': encoded_dict['attention_mask'].squeeze(),
         'token_type_ids': encoded_dict['token_type_ids'].squeeze(),
         'labels': torch.tensor(labels, dtype=torch.float).unsqueeze(dim=0)
   }
else:
   return {
         'input_ids': encoded_dict['input_ids'].squeeze(),
         'attention_mask': encoded_dict['attention_mask'].squeeze(),
         'token_type_ids': encoded_dict['token_type_ids'].squeeze()
   }

通过这样的修改，基本上大多数涉及错误 `KeyError` 的原因就能解决。根据具体的列名稍微调整代码，并处理可能的Pandas DataFrame的索引问题。

以上回复来自 FishC AI 小助理，如未能正确解答您的问题，请继续追问。

页: [1]

鱼C论坛's Archiver

预测数据时报错是什么原因