预测数据时报错是什么原因

Terence888 · 发表于 2024-11-10 00:06:33

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

报错如下

---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/pandas/core/indexes/range.py:413, in RangeIndex.get_loc(self, key)
412 try:
--> 413 return self._range.index(new_key)
414 except ValueError as err:
ValueError: 0 is not in range
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
Cell In[46], line 1
----> 1 pre_valid = inference(model,valid_eval_dataloader)#测试集预测值
Cell In[42], line 6, in inference(model, test_dataloader)
3 model.eval()
5 with torch.no_grad():
----> 6 for batch in tqdm(test_dataloader):
7 input_ids = batch['input_ids'].to(device)
8 attention_mask = batch['attention_mask'].to(device)
File /opt/conda/lib/python3.10/site-packages/tqdm/notebook.py:250, in tqdm_notebook.__iter__(self)
248 try:
249 it = super().__iter__()
--> 250 for obj in it:
251 # return super(tqdm...) will not catch exception
252 yield obj
253 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt
File /opt/conda/lib/python3.10/site-packages/tqdm/std.py:1181, in tqdm.__iter__(self)
1178 time = self._time
1180 try:
-> 1181 for obj in iterable:
1182 yield obj
1183 # Update and possibly print the progressbar.
1184 # Note: does not call self.update(1) for speed optimisation.
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
627 if self._sampler_iter is None:
628 # TODO(https://github.com/pytorch/pytorch/issues/76750)
629 self._reset() # type: ignore[call-arg]
--> 630 data = self._next_data()
631 self._num_yielded += 1
632 if self._dataset_kind == _DatasetKind.Iterable and \
633 self._IterableDataset_len_called is not None and \
634 self._num_yielded > self._IterableDataset_len_called:
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:673, in _SingleProcessDataLoaderIter._next_data(self)
671 def _next_data(self):
672 index = self._next_index() # may raise StopIteration
--> 673 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
674 if self._pin_memory:
675 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:52, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
50 data = self.dataset.__getitems__(possibly_batched_index)
51 else:
---> 52 data = [self.dataset[idx] for idx in possibly_batched_index]
53 else:
54 data = self.dataset[possibly_batched_index]
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:52, in <listcomp>(.0)
50 data = self.dataset.__getitems__(possibly_batched_index)
51 else:
---> 52 data = [self.dataset[idx] for idx in possibly_batched_index]
53 else:
54 data = self.dataset[possibly_batched_index]
Cell In[26], line 15, in LLMDataset.__getitem__(self, idx)
14 def __getitem__(self,idx):
---> 15 text = self.df.loc[idx,'cleaned'] # extracting text from each row
17 encoded_dict = self.tokenizer.encode_plus(
18 text,
19 add_special_tokens=True,#自动在每个文本前后添加特殊标记(如CLS和SEP)
(...)
24 return_attention_mask=True, # We should put it into the model，计算注意力（attention）时忽略那些paddle值
25 )
27 if self.is_grad:#训练集
File /opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py:1183, in _LocationIndexer.__getitem__(self, key)
1181 key = tuple(com.apply_if_callable(x, self.obj) for x in key)
1182 if self._is_scalar_access(key):
-> 1183 return self.obj._get_value(*key, takeable=self._takeable)
1184 return self._getitem_tuple(key)
1185 else:
1186 # we by definition only have the 0th axis
File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:4221, in DataFrame._get_value(self, index, col, takeable)
4215 engine = self.index._engine
4217 if not isinstance(self.index, MultiIndex):
4218 # CategoricalIndex: Trying to use the engine fastpath may give incorrect
4219 # results if our categories are integers that dont match our codes
4220 # IntervalIndex: IntervalTree has no get_loc
-> 4221 row = self.index.get_loc(index)
4222 return series._values[row]
4224 # For MultiIndex going through engine effectively restricts us to
4225 # same-length tuples; see test_get_set_value_no_partial_indexing
File /opt/conda/lib/python3.10/site-packages/pandas/core/indexes/range.py:415, in RangeIndex.get_loc(self, key)
413 return self._range.index(new_key)
414 except ValueError as err:
--> 415 raise KeyError(key) from err
416 if isinstance(key, Hashable):
417 raise KeyError(key)
KeyError: 0

复制代码

相关代码如下

from torch.utils.data import Dataset
import torch
#定义数据集
class LLMDataset(Dataset):
def __init__(self,df,is_grad,tokenizer):
self.df = df # Pandas.DataFrame
self.is_grad = is_grad # True: train,valid / False: test
self.tokenizer = tokenizer
def __len__(self):
return len(self.df) # number of samples
def __getitem__(self,idx):
text = self.df.loc[idx,'cleaned'] # extracting text from each row
encoded_dict = self.tokenizer.encode_plus(
text,
add_special_tokens=True,#自动在每个文本前后添加特殊标记(如CLS和SEP)
padding='max_length',#补0
truncation=True,#句子长度大于max_length时截断
max_length=512, # given to the max_length of tokenized text
return_tensors='pt', # PyTorch
return_attention_mask=True, # We should put it into the model，计算注意力（attention）时忽略那些paddle值
)
if self.is_grad:#训练集
labels = self.df.loc[idx]['label']
# [batch,1,max_len(84)] -> [batch,max_len]#使用squeeze降维
return {'input_ids':encoded_dict['input_ids'].squeeze(),
'attention_mask':encoded_dict['attention_mask'].squeeze(),
'token_type_ids':encoded_dict['token_type_ids'].squeeze(),
# Our loss_fn wants it to be a "float" type
'labels':torch.tensor(labels,dtype=torch.float).unsqueeze(dim=0)}
else:#测试集
# [batch,1,max_len(84)] -> [batch,max_len]
return {'input_ids':encoded_dict['input_ids'].squeeze(),
'attention_mask':encoded_dict['attention_mask'].squeeze(),
'token_type_ids':encoded_dict['token_type_ids'].squeeze()}model = LLMModel(configs['model_name']).to(device)
def inference(model,test_dataloader):
all_preds = []
model.eval()
with torch.no_grad():
for batch in tqdm(test_dataloader):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
logits = model(input_ids,attention_mask)
logits = logits.detach().cpu().numpy()
all_preds.append(logits)
return all_preds
valid_dataset = LLMDataset(train_df[train_size:].drop('label',axis=1),False,tokenizer)
valid_eval_dataloader = DataLoader(valid_dataset,batch_size=8,shuffle=False,pin_memory=True)
pre_valid = inference(model,valid_eval_dataloader)#测试集预测值

复制代码

FishC · 发表于 2024-11-10 00:06:44

在分析错误信息和代码后，我发现您的问题可能在于：

1. **`.loc` 用法错误**：从错误信息来看，出现了 `KeyError: 0`。在 `__getitem__` 方法中，您尝试直接使用 `self.df.loc` 来提取文本，但 `loc` 通常需要指定键或索引。正确用法应该是 `self.df.loc[idx]`，根据数据框的索引来获取行数据。

2. **缺少列名称**：稍微修改一下获取列数据的方法。如果您的数据框包含某个列名（比如 "text"），则需要指定该列名，比如 `self.df.loc[idx, 'text']` 来提取文本。

3. **Tensor 类型问题**：`encoded_dict` 的键值在 `encode_plus` 返回的是字典，您要确保对齐数据类型，比如 `encoded_dict['input_ids'].squeeze()`。

以下是更新后的 `__getitem__` 方法：

def __getitem__(self, idx):
text = self.df.loc[idx, 'text'] # 假设 'text' 是列名
encoded_dict = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
padding='max_length',
truncation=True,
max_length=512,
return_tensors='pt',
return_attention_mask=True
)
# 查找相对列名是重要的, 下方假设列名为 'label'
if self.is_grad:
labels = self.df.loc[idx, 'label']
return {
'input_ids': encoded_dict['input_ids'].squeeze(),
'attention_mask': encoded_dict['attention_mask'].squeeze(),
'token_type_ids': encoded_dict['token_type_ids'].squeeze(),
'labels': torch.tensor(labels, dtype=torch.float).unsqueeze(dim=0)
}
else:
return {
'input_ids': encoded_dict['input_ids'].squeeze(),
'attention_mask': encoded_dict['attention_mask'].squeeze(),
'token_type_ids': encoded_dict['token_type_ids'].squeeze()
}

复制代码

通过这样的修改，基本上大多数涉及错误 `KeyError` 的原因就能解决。根据具体的列名稍微调整代码，并处理可能的Pandas DataFrame的索引问题。

以上回复来自 FishC AI 小助理，如未能正确解答您的问题，请继续追问。

账号		自动登录	找回密码
密码			立即注册

预测数据时报错是什么原因

马上注册，结交更多好友，享用更多功能^_^

浏览过的版块