|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
报错如下---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/pandas/core/indexes/range.py:413, in RangeIndex.get_loc(self, key)
412 try:
--> 413 return self._range.index(new_key)
414 except ValueError as err:
ValueError: 0 is not in range
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
Cell In[46], line 1
----> 1 pre_valid = inference(model,valid_eval_dataloader)#测试集预测值
Cell In[42], line 6, in inference(model, test_dataloader)
3 model.eval()
5 with torch.no_grad():
----> 6 for batch in tqdm(test_dataloader):
7 input_ids = batch['input_ids'].to(device)
8 attention_mask = batch['attention_mask'].to(device)
File /opt/conda/lib/python3.10/site-packages/tqdm/notebook.py:250, in tqdm_notebook.__iter__(self)
248 try:
249 it = super().__iter__()
--> 250 for obj in it:
251 # return super(tqdm...) will not catch exception
252 yield obj
253 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt
File /opt/conda/lib/python3.10/site-packages/tqdm/std.py:1181, in tqdm.__iter__(self)
1178 time = self._time
1180 try:
-> 1181 for obj in iterable:
1182 yield obj
1183 # Update and possibly print the progressbar.
1184 # Note: does not call self.update(1) for speed optimisation.
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
627 if self._sampler_iter is None:
628 # TODO(https://github.com/pytorch/pytorch/issues/76750)
629 self._reset() # type: ignore[call-arg]
--> 630 data = self._next_data()
631 self._num_yielded += 1
632 if self._dataset_kind == _DatasetKind.Iterable and \
633 self._IterableDataset_len_called is not None and \
634 self._num_yielded > self._IterableDataset_len_called:
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:673, in _SingleProcessDataLoaderIter._next_data(self)
671 def _next_data(self):
672 index = self._next_index() # may raise StopIteration
--> 673 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
674 if self._pin_memory:
675 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:52, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
50 data = self.dataset.__getitems__(possibly_batched_index)
51 else:
---> 52 data = [self.dataset[idx] for idx in possibly_batched_index]
53 else:
54 data = self.dataset[possibly_batched_index]
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:52, in <listcomp>(.0)
50 data = self.dataset.__getitems__(possibly_batched_index)
51 else:
---> 52 data = [self.dataset[idx] for idx in possibly_batched_index]
53 else:
54 data = self.dataset[possibly_batched_index]
Cell In[26], line 15, in LLMDataset.__getitem__(self, idx)
14 def __getitem__(self,idx):
---> 15 text = self.df.loc[idx,'cleaned'] # extracting text from each row
17 encoded_dict = self.tokenizer.encode_plus(
18 text,
19 add_special_tokens=True,#自动在每个文本前后添加特殊标记(如CLS和SEP)
(...)
24 return_attention_mask=True, # We should put it into the model,计算注意力(attention)时忽略那些paddle值
25 )
27 if self.is_grad:#训练集
File /opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py:1183, in _LocationIndexer.__getitem__(self, key)
1181 key = tuple(com.apply_if_callable(x, self.obj) for x in key)
1182 if self._is_scalar_access(key):
-> 1183 return self.obj._get_value(*key, takeable=self._takeable)
1184 return self._getitem_tuple(key)
1185 else:
1186 # we by definition only have the 0th axis
File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:4221, in DataFrame._get_value(self, index, col, takeable)
4215 engine = self.index._engine
4217 if not isinstance(self.index, MultiIndex):
4218 # CategoricalIndex: Trying to use the engine fastpath may give incorrect
4219 # results if our categories are integers that dont match our codes
4220 # IntervalIndex: IntervalTree has no get_loc
-> 4221 row = self.index.get_loc(index)
4222 return series._values[row]
4224 # For MultiIndex going through engine effectively restricts us to
4225 # same-length tuples; see test_get_set_value_no_partial_indexing
File /opt/conda/lib/python3.10/site-packages/pandas/core/indexes/range.py:415, in RangeIndex.get_loc(self, key)
413 return self._range.index(new_key)
414 except ValueError as err:
--> 415 raise KeyError(key) from err
416 if isinstance(key, Hashable):
417 raise KeyError(key)
KeyError: 0
相关代码如下from torch.utils.data import Dataset
import torch
#定义数据集
class LLMDataset(Dataset):
def __init__(self,df,is_grad,tokenizer):
self.df = df # Pandas.DataFrame
self.is_grad = is_grad # True: train,valid / False: test
self.tokenizer = tokenizer
def __len__(self):
return len(self.df) # number of samples
def __getitem__(self,idx):
text = self.df.loc[idx,'cleaned'] # extracting text from each row
encoded_dict = self.tokenizer.encode_plus(
text,
add_special_tokens=True,#自动在每个文本前后添加特殊标记(如CLS和SEP)
padding='max_length',#补0
truncation=True,#句子长度大于max_length时截断
max_length=512, # given to the max_length of tokenized text
return_tensors='pt', # PyTorch
return_attention_mask=True, # We should put it into the model,计算注意力(attention)时忽略那些paddle值
)
if self.is_grad:#训练集
labels = self.df.loc[idx]['label']
# [batch,1,max_len(84)] -> [batch,max_len]#使用squeeze降维
return {'input_ids':encoded_dict['input_ids'].squeeze(),
'attention_mask':encoded_dict['attention_mask'].squeeze(),
'token_type_ids':encoded_dict['token_type_ids'].squeeze(),
# Our loss_fn wants it to be a "float" type
'labels':torch.tensor(labels,dtype=torch.float).unsqueeze(dim=0)}
else:#测试集
# [batch,1,max_len(84)] -> [batch,max_len]
return {'input_ids':encoded_dict['input_ids'].squeeze(),
'attention_mask':encoded_dict['attention_mask'].squeeze(),
'token_type_ids':encoded_dict['token_type_ids'].squeeze()}model = LLMModel(configs['model_name']).to(device)
def inference(model,test_dataloader):
all_preds = []
model.eval()
with torch.no_grad():
for batch in tqdm(test_dataloader):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
logits = model(input_ids,attention_mask)
logits = logits.detach().cpu().numpy()
all_preds.append(logits)
return all_preds
valid_dataset = LLMDataset(train_df[train_size:].drop('label',axis=1),False,tokenizer)
valid_eval_dataloader = DataLoader(valid_dataset,batch_size=8,shuffle=False,pin_memory=True)
pre_valid = inference(model,valid_eval_dataloader)#测试集预测值
|
|