|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
报错如下- ---------------------------------------------------------------------------
- ValueError Traceback (most recent call last)
- File /opt/conda/lib/python3.10/site-packages/pandas/core/indexes/range.py:413, in RangeIndex.get_loc(self, key)
- 412 try:
- --> 413 return self._range.index(new_key)
- 414 except ValueError as err:
- ValueError: 0 is not in range
- The above exception was the direct cause of the following exception:
- KeyError Traceback (most recent call last)
- Cell In[46], line 1
- ----> 1 pre_valid = inference(model,valid_eval_dataloader)#测试集预测值
- Cell In[42], line 6, in inference(model, test_dataloader)
- 3 model.eval()
- 5 with torch.no_grad():
- ----> 6 for batch in tqdm(test_dataloader):
- 7 input_ids = batch['input_ids'].to(device)
- 8 attention_mask = batch['attention_mask'].to(device)
- File /opt/conda/lib/python3.10/site-packages/tqdm/notebook.py:250, in tqdm_notebook.__iter__(self)
- 248 try:
- 249 it = super().__iter__()
- --> 250 for obj in it:
- 251 # return super(tqdm...) will not catch exception
- 252 yield obj
- 253 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt
- File /opt/conda/lib/python3.10/site-packages/tqdm/std.py:1181, in tqdm.__iter__(self)
- 1178 time = self._time
- 1180 try:
- -> 1181 for obj in iterable:
- 1182 yield obj
- 1183 # Update and possibly print the progressbar.
- 1184 # Note: does not call self.update(1) for speed optimisation.
- File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
- 627 if self._sampler_iter is None:
- 628 # TODO(https://github.com/pytorch/pytorch/issues/76750)
- 629 self._reset() # type: ignore[call-arg]
- --> 630 data = self._next_data()
- 631 self._num_yielded += 1
- 632 if self._dataset_kind == _DatasetKind.Iterable and \
- 633 self._IterableDataset_len_called is not None and \
- 634 self._num_yielded > self._IterableDataset_len_called:
- File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:673, in _SingleProcessDataLoaderIter._next_data(self)
- 671 def _next_data(self):
- 672 index = self._next_index() # may raise StopIteration
- --> 673 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
- 674 if self._pin_memory:
- 675 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
- File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:52, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
- 50 data = self.dataset.__getitems__(possibly_batched_index)
- 51 else:
- ---> 52 data = [self.dataset[idx] for idx in possibly_batched_index]
- 53 else:
- 54 data = self.dataset[possibly_batched_index]
- File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:52, in <listcomp>(.0)
- 50 data = self.dataset.__getitems__(possibly_batched_index)
- 51 else:
- ---> 52 data = [self.dataset[idx] for idx in possibly_batched_index]
- 53 else:
- 54 data = self.dataset[possibly_batched_index]
- Cell In[26], line 15, in LLMDataset.__getitem__(self, idx)
- 14 def __getitem__(self,idx):
- ---> 15 text = self.df.loc[idx,'cleaned'] # extracting text from each row
- 17 encoded_dict = self.tokenizer.encode_plus(
- 18 text,
- 19 add_special_tokens=True,#自动在每个文本前后添加特殊标记(如CLS和SEP)
- (...)
- 24 return_attention_mask=True, # We should put it into the model,计算注意力(attention)时忽略那些paddle值
- 25 )
- 27 if self.is_grad:#训练集
- File /opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py:1183, in _LocationIndexer.__getitem__(self, key)
- 1181 key = tuple(com.apply_if_callable(x, self.obj) for x in key)
- 1182 if self._is_scalar_access(key):
- -> 1183 return self.obj._get_value(*key, takeable=self._takeable)
- 1184 return self._getitem_tuple(key)
- 1185 else:
- 1186 # we by definition only have the 0th axis
- File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:4221, in DataFrame._get_value(self, index, col, takeable)
- 4215 engine = self.index._engine
- 4217 if not isinstance(self.index, MultiIndex):
- 4218 # CategoricalIndex: Trying to use the engine fastpath may give incorrect
- 4219 # results if our categories are integers that dont match our codes
- 4220 # IntervalIndex: IntervalTree has no get_loc
- -> 4221 row = self.index.get_loc(index)
- 4222 return series._values[row]
- 4224 # For MultiIndex going through engine effectively restricts us to
- 4225 # same-length tuples; see test_get_set_value_no_partial_indexing
- File /opt/conda/lib/python3.10/site-packages/pandas/core/indexes/range.py:415, in RangeIndex.get_loc(self, key)
- 413 return self._range.index(new_key)
- 414 except ValueError as err:
- --> 415 raise KeyError(key) from err
- 416 if isinstance(key, Hashable):
- 417 raise KeyError(key)
- KeyError: 0
复制代码
相关代码如下
- from torch.utils.data import Dataset
- import torch
- #定义数据集
- class LLMDataset(Dataset):
- def __init__(self,df,is_grad,tokenizer):
- self.df = df # Pandas.DataFrame
- self.is_grad = is_grad # True: train,valid / False: test
- self.tokenizer = tokenizer
- def __len__(self):
- return len(self.df) # number of samples
- def __getitem__(self,idx):
- text = self.df.loc[idx,'cleaned'] # extracting text from each row
-
- encoded_dict = self.tokenizer.encode_plus(
- text,
- add_special_tokens=True,#自动在每个文本前后添加特殊标记(如CLS和SEP)
- padding='max_length',#补0
- truncation=True,#句子长度大于max_length时截断
- max_length=512, # given to the max_length of tokenized text
- return_tensors='pt', # PyTorch
- return_attention_mask=True, # We should put it into the model,计算注意力(attention)时忽略那些paddle值
- )
- if self.is_grad:#训练集
- labels = self.df.loc[idx]['label']
- # [batch,1,max_len(84)] -> [batch,max_len]#使用squeeze降维
- return {'input_ids':encoded_dict['input_ids'].squeeze(),
- 'attention_mask':encoded_dict['attention_mask'].squeeze(),
- 'token_type_ids':encoded_dict['token_type_ids'].squeeze(),
- # Our loss_fn wants it to be a "float" type
- 'labels':torch.tensor(labels,dtype=torch.float).unsqueeze(dim=0)}
- else:#测试集
- # [batch,1,max_len(84)] -> [batch,max_len]
- return {'input_ids':encoded_dict['input_ids'].squeeze(),
- 'attention_mask':encoded_dict['attention_mask'].squeeze(),
- 'token_type_ids':encoded_dict['token_type_ids'].squeeze()}model = LLMModel(configs['model_name']).to(device)
- def inference(model,test_dataloader):
- all_preds = []
- model.eval()
- with torch.no_grad():
- for batch in tqdm(test_dataloader):
- input_ids = batch['input_ids'].to(device)
- attention_mask = batch['attention_mask'].to(device)
-
- logits = model(input_ids,attention_mask)
- logits = logits.detach().cpu().numpy()
- all_preds.append(logits)
-
- return all_preds
- valid_dataset = LLMDataset(train_df[train_size:].drop('label',axis=1),False,tokenizer)
- valid_eval_dataloader = DataLoader(valid_dataset,batch_size=8,shuffle=False,pin_memory=True)
- pre_valid = inference(model,valid_eval_dataloader)#测试集预测值
复制代码 |
|