|

楼主 |
发表于 2024-11-3 12:38:44
|
显示全部楼层
- ---------------------------------------------------------------------------
- KeyError Traceback (most recent call last)
- Cell In[118], line 4
- 1 print('Training Start!')
- 2 print('=' * 100)
- ----> 4 train(model,
- 5 device,
- 6 train_dataloader,
- 7 valid_dataloader,
- 8 CFG.epochs,
- 9 loss_fn,
- 10 optimizer,
- 11 metric)
- 13 del model,train_dataloader, valid_dataloader
- 14 gc.collect()
- Cell In[82], line 17, in train(model, device, train_dataloader, valid_dataloader, epochs, loss_fn, optimizer, metric)
- 14 train_step = 0
- 15 pbar = tqdm(train_dataloader)#tqdm参数是一个iterable
- ---> 17 for batch in pbar: # you can also write like "for batch in tqdm(train_dataloader"
- 18 optimizer.zero_grad() # initialize
- 19 train_step += 1
- File /opt/conda/lib/python3.10/site-packages/tqdm/notebook.py:250, in tqdm_notebook.__iter__(self)
- 248 try:
- 249 it = super().__iter__()
- --> 250 for obj in it:
- 251 # return super(tqdm...) will not catch exception
- 252 yield obj
- 253 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt
- File /opt/conda/lib/python3.10/site-packages/tqdm/std.py:1181, in tqdm.__iter__(self)
- 1178 time = self._time
- 1180 try:
- -> 1181 for obj in iterable:
- 1182 yield obj
- 1183 # Update and possibly print the progressbar.
- 1184 # Note: does not call self.update(1) for speed optimisation.
- File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
- 627 if self._sampler_iter is None:
- 628 # TODO(https://github.com/pytorch/pytorch/issues/76750)
- 629 self._reset() # type: ignore[call-arg]
- --> 630 data = self._next_data()
- 631 self._num_yielded += 1
- 632 if self._dataset_kind == _DatasetKind.Iterable and \
- 633 self._IterableDataset_len_called is not None and \
- 634 self._num_yielded > self._IterableDataset_len_called:
- File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:1344, in _MultiProcessingDataLoaderIter._next_data(self)
- 1342 else:
- 1343 del self._task_info[idx]
- -> 1344 return self._process_data(data)
- File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:1370, in _MultiProcessingDataLoaderIter._process_data(self, data)
- 1368 self._try_put_index()
- 1369 if isinstance(data, ExceptionWrapper):
- -> 1370 data.reraise()
- 1371 return data
- File /opt/conda/lib/python3.10/site-packages/torch/_utils.py:706, in ExceptionWrapper.reraise(self)
- 702 except TypeError:
- 703 # If the exception takes multiple arguments, don't try to
- 704 # instantiate since we don't know how to
- 705 raise RuntimeError(msg) from None
- --> 706 raise exception
- KeyError: Caught KeyError in DataLoader worker process 0.
- Original Traceback (most recent call last):
- File "/opt/conda/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
- return self._engine.get_loc(casted_key)
- File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
- File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
- File "pandas/_libs/hashtable_class_helper.pxi", line 2606, in pandas._libs.hashtable.Int64HashTable.get_item
- File "pandas/_libs/hashtable_class_helper.pxi", line 2630, in pandas._libs.hashtable.Int64HashTable.get_item
- KeyError: 2183
- The above exception was the direct cause of the following exception:
- Traceback (most recent call last):
- File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 309, in _worker_loop
- data = fetcher.fetch(index) # type: ignore[possibly-undefined]
- File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
- data = [self.dataset[idx] for idx in possibly_batched_index]
- File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
- data = [self.dataset[idx] for idx in possibly_batched_index]
- File "/tmp/ipykernel_31/2920046666.py", line 34, in __getitem__
- labels = self.df.loc[idx]['label']
- File "/opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py", line 1191, in __getitem__
- return self._getitem_axis(maybe_callable, axis=axis)
- File "/opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py", line 1431, in _getitem_axis
- return self._get_label(key, axis=axis)
- File "/opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py", line 1381, in _get_label
- return self.obj.xs(label, axis=axis)
- File "/opt/conda/lib/python3.10/site-packages/pandas/core/generic.py", line 4301, in xs
- loc = index.get_loc(key)
- File "/opt/conda/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3812, in get_loc
- raise KeyError(key) from err
- KeyError: 2183
复制代码
以上为报错信息,相关代码如下
- def prepare_input(cfg, text):
- inputs = cfg.tokenizer.encode_plus(
- text,
- return_tensors=None,
- add_special_tokens=True,#自动在每个文本前后添加特殊标记(如CLS和SEP)
- return_attention_mask=True # 计算注意力(attention)时忽略那些paddle值
- )
-
- if len(inputs['input_ids']) > CFG.max_len:#句子长度大于max_length时截断
- inputs['input_ids'] = inputs['input_ids'][:CFG.max_len]
- inputs['attention_mask'] = inputs['attention_mask'][:CFG.max_len]
- inputs['token_type_ids'] = inputs['token_type_ids'][:CFG.max_len]
- for k, v in inputs.items():
- inputs[k] = torch.tensor(v, dtype=torch.long)
- return inputs
- class LLMDataset(Dataset):
- def __init__(self, cfg, df,is_grad):
- self.cfg = cfg
- self.df = df
- self.texts = df["cleaned"].values
- self.is_grad = is_grad
- def __len__(self):
- return len(self.texts)
- def __getitem__(self,idx):
- inputs = prepare_input(self.cfg, self.texts[idx])#inputs是一个字典
-
-
- if self.is_grad:#训练集
- labels = self.df.loc[idx]['label']
- # [batch,1,max_len(84)] -> [batch,max_len]#使用squeeze降维
- return {'input_ids':inputs['input_ids'].squeeze(),
- 'attention_mask':inputs['attention_mask'].squeeze(),
- 'token_type_ids':inputs['token_type_ids'].squeeze(),
- # Our loss_fn wants it to be a "float" type
- 'labels':torch.tensor(labels,dtype=torch.float).unsqueeze(dim=0)}
- else:#测试集
- # [batch,1,max_len(84)] -> [batch,max_len]
- return {'input_ids':inputs['input_ids'].squeeze(),
- 'attention_mask':inputs['attention_mask'].squeeze(),
- 'token_type_ids':inputs['token_type_ids'].squeeze()}
- class CollateCls:
- def __init__(self, cfg):
- self.tokenizer = cfg.tokenizer
- self.cfg = cfg
-
- def __call__(self, batch):
- output = dict()
- output["input_ids"] = [sample["input_ids"] for sample in batch]
- output["attention_mask"] = [sample["attention_mask"] for sample in batch]
- output["token_type_ids"] = [sample["token_type_ids"] for sample in batch]
- # calculate max token length of this batch
- batch_max = max([len(ids) for ids in output["input_ids"]])
- # add padding
- if self.tokenizer.padding_side == "right":
- output["input_ids"] = [
- list(s) + (batch_max - len(s)) * [self.tokenizer.pad_token_id]
- for s in output["input_ids"]
- ]
- output["attention_mask"] = [
- list(s) + (batch_max - len(s)) * [0] for s in output["attention_mask"]
- ]
- output["token_type_ids"] = [list(s) + (batch_max - len(s)) * [0] for s in output["token_type_ids"]]
- else:
- output["input_ids"] = [
- (batch_max - len(s)) * [self.tokenizer.pad_token_id] + list(s)
- for s in output["input_ids"]
- ]
- output["attention_mask"] = [
- (batch_max - len(s)) * [0] + list(s) for s in output["attention_mask"]
- ]
- output["token_type_ids"] = [(batch_max - len(s)) * [0] + list(s) for s in output["token_type_ids"]]
-
-
- # convert to tensors
- output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
- output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
- output["token_type_ids"] = torch.tensor(output["token_type_ids"], dtype=torch.long)
-
- return output
复制代码
- #生成训练集
- train_dataset = LLMDataset(CFG,pd.concat([X_train,y_train],axis=1),True)
- valid_dataset = LLMDataset(CFG,pd.concat([X_valid,y_valid],axis=1),False)
复制代码
- from torch.utils.data import DataLoader
- train_dataloader = DataLoader(train_dataset,batch_size=32,collate_fn=CollateCls(CFG),shuffle=True,num_workers = 4,pin_memory=True)#锁页内存(pin_memory)能够保持与GPU进行高速传输,在训练时加快数据的读取
- valid_dataloader = DataLoader(valid_dataset,batch_size=32,collate_fn=CollateCls(CFG),shuffle=False,num_workers = 4,pin_memory=True)
- #collate_fn用于自定义数据加载和批处理的方式
复制代码
- import gc,os
- from tqdm.auto import tqdm # visualizing tool for progress
- # They will be used to pick the best model.pt given to the valid loss
- best_model_epoch, valid_loss_values = [],[]
- valid_loss_min = [1] # arbitrary loss I set here
- def train(model,device,train_dataloader,valid_dataloader,epochs,loss_fn,optimizer,metric):
- for epoch in range(epochs):
- gc.collect() # memory cleaning垃圾回收机制,减少占用内存
- model.train()
- train_loss = 0
- train_step = 0
- pbar = tqdm(train_dataloader)#tqdm参数是一个iterable
- for batch in pbar: # you can also write like "for batch in tqdm(train_dataloader"
- optimizer.zero_grad() # initialize
- train_step += 1
-
- train_input_ids = batch['input_ids'].to(device)
- train_attention_mask = batch['attention_mask'].to(device)
- train_token_type_ids = batch['token_type_ids'].to(device)
- train_labels = batch['labels'].squeeze().to(device).long()#label真实值long()转化成一维张量
-
- # You can refer to the class "TweetsModel" for understand
- # what would be logits
- logits = model(train_input_ids, train_attention_mask,train_token_type_ids).to(device)
- predictions = torch.argmax(logits, dim=1) # get an index from larger one
- detached_predictions = predictions.detach().cpu().numpy()
-
- loss = loss_fn(logits, train_labels)
- loss.backward()
- optimizer.step()
- model.zero_grad()
- train_loss += loss.detach().cpu().numpy().item()
- pbar.set_postfix({'train_loss':train_loss/train_step})#设置进度条显示信息
- pbar.close()
- with torch.no_grad():
- model.eval()
- valid_loss = 0
- valid_step = 0
- total_valid_score = 0
- y_pred = [] # for getting f1_score that is a metric of the competition
- y_true = []
- pbar = tqdm(valid_dataloader)
- for batch in pbar:
- valid_step += 1
- valid_input_ids = batch['input_ids'].to(device)
- valid_attention_mask = batch['attention_mask'].to(device)
- valid_token_type_ids = batch['token_type_ids'].to(device)
- valid_labels = batch['labels'].squeeze().to(device).long()
- logits = model(valid_input_ids, valid_attention_mask).to(device)
- predictions = torch.argmax(logits, dim=1)
- detached_predictions = predictions.detach().cpu().numpy()
-
- loss = loss_fn(logits, valid_labels)
- valid_loss += loss.detach().cpu().numpy().item()
- y_pred.extend(predictions.cpu().numpy())
- y_true.extend(valid_labels.cpu().numpy())
- valid_loss /= valid_step
- f1 = f1_score(y_true,y_pred)
- print(f'Epoch [{epoch+1}/{epochs}] Score: {f1}')
- print(f'Epoch [{epoch+1}/{epochs}] Valid_loss: {valid_loss}')
- if valid_loss < min(valid_loss_min):
- print('model improved!')
- else:
- print('model not improved')
-
- torch.save(model.state_dict(), f'epoch:{epoch+1}_model.pt')#state_dict 是一个字典对象,包含了模型的所有可学习参数(如权重和偏置)及其当前值
- print('save checkpoint!')
- valid_loss_min.append(valid_loss)
- print(f'valid_loss_min:{min(valid_loss_min)}')
- best_model_epoch.append(f'/kaggle/working/epoch:{epoch+1}_model.pt')
- valid_loss_values.append(valid_loss)
- print('='*100)
- select_best_model() # refer to below function
- print('Train/Valid Completed!!')
- del train_dataloader, valid_dataloader # memory cleaning
- gc.collect()
- def select_best_model():
- best_model = best_model_epoch[np.array(valid_loss_values).argmin()]
- os.rename(best_model, best_model.split('.pt')[0] + '_best.pt')#重命名文件
复制代码
- print('Training Start!')
- print('=' * 100)
- train(model,
- device,
- train_dataloader,
- valid_dataloader,
- CFG.epochs,
- loss_fn,
- optimizer,
- metric)
- del model,train_dataloader, valid_dataloader
- gc.collect()
复制代码 |
|