RuntimeError Traceback (most recent call last)
Cell In[42], line 4
1 print('Training Start!')
2 print('=' * 100)
----> 4 train(model,
5 device,
6 train_dataloader,
7 valid_dataloader,
8 CFG.epochs,
9 loss_fn,
10 optimizer,
11 metric)
13 del model,train_dataloader, valid_dataloader
14 gc.collect()
Cell In[41], line 29, in train(model, device, train_dataloader, valid_dataloader, epochs, loss_fn, optimizer, metric)
25 train_labels = batch['labels'].squeeze().to(device).long()#label真实值long()转化成一维张量
27 # You can refer to the class "TweetsModel" for understand
28 # what would be logits
---> 29 logits = model(train_input_ids, train_attention_mask,train_token_type_ids).to(device)
30 predictions = torch.argmax(logits, dim=1) # get an index from larger one
31 detached_predictions = predictions.detach().cpu().numpy()
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
Cell In[34], line 48, in LLMModel.forward(self, input_ids, attention_mask, token_type_ids)
47 def forward(self, input_ids, attention_mask, token_type_ids):#attention_mask避免用注意力机制的时候关注到填充符
---> 48 feature = self.feature( input_ids, attention_mask, token_type_ids)
49 output = self.fc(feature)
50 return output
Cell In[34], line 42, in LLMModel.feature(self, input_ids, attention_mask, token_type_ids)
41 def feature(self, input_ids, attention_mask, token_type_ids):
---> 42 outputs = self.model( input_ids, attention_mask, token_type_ids)
43 last_hidden_states = outputs[0] #encoder最后一个隐藏状态的输出传递给decoder做cross attention
44 feature = last_hidden_states[:, 0, :] ## CLS token
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File /opt/conda/lib/python3.10/site-packages/transformers/models/deberta/modeling_deberta.py:956, in DebertaModel.forward(self, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds, output_attentions, output_hidden_states, return_dict)
953 if token_type_ids is None:
954 token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
--> 956 embedding_output = self.embeddings(
957 input_ids=input_ids,
958 token_type_ids=token_type_ids,
959 position_ids=position_ids,
960 mask=attention_mask,
961 inputs_embeds=inputs_embeds,
962 )
964 encoder_outputs = self.encoder(
965 embedding_output,
966 attention_mask,
969 return_dict=return_dict,
970 )
971 encoded_layers = encoder_outputs[1]
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File /opt/conda/lib/python3.10/site-packages/transformers/models/deberta/modeling_deberta.py:770, in DebertaEmbeddings.forward(self, input_ids, token_type_ids, position_ids, mask, inputs_embeds)
767 token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
769 if inputs_embeds is None:
--> 770 inputs_embeds = self.word_embeddings(input_ids)
772 if self.position_embeddings is not None:
773 position_embeddings = self.position_embeddings(position_ids.long())
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/sparse.py:164, in Embedding.forward(self, input)
163 def forward(self, input: Tensor) -> Tensor:
--> 164 return F.embedding(
165 input, self.weight, self.padding_idx, self.max_norm,
166 self.norm_type, self.scale_grad_by_freq, self.sparse)
File /opt/conda/lib/python3.10/site-packages/torch/nn/functional.py:2267, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2261 # Note [embedding_renorm set_grad_enabled]
2262 # XXX: equivalent to
2263 # with torch.no_grad():
2264 # torch.embedding_renorm_
2265 # remove once script supports set_grad_enabled
2266 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2267 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)
报错如上,代码如下import gc,os
from tqdm.auto import tqdm # visualizing tool for progress
# They will be used to pick the best model.pt given to the valid loss
best_model_epoch, valid_loss_values = [],[]
valid_loss_min = [1] # arbitrary loss I set here
def train(model,device,train_dataloader,valid_dataloader,epochs,loss_fn,optimizer,metric):
for epoch in range(epochs):
gc.collect() # memory cleaning垃圾回收机制,减少占用内存
train_loss = 0
train_step = 0
pbar = tqdm(train_dataloader, total=len(train_dataloader))#tqdm参数是一个iterable
for batch in pbar: # you can also write like "for batch in tqdm(train_dataloader"
optimizer.zero_grad() # initialize
train_step += 1
train_input_ids = batch['input_ids'].to(device)#batch是一个字典
train_attention_mask = batch['attention_mask'].to(device)
train_token_type_ids = batch['token_type_ids'].to(device)
train_labels = batch['labels'].squeeze().to(device).long()#label真实值long()转化成一维张量
# You can refer to the class "TweetsModel" for understand
# what would be logits
logits = model(train_input_ids, train_attention_mask,train_token_type_ids).to(device)
predictions = torch.argmax(logits, dim=1) # get an index from larger one
detached_predictions = predictions.detach().cpu().numpy()
loss = loss_fn(logits, train_labels)
train_loss += loss.detach().cpu().numpy().item()
with torch.no_grad():
valid_loss = 0
valid_step = 0
total_valid_score = 0
y_pred = [] # for getting f1_score that is a metric of the competition
y_true = []
pbar = tqdm(valid_dataloader)
for batch,labels in pbar:
valid_step += 1
valid_input_ids = batch['input_ids'].to(device)
valid_attention_mask = batch['attention_mask'].to(device)
valid_token_type_ids = batch['token_type_ids'].to(device)
valid_labels = batch['labels'].squeeze().to(device).long()
logits = model(valid_input_ids, valid_attention_mask).to(device)
predictions = torch.argmax(logits, dim=1)
detached_predictions = predictions.detach().cpu().numpy()
loss = loss_fn(logits, valid_labels)
valid_loss += loss.detach().cpu().numpy().item()
valid_loss /= valid_step
f1 = f1_score(y_true,y_pred)
print(f'Epoch [{epoch+1}/{epochs}] Score: {f1}')
print(f'Epoch [{epoch+1}/{epochs}] Valid_loss: {valid_loss}')
if valid_loss < min(valid_loss_min):
print('model improved!')
print('model not improved')
torch.save(model.state_dict(), f'epoch:{epoch+1}_model.pt')#state_dict 是一个字典对象,包含了模型的所有可学习参数(如权重和偏置)及其当前值
print('save checkpoint!')
select_best_model() # refer to below function
print('Train/Valid Completed!!')
del train_dataloader, valid_dataloader # memory cleaning
def select_best_model():
best_model = best_model_epoch[np.array(valid_loss_values).argmin()]
os.rename(best_model, best_model.split('.pt')[0] + '_best.pt')#重命名文件
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset,batch_size=32,collate_fn=CollateCls(CFG),shuffle=True,pin_memory=True)#锁页内存(pin_memory)能够保持与GPU进行高速传输,在训练时加快数据的读取
valid_dataloader = DataLoader(valid_dataset,batch_size=32,collate_fn=CollateCls(CFG),shuffle=False,pin_memory=True)