报错显示存在多个gpu

Terence888 · 发表于 2024-11-3 19:32:35

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[42], line 4
1 print('Training Start!')
2 print('=' * 100)
----> 4 train(model,
5 device,
6 train_dataloader,
7 valid_dataloader,
8 CFG.epochs,
9 loss_fn,
10 optimizer,
11 metric)
13 del model,train_dataloader, valid_dataloader
14 gc.collect()
Cell In[41], line 29, in train(model, device, train_dataloader, valid_dataloader, epochs, loss_fn, optimizer, metric)
25 train_labels = batch['labels'].squeeze().to(device).long()#label真实值long()转化成一维张量
27 # You can refer to the class "TweetsModel" for understand
28 # what would be logits
---> 29 logits = model(train_input_ids, train_attention_mask,train_token_type_ids).to(device)
30 predictions = torch.argmax(logits, dim=1) # get an index from larger one
31 detached_predictions = predictions.detach().cpu().numpy()
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
Cell In[34], line 48, in LLMModel.forward(self, input_ids, attention_mask, token_type_ids)
47 def forward(self, input_ids, attention_mask, token_type_ids):#attention_mask避免用注意力机制的时候关注到填充符
---> 48 feature = self.feature( input_ids, attention_mask, token_type_ids)
49 output = self.fc(feature)
50 return output
Cell In[34], line 42, in LLMModel.feature(self, input_ids, attention_mask, token_type_ids)
41 def feature(self, input_ids, attention_mask, token_type_ids):
---> 42 outputs = self.model( input_ids, attention_mask, token_type_ids)
43 last_hidden_states = outputs[0] #encoder最后一个隐藏状态的输出传递给decoder做cross attention
44 feature = last_hidden_states[:, 0, :] ## CLS token
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File /opt/conda/lib/python3.10/site-packages/transformers/models/deberta/modeling_deberta.py:956, in DebertaModel.forward(self, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds, output_attentions, output_hidden_states, return_dict)
953 if token_type_ids is None:
954 token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
--> 956 embedding_output = self.embeddings(
957 input_ids=input_ids,
958 token_type_ids=token_type_ids,
959 position_ids=position_ids,
960 mask=attention_mask,
961 inputs_embeds=inputs_embeds,
962 )
964 encoder_outputs = self.encoder(
965 embedding_output,
966 attention_mask,
(...)
969 return_dict=return_dict,
970 )
971 encoded_layers = encoder_outputs[1]
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File /opt/conda/lib/python3.10/site-packages/transformers/models/deberta/modeling_deberta.py:770, in DebertaEmbeddings.forward(self, input_ids, token_type_ids, position_ids, mask, inputs_embeds)
767 token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
769 if inputs_embeds is None:
--> 770 inputs_embeds = self.word_embeddings(input_ids)
772 if self.position_embeddings is not None:
773 position_embeddings = self.position_embeddings(position_ids.long())
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/sparse.py:164, in Embedding.forward(self, input)
163 def forward(self, input: Tensor) -> Tensor:
--> 164 return F.embedding(
165 input, self.weight, self.padding_idx, self.max_norm,
166 self.norm_type, self.scale_grad_by_freq, self.sparse)
File /opt/conda/lib/python3.10/site-packages/torch/nn/functional.py:2267, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2261 # Note [embedding_renorm set_grad_enabled]
2262 # XXX: equivalent to
2263 # with torch.no_grad():
2264 # torch.embedding_renorm_
2265 # remove once script supports set_grad_enabled
2266 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2267 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

复制代码

报错如上，代码如下

import gc,os
from tqdm.auto import tqdm # visualizing tool for progress
# They will be used to pick the best model.pt given to the valid loss
best_model_epoch, valid_loss_values = [],[]
valid_loss_min = [1] # arbitrary loss I set here
def train(model,device,train_dataloader,valid_dataloader,epochs,loss_fn,optimizer,metric):
for epoch in range(epochs):
gc.collect() # memory cleaning垃圾回收机制，减少占用内存
model.train()
train_loss = 0
train_step = 0
pbar = tqdm(train_dataloader, total=len(train_dataloader))#tqdm参数是一个iterable
for batch in pbar: # you can also write like "for batch in tqdm(train_dataloader"
optimizer.zero_grad() # initialize
train_step += 1
train_input_ids = batch['input_ids'].to(device)#batch是一个字典
train_attention_mask = batch['attention_mask'].to(device)
train_token_type_ids = batch['token_type_ids'].to(device)
train_labels = batch['labels'].squeeze().to(device).long()#label真实值long()转化成一维张量
# You can refer to the class "TweetsModel" for understand
# what would be logits
logits = model(train_input_ids, train_attention_mask,train_token_type_ids).to(device)
predictions = torch.argmax(logits, dim=1) # get an index from larger one
detached_predictions = predictions.detach().cpu().numpy()
loss = loss_fn(logits, train_labels)
loss.backward()
optimizer.step()
model.zero_grad()
train_loss += loss.detach().cpu().numpy().item()
pbar.set_postfix({'train_loss':train_loss/train_step})#设置进度条显示信息
pbar.close()
with torch.no_grad():
model.eval()
valid_loss = 0
valid_step = 0
total_valid_score = 0
y_pred = [] # for getting f1_score that is a metric of the competition
y_true = []
pbar = tqdm(valid_dataloader)
for batch,labels in pbar:
valid_step += 1
valid_input_ids = batch['input_ids'].to(device)
valid_attention_mask = batch['attention_mask'].to(device)
valid_token_type_ids = batch['token_type_ids'].to(device)
valid_labels = batch['labels'].squeeze().to(device).long()
logits = model(valid_input_ids, valid_attention_mask).to(device)
predictions = torch.argmax(logits, dim=1)
detached_predictions = predictions.detach().cpu().numpy()
loss = loss_fn(logits, valid_labels)
valid_loss += loss.detach().cpu().numpy().item()
y_pred.extend(predictions.cpu().numpy())
y_true.extend(valid_labels.cpu().numpy())
valid_loss /= valid_step
f1 = f1_score(y_true,y_pred)
print(f'Epoch [{epoch+1}/{epochs}] Score: {f1}')
print(f'Epoch [{epoch+1}/{epochs}] Valid_loss: {valid_loss}')
if valid_loss < min(valid_loss_min):
print('model improved!')
else:
print('model not improved')
torch.save(model.state_dict(), f'epoch:{epoch+1}_model.pt')#state_dict 是一个字典对象，包含了模型的所有可学习参数（如权重和偏置）及其当前值
print('save checkpoint!')
valid_loss_min.append(valid_loss)
print(f'valid_loss_min:{min(valid_loss_min)}')
best_model_epoch.append(f'/kaggle/working/epoch:{epoch+1}_model.pt')
valid_loss_values.append(valid_loss)
print('='*100)
select_best_model() # refer to below function
print('Train/Valid Completed!!')
del train_dataloader, valid_dataloader # memory cleaning
gc.collect()
def select_best_model():
best_model = best_model_epoch[np.array(valid_loss_values).argmin()]
os.rename(best_model, best_model.split('.pt')[0] + '_best.pt')#重命名文件

复制代码

from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset,batch_size=32,collate_fn=CollateCls(CFG),shuffle=True,pin_memory=True)#锁页内存(pin_memory)能够保持与GPU进行高速传输,在训练时加快数据的读取
valid_dataloader = DataLoader(valid_dataset,batch_size=32,collate_fn=CollateCls(CFG),shuffle=False,pin_memory=True)
#collate_fn用于自定义数据加载和批处理的方式

复制代码

FishC · 发表于 2024-11-3 19:32:46

根据报错信息，问题出在模型的输入张量和模型本身不在同一个设备上。具体来说，某些张量在CPU上，而模型在GPU上（cuda:0）。要解决这个问题，确保所有张量和模型都在同一个设备上。

以下是一些可能的解决方案：

1. **确保所有输入张量都在同一个设备上：**

在代码中，`batch.to(device)` 可能没有正确地将所有张量移动到GPU上。确保每个张量都被正确地移动到设备上。

train_input_ids = batch['input_ids'].to(device)
train_attention_mask = batch['attention_mask'].to(device)
train_token_type_ids = batch['token_type_ids'].to(device)
train_labels = batch['labels'].to(device).long()

复制代码

2. **检查模型是否在正确的设备上：**

确保模型在训练开始时被移动到正确的设备上。

model.to(device)

复制代码

3. **检查所有数据加载器的输出：**

确保数据加载器输出的所有张量都被正确地移动到设备上。

4. **避免在模型输出后再调用 `.to(device)`：**

在模型输出后调用 `.to(device)` 可能会导致不必要的设备转换。确保模型的输出已经在正确的设备上。

如果这些步骤没有解决问题，请检查数据加载器的实现，确保它们输出的所有数据都在正确的设备上。

以上回复来自 FishC AI 小助理，如未能正确解答您的问题，请继续追问。

Terence888 · 发表于 2024-11-3 19:49:14

FishC 发表于 2024-11-3 19:32
根据报错信息，问题出在模型的输入张量和模型本身不在同一个设备上。具体来说，某些张量在CPU上，而模型在G ...

CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 883.12 MiB is free. Process 2822 has 15.02 GiB memory in use. Of the allocated memory 14.73 GiB is allocated by PyTorch, and 7.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management

复制代码

更正了model调用时的设备，现在出现了这个提示

FishC · 发表于 2024-11-3 19:49:22

Terence888 发表于 2024-11-03 19:49
CUDA out of memory. Tried to allocate 1024.00 MiB. ...

这个错误提示表明你的GPU内存不足，无法分配所需的内存。以下是一些可能的解决方案：

1. **减少批量大小（Batch Size）：**
尝试减小 `DataLoader` 中的 `batch_size` 参数，这通常是最直接有效的方法。

2. **清理缓存：**
在训练循环中，使用 `torch.cuda.empty_cache()` 来释放未使用的显存。

3. **检查模型大小：**
如果模型非常大，考虑使用更小的模型或减少模型的参数数量。

4. **使用半精度浮点数（Mixed Precision Training）：**
使用 `torch.cuda.amp` 来进行混合精度训练，这可以显著减少显存使用。

5. **分段内存分配：**
设置环境变量 `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`，这可能有助于减少内存碎片化。

6. **关闭不必要的进程：**
确保没有其他进程占用GPU内存，使用 `nvidia-smi` 查看并终止不必要的进程。

7. **使用更大的GPU：**
如果可能，切换到具有更大显存的GPU。

尝试这些方法，看看是否能解决你的问题。

以上回复来自 FishC AI 小助理，如未能正确解答您的问题，请继续追问。

Terence888 · 发表于 2024-11-3 22:26:39

FishC 发表于 2024-11-3 19:49
这个错误提示表明你的GPU内存不足，无法分配所需的内存。以下是一些可能的解决方案：

1. **减少批量 ...

在哪里添加代码`torch.cuda.empty_cache()` 来释放未使用的显存。

FishC · 发表于 2024-11-3 22:45:53

Terence888 发表于 2024-11-03 22:26
在哪里添加代码`torch.cuda.empty_cache()` 来释放未 ...

你可以在训练循环的每个epoch结束后，或者在每个batch处理完后，添加 `torch.cuda.empty_cache()` 来释放未使用的显存。以下是你可以添加这行代码的地方：

# 在每个epoch结束后
for epoch in range(epochs):
# 训练代码...
# 释放未使用的显存
torch.cuda.empty_cache()
# 验证代码...

复制代码

或者在每个batch处理完后：

# 在每个batch处理完后
for batch in train_dataloader:
# 训练代码...
# 释放未使用的显存
torch.cuda.empty_cache()

复制代码

这样可以帮助减少显存的占用。

以上回复来自 FishC AI 小助理，如未能正确解答您的问题，请继续追问。

Terence888 · 发表于 2024-11-5 11:12:26

FishC 发表于 2024-11-3 22:45
你可以在训练循环的每个epoch结束后，或者在每个batch处理完后，添加 `torch.cuda.empty_cache()` 来释 ...

Your notebook tried to allocate more memory than is available. It has restarted.

kaggle跳出这个提示怎么办

FishC · 发表于 2024-11-5 11:12:37

Terence888 发表于 2024-11-05 11:12
Your notebook tried to allocate more memory than i ...

在Kaggle上，出现内存不足导致的重启问题时，可以尝试以下方法来优化内存使用：

1. **减少批量大小（Batch Size）：**
- 在 `DataLoader` 中减小 `batch_size` 参数。较小的批量大小会减少每次训练步骤所需的内存。

2. **优化数据预处理：**
- 确保数据预处理和增强操作尽可能高效。使用内存映射文件或其他方法以减少内存需求。

3. **检查模型大小：**
- 确认模型的架构是否可以进行简化，例如减少层数或参数数量。

4. **使用混合精度训练：**
- 可利用 PyTorch 的 `torch.cuda.amp` 进行混合精度训练，从而减少显存使用。

5. **定期清理内存：**
- 使用 `torch.cuda.empty_cache()` 清理缓存，避免内存浪费。也可以使用 Python 的 `gc.collect()` 进行垃圾回收。

6. **简化代码逻辑：**
- 确保没有冗余或不必要的变量在占用内存。将不再需要的中间结果从内存中删除。

7. **降低数据集大小：**
- 若数据集很大，考虑使用数据的一个较小子集进行实验，确保代码逻辑正确后再对完整数据集进行完整训练。

8. **考虑训练超参数：**
- 检查其他超参数设定是否导致过多内存占用，例如过高的学习率可能会导致不必要的内存使用增长。

9. **在本地运行：**
- 如有可能，在本地机器上运行和调试，这样可以获得更高的灵活性来管理资源。

如果在经过这些调整之后仍然出现问题，可以考虑在 Kaggle 上咨询社区，寻找一些其他用户的解决方案或建议。

以上回复来自 FishC AI 小助理，如未能正确解答您的问题，请继续追问。

Terence888 · 发表于 2024-11-5 17:28:33

FishC 发表于 2024-11-5 11:12
在Kaggle上，出现内存不足导致的重启问题时，可以尝试以下方法来优化内存使用：

1. **减少批量大小 ...

predictions = np.argmax(predictions,axis=1) # 0 or 1
sample['generated'] = predictions
sample.head(10)

复制代码

提示报错

---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[48], line 2
1 predictions = np.argmax(predictions,axis=1) # 0 or 1
----> 2 sample['generated'] = predictions
3 sample.head(10)
File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:4311, in DataFrame.__setitem__(self, key, value)
4308 self._setitem_array([key], value)
4309 else:
4310 # set column
-> 4311 self._set_item(key, value)
File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:4524, in DataFrame._set_item(self, key, value)
4514 def _set_item(self, key, value) -> None:
4515 """
4516 Add series to DataFrame in specified column.
4517
(...)
4522 ensure homogeneity.
4523 """
-> 4524 value, refs = self._sanitize_column(value)
4526 if (
4527 key in self.columns
4528 and value.ndim == 1
4529 and not isinstance(value.dtype, ExtensionDtype)
4530 ):
4531 # broadcast across multiple columns if necessary
4532 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:5266, in DataFrame._sanitize_column(self, value)
5263 return _reindex_for_setitem(value, self.index)
5265 if is_list_like(value):
-> 5266 com.require_length_match(value, self.index)
5267 arr = sanitize_array(value, self.index, copy=True, allow_2d=True)
5268 if (
5269 isinstance(value, Index)
5270 and value.dtype == "object"
(...)
5273 # TODO: Remove kludge in sanitize_array for string mode when enforcing
5274 # this deprecation
File /opt/conda/lib/python3.10/site-packages/pandas/core/common.py:573, in require_length_match(data, index)
569 """
570 Check the length of data matches the length of the index.
571 """
572 if len(data) != len(index):
--> 573 raise ValueError(
574 "Length of values "
575 f"({len(data)}) "
576 "does not match length of index "
577 f"({len(index)})"
578 )
ValueError: Length of values (1) does not match length of index (3)

复制代码

FishC · 发表于 2024-11-5 17:34:58

Terence888 发表于 2024-11-05 17:28
predictions = np.argmax(predictions,axis=1) # 0 or ...

报错信息表明，给定的 `predictions` 长度与 Panda 数据框架的索引长度不匹配。

从你的代码来看，你可能尝试将 `predictions` 的结果保存到一个 Panda 数据框中，而 `np.argmax(predictions, axis=1)` 返回一个一维数组，其长度与输入的第二个维度相同。如果 `predictions` 本身是一个一维数组，这样操作会返回单一值，从而导致这个错误。

以下是一些解决方法：

1. **检查 `predictions` 的维度：**
- 在调用 `np.argmax()` 之前打印 `predictions` 的形状，确保其维度是正确的。例如：

print(predictions.shape)

复制代码

- 如果 `predictions` 是一个一维数组，那么使用 `np.argmax(predictions)` 而不是 `axis=1`。

2. **同步索引长度：**
- 确保与 `sample` 数据框相比，`predictions` 的长度是相同的。例如：

predictions = np.argmax(predictions, axis=1)
sample = pd.DataFrame(predictions, columns=['predictions']) # 重新构建 DataFrame
print(sample.head(10))

复制代码

3. **验证输入数据形状：**
- 如果 `predictions` 是来自模型的输出，重新检查模型输出的形状和 `np.argmax` 使用的参数，确保它们符合预期。

通过检查和调整这些部分，你应该能够解决这个问题。

以上回复来自 FishC AI 小助理，如未能正确解答您的问题，请继续追问。

Terence888 · 发表于 2024-11-5 18:04:05

FishC 发表于 2024-11-5 17:34
报错信息表明，给定的 `predictions` 长度与 Panda 数据框架的索引长度不匹配。

从你的代码来看，你 ...

BertForSequenceClassification.from_pretrained模型输出的结果为什么是二维张量，用于情感分类任务时两个维度分别表示什么

FishC · 发表于 2024-11-5 18:04:19

Terence888 发表于 2024-11-05 18:04
BertForSequenceClassification.from_pretrained模型 ...

在情感分类任务中使用 `BertForSequenceClassification.from_pretrained` 模型时，模型的输出是一个二维张量，通常表示为 `(batch_size, num_labels)`。

这里的两个维度分别代表：

1. **batch_size**：
这表示输入到模型的一批数据样本的数量。对于每个输入样本，模型都会输出一个结果。也就是说，模型会对每个样本分别提供一个预测。

2. **num_labels**：
这个维度代表标签的数量。在一个二分类（例如正面或负面情感）任务中，`num_labels` 通常是2。每个元素表示该类别的未归一化的logits（模型的未激活输出）。这些logits可以通过softmax函数转换为概率。

在情感分类任务中，如果你有两个类（例如正面和负面），那么输出中的每个预测（二维张量的每一行）会包含两个值，这两个值可以通过softmax进行转换来表示属于每个类的概率。通常情况下，我们会选择概率最大的那个类作为最终的预测结果。

具体地，你可以通过如下方式将logits转换为预测的类别标签：

import torch.nn.functional as F
# 假设 'outputs' 是模型的输出 logits 张量
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
# 使用 softmax 转换为概率
probs = F.softmax(logits, dim=1)
# 找到概率最大的索引作为预测类别
predictions = torch.argmax(probs, dim=1)

复制代码

希望这可以帮助你更好地理解模型的输出！

以上回复来自 FishC AI 小助理，如未能正确解答您的问题，请继续追问。

Terence888 · 发表于 2024-11-6 23:49:52

FishC 发表于 2024-11-5 18:04
在情感分类任务中使用 `BertForSequenceClassification.from_pretrained` 模型时，模型的输出是一个二 ...

---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[40], line 4
1 print('Training Start!')
2 print('=' * 100)
----> 4 train(model,
5 device,
6 train_dataloader,
7 valid_dataloader,
8 configs['epochs'],
9 loss_fn,
10 optimizer,
11 metric)
13 del model,train_dataloader, valid_dataloader
14 gc.collect()
Cell In[38], line 33, in train(model, device, train_dataloader, valid_dataloader, epochs, loss_fn, optimizer, metric)
31 loss = loss_fn(logits, train_labels)
32 loss.backward()
---> 33 optimizer.step()
34 model.zero_grad()
36 train_loss += loss.detach().cpu().numpy().item()
File /opt/conda/lib/python3.10/site-packages/torch/optim/optimizer.py:484, in Optimizer.profile_hook_step.<locals>.wrapper(*args, **kwargs)
479 else:
480 raise RuntimeError(
481 f"{func} must return None or a tuple of (new_args, new_kwargs), but got {result}."
482 )
--> 484 out = func(*args, **kwargs)
485 self._optimizer_step_code()
487 # call optimizer step post hooks
File /opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/transformers/optimization.py:648, in AdamW.step(self, closure)
646 exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
647 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
--> 648 denom = exp_avg_sq.sqrt().add_(group["eps"])
650 step_size = group["lr"]
651 if group["correct_bias"]: # No bias correction for Bert
RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

复制代码

出现了这个报错，代码要如何修改

import gc,os
from tqdm.auto import tqdm # visualizing tool for progress
# They will be used to pick the best model.pt given to the valid loss
best_model_epoch, valid_loss_values = [],[]
valid_loss_min = [1] # arbitrary loss I set here
def train(model,device,train_dataloader,valid_dataloader,epochs,loss_fn,optimizer,metric):
for epoch in range(epochs):
gc.collect() # memory cleaning垃圾回收机制，减少占用内存
model.train()
train_loss = 0
train_step = 0
pbar = tqdm(train_dataloader)#tqdm参数是一个iterable
for batch in pbar: # you can also write like "for batch in tqdm(train_dataloader"
optimizer.zero_grad() # initialize
train_step += 1
train_input_ids = batch['input_ids'].to(device)
train_attention_mask = batch['attention_mask'].to(device)
train_labels = batch['labels'].squeeze().to(device).long()#long()转化成一维张量
# You can refer to the class "TweetsModel" for understand
# what would be logits
logits = model(train_input_ids, train_attention_mask).to(device)
predictions = torch.argmax(logits, dim=1) # get an index from larger one
detached_predictions = predictions.detach().cpu().numpy()
loss = loss_fn(logits, train_labels)
loss.backward()
optimizer.step()
model.zero_grad()
train_loss += loss.detach().cpu().numpy().item()
pbar.set_postfix({'train_loss':train_loss/train_step})#设置进度条显示信息
pbar.close()
with torch.no_grad():
model.eval()
valid_loss = 0
valid_step = 0
total_valid_score = 0
y_pred = [] # for getting f1_score that is a metric of the competition
y_true = []
pbar = tqdm(valid_dataloader)
for batch in pbar:
valid_step += 1
valid_input_ids = batch['input_ids'].to(device)
valid_attention_mask = batch['attention_mask'].to(device)
valid_labels = batch['labels'].squeeze().to(device).long()
logits = model(valid_input_ids, valid_attention_mask).to(device)
predictions = torch.argmax(logits, dim=1)
detached_predictions = predictions.detach().cpu().numpy()
loss = loss_fn(logits, valid_labels)
valid_loss += loss.detach().cpu().numpy().item()
y_pred.extend(predictions.cpu().numpy())
y_true.extend(valid_labels.cpu().numpy())
valid_loss /= valid_step
f1 = f1_score(y_true,y_pred)
print(f'Epoch [{epoch+1}/{epochs}] Score: {f1}')
print(f'Epoch [{epoch+1}/{epochs}] Valid_loss: {valid_loss}')
if valid_loss < min(valid_loss_min):
print('model improved!')
else:
print('model not improved')
torch.save(model.state_dict(), f'epoch:{epoch+1}_model.pt')#state_dict 是一个字典对象，包含了模型的所有可学习参数（如权重和偏置）及其当前值
print('save checkpoint!')
valid_loss_min.append(valid_loss)
print(f'valid_loss_min:{min(valid_loss_min)}')
best_model_epoch.append(f'/kaggle/working/epoch:{epoch+1}_model.pt')
valid_loss_values.append(valid_loss)
print('='*100)
select_best_model() # refer to below function
print('Train/Valid Completed!!')
del train_dataloader, valid_dataloader # memory cleaning
gc.collect()
def select_best_model():
best_model = best_model_epoch[np.array(valid_loss_values).argmin()]
os.rename(best_model, best_model.split('.pt')[0] + '_best.pt')#重命名文件

复制代码

FishC · 发表于 2024-11-7 00:12:49

Terence888 发表于 2024-11-06 23:49
-------------------------------------------------- ...

这个错误提示是一个通用的CUDA错误，可能由多种原因引起。以下是一些可能的解决方案和调试步骤：

1. **设置CUDA同步：**
- 通过设置环境变量 `CUDA_LAUNCH_BLOCKING=1` 来强制CUDA同步，这样可以帮助你更准确地定位错误发生的位置。

import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

复制代码

2. **检查设备分配：**
- 确保所有张量和模型都在同一个设备上。你已经在代码中使用了 `.to(device)`，但请确保 `device` 是正确设置的。

3. **减少批量大小：**
- 尝试减小 `DataLoader` 中的 `batch_size`，以减少每次训练步骤所需的内存。

4. **检查内存使用：**
- 使用 `torch.cuda.memory_summary()` 来检查当前的内存使用情况，帮助你了解内存的分配情况。

5. **清理缓存：**
- 在训练循环中使用 `torch.cuda.empty_cache()` 来释放未使用的显存。

6. **检查模型和数据：**
- 确保模型和数据没有问题，特别是数据的形状和类型是否符合模型的输入要求。

7. **调试代码：**
- 在关键步骤前后添加打印语句，检查每一步的输出和状态。

8. **更新PyTorch和CUDA：**
- 确保你使用的是最新版本的PyTorch和CUDA，可能会修复一些已知的错误。

如果这些步骤没有解决问题，可能需要更详细地检查代码逻辑和数据处理流程。

以上回复来自 FishC AI 小助理，如未能正确解答您的问题，请继续追问。

账号		自动登录	找回密码
密码			立即注册

报错显示存在多个gpu

马上注册，结交更多好友，享用更多功能^_^

浏览过的版块