|

楼主 |
发表于 2024-10-27 16:37:49
|
显示全部楼层
转换成gpu代码报错- ---------------------------------------------------------------------------
- RuntimeError Traceback (most recent call last)
- Cell In[32], line 96
- 94 for epoch in range(num_epochs):
- 95 train_loss = 0
- ---> 96 for tensor_x, tensor_y in train_iter:#训练集执行梯度更新
- 97 tensor_x = tensor_x.float()
- 98 tensor_y = tensor_y.float().reshape(-1, 1)
- File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
- 627 if self._sampler_iter is None:
- 628 # TODO(https://github.com/pytorch/pytorch/issues/76750)
- 629 self._reset() # type: ignore[call-arg]
- --> 630 data = self._next_data()
- 631 self._num_yielded += 1
- 632 if self._dataset_kind == _DatasetKind.Iterable and \
- 633 self._IterableDataset_len_called is not None and \
- 634 self._num_yielded > self._IterableDataset_len_called:
- File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:1344, in _MultiProcessingDataLoaderIter._next_data(self)
- 1342 else:
- 1343 del self._task_info[idx]
- -> 1344 return self._process_data(data)
- File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:1370, in _MultiProcessingDataLoaderIter._process_data(self, data)
- 1368 self._try_put_index()
- 1369 if isinstance(data, ExceptionWrapper):
- -> 1370 data.reraise()
- 1371 return data
- File /opt/conda/lib/python3.10/site-packages/torch/_utils.py:706, in ExceptionWrapper.reraise(self)
- 702 except TypeError:
- 703 # If the exception takes multiple arguments, don't try to
- 704 # instantiate since we don't know how to
- 705 raise RuntimeError(msg) from None
- --> 706 raise exception
- RuntimeError: Caught RuntimeError in DataLoader worker process 0.
- Original Traceback (most recent call last):
- File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 309, in _worker_loop
- data = fetcher.fetch(index) # type: ignore[possibly-undefined]
- File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
- data = [self.dataset[idx] for idx in possibly_batched_index]
- File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
- data = [self.dataset[idx] for idx in possibly_batched_index]
- File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataset.py", line 211, in __getitem__
- return tuple(tensor[index] for tensor in self.tensors)
- File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataset.py", line 211, in <genexpr>
- return tuple(tensor[index] for tensor in self.tensors)
- RuntimeError: CUDA error: initialization error
- CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
- For debugging consider passing CUDA_LAUNCH_BLOCKING=1
- Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
复制代码
----------------------------------以下是源代码
- #使用dnn模型(k折交叉验证)
- import torch
- import torch.nn as nn
- from torch.utils import data
- from torch.utils.data import Dataset,DataLoader
- from torch import optim
- #定义神经网络模型
- class SimpleNN(nn.Module):
- def __init__(self):
- super(SimpleNN,self).__init__()
- self.hidden_layer1 = nn.Linear(154,1024)
- self.hidden_layer2 = nn.Linear(1024,1024)
- self.hidden_layer3 = nn.Linear(1024,1024)
- self.hidden_layer4 = nn.Linear(1024,1024)
- self.output_layer = nn.Linear(1024,1)
- self.dropout = nn.Dropout(p=0.5)
- nn.init.xavier_uniform_(self.hidden_layer1.weight)
- nn.init.xavier_uniform_(self.hidden_layer2.weight)
- nn.init.xavier_uniform_(self.hidden_layer3.weight)
- nn.init.xavier_uniform_(self.hidden_layer4.weight)
- nn.init.xavier_uniform_(self.output_layer.weight)
- def forward(self,x):
- inputs = x
- layer1_out = torch.nn.functional.gelu(self.hidden_layer1(inputs))
- layer1_out = self.dropout(layer1_out)
- layer2_out = torch.nn.functional.gelu(self.hidden_layer2(layer1_out))
- layer2_out = self.dropout(layer2_out)
- layer3_out = torch.nn.functional.gelu(self.hidden_layer3(layer2_out))
- layer3_out = self.dropout(layer3_out)
- layer4_out = torch.nn.functional.gelu(self.hidden_layer4(layer3_out))
- layer4_out = self.dropout(layer4_out)
- output = torch.relu(self.output_layer(layer4_out))
- return output
- # 设置超参数
- k = 5
- batch_size = 128
- num_epochs = 1000
- weight_decay = 0
- #初始化模型和优化器
- dnn_model = SimpleNN().to(device) # 将模型移到GPU上
- optimizer = optim.AdamW(dnn_model.parameters(),lr=0.0001,weight_decay=weight_decay) #定义优化器
- #k折交叉验证选取训练集与验证集
- def get_k_fold_data(k, i, X, y):
- assert k > 1
- fold_size = len(X) // k
- X_train, y_train = None, None
- for j in range(k):
- start = j * fold_size
- end = (j + 1) * fold_size
- if j == i:
- X_valid, y_valid = X.iloc[start:end], y.iloc[start:end]
- elif X_train is None:
- X_train, y_train = X.iloc[start:end], y.iloc[start:end]
- else:
- X_train = pd.concat([X_train, X.iloc[start:end]], ignore_index=True)
- y_train = pd.concat([y_train, y.iloc[start:end]], ignore_index=True)
- return X_train, y_train, X_valid, y_valid
- #初始化列表
- train_ls, valid_ls = [], []
- for i in range(k):
- X_train, y_train, X_valid, y_valid = get_k_fold_data(k, i, X, y)
- print(f'FOLD {i}')
- print('--------------------------------')
-
- # 将DataFrame数据转换为NumPy数组,然后再转换为PyTorch张量,并且移动到GPU上
- X_train = torch.tensor(X_train.astype(np.float32).values, dtype=torch.float32).to(device)
- y_train = torch.tensor(y_train.astype(np.float32).values, dtype=torch.float32).to(device)
- X_valid = torch.tensor(X_valid.astype(np.float32).values, dtype=torch.float32).to(device)
- y_valid = torch.tensor(y_valid.astype(np.float32).values, dtype=torch.float32).to(device)
-
-
- #创建数据集
- train_ds = data.TensorDataset(X_train, y_train)
- valid_ds = data.TensorDataset(X_valid, y_valid)
- # 获取一个数据迭代器
- train_iter = DataLoader(dataset=train_ds,batch_size=batch_size,shuffle=True,num_workers=2)#shuffle=True相当于sampler=RandomSampler(dataset)
- valid_iter = DataLoader(dataset=valid_ds,batch_size=batch_size,shuffle=True,num_workers=2)
-
- #开始迭代
- for epoch in range(num_epochs):
- train_loss = 0
- for tensor_x, tensor_y in train_iter:#训练集执行梯度更新
- tensor_x = tensor_x.float()
- tensor_y = tensor_y.float().reshape(-1, 1)
- optimizer.zero_grad() #梯度清零
- pre_train = dnn_model(tensor_x)
- train_l = MSLE_loss(pre_train, tensor_y) #损失应避免与全局变量loss重名
- train_l.backward()#前向传播
- optimizer.step()#梯度下降
- train_loss += train_l.item() * len(tensor_x)
-
- train_loss /= len(train_ds) #每次迭代平均损失
-
- if epoch % 200 == 0:
- print('Loss: {} Epoch:{}'.format(train_loss, epoch))
- #print(train_l)
-
-
- with torch.no_grad():
- valid_loss = 0
-
- for tensor_x, tensor_y in valid_iter:
- tensor_x = tensor_x.float()
- tensor_y = tensor_y.float().reshape(-1, 1)
- pre_valid = dnn_model(tensor_x)
- valid_l = MSLE_loss(pre_valid, tensor_y)
- valid_loss += valid_l.item() * len(tensor_x)
-
- valid_loss /= len(valid_ds)
-
- if epoch % 200 == 0:
- print('Valid Loss: {} Epoch:{}'.format(valid_loss, epoch))
- #print(valid_l)
-
- #将每折的损失添加到列表中
- train_ls.append(train_loss)
- valid_ls.append(valid_loss)
-
- print('Training Ended')
- print('Train Average Loss: {} Valid Average Loss: {}'.format(np.mean(train_ls),np.mean(valid_ls)))
复制代码 |
|