鱼C论坛

 找回密码
 立即注册
查看: 406|回复: 3

如何修改代码使用gpu的cuda来运行

[复制链接]
发表于 2024-10-27 16:14:47 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
  1. #使用dnn模型(k折交叉验证)
  2. import torch
  3. import torch.nn as nn
  4. from torch.utils import data
  5. from torch.utils.data import Dataset,DataLoader
  6. from torch import optim

  7. #定义神经网络模型


  8. class SimpleNN(nn.Module):
  9.     def __init__(self):
  10.         super(SimpleNN,self).__init__()
  11.         self.hidden_layer1 = nn.Linear(154,1024)
  12.         self.hidden_layer2 = nn.Linear(1024,1024)
  13.         self.hidden_layer3 = nn.Linear(1024,1024)
  14.         self.hidden_layer4 = nn.Linear(1024,1024)
  15.         self.output_layer = nn.Linear(1024,1)
  16.         self.dropout = nn.Dropout(p=0.5)
  17.         nn.init.xavier_uniform_(self.hidden_layer1.weight)
  18.         nn.init.xavier_uniform_(self.hidden_layer2.weight)
  19.         nn.init.xavier_uniform_(self.hidden_layer3.weight)
  20.         nn.init.xavier_uniform_(self.hidden_layer4.weight)
  21.         nn.init.xavier_uniform_(self.output_layer.weight)
  22.     def forward(self,x):
  23.         inputs = x
  24.         layer1_out = torch.nn.functional.gelu(self.hidden_layer1(inputs))
  25.         layer1_out = self.dropout(layer1_out)
  26.         layer2_out = torch.nn.functional.gelu(self.hidden_layer2(layer1_out))
  27.         layer2_out = self.dropout(layer2_out)
  28.         layer3_out = torch.nn.functional.gelu(self.hidden_layer3(layer2_out))
  29.         layer3_out = self.dropout(layer3_out)
  30.         layer4_out = torch.nn.functional.gelu(self.hidden_layer4(layer3_out))
  31.         layer4_out = self.dropout(layer4_out)
  32.         output = torch.relu(self.output_layer(layer4_out))
  33.         return output

  34. # 设置超参数
  35. k = 5
  36. batch_size = 128
  37. num_epochs = 1000
  38. weight_decay = 0

  39. #初始化模型和优化器
  40. dnn_model = SimpleNN()
  41. optimizer = optim.AdamW(dnn_model.parameters(),lr=0.0001,weight_decay=weight_decay) #定义优化器


  42. #k折交叉验证选取训练集与验证集
  43. def get_k_fold_data(k, i, X, y):
  44.     assert k > 1
  45.     fold_size = len(X) // k
  46.     X_train, y_train = None, None
  47.     for j in range(k):
  48.         start = j * fold_size
  49.         end = (j + 1) * fold_size
  50.         if j == i:
  51.             X_valid, y_valid = X.iloc[start:end], y.iloc[start:end]
  52.         elif X_train is None:
  53.             X_train, y_train = X.iloc[start:end], y.iloc[start:end]
  54.         else:
  55.             X_train = pd.concat([X_train, X.iloc[start:end]], ignore_index=True)
  56.             y_train = pd.concat([y_train, y.iloc[start:end]], ignore_index=True)
  57.     return X_train, y_train, X_valid, y_valid




  58. #初始化列表
  59. train_ls, valid_ls = [], []

  60. for i in range(k):
  61.     X_train, y_train, X_valid, y_valid = get_k_fold_data(k, i, X, y)
  62.     print(f'FOLD {i}')
  63.     print('--------------------------------')
  64.    

  65.     #将DataFrame数据转换为NumPy数组,然后再转换为PyTorch张量
  66.     X_train = torch.tensor(X_train.astype(np.float32).values, dtype=torch.float32)
  67.     y_train = torch.tensor(y_train.astype(np.float32).values, dtype=torch.float32)
  68.     X_valid = torch.tensor(X_valid.astype(np.float32).values, dtype=torch.float32)
  69.     y_valid = torch.tensor(y_valid.astype(np.float32).values, dtype=torch.float32)
  70.    
  71.     #创建数据集
  72.     train_ds = data.TensorDataset(X_train, y_train)
  73.     valid_ds = data.TensorDataset(X_valid, y_valid)

  74.     # 获取一个数据迭代器
  75.     train_iter = DataLoader(dataset=train_ds,batch_size=batch_size,shuffle=True,num_workers=2)#shuffle=True相当于sampler=RandomSampler(dataset)
  76.     valid_iter = DataLoader(dataset=valid_ds,batch_size=batch_size,shuffle=True,num_workers=2)
  77.    
  78.     #开始迭代
  79.     for epoch in range(num_epochs):
  80.         train_loss = 0
  81.         for tensor_x, tensor_y in train_iter:#训练集执行梯度更新
  82.             tensor_x = tensor_x.float()
  83.             tensor_y = tensor_y.float().reshape(-1, 1)
  84.             optimizer.zero_grad() #梯度清零
  85.             pre_train = dnn_model(tensor_x)
  86.             train_l = MSLE_loss(pre_train, tensor_y) #损失应避免与全局变量loss重名
  87.             train_l.backward()#前向传播
  88.             optimizer.step()#梯度下降

  89.             train_loss += train_l.item() * len(tensor_x)
  90.             
  91.         train_loss /= len(train_ds) #每次迭代平均损失
  92.         
  93.         if epoch % 200 == 0:
  94.             print('Loss: {}  Epoch:{}'.format(train_loss, epoch))
  95.             #print(train_l)
  96.         
  97.         
  98.         with torch.no_grad():
  99.             valid_loss = 0
  100.             
  101.             for tensor_x, tensor_y in valid_iter:
  102.                 tensor_x = tensor_x.float()
  103.                 tensor_y = tensor_y.float().reshape(-1, 1)
  104.                 pre_valid = dnn_model(tensor_x)
  105.                 valid_l = MSLE_loss(pre_valid, tensor_y)
  106.                 valid_loss += valid_l.item() * len(tensor_x)
  107.                   
  108.             valid_loss /= len(valid_ds)
  109.                   
  110.             if epoch % 200 == 0:
  111.                 print('Valid Loss: {}  Epoch:{}'.format(valid_loss, epoch))
  112.                 #print(valid_l)
  113.         
  114.       #将每折的损失添加到列表中
  115.     train_ls.append(train_loss)
  116.     valid_ls.append(valid_loss)

  117.         
  118. print('Training Ended')
  119. print('Train Average Loss: {} Valid Average Loss: {}'.format(np.mean(train_ls),np.mean(valid_ls)))
复制代码


迭代次数太多cpu跑不动了, 要怎么修改用gpu跑
小甲鱼最新课程 -> https://ilovefishc.com
回复

使用道具 举报

发表于 2024-10-27 16:15:04 | 显示全部楼层
为了让你的代码在GPU上运行,主要需要以下几个步骤:

1. **检查是否有可用的GPU**。
2. **将模型移到GPU上**。
3. **将输入数据和目标数据移到GPU上**。

下面是修改后的代码示例:

  1. import torch
  2. import torch.nn as nn
  3. from torch.utils import data
  4. from torch.utils.data import Dataset, DataLoader
  5. from torch import optim

  6. # 检查GPU是否可用
  7. device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  8. # 定义神经网络模型

  9. class SimpleNN(nn.Module):
  10.     def __init__(self):
  11.         super(SimpleNN, self).__init__()
  12.         self.hidden_layer1 = nn.Linear(154, 1024)
  13.         self.hidden_layer2 = nn.Linear(1024, 1024)
  14.         self.hidden_layer3 = nn.Linear(1024, 1024)
  15.         self.hidden_layer4 = nn.Linear(1024, 1024)
  16.         self.output_layer = nn.Linear(1024, 1)
  17.         self.dropout = nn.Dropout(p=0.5)
  18.         nn.init.xavier_uniform_(self.hidden_layer1.weight)
  19.         nn.init.xavier_uniform_(self.hidden_layer2.weight)
  20.         nn.init.xavier_uniform_(self.hidden_layer3.weight)
  21.         nn.init.xavier_uniform_(self.hidden_layer4.weight)
  22.         nn.init.xavier_uniform_(self.output_layer.weight)

  23.     def forward(self, x):
  24.         inputs = x
  25.         layer1_out = torch.nn.functional.gelu(self.hidden_layer1(inputs))
  26.         layer1_out = self.dropout(layer1_out)
  27.         layer2_out = torch.nn.functional.gelu(self.hidden_layer2(layer1_out))
  28.         layer2_out = self.dropout(layer2_out)
  29.         layer3_out = torch.nn.functional.gelu(self.hidden_layer3(layer2_out))
  30.         layer3_out = self.dropout(layer3_out)
  31.         layer4_out = torch.nn.functional.gelu(self.hidden_layer4(layer3_out))
  32.         layer4_out = self.dropout(layer4_out)
  33.         output = torch.relu(self.output_layer(layer4_out))
  34.         return output

  35. # 设置超参数
  36. k = 5
  37. batch_size = 128
  38. num_epochs = 1000
  39. weight_decay = 0

  40. # 初始化模型和优化器
  41. dnn_model = SimpleNN().to(device)  # 将模型移到GPU上
  42. optimizer = optim.AdamW(dnn_model.parameters(), lr=0.0001, weight_decay=weight_decay)  # 定义优化器

  43. # k折交叉验证选取训练集与验证集
  44. def get_k_fold_data(k, i, X, y):
  45.     assert k > 1
  46.     fold_size = len(X) // k
  47.     X_train, y_train = None, None
  48.     for j in range(k):
  49.         start = j * fold_size
  50.         end = (j + 1) * fold_size
  51.         if j == i:
  52.             X_valid, y_valid = X.iloc, y.iloc
  53.         elif X_train is None:
  54.             X_train, y_train = X.iloc, y.iloc
  55.         else:
  56.             X_train = pd.concat([X_train, X.iloc], ignore_index=True)
  57.             y_train = pd.concat([y_train, y.iloc], ignore_index=True)
  58.     return X_train, y_train, X_valid, y_valid

  59. # 初始化列表
  60. train_ls, valid_ls = [], []

  61. for i in range(k):
  62.     X_train, y_train, X_valid, y_valid = get_k_fold_data(k, i, X, y)
  63.     print(f'FOLD {i}')
  64.     print('--------------------------------')
  65.    
  66.     # 将DataFrame数据转换为NumPy数组,然后再转换为PyTorch张量,并且移动到GPU上
  67.     X_train = torch.tensor(X_train.astype(np.float32).values, dtype=torch.float32).to(device)
  68.     y_train = torch.tensor(y_train.astype(np.float32).values, dtype=torch.float32).to(device)
  69.     X_valid = torch.tensor(X_valid.astype(np.float32).values, dtype=torch.float32).to(device)
  70.     y_valid = torch.tensor(y_valid.astype(np.float32).values, dtype=torch.float32).to(device)
  71.    
  72.     # 创建数据集
  73.     train_ds = data.TensorDataset(X_train, y_train)
  74.     valid_ds = data.TensorDataset(X_valid, y_valid)

  75.     # 获取一个数据迭代器
  76.     train_iter = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
  77.     valid_iter = DataLoader(dataset=valid_ds, batch_size=batch_size, shuffle=True, num_workers=2)
  78.    
  79.     # 开始迭代
  80.     for epoch in range(num_epochs):
  81.         train_loss = 0
  82.         for tensor_x, tensor_y in train_iter:  # 训练集执行梯度更新
  83.             tensor_x = tensor_x.float()
  84.             tensor_y = tensor_y.float().reshape(-1, 1)
  85.             optimizer.zero_grad()  # 梯度清零
  86.             pre_train = dnn_model(tensor_x)
  87.             train_l = MSLE_loss(pre_train, tensor_y)  # 损失应避免与全局变量loss重名
  88.             train_l.backward()  # 前向传播
  89.             optimizer.step()  # 梯度下降

  90.             train_loss += train_l.item() * len(tensor_x)
  91.         
  92.         train_loss /= len(train_ds)  # 每次迭代平均损失
  93.         
  94.         if epoch % 200 == 0:
  95.             print('Loss: {}  Epoch:{}'.format(train_loss, epoch))
  96.         
  97.         with torch.no_grad():
  98.             valid_loss = 0
  99.             for tensor_x, tensor_y in valid_iter:
  100.                 tensor_x = tensor_x.float()
  101.                 tensor_y = tensor_y.float().reshape(-1, 1)
  102.                 pre_valid = dnn_model(tensor_x)
  103.                 valid_l = MSLE_loss(pre_valid, tensor_y)
  104.                 valid_loss += valid_l.item() * len(tensor_x)
  105.             
  106.             valid_loss /= len(valid_ds)
  107.             
  108.             if epoch % 200 == 0:
  109.                 print('Valid Loss: {}  Epoch:{}'.format(valid_loss, epoch))
  110.         
  111.     # 将每折的损失添加到列表中
  112.     train_ls.append(train_loss)
  113.     valid_ls.append(valid_loss)

  114. print('Training Ended')
  115. print('Train Average Loss: {} Valid Average Loss: {}'.format(np.mean(train_ls), np.mean(valid_ls)))
复制代码


**注意**:
1. 在初始化模型时,使用 `.to(device)` 将模型移动到指定的设备(GPU)。
2. 在将数据转换为PyTorch张量后,也使用 `.to(device)` 将数据移动到GPU。
3. 确保你的计算环境中已经安装了CUDA驱动和相关的工具包。

以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。
小甲鱼最新课程 -> https://ilovefishc.com
回复 支持 反对

使用道具 举报

 楼主| 发表于 2024-10-27 16:37:49 | 显示全部楼层
FishC 发表于 2024-10-27 16:15
为了让你的代码在GPU上运行,主要需要以下几个步骤:

1. **检查是否有可用的GPU**。

转换成gpu代码报错
  1. ---------------------------------------------------------------------------
  2. RuntimeError                              Traceback (most recent call last)
  3. Cell In[32], line 96
  4.      94 for epoch in range(num_epochs):
  5.      95     train_loss = 0
  6. ---> 96     for tensor_x, tensor_y in train_iter:#训练集执行梯度更新
  7.      97         tensor_x = tensor_x.float()
  8.      98         tensor_y = tensor_y.float().reshape(-1, 1)

  9. File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
  10.     627 if self._sampler_iter is None:
  11.     628     # TODO(https://github.com/pytorch/pytorch/issues/76750)
  12.     629     self._reset()  # type: ignore[call-arg]
  13. --> 630 data = self._next_data()
  14.     631 self._num_yielded += 1
  15.     632 if self._dataset_kind == _DatasetKind.Iterable and \
  16.     633         self._IterableDataset_len_called is not None and \
  17.     634         self._num_yielded > self._IterableDataset_len_called:

  18. File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:1344, in _MultiProcessingDataLoaderIter._next_data(self)
  19.    1342 else:
  20.    1343     del self._task_info[idx]
  21. -> 1344     return self._process_data(data)

  22. File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:1370, in _MultiProcessingDataLoaderIter._process_data(self, data)
  23.    1368 self._try_put_index()
  24.    1369 if isinstance(data, ExceptionWrapper):
  25. -> 1370     data.reraise()
  26.    1371 return data

  27. File /opt/conda/lib/python3.10/site-packages/torch/_utils.py:706, in ExceptionWrapper.reraise(self)
  28.     702 except TypeError:
  29.     703     # If the exception takes multiple arguments, don't try to
  30.     704     # instantiate since we don't know how to
  31.     705     raise RuntimeError(msg) from None
  32. --> 706 raise exception

  33. RuntimeError: Caught RuntimeError in DataLoader worker process 0.
  34. Original Traceback (most recent call last):
  35.   File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 309, in _worker_loop
  36.     data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  37.   File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
  38.     data = [self.dataset[idx] for idx in possibly_batched_index]
  39.   File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
  40.     data = [self.dataset[idx] for idx in possibly_batched_index]
  41.   File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataset.py", line 211, in __getitem__
  42.     return tuple(tensor[index] for tensor in self.tensors)
  43.   File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataset.py", line 211, in <genexpr>
  44.     return tuple(tensor[index] for tensor in self.tensors)
  45. RuntimeError: CUDA error: initialization error
  46. CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
  47. For debugging consider passing CUDA_LAUNCH_BLOCKING=1
  48. Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
复制代码


----------------------------------以下是源代码
  1. #使用dnn模型(k折交叉验证)
  2. import torch
  3. import torch.nn as nn
  4. from torch.utils import data
  5. from torch.utils.data import Dataset,DataLoader
  6. from torch import optim

  7. #定义神经网络模型


  8. class SimpleNN(nn.Module):
  9.     def __init__(self):
  10.         super(SimpleNN,self).__init__()
  11.         self.hidden_layer1 = nn.Linear(154,1024)
  12.         self.hidden_layer2 = nn.Linear(1024,1024)
  13.         self.hidden_layer3 = nn.Linear(1024,1024)
  14.         self.hidden_layer4 = nn.Linear(1024,1024)
  15.         self.output_layer = nn.Linear(1024,1)
  16.         self.dropout = nn.Dropout(p=0.5)
  17.         nn.init.xavier_uniform_(self.hidden_layer1.weight)
  18.         nn.init.xavier_uniform_(self.hidden_layer2.weight)
  19.         nn.init.xavier_uniform_(self.hidden_layer3.weight)
  20.         nn.init.xavier_uniform_(self.hidden_layer4.weight)
  21.         nn.init.xavier_uniform_(self.output_layer.weight)
  22.     def forward(self,x):
  23.         inputs = x
  24.         layer1_out = torch.nn.functional.gelu(self.hidden_layer1(inputs))
  25.         layer1_out = self.dropout(layer1_out)
  26.         layer2_out = torch.nn.functional.gelu(self.hidden_layer2(layer1_out))
  27.         layer2_out = self.dropout(layer2_out)
  28.         layer3_out = torch.nn.functional.gelu(self.hidden_layer3(layer2_out))
  29.         layer3_out = self.dropout(layer3_out)
  30.         layer4_out = torch.nn.functional.gelu(self.hidden_layer4(layer3_out))
  31.         layer4_out = self.dropout(layer4_out)
  32.         output = torch.relu(self.output_layer(layer4_out))
  33.         return output

  34. # 设置超参数
  35. k = 5
  36. batch_size = 128
  37. num_epochs = 1000
  38. weight_decay = 0

  39. #初始化模型和优化器
  40. dnn_model = SimpleNN().to(device)  # 将模型移到GPU上
  41. optimizer = optim.AdamW(dnn_model.parameters(),lr=0.0001,weight_decay=weight_decay) #定义优化器


  42. #k折交叉验证选取训练集与验证集
  43. def get_k_fold_data(k, i, X, y):
  44.     assert k > 1
  45.     fold_size = len(X) // k
  46.     X_train, y_train = None, None
  47.     for j in range(k):
  48.         start = j * fold_size
  49.         end = (j + 1) * fold_size
  50.         if j == i:
  51.             X_valid, y_valid = X.iloc[start:end], y.iloc[start:end]
  52.         elif X_train is None:
  53.             X_train, y_train = X.iloc[start:end], y.iloc[start:end]
  54.         else:
  55.             X_train = pd.concat([X_train, X.iloc[start:end]], ignore_index=True)
  56.             y_train = pd.concat([y_train, y.iloc[start:end]], ignore_index=True)
  57.     return X_train, y_train, X_valid, y_valid




  58. #初始化列表
  59. train_ls, valid_ls = [], []

  60. for i in range(k):
  61.     X_train, y_train, X_valid, y_valid = get_k_fold_data(k, i, X, y)
  62.     print(f'FOLD {i}')
  63.     print('--------------------------------')
  64.    

  65.     # 将DataFrame数据转换为NumPy数组,然后再转换为PyTorch张量,并且移动到GPU上
  66.     X_train = torch.tensor(X_train.astype(np.float32).values, dtype=torch.float32).to(device)
  67.     y_train = torch.tensor(y_train.astype(np.float32).values, dtype=torch.float32).to(device)
  68.     X_valid = torch.tensor(X_valid.astype(np.float32).values, dtype=torch.float32).to(device)
  69.     y_valid = torch.tensor(y_valid.astype(np.float32).values, dtype=torch.float32).to(device)
  70.    
  71.    
  72.     #创建数据集
  73.     train_ds = data.TensorDataset(X_train, y_train)
  74.     valid_ds = data.TensorDataset(X_valid, y_valid)

  75.     # 获取一个数据迭代器
  76.     train_iter = DataLoader(dataset=train_ds,batch_size=batch_size,shuffle=True,num_workers=2)#shuffle=True相当于sampler=RandomSampler(dataset)
  77.     valid_iter = DataLoader(dataset=valid_ds,batch_size=batch_size,shuffle=True,num_workers=2)
  78.    
  79.     #开始迭代
  80.     for epoch in range(num_epochs):
  81.         train_loss = 0
  82.         for tensor_x, tensor_y in train_iter:#训练集执行梯度更新
  83.             tensor_x = tensor_x.float()
  84.             tensor_y = tensor_y.float().reshape(-1, 1)
  85.             optimizer.zero_grad() #梯度清零
  86.             pre_train = dnn_model(tensor_x)
  87.             train_l = MSLE_loss(pre_train, tensor_y) #损失应避免与全局变量loss重名
  88.             train_l.backward()#前向传播
  89.             optimizer.step()#梯度下降

  90.             train_loss += train_l.item() * len(tensor_x)
  91.             
  92.         train_loss /= len(train_ds) #每次迭代平均损失
  93.         
  94.         if epoch % 200 == 0:
  95.             print('Loss: {}  Epoch:{}'.format(train_loss, epoch))
  96.             #print(train_l)
  97.         
  98.         
  99.         with torch.no_grad():
  100.             valid_loss = 0
  101.             
  102.             for tensor_x, tensor_y in valid_iter:
  103.                 tensor_x = tensor_x.float()
  104.                 tensor_y = tensor_y.float().reshape(-1, 1)
  105.                 pre_valid = dnn_model(tensor_x)
  106.                 valid_l = MSLE_loss(pre_valid, tensor_y)
  107.                 valid_loss += valid_l.item() * len(tensor_x)
  108.                   
  109.             valid_loss /= len(valid_ds)
  110.                   
  111.             if epoch % 200 == 0:
  112.                 print('Valid Loss: {}  Epoch:{}'.format(valid_loss, epoch))
  113.                 #print(valid_l)
  114.         
  115.       #将每折的损失添加到列表中
  116.     train_ls.append(train_loss)
  117.     valid_ls.append(valid_loss)

  118.         
  119. print('Training Ended')
  120. print('Train Average Loss: {} Valid Average Loss: {}'.format(np.mean(train_ls),np.mean(valid_ls)))
复制代码
小甲鱼最新课程 -> https://ilovefishc.com
回复 支持 反对

使用道具 举报

发表于 2024-10-27 16:52:11 | 显示全部楼层
Terence888 发表于 2024-10-27 16:37
转换成gpu代码报错--------------------------------- ...


好的,让我们一步一步来分析和解决这个问题。

### 问题分析
错误信息显示一个 `CUDA error: initialization error`。这通常不是代码本身的问题,而是由于环境或并行处理引起的。在你的代码里,我们使用 `num_workers` 参数,并行处理数据加载,可能导致CUDA初始化错误。

### 解决方案
1. 确保你的代码中正确处理了设备选择和数据移动。
2. 在数据加载器中设置 `num_workers=0` 以避免多进程数据加载的问题,特别是在调试时。

### 修改后的代码
下面是修改后的代码,你可以按照这个示例来运行:

  1. import torch
  2. import torch.nn as nn
  3. from torch.utils import data
  4. from torch.utils.data import Dataset, DataLoader
  5. from torch import optim
  6. import numpy as np
  7. import pandas as pd

  8. # 检查GPU是否可用
  9. device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  10. # 定义神经网络模型
  11. class SimpleNN(nn.Module):
  12.     def __init__(self):
  13.         super(SimpleNN, self).__init__()
  14.         self.hidden_layer1 = nn.Linear(154, 1024)
  15.         self.hidden_layer2 = nn.Linear(1024, 1024)
  16.         self.hidden_layer3 = nn.Linear(1024, 1024)
  17.         self.hidden_layer4 = nn.Linear(1024, 1024)
  18.         self.output_layer = nn.Linear(1024, 1)
  19.         self.dropout = nn.Dropout(p=0.5)
  20.         nn.init.xavier_uniform_(self.hidden_layer1.weight)
  21.         nn.init.xavier_uniform_(self.hidden_layer2.weight)
  22.         nn.init.xavier_uniform_(self.hidden_layer3.weight)
  23.         nn.init.xavier_uniform_(self.hidden_layer4.weight)
  24.         nn.init.xavier_uniform_(self.output_layer.weight)

  25.     def forward(self, x):
  26.         inputs = x
  27.         layer1_out = torch.nn.functional.gelu(self.hidden_layer1(inputs))
  28.         layer1_out = self.dropout(layer1_out)
  29.         layer2_out = torch.nn.functional.gelu(self.hidden_layer2(layer1_out))
  30.         layer2_out = self.dropout(layer2_out)
  31.         layer3_out = torch.nn.functional.gelu(self.hidden_layer3(layer2_out))
  32.         layer3_out = self.dropout(layer3_out)
  33.         layer4_out = torch.nn.functional.gelu(self.hidden_layer4(layer3_out))
  34.         layer4_out = self.dropout(layer4_out)
  35.         output = torch.relu(self.output_layer(layer4_out))
  36.         return output

  37. # 设置超参数
  38. k = 5
  39. batch_size = 128
  40. num_epochs = 1000
  41. weight_decay = 0
  42. learning_rate = 0.0001

  43. # 初始化模型和优化器
  44. dnn_model = SimpleNN().to(device)
  45. optimizer = optim.AdamW(dnn_model.parameters(), lr=learning_rate, weight_decay=weight_decay)

  46. # k折交叉验证选取训练集与验证集
  47. def get_k_fold_data(k, i, X, y):
  48.     assert k > 1
  49.     fold_size = len(X) // k
  50.     X_train, y_train = None, None
  51.     for j in range(k):
  52.         start = j * fold_size
  53.         end = (j + 1) * fold_size
  54.         X_part, y_part = X[start:end], y[start:end]
  55.         if j == i:
  56.             X_valid, y_valid = X_part, y_part
  57.         elif X_train is None:
  58.             X_train, y_train = X_part, y_part
  59.         else:
  60.             X_train = pd.concat([X_train, X_part], ignore_index=True)
  61.             y_train = pd.concat([y_train, y_part], ignore_index=True)
  62.     return X_train, y_train, X_valid, y_valid

  63. # 初始化列表
  64. train_ls, valid_ls = [], []

  65. for i in range(k):
  66.     X_train, y_train, X_valid, y_valid = get_k_fold_data(k, i, X, y)
  67.     print(f'FOLD {i}')
  68.     print('--------------------------------')
  69.    
  70.     # 将DataFrame数据转换为NumPy数组,然后再转换为PyTorch张量,并且移动到GPU上
  71.     X_train = torch.tensor(X_train.astype(np.float32).values, dtype=torch.float32).to(device)
  72.     y_train = torch.tensor(y_train.astype(np.float32).values, dtype=torch.float32).to(device)
  73.     X_valid = torch.tensor(X_valid.astype(np.float32).values, dtype=torch.float32).to(device)
  74.     y_valid = torch.tensor(y_valid.astype(np.float32).values, dtype=torch.float32).to(device)
  75.    
  76.     # 创建数据集
  77.     train_ds = data.TensorDataset(X_train, y_train)
  78.     valid_ds = data.TensorDataset(X_valid, y_valid)

  79.     # 获取一个数据迭代器
  80.     train_iter = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True, num_workers=0)
  81.     valid_iter = DataLoader(dataset=valid_ds, batch_size=batch_size, shuffle=False, num_workers=0)
  82.    
  83.     # 开始迭代
  84.     for epoch in range(num_epochs):
  85.         train_loss = 0
  86.         for tensor_x, tensor_y in train_iter:  # 训练集执行梯度更新
  87.             tensor_x = tensor_x.float()
  88.             tensor_y = tensor_y.float().reshape(-1, 1)
  89.             optimizer.zero_grad()  # 梯度清零
  90.             pre_train = dnn_model(tensor_x)
  91.             train_l = MSLE_loss(pre_train, tensor_y)  # 损失应避免与全局变量loss重名
  92.             train_l.backward()  # 前向传播
  93.             optimizer.step()  # 梯度下降

  94.             train_loss += train_l.item() * len(tensor_x)
  95.         
  96.         train_loss /= len(train_ds)  # 每次迭代平均损失
  97.         
  98.         if epoch % 200 == 0:
  99.             print('Loss: {}  Epoch: {}'.format(train_loss, epoch))
  100.         
  101.         with torch.no_grad():
  102.             valid_loss = 0
  103.             for tensor_x, tensor_y in valid_iter:
  104.                 tensor_x = tensor_x.float()
  105.                 tensor_y = tensor_y.float().reshape(-1, 1)
  106.                 pre_valid = dnn_model(tensor_x)
  107.                 valid_l = MSLE_loss(pre_valid, tensor_y)
  108.                 valid_loss += valid_l.item() * len(tensor_x)
  109.             
  110.             valid_loss /= len(valid_ds)
  111.             
  112.             if epoch % 200 == 0:
  113.                 print('Valid Loss: {}  Epoch: {}'.format(valid_loss, epoch))
  114.         
  115.     # 将每折的损失添加到列表中
  116.     train_ls.append(train_loss)
  117.     valid_ls.append(valid_loss)

  118. print('Training Ended')
  119. print('Train Average Loss: {} Valid Average Loss: {}'.format(np.mean(train_ls), np.mean(valid_ls)))
复制代码


### 关键点
1. 将数据加载器的 `num_workers` 参数设置为 `0` 以避免 CUDA 初始化错误。
2. 确保所有的张量和模型都正确地移动到 GPU 上: `tensor.to(device)` 和 `model.to(device)`。
3. 确保所有的计算都在 GPU 上进行,包含损失计算和优化器步骤。

### 注意
- `MSLE_loss` 需要在代码中定义,这个示例假设你已经定义了这个函数。
- 确保你的计算环境中有足够的 GPU 资源,否则可能仍然会遇到运行时错误。

通过上述修改,你应该能够成功地在 GPU 上运行你的深度神经网络模型。

以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。
小甲鱼最新课程 -> https://ilovefishc.com
回复 支持 反对

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2025-4-18 16:47

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表