鱼C论坛

 找回密码
 立即注册
查看: 114|回复: 3

如何修改代码使用gpu的cuda来运行

[复制链接]
发表于 2024-10-27 16:14:47 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
#使用dnn模型(k折交叉验证)
import torch 
import torch.nn as nn
from torch.utils import data
from torch.utils.data import Dataset,DataLoader
from torch import optim

#定义神经网络模型


class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN,self).__init__()
        self.hidden_layer1 = nn.Linear(154,1024)
        self.hidden_layer2 = nn.Linear(1024,1024)
        self.hidden_layer3 = nn.Linear(1024,1024)
        self.hidden_layer4 = nn.Linear(1024,1024)
        self.output_layer = nn.Linear(1024,1)
        self.dropout = nn.Dropout(p=0.5) 
        nn.init.xavier_uniform_(self.hidden_layer1.weight)
        nn.init.xavier_uniform_(self.hidden_layer2.weight)
        nn.init.xavier_uniform_(self.hidden_layer3.weight)
        nn.init.xavier_uniform_(self.hidden_layer4.weight)
        nn.init.xavier_uniform_(self.output_layer.weight)
    def forward(self,x):
        inputs = x
        layer1_out = torch.nn.functional.gelu(self.hidden_layer1(inputs))
        layer1_out = self.dropout(layer1_out)
        layer2_out = torch.nn.functional.gelu(self.hidden_layer2(layer1_out))
        layer2_out = self.dropout(layer2_out)
        layer3_out = torch.nn.functional.gelu(self.hidden_layer3(layer2_out))
        layer3_out = self.dropout(layer3_out)
        layer4_out = torch.nn.functional.gelu(self.hidden_layer4(layer3_out))
        layer4_out = self.dropout(layer4_out)
        output = torch.relu(self.output_layer(layer4_out))
        return output

# 设置超参数
k = 5
batch_size = 128
num_epochs = 1000
weight_decay = 0

#初始化模型和优化器
dnn_model = SimpleNN()
optimizer = optim.AdamW(dnn_model.parameters(),lr=0.0001,weight_decay=weight_decay) #定义优化器


#k折交叉验证选取训练集与验证集
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = len(X) // k
    X_train, y_train = None, None
    for j in range(k):
        start = j * fold_size
        end = (j + 1) * fold_size
        if j == i:
            X_valid, y_valid = X.iloc[start:end], y.iloc[start:end]
        elif X_train is None:
            X_train, y_train = X.iloc[start:end], y.iloc[start:end]
        else:
            X_train = pd.concat([X_train, X.iloc[start:end]], ignore_index=True)
            y_train = pd.concat([y_train, y.iloc[start:end]], ignore_index=True)
    return X_train, y_train, X_valid, y_valid




#初始化列表
train_ls, valid_ls = [], []

for i in range(k):
    X_train, y_train, X_valid, y_valid = get_k_fold_data(k, i, X, y)
    print(f'FOLD {i}')
    print('--------------------------------')
    

    #将DataFrame数据转换为NumPy数组,然后再转换为PyTorch张量
    X_train = torch.tensor(X_train.astype(np.float32).values, dtype=torch.float32)
    y_train = torch.tensor(y_train.astype(np.float32).values, dtype=torch.float32)
    X_valid = torch.tensor(X_valid.astype(np.float32).values, dtype=torch.float32)
    y_valid = torch.tensor(y_valid.astype(np.float32).values, dtype=torch.float32)
    
    #创建数据集
    train_ds = data.TensorDataset(X_train, y_train)
    valid_ds = data.TensorDataset(X_valid, y_valid)

    # 获取一个数据迭代器
    train_iter = DataLoader(dataset=train_ds,batch_size=batch_size,shuffle=True,num_workers=2)#shuffle=True相当于sampler=RandomSampler(dataset)
    valid_iter = DataLoader(dataset=valid_ds,batch_size=batch_size,shuffle=True,num_workers=2)
    
    #开始迭代
    for epoch in range(num_epochs):
        train_loss = 0
        for tensor_x, tensor_y in train_iter:#训练集执行梯度更新
            tensor_x = tensor_x.float()
            tensor_y = tensor_y.float().reshape(-1, 1)
            optimizer.zero_grad() #梯度清零
            pre_train = dnn_model(tensor_x)
            train_l = MSLE_loss(pre_train, tensor_y) #损失应避免与全局变量loss重名
            train_l.backward()#前向传播
            optimizer.step()#梯度下降

            train_loss += train_l.item() * len(tensor_x)
            
        train_loss /= len(train_ds) #每次迭代平均损失
        
        if epoch % 200 == 0:
            print('Loss: {}  Epoch:{}'.format(train_loss, epoch))
            #print(train_l)
        
        
        with torch.no_grad():
            valid_loss = 0
            
            for tensor_x, tensor_y in valid_iter:
                tensor_x = tensor_x.float()
                tensor_y = tensor_y.float().reshape(-1, 1)
                pre_valid = dnn_model(tensor_x)
                valid_l = MSLE_loss(pre_valid, tensor_y)
                valid_loss += valid_l.item() * len(tensor_x)
                  
            valid_loss /= len(valid_ds)
                   
            if epoch % 200 == 0:
                print('Valid Loss: {}  Epoch:{}'.format(valid_loss, epoch))
                #print(valid_l)
        
      #将每折的损失添加到列表中
    train_ls.append(train_loss)
    valid_ls.append(valid_loss)

        
print('Training Ended')
print('Train Average Loss: {} Valid Average Loss: {}'.format(np.mean(train_ls),np.mean(valid_ls)))

迭代次数太多cpu跑不动了, 要怎么修改用gpu跑
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

发表于 2024-10-27 16:15:04 | 显示全部楼层
为了让你的代码在GPU上运行,主要需要以下几个步骤:

1. **检查是否有可用的GPU**。
2. **将模型移到GPU上**。
3. **将输入数据和目标数据移到GPU上**。

下面是修改后的代码示例:
import torch 
import torch.nn as nn
from torch.utils import data
from torch.utils.data import Dataset, DataLoader
from torch import optim

# 检查GPU是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 定义神经网络模型

class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.hidden_layer1 = nn.Linear(154, 1024)
        self.hidden_layer2 = nn.Linear(1024, 1024)
        self.hidden_layer3 = nn.Linear(1024, 1024)
        self.hidden_layer4 = nn.Linear(1024, 1024)
        self.output_layer = nn.Linear(1024, 1)
        self.dropout = nn.Dropout(p=0.5)
        nn.init.xavier_uniform_(self.hidden_layer1.weight)
        nn.init.xavier_uniform_(self.hidden_layer2.weight)
        nn.init.xavier_uniform_(self.hidden_layer3.weight)
        nn.init.xavier_uniform_(self.hidden_layer4.weight)
        nn.init.xavier_uniform_(self.output_layer.weight)

    def forward(self, x):
        inputs = x
        layer1_out = torch.nn.functional.gelu(self.hidden_layer1(inputs))
        layer1_out = self.dropout(layer1_out)
        layer2_out = torch.nn.functional.gelu(self.hidden_layer2(layer1_out))
        layer2_out = self.dropout(layer2_out)
        layer3_out = torch.nn.functional.gelu(self.hidden_layer3(layer2_out))
        layer3_out = self.dropout(layer3_out)
        layer4_out = torch.nn.functional.gelu(self.hidden_layer4(layer3_out))
        layer4_out = self.dropout(layer4_out)
        output = torch.relu(self.output_layer(layer4_out))
        return output

# 设置超参数
k = 5
batch_size = 128
num_epochs = 1000
weight_decay = 0

# 初始化模型和优化器
dnn_model = SimpleNN().to(device)  # 将模型移到GPU上
optimizer = optim.AdamW(dnn_model.parameters(), lr=0.0001, weight_decay=weight_decay)  # 定义优化器

# k折交叉验证选取训练集与验证集
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = len(X) // k
    X_train, y_train = None, None
    for j in range(k):
        start = j * fold_size
        end = (j + 1) * fold_size
        if j == i:
            X_valid, y_valid = X.iloc, y.iloc
        elif X_train is None:
            X_train, y_train = X.iloc, y.iloc
        else:
            X_train = pd.concat([X_train, X.iloc], ignore_index=True)
            y_train = pd.concat([y_train, y.iloc], ignore_index=True)
    return X_train, y_train, X_valid, y_valid

# 初始化列表
train_ls, valid_ls = [], []

for i in range(k):
    X_train, y_train, X_valid, y_valid = get_k_fold_data(k, i, X, y)
    print(f'FOLD {i}')
    print('--------------------------------')
    
    # 将DataFrame数据转换为NumPy数组,然后再转换为PyTorch张量,并且移动到GPU上
    X_train = torch.tensor(X_train.astype(np.float32).values, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train.astype(np.float32).values, dtype=torch.float32).to(device)
    X_valid = torch.tensor(X_valid.astype(np.float32).values, dtype=torch.float32).to(device)
    y_valid = torch.tensor(y_valid.astype(np.float32).values, dtype=torch.float32).to(device)
    
    # 创建数据集
    train_ds = data.TensorDataset(X_train, y_train)
    valid_ds = data.TensorDataset(X_valid, y_valid)

    # 获取一个数据迭代器
    train_iter = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
    valid_iter = DataLoader(dataset=valid_ds, batch_size=batch_size, shuffle=True, num_workers=2)
    
    # 开始迭代
    for epoch in range(num_epochs):
        train_loss = 0
        for tensor_x, tensor_y in train_iter:  # 训练集执行梯度更新
            tensor_x = tensor_x.float()
            tensor_y = tensor_y.float().reshape(-1, 1)
            optimizer.zero_grad()  # 梯度清零
            pre_train = dnn_model(tensor_x)
            train_l = MSLE_loss(pre_train, tensor_y)  # 损失应避免与全局变量loss重名
            train_l.backward()  # 前向传播
            optimizer.step()  # 梯度下降

            train_loss += train_l.item() * len(tensor_x)
        
        train_loss /= len(train_ds)  # 每次迭代平均损失
        
        if epoch % 200 == 0:
            print('Loss: {}  Epoch:{}'.format(train_loss, epoch))
        
        with torch.no_grad():
            valid_loss = 0
            for tensor_x, tensor_y in valid_iter:
                tensor_x = tensor_x.float()
                tensor_y = tensor_y.float().reshape(-1, 1)
                pre_valid = dnn_model(tensor_x)
                valid_l = MSLE_loss(pre_valid, tensor_y)
                valid_loss += valid_l.item() * len(tensor_x)
            
            valid_loss /= len(valid_ds)
            
            if epoch % 200 == 0:
                print('Valid Loss: {}  Epoch:{}'.format(valid_loss, epoch))
        
    # 将每折的损失添加到列表中
    train_ls.append(train_loss)
    valid_ls.append(valid_loss)

print('Training Ended')
print('Train Average Loss: {} Valid Average Loss: {}'.format(np.mean(train_ls), np.mean(valid_ls)))

**注意**:
1. 在初始化模型时,使用 `.to(device)` 将模型移动到指定的设备(GPU)。
2. 在将数据转换为PyTorch张量后,也使用 `.to(device)` 将数据移动到GPU。
3. 确保你的计算环境中已经安装了CUDA驱动和相关的工具包。

以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复 支持 反对

使用道具 举报

 楼主| 发表于 2024-10-27 16:37:49 | 显示全部楼层
FishC 发表于 2024-10-27 16:15
为了让你的代码在GPU上运行,主要需要以下几个步骤:

1. **检查是否有可用的GPU**。

转换成gpu代码报错
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[32], line 96
     94 for epoch in range(num_epochs):
     95     train_loss = 0
---> 96     for tensor_x, tensor_y in train_iter:#训练集执行梯度更新
     97         tensor_x = tensor_x.float()
     98         tensor_y = tensor_y.float().reshape(-1, 1)

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
    627 if self._sampler_iter is None:
    628     # TODO(https://github.com/pytorch/pytorch/issues/76750)
    629     self._reset()  # type: ignore[call-arg]
--> 630 data = self._next_data()
    631 self._num_yielded += 1
    632 if self._dataset_kind == _DatasetKind.Iterable and \
    633         self._IterableDataset_len_called is not None and \
    634         self._num_yielded > self._IterableDataset_len_called:

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:1344, in _MultiProcessingDataLoaderIter._next_data(self)
   1342 else:
   1343     del self._task_info[idx]
-> 1344     return self._process_data(data)

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:1370, in _MultiProcessingDataLoaderIter._process_data(self, data)
   1368 self._try_put_index()
   1369 if isinstance(data, ExceptionWrapper):
-> 1370     data.reraise()
   1371 return data

File /opt/conda/lib/python3.10/site-packages/torch/_utils.py:706, in ExceptionWrapper.reraise(self)
    702 except TypeError:
    703     # If the exception takes multiple arguments, don't try to
    704     # instantiate since we don't know how to
    705     raise RuntimeError(msg) from None
--> 706 raise exception

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 309, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataset.py", line 211, in __getitem__
    return tuple(tensor[index] for tensor in self.tensors)
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataset.py", line 211, in <genexpr>
    return tuple(tensor[index] for tensor in self.tensors)
RuntimeError: CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

----------------------------------以下是源代码
#使用dnn模型(k折交叉验证)
import torch 
import torch.nn as nn
from torch.utils import data
from torch.utils.data import Dataset,DataLoader
from torch import optim

#定义神经网络模型


class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN,self).__init__()
        self.hidden_layer1 = nn.Linear(154,1024)
        self.hidden_layer2 = nn.Linear(1024,1024)
        self.hidden_layer3 = nn.Linear(1024,1024)
        self.hidden_layer4 = nn.Linear(1024,1024)
        self.output_layer = nn.Linear(1024,1)
        self.dropout = nn.Dropout(p=0.5) 
        nn.init.xavier_uniform_(self.hidden_layer1.weight)
        nn.init.xavier_uniform_(self.hidden_layer2.weight)
        nn.init.xavier_uniform_(self.hidden_layer3.weight)
        nn.init.xavier_uniform_(self.hidden_layer4.weight)
        nn.init.xavier_uniform_(self.output_layer.weight)
    def forward(self,x):
        inputs = x
        layer1_out = torch.nn.functional.gelu(self.hidden_layer1(inputs))
        layer1_out = self.dropout(layer1_out)
        layer2_out = torch.nn.functional.gelu(self.hidden_layer2(layer1_out))
        layer2_out = self.dropout(layer2_out)
        layer3_out = torch.nn.functional.gelu(self.hidden_layer3(layer2_out))
        layer3_out = self.dropout(layer3_out)
        layer4_out = torch.nn.functional.gelu(self.hidden_layer4(layer3_out))
        layer4_out = self.dropout(layer4_out)
        output = torch.relu(self.output_layer(layer4_out))
        return output

# 设置超参数
k = 5
batch_size = 128
num_epochs = 1000
weight_decay = 0

#初始化模型和优化器
dnn_model = SimpleNN().to(device)  # 将模型移到GPU上
optimizer = optim.AdamW(dnn_model.parameters(),lr=0.0001,weight_decay=weight_decay) #定义优化器


#k折交叉验证选取训练集与验证集
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = len(X) // k
    X_train, y_train = None, None
    for j in range(k):
        start = j * fold_size
        end = (j + 1) * fold_size
        if j == i:
            X_valid, y_valid = X.iloc[start:end], y.iloc[start:end]
        elif X_train is None:
            X_train, y_train = X.iloc[start:end], y.iloc[start:end]
        else:
            X_train = pd.concat([X_train, X.iloc[start:end]], ignore_index=True)
            y_train = pd.concat([y_train, y.iloc[start:end]], ignore_index=True)
    return X_train, y_train, X_valid, y_valid




#初始化列表
train_ls, valid_ls = [], []

for i in range(k):
    X_train, y_train, X_valid, y_valid = get_k_fold_data(k, i, X, y)
    print(f'FOLD {i}')
    print('--------------------------------')
    

    # 将DataFrame数据转换为NumPy数组,然后再转换为PyTorch张量,并且移动到GPU上
    X_train = torch.tensor(X_train.astype(np.float32).values, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train.astype(np.float32).values, dtype=torch.float32).to(device)
    X_valid = torch.tensor(X_valid.astype(np.float32).values, dtype=torch.float32).to(device)
    y_valid = torch.tensor(y_valid.astype(np.float32).values, dtype=torch.float32).to(device)
    
    
    #创建数据集
    train_ds = data.TensorDataset(X_train, y_train)
    valid_ds = data.TensorDataset(X_valid, y_valid)

    # 获取一个数据迭代器
    train_iter = DataLoader(dataset=train_ds,batch_size=batch_size,shuffle=True,num_workers=2)#shuffle=True相当于sampler=RandomSampler(dataset)
    valid_iter = DataLoader(dataset=valid_ds,batch_size=batch_size,shuffle=True,num_workers=2)
    
    #开始迭代
    for epoch in range(num_epochs):
        train_loss = 0
        for tensor_x, tensor_y in train_iter:#训练集执行梯度更新
            tensor_x = tensor_x.float()
            tensor_y = tensor_y.float().reshape(-1, 1)
            optimizer.zero_grad() #梯度清零
            pre_train = dnn_model(tensor_x)
            train_l = MSLE_loss(pre_train, tensor_y) #损失应避免与全局变量loss重名
            train_l.backward()#前向传播
            optimizer.step()#梯度下降

            train_loss += train_l.item() * len(tensor_x)
            
        train_loss /= len(train_ds) #每次迭代平均损失
        
        if epoch % 200 == 0:
            print('Loss: {}  Epoch:{}'.format(train_loss, epoch))
            #print(train_l)
        
        
        with torch.no_grad():
            valid_loss = 0
            
            for tensor_x, tensor_y in valid_iter:
                tensor_x = tensor_x.float()
                tensor_y = tensor_y.float().reshape(-1, 1)
                pre_valid = dnn_model(tensor_x)
                valid_l = MSLE_loss(pre_valid, tensor_y)
                valid_loss += valid_l.item() * len(tensor_x)
                  
            valid_loss /= len(valid_ds)
                   
            if epoch % 200 == 0:
                print('Valid Loss: {}  Epoch:{}'.format(valid_loss, epoch))
                #print(valid_l)
        
      #将每折的损失添加到列表中
    train_ls.append(train_loss)
    valid_ls.append(valid_loss)

        
print('Training Ended')
print('Train Average Loss: {} Valid Average Loss: {}'.format(np.mean(train_ls),np.mean(valid_ls)))
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复 支持 反对

使用道具 举报

发表于 2024-10-27 16:52:11 | 显示全部楼层
Terence888 发表于 2024-10-27 16:37
转换成gpu代码报错--------------------------------- ...


好的,让我们一步一步来分析和解决这个问题。

### 问题分析
错误信息显示一个 `CUDA error: initialization error`。这通常不是代码本身的问题,而是由于环境或并行处理引起的。在你的代码里,我们使用 `num_workers` 参数,并行处理数据加载,可能导致CUDA初始化错误。

### 解决方案
1. 确保你的代码中正确处理了设备选择和数据移动。
2. 在数据加载器中设置 `num_workers=0` 以避免多进程数据加载的问题,特别是在调试时。

### 修改后的代码
下面是修改后的代码,你可以按照这个示例来运行:
import torch
import torch.nn as nn
from torch.utils import data
from torch.utils.data import Dataset, DataLoader
from torch import optim
import numpy as np
import pandas as pd

# 检查GPU是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 定义神经网络模型
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.hidden_layer1 = nn.Linear(154, 1024)
        self.hidden_layer2 = nn.Linear(1024, 1024)
        self.hidden_layer3 = nn.Linear(1024, 1024)
        self.hidden_layer4 = nn.Linear(1024, 1024)
        self.output_layer = nn.Linear(1024, 1)
        self.dropout = nn.Dropout(p=0.5)
        nn.init.xavier_uniform_(self.hidden_layer1.weight)
        nn.init.xavier_uniform_(self.hidden_layer2.weight)
        nn.init.xavier_uniform_(self.hidden_layer3.weight)
        nn.init.xavier_uniform_(self.hidden_layer4.weight)
        nn.init.xavier_uniform_(self.output_layer.weight)

    def forward(self, x):
        inputs = x
        layer1_out = torch.nn.functional.gelu(self.hidden_layer1(inputs))
        layer1_out = self.dropout(layer1_out)
        layer2_out = torch.nn.functional.gelu(self.hidden_layer2(layer1_out))
        layer2_out = self.dropout(layer2_out)
        layer3_out = torch.nn.functional.gelu(self.hidden_layer3(layer2_out))
        layer3_out = self.dropout(layer3_out)
        layer4_out = torch.nn.functional.gelu(self.hidden_layer4(layer3_out))
        layer4_out = self.dropout(layer4_out)
        output = torch.relu(self.output_layer(layer4_out))
        return output

# 设置超参数
k = 5
batch_size = 128
num_epochs = 1000
weight_decay = 0
learning_rate = 0.0001

# 初始化模型和优化器
dnn_model = SimpleNN().to(device)
optimizer = optim.AdamW(dnn_model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# k折交叉验证选取训练集与验证集
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = len(X) // k
    X_train, y_train = None, None
    for j in range(k):
        start = j * fold_size
        end = (j + 1) * fold_size
        X_part, y_part = X[start:end], y[start:end]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = pd.concat([X_train, X_part], ignore_index=True)
            y_train = pd.concat([y_train, y_part], ignore_index=True)
    return X_train, y_train, X_valid, y_valid

# 初始化列表
train_ls, valid_ls = [], []

for i in range(k):
    X_train, y_train, X_valid, y_valid = get_k_fold_data(k, i, X, y)
    print(f'FOLD {i}')
    print('--------------------------------')
    
    # 将DataFrame数据转换为NumPy数组,然后再转换为PyTorch张量,并且移动到GPU上
    X_train = torch.tensor(X_train.astype(np.float32).values, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train.astype(np.float32).values, dtype=torch.float32).to(device)
    X_valid = torch.tensor(X_valid.astype(np.float32).values, dtype=torch.float32).to(device)
    y_valid = torch.tensor(y_valid.astype(np.float32).values, dtype=torch.float32).to(device)
    
    # 创建数据集
    train_ds = data.TensorDataset(X_train, y_train)
    valid_ds = data.TensorDataset(X_valid, y_valid)

    # 获取一个数据迭代器
    train_iter = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True, num_workers=0)
    valid_iter = DataLoader(dataset=valid_ds, batch_size=batch_size, shuffle=False, num_workers=0)
    
    # 开始迭代
    for epoch in range(num_epochs):
        train_loss = 0
        for tensor_x, tensor_y in train_iter:  # 训练集执行梯度更新
            tensor_x = tensor_x.float()
            tensor_y = tensor_y.float().reshape(-1, 1)
            optimizer.zero_grad()  # 梯度清零
            pre_train = dnn_model(tensor_x)
            train_l = MSLE_loss(pre_train, tensor_y)  # 损失应避免与全局变量loss重名
            train_l.backward()  # 前向传播
            optimizer.step()  # 梯度下降

            train_loss += train_l.item() * len(tensor_x)
        
        train_loss /= len(train_ds)  # 每次迭代平均损失
        
        if epoch % 200 == 0:
            print('Loss: {}  Epoch: {}'.format(train_loss, epoch))
        
        with torch.no_grad():
            valid_loss = 0
            for tensor_x, tensor_y in valid_iter:
                tensor_x = tensor_x.float()
                tensor_y = tensor_y.float().reshape(-1, 1)
                pre_valid = dnn_model(tensor_x)
                valid_l = MSLE_loss(pre_valid, tensor_y)
                valid_loss += valid_l.item() * len(tensor_x)
            
            valid_loss /= len(valid_ds)
            
            if epoch % 200 == 0:
                print('Valid Loss: {}  Epoch: {}'.format(valid_loss, epoch))
        
    # 将每折的损失添加到列表中
    train_ls.append(train_loss)
    valid_ls.append(valid_loss)

print('Training Ended')
print('Train Average Loss: {} Valid Average Loss: {}'.format(np.mean(train_ls), np.mean(valid_ls)))

### 关键点
1. 将数据加载器的 `num_workers` 参数设置为 `0` 以避免 CUDA 初始化错误。
2. 确保所有的张量和模型都正确地移动到 GPU 上: `tensor.to(device)` 和 `model.to(device)`。
3. 确保所有的计算都在 GPU 上进行,包含损失计算和优化器步骤。

### 注意
- `MSLE_loss` 需要在代码中定义,这个示例假设你已经定义了这个函数。
- 确保你的计算环境中有足够的 GPU 资源,否则可能仍然会遇到运行时错误。

通过上述修改,你应该能够成功地在 GPU 上运行你的深度神经网络模型。

以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复 支持 反对

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-11-21 20:15

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表