训练模型中如何增加梯度计算
class CustomModel(nn.Module):def __init__(self, cfg, config_path=None, pretrained=False):
super().__init__()
self.cfg = cfg
if config_path is None:
self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
self.config.hidden_dropout = 0.
self.config.hidden_dropout_prob = 0.
self.config.attention_dropout = 0.
self.config.attention_probs_dropout_prob = 0.
self.config.add_pooling_layer = False
else:
self.config = torch.load(config_path)
if pretrained:
self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
else:
self.model = AutoModel.from_config(self.config)
self.model.resize_token_embeddings(len(CFG.tokenizer))
if self.cfg.gradient_checkpointing:
self.model.gradient_checkpointing_enable()
self.fc = nn.Linear(self.config.hidden_size, self.cfg.num_labels)
self._init_weights(self.fc)
def _init_weights(self, module):
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def feature(self, input_ids, attention_mask, token_type_ids ):
outputs = self.model( input_ids, attention_mask, token_type_ids)
last_hidden_states = outputs
feature = last_hidden_states[:, 0, :] ## CLS token
return feature
def forward(self,input_ids, attention_mask, token_type_ids):
feature = self.feature( input_ids, attention_mask, token_type_ids)
output = self.fc(feature)
return output.squeeze(-1)
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
preds = []
model.eval()
model.to(device)
tk0 = tqdm(test_loader, total=len(test_loader))
for inputs in tk0:
inputs["input_ids"] = inputs["input_ids"].to(device)
inputs["attention_mask"] = inputs["attention_mask"].to(device)
inputs['token_type_ids'] = inputs['token_type_ids'].to(device)
with torch.no_grad():
with torch.cuda.amp.autocast(
enabled=True, dtype=torch.float16, cache_enabled=True
):
y_preds = model(inputs["input_ids"], inputs["attention_mask"],inputs['token_type_ids'])
preds.append(y_preds.to("cpu").numpy().astype(np.float32))
predictions = np.concatenate(preds)
return predictions
上述代码不使用梯度更新仅适用于测试集,如何修改可将其用于训练集 要将上述代码用于训练集,你需要在模型的训练过程中计算梯度并更新模型参数。可以通过以下步骤实现:
1. **设置模型为训练模式**:使用`model.train()`。
2. **定义损失函数和优化器**:选择合适的损失函数(如`nn.CrossEntropyLoss()`)和优化器(如`torch.optim.AdamW()`)。
3. **计算损失并反向传播**:使用`loss.backward()`计算梯度。
4. **更新模型参数**:使用`optimizer.step()`更新参数。
5. **清零梯度**:在每个训练步骤开始前使用`optimizer.zero_grad()`清除上一步的梯度。
以下是如何修改代码以适用于训练集的示例:
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
# 训练函数
def train_fn(train_loader, model, device, optimizer, criterion):
model.train()
model.to(device)
tk0 = tqdm(train_loader, total=len(train_loader))
for inputs, labels in tk0:
inputs, labels = inputs.to(device), labels.to(device)
# 清零梯度
optimizer.zero_grad()
# 前向传播
outputs = model(inputs, inputs, inputs)
# 计算损失
loss = criterion(outputs, labels)
# 反向传播
loss.backward()
# 更新参数
optimizer.step()
tk0.set_postfix(loss=loss.item())
在训练过程中,确保你传入了正确的`train_loader`,并且`inputs`和`labels`是从数据集中获取的。这样就可以在训练集上进行梯度计算和参数更新了。
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。 FishC 发表于 2024-11-3 13:14
要将上述代码用于训练集,你需要在模型的训练过程中计算梯度并更新模型参数。可以通过以下步骤实现:
1. ...
import gc,os
from tqdm.auto import tqdm # visualizing tool for progress
# They will be used to pick the best model.pt given to the valid loss
best_model_epoch, valid_loss_values = [],[]
valid_loss_min = # arbitrary loss I set here
def train(model,device,train_dataloader,valid_dataloader,epochs,loss_fn,optimizer,metric):
for epoch in range(epochs):
gc.collect() # memory cleaning垃圾回收机制,减少占用内存
model.train()
train_loss = 0
train_step = 0
pbar = tqdm(train_dataloader, total=len(train_dataloader))#tqdm参数是一个iterable
for batch, labels in pbar: # you can also write like "for batch in tqdm(train_dataloader"
optimizer.zero_grad() # initialize
train_step += 1
train_input_ids = batch['input_ids'].to(device)#batch是一个字典
train_attention_mask = batch['attention_mask'].to(device)
train_token_type_ids = batch['token_type_ids'].to(device)
train_labels = labels.squeeze().to(device).long()#label真实值long()转化成一维张量
# You can refer to the class "TweetsModel" for understand
# what would be logits
logits = model(train_input_ids, train_attention_mask,train_token_type_ids).to(device)
predictions = torch.argmax(logits, dim=1) # get an index from larger one
detached_predictions = predictions.detach().cpu().numpy()
loss = loss_fn(logits, train_labels)
loss.backward()
optimizer.step()
model.zero_grad()
train_loss += loss.detach().cpu().numpy().item()
pbar.set_postfix({'train_loss':train_loss/train_step})#设置进度条显示信息
pbar.close()
with torch.no_grad():
model.eval()
valid_loss = 0
valid_step = 0
total_valid_score = 0
y_pred = [] # for getting f1_score that is a metric of the competition
y_true = []
pbar = tqdm(valid_dataloader)
for batch,labels in pbar:
valid_step += 1
valid_input_ids = batch['input_ids'].to(device)
valid_attention_mask = batch['attention_mask'].to(device)
valid_token_type_ids = batch['token_type_ids'].to(device)
valid_labels = labels.squeeze().to(device).long()
logits = model(valid_input_ids, valid_attention_mask).to(device)
predictions = torch.argmax(logits, dim=1)
detached_predictions = predictions.detach().cpu().numpy()
loss = loss_fn(logits, valid_labels)
valid_loss += loss.detach().cpu().numpy().item()
y_pred.extend(predictions.cpu().numpy())
y_true.extend(valid_labels.cpu().numpy())
valid_loss /= valid_step
f1 = f1_score(y_true,y_pred)
print(f'Epoch [{epoch+1}/{epochs}] Score: {f1}')
print(f'Epoch [{epoch+1}/{epochs}] Valid_loss: {valid_loss}')
if valid_loss < min(valid_loss_min):
print('model improved!')
else:
print('model not improved')
torch.save(model.state_dict(), f'epoch:{epoch+1}_model.pt')#state_dict 是一个字典对象,包含了模型的所有可学习参数(如权重和偏置)及其当前值
print('save checkpoint!')
valid_loss_min.append(valid_loss)
print(f'valid_loss_min:{min(valid_loss_min)}')
best_model_epoch.append(f'/kaggle/working/epoch:{epoch+1}_model.pt')
valid_loss_values.append(valid_loss)
print('='*100)
select_best_model() # refer to below function
print('Train/Valid Completed!!')
del train_dataloader, valid_dataloader # memory cleaning
gc.collect()
def select_best_model():
best_model = best_model_epoch
os.rename(best_model, best_model.split('.pt') + '_best.pt')#重命名文件
上述代码train_dataloader要求是什么形状的?batch是一个字典
有如下报错---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In, line 4
1 print('Training Start!')
2 print('=' * 100)
----> 4 train(model,
5 device,
6 train_dataloader,
7 valid_dataloader,
8 CFG.epochs,
9 loss_fn,
10 optimizer,
11 metric)
13 del model,train_dataloader, valid_dataloader
14 gc.collect()
Cell In, line 17, in train(model, device, train_dataloader, valid_dataloader, epochs, loss_fn, optimizer, metric)
14 train_step = 0
15 pbar = tqdm(train_dataloader, total=len(train_dataloader))#tqdm参数是一个iterable
---> 17 for batch, labels in pbar: # you can also write like "for batch in tqdm(train_dataloader"
18 optimizer.zero_grad() # initialize
19 train_step += 1
ValueError: too many values to unpack (expected 2)
页:
[1]