|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- #@save
- def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
- """训练序列到序列模型"""
- def xavier_init_weights(m):
- if type(m) == nn.Linear:
- nn.init.xavier_uniform_(m.weight)
- if type(m) == nn.GRU:
- for param in m._flat_weights_names:
- if "weight" in param:
- nn.init.xavier_uniform_(m._parameters[param])
- net.apply(xavier_init_weights)
- net.to(device)
- optimizer = torch.optim.Adam(net.parameters(), lr=lr)
- loss = MaskedSoftmaxCELoss()
- net.train()
- animator = d2l.Animator(xlabel='epoch', ylabel='loss',
- xlim=[10, num_epochs])
- for epoch in range(num_epochs):
- timer = d2l.Timer()
- metric = d2l.Accumulator(2) # 训练损失总和,词元数量
- for batch in data_iter:
- optimizer.zero_grad()
- X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
- bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
- device=device).reshape(-1, 1)
- dec_input = torch.cat([bos, Y[:, :-1]], 1) # 强制教学
- Y_hat, _ = net(X, dec_input, X_valid_len)
- l = loss(Y_hat, Y, Y_valid_len)
- l.sum().backward() # 损失函数的标量进行“反向传播”
- d2l.grad_clipping(net, 1)
- num_tokens = Y_valid_len.sum()
- optimizer.step()
- with torch.no_grad():
- metric.add(l.sum(), num_tokens)
- if (epoch + 1) % 10 == 0:
- animator.add(epoch + 1, (metric[0] / metric[1],))
- print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
- f'tokens/sec on {str(device)}')
复制代码
问题1:- def xavier_init_weights(m):
- if type(m) == nn.Linear:
- nn.init.xavier_uniform_(m.weight)
- if type(m) == nn.GRU:
- for param in m._flat_weights_names:
- if "weight" in param:
- nn.init.xavier_uniform_(m._parameters[param])
复制代码
这段代码的作用是什么?
问题2:- animator = d2l.Animator(xlabel='epoch', ylabel='loss',
- xlim=[10, num_epochs])
复制代码
animator在干什么?
问题3:- metric = d2l.Accumulator(2) # 训练损失总和,词元数量
复制代码
metirc、batch的形状是什么样的
问题4:
为什么要丢弃Y最后一列
|
|