层归一化
本帖最后由 Handsome_zhou 于 2023-8-15 10:13 编辑层归一化是在特征维度上进行归一化即正态分布,使特征在每个样本上分布相似,有助于网络的学习和收敛。
import torch
import torch.nn as nn
input_data = torch.randn(10, 5)# 10是样本数,5是特征数
out:input_data
tensor([[-1.2586,0.1137, -0.2599,0.0824,0.3824],
[ 1.8684, -0.9581, -0.0497, -1.7724,0.2099],
[ 1.0365,0.8831,0.0261, -0.3369,0.1216],
[-0.2050,1.3703,0.7986,1.3495, -0.4783],
[-0.0580,3.2424, -1.3373,0.8285, -0.9443],
[-1.0602,0.5071, -0.4528, -1.7445, -1.0515],
[ 1.2190, -0.5350,0.3590, -0.5628,0.2565],
[ 1.3337, -0.6709, -0.5470,1.1072,0.8083],
[-1.2246, -1.3705, -0.4237,0.2331, -0.5259],
[-0.5911,2.2197,0.0524, -0.1002,0.0489]])
layer_norm = nn.LayerNorm(5) # 初始化LayerNorm模块,输入特征数为5
output_data = layer_norm(input_data)# 在特征维度进行归一化,使特征在每个样本上的分布相似,有助于网络的学习和收敛
out:output_data
tensor([[-1.8688,0.5267, -0.1256,0.4720,0.9957],
[ 1.6403, -0.6678,0.0741, -1.3327,0.2860],
[ 1.3121,1.0206, -0.6082, -1.2980, -0.4265],
[-0.9967,1.0371,0.2990,1.0102, -1.3495],
[-0.2480,1.7768, -1.0329,0.2958, -0.7918],
[-0.3975,1.6805,0.4078, -1.3048, -0.3860],
[ 1.6248, -1.0345,0.3209, -1.0767,0.1655],
[ 1.0957, -1.2726, -1.1262,0.8281,0.4750],
[-0.9657, -1.2163,0.4098,1.5379,0.2342],
[-0.9398,1.9408, -0.2803, -0.4368, -0.2839]],
grad_fn=<NativeLayerNormBackward0>)
transformer层归一化类代码实现:
class LayerNorm(nn.Module):
def __init__(self, features, eps=1e-6):
super(LayerNorm, self).__init__()
# 初始化alpha为全1, beta为全0
self.a_2 = nn.Parameter(torch.ones(features))
self.b_2 = nn.Parameter(torch.zeros(features))
# 平滑项
self.eps = eps
def forward(self, x):
# 按最后一个维度即列计算均值和方差
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True)
# 返回Layer Norm的结果,给正态分布的结果加上系数和偏移项
return self.a_2 * (x - mean) / torch.sqrt(std ** 2 + self.eps) + self.b_2
这啥呀?
页:
[1]