batchnorm讲解

lr不能更新太快,第一次实验lr=0.001,StepLR(step_size=3, gamma=0.1),三层CNN,学习率衰减过快,导致最终准确率仅为18%,小心梯度消失!
大数据集本身就是天然的数据增强,所以不那么需要做数据增强,而小数据集大模型则必须数据增强以缓解过拟合。
Dropout在小模型上使用可能导致性能降低,由于规模小大部分神经元都是有效的,冗余神经元少。如果真要使用Dropout,要减少抛掷神经元的比例。
小分辨率图像要慎重使用池化,容易丢失过多的细节。


CIFAR-10 数据集的图像尺寸是 32×32 像素,3 个通道(RGB)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# kernel size=3,padding=1,step_size=4,bitch_size=128

# 第一次实验(第二次实验lr改为0.01后训练效果显著上升)
def train_model(model,train_loader,test_loader,epochs=10):
criterion = nn.CrossEntropyLoss()
# model.parameters():告诉优化器哪些参数需要更新。不是"冲量",而是模型中所有可训练的权重和偏置,梯度更新的方向和幅度
optimizer = optim.SGD(model.parameters(),lr=0.001,momentum=0.9)
# optimizer,优化器对象,初期快速收敛,在后期用更小的学习率微调,提高收敛精度,学习率调度器
scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=4,gamma=0.5)

class CNN(nn.Module):
def __init__(self):
super(CNN,self).__init__()
self.conv1 = nn.Conv2d(3,9,3,padding=1)
self.conv2 = nn.Conv2d(9,27,3,padding=1)
self.conv3 = nn.Conv2d(27,81,3,padding=1)
self.conv4 = nn.Conv2d(81,81,3,padding=1)
self.pool = nn.MaxPool2d(2,2)
self.fc1 = nn.Linear(in_features=81*2*2,out_features=10)

def forward(self,x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
x = self.pool(F.relu(self.conv4(x)))
x=torch.flatten(x,1)
x = self.fc1(x)
return x

# 第三次实验(深度增加两层),效果相对第二次还下降了😂
self.conv5 = nn.Conv2d(81, 81, 3, padding=1)
self.conv6 = nn.Conv2d(81, 81, 3, padding=1)

x=F.relu(self.conv5(x))
x=F.relu(self.conv6(x))

# 第四次实验(第四层不池化,降低池化次数)
x = F.relu(self.conv4(x))

# 第五次实验(增加BitchNorm)
class CNN(nn.Module):
def __init__(self):
super(CNN,self).__init__()
self.pool = nn.MaxPool2d(2, 2)

self.conv1 = nn.Conv2d(3, 16, 3, padding=1)
self.bn1 = nn.BatchNorm2d(16)
self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
self.bn2 = nn.BatchNorm2d(32)
self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
self.bn3 = nn.BatchNorm2d(64)
self.conv4 = nn.Conv2d(64, 128, 3, padding=1)
self.bn4 = nn.BatchNorm2d(128)
self.conv5 = nn.Conv2d(128, 128, 3, padding=1)
self.bn5 = nn.BatchNorm2d(128)
self.conv6 = nn.Conv2d(128, 128, 3, padding=1)
self.bn6 = nn.BatchNorm2d(128)

self.fc1 = nn.Linear(128 * 4 * 4, 10)

def forward(self,x):
# 在卷积层后接BN层
x = self.pool(F.relu(self.bn1(self.conv1(x))))
x = self.pool(F.relu(self.bn2(self.conv2(x))))
x = self.pool(F.relu(self.bn3(self.conv3(x))))

x = F.relu(self.bn4(self.conv4(x)))
x = F.relu(self.bn5(self.conv5(x)))
x = F.relu(self.bn6(self.conv6(x)))

x = torch.flatten(x, 1)
x = self.fc1(x)
return x

# 第六次实验(数据增强,更换train_data和test_data的transform)
train_transform = transforms.Compose([
transforms.RandomCrop(32, padding=4), # 随机裁剪 + padding
transforms.RandomHorizontalFlip(), # 随机水平翻转
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), # CIFAR-10 均值
(0.2023, 0.1994, 0.2010)) # CIFAR-10 标准差
])

test_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465),
(0.2023, 0.1994, 0.2010))
])

# 第七次实验(增加Dropout),效果不好
self.drop1 = nn.Dropout2d(0.1)
self.drop2 = nn.Dropout2d(0.1)
self.drop3 = nn.Dropout2d(0.1)
self.drop_fc = nn.Dropout(0.5)

x = self.pool(F.relu(self.bn1(self.conv1(x))))
x = self.drop1(x)
x = self.pool(F.relu(self.bn2(self.conv2(x))))
x = self.drop2(x)
x = self.pool(F.relu(self.bn3(self.conv3(x))))
x = self.drop3(x)

x = torch.flatten(x, 1)
x = self.drop_fc(x)
x = self.fc1(x)

# 第八次实验(batch_size=30,step_size=6)

# 第九次实验(batch_size=30,step_size=6,CNN改用ResNet18)
class BasicBlock(nn.Module): # 基础残差块
expansion = 1 # 输出通道数 = planes * expansion

def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
# 由于卷积接BatchNorm,BN会抵消偏置,所以可以去掉conv的bias节省参数和略微加速
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride,padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)

# 空nn.Sequential()是恒等映射(identity),此时shortcut(x)等于x
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion * planes,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion * planes)
)

def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = self.bn2(self.conv2(x))
x += self.shortcut(x) # 残差连接,如果通道或尺寸不匹配,shortcut用1×1conv做变换
x = F.relu(x)
return x


class ResNet(nn.Module):
# block是残差快种类,num_blocks是不同阶段残差快数量,num_classes是数据集数据种类量
def __init__(self, block, num_blocks, num_classes=10):
super(ResNet, self).__init__()
# plane等价channel,只是plane为历史沿袭的名称也常用于pytorch中
self.in_planes = 64

# 先从RGB的3维升到64维,保持尺寸不变
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(64)

# 残差块分成四个阶段便于中间特征提取与可视化,_make_layer()允许快速生成不同深度的网络
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
# 使用stride=2的卷积代替池化层,卷积层自身可学习,能保留更强可表达性
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.linear = nn.Linear(512 * block.expansion, num_classes)

# 自动生成多个残差块,是ResNet类里自定义的私有方法
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1]*(num_blocks-1)
layers = [] # 用来装残差块的空列表
for s in strides:
layers.append(block(self.in_planes, planes, s)) # 创建一个残差块
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)

def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)

# 使用F.avg_pool2d而不是nn.AvgPool2d,是出于无参数层(parameter-free),不需要注册状态,也不会被更新,且kernel固定,省一个对象
x = F.avg_pool2d(x, 4) # Global Average Pooling
x = x.view(x.size(0), -1)
x = self.linear(x)
return x

def ResNet18():
# 每个阶段包含的 BasicBlock 个数,1+2*4*2+1=18层,所以叫ResNet18
return ResNet(BasicBlock, [2,2,2,2]) # return参数参考def __init__里的参数,和括号无关


仅供参考的Resnet18图,本人模型与其出入较大
Resnet18


实验次数 模型结构(通道量前,后,最后张量尺寸) 优化器&学习率 批大小 训练轮数 训练准确率 测试准确率
1 4层CNN(3,81,2) SGD(lr=0.001,momentum=0.9) 128 10 23.08% 23.31%
2 4层CNN(3,81,2) SGD(lr=0.01,momentum=0.9) 128 20 74.15% 66.15%
3 6层CNN(3,81,2) SGD(lr=0.01,momentum=0.9) 128 10 51.19% 52.11%
4 6层CNN(3,81,4) SGD(lr=0.01,momentum=0.9) 128 10 58.59% 58.72%
5 6层CNN(3,81,4)+BN SGD(lr=0.01,momentum=0.9) 128 10 99.60% 77.63%
6 6层CNN(3,81,4)+BN SGD(lr=0.01,momentum=0.9) 128 10 80.24% 79.45%
7 6层CNN(3,81,4)+BN+Dropout SGD(lr=0.01,momentum=0.9) 128 10 67.78% 73.05%
8 6层CNN(3,81,4)+BN SGD(lr=0.01,momentum=0.9) 128 30 86.85% 82.76%
9 ResNet18 SGD(lr=0.01,momentum=0.9) 128 30 98.02% 90.87%
10 128 10