参考,可选看
Adam和AdamW优化器有什么区别
torch.optim
transforms.Compose()使用
StepLR官方文档


卷积负责提取局部特征并逐层抽象,池化负责降采样(扩大感受野、减少计算、增加平移不变性),通道数随层数增加以保留和扩展特征表达的容量;最后把空间信息汇总(flatten 或全局池化)交给分类器判决。

空间尺寸减小后,为了不丢失表达能力,通常增加通道数(例如 32→64→128),等于在更小的格子里放更多种特征检测器。空间上降分辨率、通道上补偿表达容量。

池化降低空间分辨率(丢掉部分像素级细节),但每个空间位置保留更多的通道(更多种特征检测器),这些通道可以编码更复杂、更抽象的局部信息,从而在一定程度上补偿空间上的信息损失。

计算图告诉你怎么求梯度,反向传播就是在计算图上求梯度,求出的梯度存参数的 .grad,然后用 .grad 去更新参数的数值。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import torch
from torch import nn, optim # torch.optim 是一个实现了各种优化算法的包
from torchvision import datasets, transforms # datasets方便、安全地加载 MNIST 数据集。transforms用于图像数据归一化处理、张量转换等预处理
from torch.utils.data import DataLoader # 对于训练和验证过程,不能一次性加载所有数据,要使用小批量训练(mini-batch)
import matplotlib.pyplot as plt

# cuda:0指明使用第一块GPU,等价cuda(默认第一块)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# 0.1307 和 0.3081 分别是 MNIST 全量数据的均值和标准差(官方已计算好),
transform_train = transforms.Compose([
# 将图像从 PIL.Image 或 numpy.ndarray 转换为 Torch Tensor 类型,同时自动将像素值从 0-255 缩放到 0-1 之间
transforms.ToTensor(),
# 标准化,对称分布更利于训练,根据导数可以判断较少梯度消失问题
transforms.Normalize((0.1307,), (0.3081,))
])
# transforms.Compose归纳多个预处理步骤,让代码结构更清晰、可维护性更高,并不必要
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])

# 下载训练集和测试集
train_data = datasets.MNIST(root='./data', train=True, download=True, transform=transform_train)
test_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform_test)


class SimpleCNN(nn.Module):
def __init__(self):
super(SimpleCNN, self).__init__()

# 输入通道数 1(灰度图),输出通道数 32(特征图数),卷积核大小 3×3,padding=1 保证卷积不改变特征图尺寸(仅卷积部分)
self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
self.conv3 = nn.Conv2d(64, 128, 3, padding=1)

# 最大池化窗口 2×2,stride=2(默认),每次将特征图尺寸减半,提取主要特征并减少计算量。
self.pool = nn.MaxPool2d(2)
self.dropout = nn.Dropout(0.25)

# 最后一层通道数为 128,空间尺寸为 3×3
self.fc1 = nn.Linear(3 * 3 * 128, 256)
self.fc2 = nn.Linear(256, 10)

# 输入是形状为 [batch, 1, 28, 28] 的张量(已归一化、标准化),对应原始 28×28 灰度图
def forward(self, x):
# 尺度问题请联系实际
x = torch.relu(self.conv1(x))
x = self.pool(x) # 28->14

x = torch.relu(self.conv2(x))
x = self.pool(x) # 14->7

x = torch.relu(self.conv3(x))
x = self.pool(x) # 7->3

# 展平进入全连接线性层
x = torch.flatten(x, 1)
x = self.dropout(x)
x = torch.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x


# 设默认值防止报错
def train_model(model, train_loader, test_loader, epochs=10):
# 已经实例化了交叉熵损失函数的对象
criterion = nn.CrossEntropyLoss()

# 权重衰减,优化器,weight_decay是L2 正则化
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

# 学习率调度器,optim.lr_scheduler是模块,StepLR 每隔固定步数(step_size)将学习率乘以一个衰减系数(gamma)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.5)

# 保存记录
train_losses, train_accuracies, test_accuracies = [], [], []

for epoch in range(epochs):
# 训练阶段,train() 会启用 Dropout、BatchNorm 的训练模式,而 eval() 会关闭这些随机性
model.train()
running_loss = 0.0
correct_train = 0
total_train = 0

# 分批边转移边计算节省显存,数据集通常太大,不能一次性全部加载到 GPU
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)

optimizer.zero_grad()
# 前向传播
outputs = model(inputs)
# 交叉熵损失,多分类
loss = criterion(outputs, labels)
# 反向传播
loss.backward()
# 更新权重
optimizer.step()

# .item() 将张量转换为Python浮点数,running_loss累加每个batch的损失
running_loss += loss.item()
# 不关心最大值概率,只关注最大值对应分类是哪一种,前者是沿指定维度的最大值,后者是最大值对应的索引
_, predicted = torch.max(outputs, 1)
total_train += labels.size(0)
correct_train += (predicted == labels).sum().item()

# 平均batch损失
avg_loss = running_loss / len(train_loader)
train_acc = 100 * correct_train / total_train

# 测试阶段
model.eval()
correct_test = 0
total_test = 0

with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
_, predicted = torch.max(outputs, 1)
total_test += labels.size(0)
correct_test += (predicted == labels).sum().item()

test_acc = 100 * correct_test / total_test

train_losses.append(avg_loss)
train_accuracies.append(train_acc)
test_accuracies.append(test_acc)

print(f"Epoch {epoch + 1}/{epochs}")
print(f" Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%, Test Acc: {test_acc:.2f}%")
print(f" Learning Rate: {scheduler.get_last_lr()[0]:.6f}")

scheduler.step()

return model, train_losses, train_accuracies, test_accuracies



# 这里用 ax2.plot 而不是 plt.plot,可以指定在哪个子图画图(例如此处有两张图),不会画错
def plot_results(train_losses, train_accuracies, test_accuracies):
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# 损失曲线
ax1.plot(range(1, len(train_losses) + 1), train_losses)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training Loss Curve')
ax1.grid(True)

# 准确率曲线
ax2.plot(range(1, len(train_accuracies) + 1), train_accuracies, label='Train Accuracy')
ax2.plot(range(1, len(test_accuracies) + 1), test_accuracies, label='Test Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy (%)')
ax2.set_title('Accuracy Curves')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.show()


def main():
train_loader = DataLoader(train_data, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_data, batch_size=128, shuffle=False, num_workers=2)

# 移到GPU上,让模型和数据都在同个 device 上进行训练
model = SimpleCNN().to(device)

print(f"模型参数量: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# 训练模型
model, train_losses, train_accuracies, test_accuracies = train_model(model, train_loader, test_loader, epochs=10)

# 绘制结果
plot_results(train_losses, train_accuracies, test_accuracies)

# 保存模型
torch.save(model.state_dict(), "mnist_cnn.pth")

# 最终测试
print(f"\n最终测试准确率: {test_accuracies[-1]:.2f}%")



# ===================== 加载并验证 =====================
loaded_model = SimpleCNN().to(device)
loaded_model.load_state_dict(torch.load("mnist_cnn.pth", map_location=device))
loaded_model.eval()

correct, total = 0, 0
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = loaded_model(inputs)
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()

print(f"加载后的模型测试准确率: {100 * correct / total:.2f}%")

# ===================== 单张预测 =====================
image, label = test_data[0]
plt.imshow(image.squeeze(), cmap="gray")
plt.title(f"True: {label}")
plt.show()

image = image.unsqueeze(0).to(device) # [1, 1, 28, 28]
with torch.no_grad():
output = loaded_model(image)
predicted = torch.argmax(output, 1).item()

print(f"Predict: {predicted}")



if __name__ == "__main__":
main()