
FCN
FCN(Fully Convolutional Networks,全卷积网络)是CNN语义分割开山之作,FCN-8s是其最终优化版本,像素级分类、多尺度融合、上采样弥补细节
传统分类网络(VGG、AlexNet)末尾是全连接层,输出单类别;FCN把全连接全部替换为卷积,让网络可接收任意尺寸输入,输出和原图等尺寸的分割热力图,实现端到端语义分割。
FCN分3个版本:FCN-32s → FCN-16s → FCN-8s(精度最高、最常用)
FCN-8s 核心:跳层融合 + 8 倍上采样
| 版本 | 融合层 | 最终上采样 | 效果 |
|---|---|---|---|
| FCN-32s | 仅 pool5 | 32 倍 | 最差,边缘粗糙 |
| FCN-16s | pool5 + pool4 | 16 倍 | 中等 |
| FCN-8s | pool5 + pool4 + pool3 | 8 倍 | 最优,工业经典基线 |
FCN-8s其网络结构如下:

创新与突破
首个端到端语义分割框架,奠定后续所有分割网络基础(U-Net、SegNet、DeepLab 均受其启发),任意尺寸输入,推理高效、结构简单,多尺度跳融合思路成为分割标配
VOC2012数据集
PASCAL Visual Object Classes Challenge 2012,由牛津+剑桥共同发布,CV 领域里程碑式数据集
核心用途:图像分类、目标检测、语义分割、实例分割
图像来源:Flickr真实场景,分辨率约300–600像素



代码构建
使用Lumos框架构建FCN-8s网络模型
int num_class = 21;
Graph *graph = create_graph();
Layer **layers = malloc(33*sizeof(Layer*));
layers[0] = make_convolutional_layer(64, 3, 1, 1, 1, "relu");
layers[1] = make_convolutional_layer(64, 3, 1, 1, 1, "relu");
layers[2] = make_maxpool_layer(2, 2, 0);
layers[3] = make_convolutional_layer(128, 3, 1, 1, 1, "relu");
layers[4] = make_convolutional_layer(128, 3, 1, 1, 1, "relu");
layers[5] = make_maxpool_layer(2, 2, 0);
layers[6] = make_convolutional_layer(256, 3, 1, 1, 1, "relu");
layers[7] = make_convolutional_layer(256, 3, 1, 1, 1, "relu");
layers[8] = make_convolutional_layer(256, 3, 1, 1, 1, "relu");
layers[9] = make_maxpool_layer(2, 2, 0);
// pool3
layers[10] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[11] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[12] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[13] = make_maxpool_layer(2, 2, 0);
// pool4
layers[14] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[15] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[16] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[17] = make_maxpool_layer(2, 2, 0);
// pool5
layers[18] = make_convolutional_layer(4096, 7, 1, 3, 1, "relu"); // fc6
layers[19] = make_dropout_layer(0.5);
layers[20] = make_convolutional_layer(4096, 1, 1, 0, 1, "relu"); // fc7
layers[21] = make_dropout_layer(0.5);
// 跳跃连接+上采样
layers[22] = make_convolutional_layer(num_class, 1, 1, 0, 1, "linear"); // score_fr
layers[23] = make_deconvolutional_layer(num_class, 4, 2, 1, 0, "linear"); // up2
layers[24] = make_shortcut_layer(layers[14], 1, "linear");
layers[25] = make_convolutional_layer(num_class, 1, 1, 0, 1, "linear"); // score_pool4
layers[26] = make_shortcut_layer(layers[24], 0, "linear"); // fuse1
layers[27] = make_deconvolutional_layer(num_class, 4, 2, 1, 0, "linear"); // up4
layers[28] = make_shortcut_layer(layers[10], 1, "linear");
layers[29] = make_convolutional_layer(num_class, 1, 1, 0, 1, "linear"); // score_pool3
layers[30] = make_shortcut_layer(layers[28], 0, "linear");
layers[31] = make_deconvolutional_layer(num_class, 16, 8, 4, 0, "linear");
layers[32] = make_crossentropy_layer(NULL, 255);
我们使用crossentropy分类器进行分类
特征提取层(VGG前18层)使用VGG预训练权重,上采样deconvolutional采用双线性插值初始化,其余卷积层采用随机初始化
for (int i = 0; i < 33; ++i){
append_layer2grpah(graph, layers[i]);
Layer *l = layers[i];
if (l->type == CONVOLUTIONAL){
init_kaiming_uniform_kernel(l, 0, "fan_in", "relu");
init_constant_bias(l, 0);
}
if (l->type == DECONVOLUTIONAL){
init_bilinearinterp_kernel(l);
}
}
接下来创建会话,并设置相关训练超参数
Session *sess = create_session(graph, 320, 320, 3, 320*320, num_class, type, path);
float *mean = calloc(3, sizeof(float));
float *std = calloc(3, sizeof(float));
mean[0] = 0.485;
mean[1] = 0.456;
mean[2] = 0.406;
std[0] = 0.229;
std[1] = 0.224;
std[2] = 0.225;
transform_normalize_sess(sess, mean, std);
transform_resize_sess(sess, 320, 320);
set_train_params(sess, 200, 20, 20, 1e-4);
SGDOptimizer_sess(sess, 0.9, 0, 2e-4, 0, 0);
init_session(sess, "./data/VOC2012/train.txt", "./data/VOC2012/train_label.txt");
train(sess);
可以看到我们对数据集进行了一定的预处理操作,首先对数据集进行归一化,归一化的分布来自于ImageNet数据集的先验计算结果,后续我们对数据集进行缩放,使其符合网络模型输入
我们使用SGD参数优化器进行参数优化
完整代码如下
#include "fcn8.h"
void fcn8(char *type, char *path)
{
int num_class = 21;
Graph *graph = create_graph();
Layer **layers = malloc(33*sizeof(Layer*));
layers[0] = make_convolutional_layer(64, 3, 1, 1, 1, "relu");
layers[1] = make_convolutional_layer(64, 3, 1, 1, 1, "relu");
layers[2] = make_maxpool_layer(2, 2, 0);
layers[3] = make_convolutional_layer(128, 3, 1, 1, 1, "relu");
layers[4] = make_convolutional_layer(128, 3, 1, 1, 1, "relu");
layers[5] = make_maxpool_layer(2, 2, 0);
layers[6] = make_convolutional_layer(256, 3, 1, 1, 1, "relu");
layers[7] = make_convolutional_layer(256, 3, 1, 1, 1, "relu");
layers[8] = make_convolutional_layer(256, 3, 1, 1, 1, "relu");
layers[9] = make_maxpool_layer(2, 2, 0);
// pool3
layers[10] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[11] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[12] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[13] = make_maxpool_layer(2, 2, 0);
// pool4
layers[14] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[15] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[16] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[17] = make_maxpool_layer(2, 2, 0);
// pool5
layers[18] = make_convolutional_layer(4096, 7, 1, 3, 1, "relu"); // fc6
layers[19] = make_dropout_layer(0.5);
layers[20] = make_convolutional_layer(4096, 1, 1, 0, 1, "relu"); // fc7
layers[21] = make_dropout_layer(0.5);
// 跳跃连接+上采样
layers[22] = make_convolutional_layer(num_class, 1, 1, 0, 1, "linear"); // score_fr
layers[23] = make_deconvolutional_layer(num_class, 4, 2, 1, 0, "linear"); // up2
layers[24] = make_shortcut_layer(layers[14], 1, "linear");
layers[25] = make_convolutional_layer(num_class, 1, 1, 0, 1, "linear"); // score_pool4
layers[26] = make_shortcut_layer(layers[24], 0, "linear"); // fuse1
layers[27] = make_deconvolutional_layer(num_class, 4, 2, 1, 0, "linear"); // up4
layers[28] = make_shortcut_layer(layers[10], 1, "linear");
layers[29] = make_convolutional_layer(num_class, 1, 1, 0, 1, "linear"); // score_pool3
layers[30] = make_shortcut_layer(layers[28], 0, "linear");
layers[31] = make_deconvolutional_layer(num_class, 16, 8, 4, 0, "linear");
layers[32] = make_crossentropy_layer(NULL, 255);
for (int i = 0; i < 33; ++i){
append_layer2grpah(graph, layers[i]);
Layer *l = layers[i];
if (l->type == CONVOLUTIONAL){
init_kaiming_uniform_kernel(l, 0, "fan_in", "relu");
init_constant_bias(l, 0);
}
if (l->type == DECONVOLUTIONAL){
init_bilinearinterp_kernel(l);
}
}
Session *sess = create_session(graph, 320, 320, 3, 320*320, num_class, type, path);
float *mean = calloc(3, sizeof(float));
float *std = calloc(3, sizeof(float));
mean[0] = 0.485;
mean[1] = 0.456;
mean[2] = 0.406;
std[0] = 0.229;
std[1] = 0.224;
std[2] = 0.225;
transform_normalize_sess(sess, mean, std);
transform_resize_sess(sess, 320, 320);
set_train_params(sess, 200, 20, 20, 1e-4);
SGDOptimizer_sess(sess, 0.9, 0, 2e-4, 0, 0);
init_session(sess, "./data/VOC2012/train.txt", "./data/VOC2012/train_label.txt");
train(sess);
}
void fcn8_detect(char *type, char *path)
{
int num_class = 21;
Graph *graph = create_graph();
Layer **layers = malloc(33*sizeof(Layer*));
layers[0] = make_convolutional_layer(64, 3, 1, 1, 1, "relu");
layers[1] = make_convolutional_layer(64, 3, 1, 1, 1, "relu");
layers[2] = make_maxpool_layer(2, 2, 0);
layers[3] = make_convolutional_layer(128, 3, 1, 1, 1, "relu");
layers[4] = make_convolutional_layer(128, 3, 1, 1, 1, "relu");
layers[5] = make_maxpool_layer(2, 2, 0);
layers[6] = make_convolutional_layer(256, 3, 1, 1, 1, "relu");
layers[7] = make_convolutional_layer(256, 3, 1, 1, 1, "relu");
layers[8] = make_convolutional_layer(256, 3, 1, 1, 1, "relu");
layers[9] = make_maxpool_layer(2, 2, 0);
// pool3
layers[10] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[11] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[12] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[13] = make_maxpool_layer(2, 2, 0);
// pool4
layers[14] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[15] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[16] = make_convolutional_layer(512, 3, 1, 1, 1, "relu");
layers[17] = make_maxpool_layer(2, 2, 0);
// pool5
layers[18] = make_convolutional_layer(4096, 7, 1, 3, 1, "relu"); // fc6
layers[19] = make_dropout_layer(0.5);
layers[20] = make_convolutional_layer(4096, 1, 1, 0, 1, "relu"); // fc7
layers[21] = make_dropout_layer(0.5);
// 跳跃连接+上采样
layers[22] = make_convolutional_layer(num_class, 1, 1, 0, 1, "linear"); // score_fr
layers[23] = make_deconvolutional_layer(num_class, 4, 2, 1, 0, "linear"); // up2
layers[24] = make_shortcut_layer(layers[14], 1, "linear");
layers[25] = make_convolutional_layer(num_class, 1, 1, 0, 1, "linear"); // score_pool4
layers[26] = make_shortcut_layer(layers[24], 0, "linear"); // fuse1
layers[27] = make_deconvolutional_layer(num_class, 4, 2, 1, 0, "linear"); // up4
layers[28] = make_shortcut_layer(layers[10], 1, "linear");
layers[29] = make_convolutional_layer(num_class, 1, 1, 0, 1, "linear"); // score_pool3
layers[30] = make_shortcut_layer(layers[28], 0, "linear");
layers[31] = make_deconvolutional_layer(num_class, 16, 8, 4, 0, "linear");
layers[32] = make_crossentropy_layer(NULL, 255);
for (int i = 0; i < 33; ++i){
append_layer2grpah(graph, layers[i]);
}
Session *sess = create_session(graph, 320, 320, 3, 320*320, num_class, type, path);
float *mean = calloc(3, sizeof(float));
float *std = calloc(3, sizeof(float));
mean[0] = 0.485;
mean[1] = 0.456;
mean[2] = 0.406;
std[0] = 0.229;
std[1] = 0.224;
std[2] = 0.225;
transform_normalize_sess(sess, mean, std);
transform_resize_sess(sess, 320, 320);
set_detect_params(sess);
init_session(sess, "./data/VOC2012/train.txt", "./data/VOC2012/train_label.txt");
detect_segmentation(sess);
}
在Lumos框架中demo目录下,您能找到fcn8.c文件,这就是我们已实现的fcn-8s模型
fcn8("gpu", "VGG.lw")
结果展示
测试超参数为:lr(0.001)batch(4)decay(1e-4)momentum(0.9)epoch(50)



