DilatedEncoder（2021年CVPR-YOLOF）代码详解-CFANZ编程社区

在这里分享一下我对于DilatedEncoder模块的代码解释，
DilatedEncoder为2021年CVPR paper You Only Look One-Level Feature中所提出的一个模块，其主要作用在于扩大感受野，作者的本意在于通过该模块来降低因为只使用backbone最深层的特征而造成的感受野的缺失的问题。
paper:CVPR 2021 Open Access Repository
code:https://github.com/megvii-model/YOLOF
import torch
import torch.nn as nn

#------------------------------------------#
#  此为对于残差块的定义,其在DilatedEncoder
#  的残差结构中被应用
#  一个残差块共有三个卷积+正则化+relu
#  卷积核的大小分别为 1*1，3*3，1*1
#------------------------------------------#
class Bottleneck(nn.Module):

    def __init__(self,in_channels=512,mid_channels=128,dilation=1):
        super(Bottleneck, self).__init__()

        self.conv1 = nn.Sequential(  #卷积+正则化+relu,其中为1*1的卷积
            nn.Conv2d(in_channels, mid_channels, kernel_size=1, padding=0),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True)
        )
        self.conv2 = nn.Sequential(   #卷积+正则化+relu，其中为3*3的卷积
            nn.Conv2d(mid_channels, mid_channels,kernel_size=3, padding=dilation, dilation=dilation),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True)
        )
        self.conv3 = nn.Sequential(   #卷积+正则化+relu，其中为1*1的卷积
            nn.Conv2d(mid_channels, in_channels, kernel_size=1, padding=0),
            nn.BatchNorm2d(in_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self,x):
        out = self.conv1(x)
        out = self.conv2(out)
        out = self.conv3(out)
        out = out + x
        return out

#------------------------------------------#
#  DilatedEncoder结构
#------------------------------------------#
class DilatedEncoder(nn.Module):
    def __init__(self, in_channels=1024, encoder_channels=512, block_mid_channels=128, num_residual_blocks=4, block_dilations=[2, 4, 6, 8]):
        super(DilatedEncoder, self).__init__()

        self.in_channels         = in_channels
        self.encoder_channels    = encoder_channels
        self.block_mid_channels  = block_mid_channels
        self.num_residual_blocks = num_residual_blocks
        self.block_dilations     = block_dilations
        #这个断言的作用是根据残差块的个数来确定block_dilations的长度
        #每个残差块中所对应的膨胀卷积率
        assert len(self.block_dilations) == self.num_residual_blocks
        # 初始化两个函数
        self._init_layers()
        self._init_weight()

#--------------------------------------#
#  此为对于投影层以及残差结构的初始化操作
#  残差结构中共有四个残差块
#--------------------------------------#
    def _init_layers(self):
        #-----------------------------#
        # 首先为先进行投影层的初始化操作
        #-----------------------------#
        #首先为1*1的卷积+正则化
        self.lateral_conv = nn.Conv2d(self.in_channels,self.encoder_channels,kernel_size=1)
        self.lateral_norm = nn.BatchNorm2d(self.encoder_channels)
        #接下来为3*3的卷积加正则化
        self.fpn_conv = nn.Conv2d(self.encoder_channels,self.encoder_channels,kernel_size=3,padding=1)
        self.fpn_norm = nn.BatchNorm2d(self.encoder_channels)
        #-----------------------------#
        # 接下来进行残差结构的初始化操作
        #-----------------------------#
        encoder_blocks = []
        #在这个结构中，共有四个残差块，每个残差块均由三个conv+bn+relu组成
        for i in range(self.num_residual_blocks):
            # 分配每一个残差块中的卷积所需要的膨胀率，4个残差块从左至右依次为 2，4，6，8
            dilation = self.block_dilations[i]
            # 对每个残差块进行初始化
            encoder_blocks.append(
                Bottleneck(
                    self.encoder_channels,
                    self.block_mid_channels,
                    dilation=dilation
                )
            )
        #将四个残差块依次进行排列，进而得到了我们的残差结构
        #print(encoder_blocks[0])
        self.dilated_encoder_blocks = nn.Sequential(*encoder_blocks)  #pack up

    def xavier_init(self, layer):
        if isinstance(layer, nn.Conv2d):  #对于layer中的二维卷积操作进行相应的初始化
            #---------------------------------------------#
            # xavier的基本思想是通过网络层时，输入和输出的方差相同，包括前向传播和反向传播
            # 用于激活函数中， 此处code的含义为服从均匀分布
            #---------------------------------------------#
            nn.init.xavier_uniform_(layer.weight, gain=1)

#-------------------------------------------#
# 初始化Dilated Encoder的权重值
#-------------------------------------------#
    def _init_weight(self):
        #为投影层中的卷积操作添加xavier
        self.xavier_init(self.lateral_conv)
        self.xavier_init(self.fpn_conv)
        #对投影层中的正则化操作进行相应的限制
        for m in [self.lateral_norm, self.fpn_norm]:
            nn.init.constant_(m.weight, 1) #使用1来填充weight的值
            nn.init.constant_(m.bias, 0) #使用0来填充bias的值，表示偏移量为0
        # 对于残差结构中的四个残差块
        for m in self.dilated_encoder_blocks.modules():
            # 对于conv操作
            if isinstance(m, nn.Conv2d):
                nn.init.normal_(m.weight, mean=0, std=0.01)  #使其服从一个正太分布操作
                if hasattr(m, 'bias') and m.bias is not None:#如果这个卷积操作存在偏置项bias且不为空，则我们将其值初始为0
                    nn.init.constant_(m.bias, 0)
            # 对于正则化操作
            if isinstance(m, (nn.GroupNorm, nn.BatchNorm2d, nn.SyncBatchNorm)):
                nn.init.constant_(m.weight, 1)   #将其weight值初始化为常数
                nn.init.constant_(m.bias, 0)     #将其bias值初始化为常数

# -------------------------------------------#
# 进行DilatedEncoder模块的前向传播函数的定义
# -------------------------------------------#
    def forward(self, feature): #可以理解为对于一个张量的操作
        out = self.lateral_norm(self.lateral_conv(feature))  #为先过卷积再过池化，投影层的第一层卷积
        out = self.fpn_norm(self.fpn_conv(out))  #先过卷积再过池化，投影层的第二层卷积
        out = self.dilated_encoder_blocks(out)   #最后经过残差结构，返回结果
        return out


x = torch.rand(16, 1024, 13, 13)
print('As Begin!!!')
print(x.shape)
encoder = DilatedEncoder()
y = encoder(x)
print('After Dilated Encoder !!!')
print(y.shape)

#print(encoder)