视觉算法题合集1-CFANZ编程社区

1、python手动实现二维卷积（一种丑陋但容易背的写法）

import numpy as np 
def conv2d(img, in_channels, out_channels ,kernels, bias, stride=1, padding=0):
    N, C, H, W = img.shape 
    kh, kw = kernels.shape
    p = padding
    assert C == in_channels, "kernels' input channels do not match with img"

    if p:
        img = np.pad(img, ((0,0),(0,0),(p,p),(p,p)), 'constant') # padding along with all axis

    out_h = (H + 2*padding - kh) // stride + 1
    out_w = (W + 2*padding - kw) // stride + 1

    outputs = np.zeros([N, out_channels, out_h, out_w])
    # print(img)
    for n in range(N):
        for out in range(out_channels):
            for i in range(in_channels):
                for h in range(out_h):
                    for w in range(out_w):
                        for x in range(kh):
                            for y in range(kw):
                                outputs[n][out][h][w] += img[n][i][h * stride + x][w * stride + y] * kernels[x][y]
                if i == in_channels - 1:
                    outputs[n][out][:][:] += bias[n][out]
    return outputs

2、pytorch手动实现自注意力和多头自注意力

自注意力

from math import sqrt

import torch
import torch.nn as nn

class SelfAttention(nn.Module):
    dim_in: int
    dim_k: int
    dim_v: int

    def __init__(self, dim_in, dim_k, dim_v):
        super(SelfAttention, self).__init__()
        self.dim_in = dim_in
        self.dim_k = dim_k
        self.dim_v = dim_v
        self.linear_q = nn.Linear(dim_in, dim_k, bias=False)
        self.linear_k = nn.Linear(dim_in, dim_k, bias=False)
        self.linear_v = nn.Linear(dim_in, dim_v, bias=False)
        self._norm_fact = 1 / sqrt(dim_k)

    def forward(self, x):
        # x: batch, n, dim_in
        batch, n, dim_in = x.shape
        assert dim_in == self.dim_in

        q = self.linear_q(x)  # batch, n, dim_k
        k = self.linear_k(x)  # batch, n, dim_k
        v = self.linear_v(x)  # batch, n, dim_v

        dist = torch.bmm(q, k.transpose(1, 2)) * self._norm_fact  # batch, n, n
        dist = torch.softmax(dist, dim=-1)  # batch, n, n

        att = torch.bmm(dist, v)
        return att

多头自注意力

from math import sqrt

import torch
import torch.nn as nn


class MultiHeadSelfAttention(nn.Module):
    dim_in: int  # input dimension
    dim_k: int   # key and query dimension
    dim_v: int   # value dimension
    num_heads: int  # number of heads, for each head, dim_* = dim_* // num_heads

    def __init__(self, dim_in, dim_k, dim_v, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        assert dim_k % num_heads == 0 and dim_v % num_heads == 0, "dim_k and dim_v must be multiple of num_heads"
        self.dim_in = dim_in
        self.dim_k = dim_k
        self.dim_v = dim_v
        self.num_heads = num_heads
        self.linear_q = nn.Linear(dim_in, dim_k, bias=False)
        self.linear_k = nn.Linear(dim_in, dim_k, bias=False)
        self.linear_v = nn.Linear(dim_in, dim_v, bias=False)
        self._norm_fact = 1 / sqrt(dim_k // num_heads)

    def forward(self, x):
        # x: tensor of shape (batch, n, dim_in)
        batch, n, dim_in = x.shape
        assert dim_in == self.dim_in

        nh = self.num_heads
        dk = self.dim_k // nh  # dim_k of each head
        dv = self.dim_v // nh  # dim_v of each head

        q = self.linear_q(x).reshape(batch, n, nh, dk).transpose(1, 2)  # (batch, nh, n, dk)
        k = self.linear_k(x).reshape(batch, n, nh, dk).transpose(1, 2)  # (batch, nh, n, dk)
        v = self.linear_v(x).reshape(batch, n, nh, dv).transpose(1, 2)  # (batch, nh, n, dv)

        dist = torch.matmul(q, k.transpose(2, 3)) * self._norm_fact  # batch, nh, n, n
        dist = torch.softmax(dist, dim=-1)  # batch, nh, n, n

        att = torch.matmul(dist, v)  # batch, nh, n, dv
        att = att.transpose(1, 2).reshape(batch, n, self.dim_v)  # batch, n, dim_v
        return att

3、图像缩放

步骤：

通过原始图像和比例因子得到新图像的大小，并用零矩阵初始化新图像。
由新图像的某个像素点（x，y）映射到原始图像(x’，y’)处。
对x’,y’取整得到（xx，yy）并得到(xx，yy)、(xx+1，yy)、（xx，yy+1）和（xx+1，yy+1）的值。
利用双线性插值得到像素点(x，y)的值并写回新图像。

双线性插值实现：将每个像素点坐标(x,y)分解为(i+u,j+v), i,j是整数部分，u,v是小数部分，则f(i+u,j+v) = (1-u)(1-v)f(i,j)+uvf(i+1,j+1)+u(1-v)f(i+1,j)+(1-u)v* f(i,j+1)。

opencv实现细节：将新图像像素点映射回原图像时，SrcX=(dstX+0.5)* (srcWidth/dstWidth) -0.5，SrcY=(dstY+0.5) * (srcHeight/dstHeight)-0.5，使得原图像和新图像几何中心对齐。因为按原始映射方式，55图像缩放成33图像，图像中心点(1,1)映射回原图会变成(1.67，1.67)而不是(2,2)。

4、图像旋转实现

旋转矩阵：

视觉算法题合集1_ide

实现思路：

计算旋转后图像的min_x,min_y，将(min_x,min_y)作为新坐标原点(向下取整)，并变换原图像坐标到新坐标系，以防止旋转后图像超出图像边界。
初始化旋转后图像的0矩阵，遍历矩阵中每个点(x,y)，根据旋转矩阵进行反向映射（旋转矩阵的逆，np.linalg.inv(a)），将(x,y)映射回原图(x0,y0)，同样将x0和y0拆分为整数和小数部分：i+u,j+v，进行双线性插值即可。从而得到旋转后图像每个像素（x,y）的值。

5、RoI Pooling实现细节

RoI Pooling需要经过两次量化实现pooling:

第一次是映射到feature map时，当位置是小数时，对坐标进行最近邻插值。

视觉算法题合集1_双线性插值_02

第二次是在pooling时，当RoI size不能被RoI Pooling ouputsize整除时，直接舍去小数位。如4/3=1.33，直接变为1，则RoI pooling变成对每个1* 2的格子做pooling，pooling方式可选max或者average。

视觉算法题合集1_双线性插值_03

6、RoIAlign实现细节

RoIAlign采用双线性插值避免量化带来的特征损失：

将RoI平分成outputsize* outputsize个方格，对每个方格取四个采样点，采样点的值通过双线性插值获得，最后通过对四个采样点进行max或average pooling得到最终的RoI feature。

视觉算法题合集1_人工智能_04

7、2D/3D IoU实现

#核心思路：
union_h = min(top_y) - max(bottom_y)
union_w = min(right_x) - max(left_x)
def 2d_iou(box1, box2):
    '''
    两个框（二维）的 iou 计算

    注意：左下和右上角点

    box:[x1, y1, x2, y2]
    '''
    # 计算重叠区域的长宽
    in_h = min(box1[3], box2[3]) - max(box1[1], box2[1])
    in_w = min(box1[2], box2[2]) - max(box1[0], box2[0])
    inter = 0 if in_h<0 or in_w<0 else in_h*in_w
    union = (box1[2] - box1[0]) * (box1[3] - box1[1]) + \
    (box2[2] - box2[0]) * (box2[3] - box2[1]) - inter
    iou = inter / union
    return iou

# 思路类似，找到原点方向的角点以及斜对角处的角点
def 3d_iou(box1, box2):
    '''
   box:[x1,y1,z1,x2,y2,z2]
   '''
    area1 = (box1[3]-box1[0])*(box1[4]-box1[1])*(box1[5]-box1[2])
    area2 = (box2[3]-box2[0])*(box2[4]-box2[1])*(box2[5]-box2[2])
    area_sum = area1 + area2
    
    #计算重叠长方体区域的两个角点[x1,y1,z1,x2,y2,z2]
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    z1 = max(box1[2], box2[2])
    x2 = min(box1[3], box2[3])
    y2 = min(box1[4], box2[4])
    z2 = min(box1[5], box2[5])
    if x1 >= x2 or y1 >= y2 or z1 >= z2:
        return 0
    else:
        inter_area = (x2-x1)*(y2-y1)*(z2-z1)
    
    return inter_area/(area_sum-inter_area)

8、手撕NMS

import numpy as np
# from https://github.com/luanshiyinyang/NMS，个人觉得很简洁的一种写法
def nms(bboxes, scores, iou_thresh):
    """
    :param bboxes: 检测框列表
    :param scores: 置信度列表
    :param iou_thresh: IOU阈值
    :return:
    """

    x1 = bboxes[:, 0]
    y1 = bboxes[:, 1]
    x2 = bboxes[:, 2]
    y2 = bboxes[:, 3]
    areas = (y2 - y1) * (x2 - x1)

    # 结果列表
    result = []
    # 对检测框按照置信度进行从高到低的排序，并获取索引
    index = scores.argsort()[::-1]
    # 下面的操作为了安全，都是对索引处理
    while index.size > 0:
        # 当检测框不为空一直循环
        i = index[0]
        # 将置信度最高的加入结果列表
        result.append(i)

        # 计算其他边界框与该边界框的IOU
        x11 = np.maximum(x1[i], x1[index[1:]])
        y11 = np.maximum(y1[i], y1[index[1:]])
        x22 = np.minimum(x2[i], x2[index[1:]])
        y22 = np.minimum(y2[i], y2[index[1:]])
        w = np.maximum(0, x22 - x11 + 1)
        h = np.maximum(0, y22 - y11 + 1)
        overlaps = w * h
        ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
        # 只保留满足IOU阈值的索引
        idx = np.where(ious <= iou_thresh)[0]
        index = index[idx + 1]  # 处理剩余的边框
    bboxes, scores = bboxes[result], scores[result]
    return bboxes, scores