1、python手动实现二维卷积(一种丑陋但容易背的写法)
import numpy as np
def conv2d(img, in_channels, out_channels ,kernels, bias, stride=1, padding=0):
N, C, H, W = img.shape
kh, kw = kernels.shape
p = padding
assert C == in_channels, "kernels' input channels do not match with img"
if p:
img = np.pad(img, ((0,0),(0,0),(p,p),(p,p)), 'constant') # padding along with all axis
out_h = (H + 2*padding - kh) // stride + 1
out_w = (W + 2*padding - kw) // stride + 1
outputs = np.zeros([N, out_channels, out_h, out_w])
# print(img)
for n in range(N):
for out in range(out_channels):
for i in range(in_channels):
for h in range(out_h):
for w in range(out_w):
for x in range(kh):
for y in range(kw):
outputs[n][out][h][w] += img[n][i][h * stride + x][w * stride + y] * kernels[x][y]
if i == in_channels - 1:
outputs[n][out][:][:] += bias[n][out]
return outputs
2、pytorch手动实现自注意力和多头自注意力
- 自注意力
from math import sqrt
import torch
import torch.nn as nn
class SelfAttention(nn.Module):
dim_in: int
dim_k: int
dim_v: int
def __init__(self, dim_in, dim_k, dim_v):
super(SelfAttention, self).__init__()
self.dim_in = dim_in
self.dim_k = dim_k
self.dim_v = dim_v
self.linear_q = nn.Linear(dim_in, dim_k, bias=False)
self.linear_k = nn.Linear(dim_in, dim_k, bias=False)
self.linear_v = nn.Linear(dim_in, dim_v, bias=False)
self._norm_fact = 1 / sqrt(dim_k)
def forward(self, x):
# x: batch, n, dim_in
batch, n, dim_in = x.shape
assert dim_in == self.dim_in
q = self.linear_q(x) # batch, n, dim_k
k = self.linear_k(x) # batch, n, dim_k
v = self.linear_v(x) # batch, n, dim_v
dist = torch.bmm(q, k.transpose(1, 2)) * self._norm_fact # batch, n, n
dist = torch.softmax(dist, dim=-1) # batch, n, n
att = torch.bmm(dist, v)
return att
- 多头自注意力
from math import sqrt
import torch
import torch.nn as nn
class MultiHeadSelfAttention(nn.Module):
dim_in: int # input dimension
dim_k: int # key and query dimension
dim_v: int # value dimension
num_heads: int # number of heads, for each head, dim_* = dim_* // num_heads
def __init__(self, dim_in, dim_k, dim_v, num_heads=8):
super(MultiHeadSelfAttention, self).__init__()
assert dim_k % num_heads == 0 and dim_v % num_heads == 0, "dim_k and dim_v must be multiple of num_heads"
self.dim_in = dim_in
self.dim_k = dim_k
self.dim_v = dim_v
self.num_heads = num_heads
self.linear_q = nn.Linear(dim_in, dim_k, bias=False)
self.linear_k = nn.Linear(dim_in, dim_k, bias=False)
self.linear_v = nn.Linear(dim_in, dim_v, bias=False)
self._norm_fact = 1 / sqrt(dim_k // num_heads)
def forward(self, x):
# x: tensor of shape (batch, n, dim_in)
batch, n, dim_in = x.shape
assert dim_in == self.dim_in
nh = self.num_heads
dk = self.dim_k // nh # dim_k of each head
dv = self.dim_v // nh # dim_v of each head
q = self.linear_q(x).reshape(batch, n, nh, dk).transpose(1, 2) # (batch, nh, n, dk)
k = self.linear_k(x).reshape(batch, n, nh, dk).transpose(1, 2) # (batch, nh, n, dk)
v = self.linear_v(x).reshape(batch, n, nh, dv).transpose(1, 2) # (batch, nh, n, dv)
dist = torch.matmul(q, k.transpose(2, 3)) * self._norm_fact # batch, nh, n, n
dist = torch.softmax(dist, dim=-1) # batch, nh, n, n
att = torch.matmul(dist, v) # batch, nh, n, dv
att = att.transpose(1, 2).reshape(batch, n, self.dim_v) # batch, n, dim_v
return att
3、图像缩放
步骤:
- 通过原始图像和比例因子得到新图像的大小,并用零矩阵初始化新图像。
- 由新图像的某个像素点(x,y)映射到原始图像(x’,y’)处。
- 对x’,y’取整得到(xx,yy)并得到(xx,yy)、(xx+1,yy)、(xx,yy+1)和(xx+1,yy+1)的值。
- 利用双线性插值得到像素点(x,y)的值并写回新图像。
双线性插值实现:将每个像素点坐标(x,y)分解为(i+u,j+v), i,j是整数部分,u,v是小数部分,则f(i+u,j+v) = (1-u)(1-v)f(i,j)+uvf(i+1,j+1)+u(1-v)f(i+1,j)+(1-u)v* f(i,j+1)。
opencv实现细节:将新图像像素点映射回原图像时,SrcX=(dstX+0.5)* (srcWidth/dstWidth) -0.5,SrcY=(dstY+0.5) * (srcHeight/dstHeight)-0.5,使得原图像和新图像几何中心对齐。因为按原始映射方式,55图像缩放成33图像,图像中心点(1,1)映射回原图会变成(1.67,1.67)而不是(2,2)。
4、图像旋转实现
旋转矩阵:
实现思路:
- 计算旋转后图像的min_x,min_y,将(min_x,min_y)作为新坐标原点(向下取整),并变换原图像坐标到新坐标系,以防止旋转后图像超出图像边界。
- 初始化旋转后图像的0矩阵,遍历矩阵中每个点(x,y),根据旋转矩阵进行反向映射(旋转矩阵的逆,np.linalg.inv(a)),将(x,y)映射回原图(x0,y0),同样将x0和y0拆分为整数和小数部分:i+u,j+v,进行双线性插值即可。从而得到旋转后图像每个像素(x,y)的值。
5、RoI Pooling实现细节
RoI Pooling需要经过两次量化实现pooling:
第一次是映射到feature map时,当位置是小数时,对坐标进行最近邻插值。
第二次是在pooling时,当RoI size不能被RoI Pooling ouputsize整除时,直接舍去小数位。如4/3=1.33,直接变为1,则RoI pooling变成对每个1* 2的格子做pooling,pooling方式可选max或者average。
6、RoIAlign实现细节
RoIAlign采用双线性插值避免量化带来的特征损失:
将RoI平分成outputsize* outputsize个方格,对每个方格取四个采样点,采样点的值通过双线性插值获得,最后通过对四个采样点进行max或average pooling得到最终的RoI feature。
7、2D/3D IoU实现
#核心思路:
union_h = min(top_y) - max(bottom_y)
union_w = min(right_x) - max(left_x)
def 2d_iou(box1, box2):
'''
两个框(二维)的 iou 计算
注意:左下和右上角点
box:[x1, y1, x2, y2]
'''
# 计算重叠区域的长宽
in_h = min(box1[3], box2[3]) - max(box1[1], box2[1])
in_w = min(box1[2], box2[2]) - max(box1[0], box2[0])
inter = 0 if in_h<0 or in_w<0 else in_h*in_w
union = (box1[2] - box1[0]) * (box1[3] - box1[1]) + \
(box2[2] - box2[0]) * (box2[3] - box2[1]) - inter
iou = inter / union
return iou
# 思路类似,找到原点方向的角点以及斜对角处的角点
def 3d_iou(box1, box2):
'''
box:[x1,y1,z1,x2,y2,z2]
'''
area1 = (box1[3]-box1[0])*(box1[4]-box1[1])*(box1[5]-box1[2])
area2 = (box2[3]-box2[0])*(box2[4]-box2[1])*(box2[5]-box2[2])
area_sum = area1 + area2
#计算重叠长方体区域的两个角点[x1,y1,z1,x2,y2,z2]
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
z1 = max(box1[2], box2[2])
x2 = min(box1[3], box2[3])
y2 = min(box1[4], box2[4])
z2 = min(box1[5], box2[5])
if x1 >= x2 or y1 >= y2 or z1 >= z2:
return 0
else:
inter_area = (x2-x1)*(y2-y1)*(z2-z1)
return inter_area/(area_sum-inter_area)
8、手撕NMS
import numpy as np
# from https://github.com/luanshiyinyang/NMS,个人觉得很简洁的一种写法
def nms(bboxes, scores, iou_thresh):
"""
:param bboxes: 检测框列表
:param scores: 置信度列表
:param iou_thresh: IOU阈值
:return:
"""
x1 = bboxes[:, 0]
y1 = bboxes[:, 1]
x2 = bboxes[:, 2]
y2 = bboxes[:, 3]
areas = (y2 - y1) * (x2 - x1)
# 结果列表
result = []
# 对检测框按照置信度进行从高到低的排序,并获取索引
index = scores.argsort()[::-1]
# 下面的操作为了安全,都是对索引处理
while index.size > 0:
# 当检测框不为空一直循环
i = index[0]
# 将置信度最高的加入结果列表
result.append(i)
# 计算其他边界框与该边界框的IOU
x11 = np.maximum(x1[i], x1[index[1:]])
y11 = np.maximum(y1[i], y1[index[1:]])
x22 = np.minimum(x2[i], x2[index[1:]])
y22 = np.minimum(y2[i], y2[index[1:]])
w = np.maximum(0, x22 - x11 + 1)
h = np.maximum(0, y22 - y11 + 1)
overlaps = w * h
ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
# 只保留满足IOU阈值的索引
idx = np.where(ious <= iou_thresh)[0]
index = index[idx + 1] # 处理剩余的边框
bboxes, scores = bboxes[result], scores[result]
return bboxes, scores