BiribiriBird's Gallery.

动手学深度学习 · Part3 SSD代码实现

字数统计: 2.8k阅读时长: 14 min
2024/08/05
loading

计算机视觉 · SSD代码实现

动手学深度学习v2 - https://www.bilibili.com/video/BV18p4y1h7Dr

边界框

1
2
img = plt.imread('data/dog_cat.png')
plt.imshow(img)
image-20240804172435456

边界框的表示有两种:

  • 左上角x、左上角y、右下角x、右下角y
  • 中心x、中心y、高度、宽度
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 两种表示方法互相转换
def box_center_to_corner(boxes):
x,y,w,h = boxes[:,0], boxes[:,1], boxes[:,2], boxes[:,3]
boxes = torch.stack((x - w/2, y - h/2, x + w/2, y + h/2), axis = -1)
return boxes

def box_corner_to_center(boxes):
lx,ly,rx,ry = boxes[:,0], boxes[:,1], boxes[:,2], boxes[:,3]
boxes = torch.stack(((lx+rx)/2, (ly+ry)/2, (rx-lx), (ry-ly)), axis = -1)
return boxes

# 定义多个边界框
dog_bbox, cat_bbox = [60.0, 45.0, 378.0, 516.0], [400.0, 112.0, 655.0, 493.0]
boxes = torch.tensor( (dog_bbox, cat_bbox) )

box_center_to_corner( box_corner_to_center(boxes) ) == boxes
# tensor([[True, True, True, True],
# [True, True, True, True]])

绘制边缘框

1
2
3
4
5
6
7
8
9
10
def draw_bbox(bbox, color):
return plt.Rectangle(
xy= ( bbox[0], bbox[1] ),
width = bbox[2] - bbox[0], height = bbox[3] - bbox[1],
fill=False, edgecolor = color, linewidth=2
)

fig = plt.imshow(img)
fig.axes.add_patch( draw_bbox(dog_bbox, 'red') )
fig.axes.add_patch( draw_bbox(cat_bbox, 'blue') )
image-20240804174437500

数据集

下载地址:香蕉数据集

1
2
3
4
5
6
7
data/banana-detection/
├── bananas_train
│ ├── images/
│ └── label.csv
└── bananas_val
├── images/
└── label.csv

训练集和验证集已经切分好了,csv格式如下

1
img_name,label,xmin,ymin,xmax,ymax

读取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def read_data(is_Train=True):

data_dir = 'data/banana-detection'
data_dir = os.path.join( data_dir, 'bananas_train' if is_Train else 'bananas_val')
csv_dir = os.path.join(data_dir, 'label.csv')

# 设定索引 其他列为值
csv_data = pd.read_csv(csv_dir).set_index('img_name')

images, targets = [], []
for img_name, target in csv_data.iterrows():
targets.append(list(target))
images.append(
torchvision.io.read_image(
os.path.join(data_dir, "images", img_name)
)
)

return images, torch.tensor(targets).unsqueeze(1)/256


read_data()

Dataset

1
2
3
4
5
6
7
8
9
10
class BananaDataset(Dataset):
def __init__(self, isTrain) -> None:
super().__init__()
self.features, self.labels = read_data(isTrain)

def __len__(self):
return len(self.features)

def __getitem__(self, idx):
return self.features[idx].float(), self.labels[idx]

DataLoader

1
2
3
4
5
6
batch_size = 32
train_loader = DataLoader(BananaDataset(isTrain=True), batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(BananaDataset(isTrain=False), batch_size, num_workers=4)

# features = torch.Size([32, 3, 256, 256])
# labels = torch.Size([32, 1, 5])

读取后的图片如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
x = next(iter(train_loader))
print(x[0].shape)
print(x[1].shape)
images = x[0] # torch.Size([32, 3, 256, 256])
labels = x[1] # torch.Size([32, 1, 5])

fig, axes = plt.subplots(2, 5, figsize=(15, 6))
edge_size = 256

for i in range(10):
img = images[i].permute(1, 2, 0).numpy() / 255
# 将图像维度从 (C, H, W) 转换为 (H, W, C)
# 若像素值为浮点数 则imshow认为是[0,1] 否则认为是[0,255]

ax = axes[i // 5, i % 5] # 确定子图的位置

# 绘制边界框 tensor存储的是相对比例 需要乘上图片大小
ax.add_patch( draw_bbox(labels[i][0][1:5] * edge_size, 'red') )

ax.imshow(img) # 显示图像
plt.show()

image-20240804203156487

锚框

生成

使用SSD,每个像素以自己为中心,生成若干个不同缩放、高宽比的锚框

s=(s1,s2,s3,...,sn),r=(r1,r2,...,rm)s = (s_1, s_2, s_3, ..., s_n), r = (r_1, r_2, ..., r_m)

则搭配的方案为:

(s1,r1),(s1,r2),...,(s1,rm)(s_1, r_1), (s_1,r_2),...,(s_1,r_m)

(s2,r1),(s2,r1),...,(sn,r1)(s_2, r_1), (s_2,r_1),...,(s_n,r_1)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def multibox_prior(data, sizes, ratios):

in_height, in_width = data.shape[-2] # feature map的高宽
device, sizes_len, ratios_len = data.device, len(sizes), len(ratios)

# 每个像素生成的锚框
boxes_per_pixel = sizes_len + ratios_len - 1
sizes = torch.tensor(sizes, device=device)
ratios = torch.tensor(ratios, device=device)

# 计算单位距离(步长)
steps_h, steps_w = 1.0 / in_height, 1.0 / in_width

# 所有像素的中心坐标
# 第0,1,2,3,4...i个像素
# 中心在(i+0.5)*step

center_h = (torch.arange(in_height, device=device) + 0.5) * steps_h
center_w = (torch.arange(in_width, device=device) + 0.5) * steps_w

# 两两配对 变成网格
shift_x, shift_y = torch.meshgrid(center_h, center_w, indexing='ij')
shift_y, shift_x = shift_y.reshape(-1), shift_x.reshape(-1)

# 单个锚框的高宽组合
w = torch.cat((sizes * torch.sqrt(ratios[0]), sizes[0] * torch.sqrt(ratios[1:]))) * in_height / in_width # 处理矩形输入
h = torch.cat((sizes / torch.sqrt(ratios[0]), sizes[0] / torch.sqrt(ratios[1:])))

# 除以2来获得半高和半宽 重复in_height * in_width个像素次
anchor_manipulations = torch.stack((-w, -h, w, h)).T.repeat( in_height * in_width, 1) / 2

out_grid = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1).repeat_interleave(boxes_per_pixel, dim=0)
output = out_grid + anchor_manipulations

return output

绘制:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
img = plt.imread('data/dog_cat.png')
h,w = img.shape[:2]


def show_bboxes(img, anchors, fmap_w, fmap_h):

fig = plt.imshow(img)
nums = len(anchors) // (fmap_w * fmap_h)
colors = ['b','g','r','c','m','y','k','w'][:nums]

for i in range(len(anchors)):
fig.axes.add_patch( draw_bbox(anchors[i], colors[i%nums]) )


def display_anchors(fmap_w, fmap_h, s):
fmap = torch.zeros((1, 10, fmap_h, fmap_w)) # [batch_size, channels, h, w]

anchors = multibox_prior(fmap, sizes=s, ratios=[1, 2, 0.5]) # 指定s和r生成锚框
bbox_scale = torch.tensor((w, h, w, h))
show_bboxes(img, anchors[0] * bbox_scale, fmap_w, fmap_h) # 乘上实际大小

display_anchors(fmap_w=4, fmap_h=4, s=[0.15])
image-20240805012537793
1
display_anchors(fmap_w=2, fmap_h=2, s=[0.15])
image-20240805012606129
1
display_anchors(fmap_w=1, fmap_h=1, s=[0.8])
image-20240805012705374

交并比IoU

锚框画出来需要进行评价,显然和真实边界框越相近就越好

使用交并比进行比较

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def box_iou(boxes1, boxes2):

# lambda 表达式 计算矩形面积
box_area = lambda boxes: ((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]))


areas1 = box_area(boxes1) # 面积1
areas2 = box_area(boxes2) # 面积2

# 求交集
inter_upperlefts = torch.max(boxes1[:, None, :2], boxes2[:, :2])
inter_lowerrights = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])

inters = (inter_lowerrights - inter_upperlefts).clamp(min=0)
inter_areas = inters[:, :, 0] * inters[:, :, 1]
union_areas = areas1[:, None] + areas2 - inter_areas
return inter_areas / union_areas

锚框绑定

对于给定真实边界框数据集,我们实际上会对每个锚框进行绑定

我们将每个锚框视作一个训练样本,其类别为绑定的边界框的类别标签,其偏移值为锚框到边界框的偏移

具体绑定做法见理论部分的笔记

image-20240805220447156
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 分配锚框 边界框
def assign_anchor_to_bbox(ground_truth, anchors, device, iou_threshold=0.5):
num_anchors, num_gt_boxes = anchors.shape[0], ground_truth.shape[0]

jaccard = box_iou(anchors, ground_truth)
# 对于每个锚框,分配的真实边界框的张量
anchors_bbox_map = torch.full((num_anchors,), -1, dtype=torch.long, device=device)
# 根据阈值,决定是否分配真实边界框
max_ious, indices = torch.max(jaccard, dim=1)
anc_i = torch.nonzero(max_ious >= iou_threshold).reshape(-1)
box_j = indices[max_ious >= iou_threshold]
anchors_bbox_map[anc_i] = box_j
col_discard = torch.full((num_anchors,), -1)
row_discard = torch.full((num_gt_boxes,), -1)

for _ in range(num_gt_boxes):
max_idx = torch.argmax(jaccard)
box_idx = (max_idx % num_gt_boxes).long()
anc_idx = (max_idx / num_gt_boxes).long()
anchors_bbox_map[anc_idx] = box_idx
jaccard[:, box_idx] = col_discard
jaccard[anc_idx, :] = row_discard
return anchors_bbox_map
1
2
3
4
5
6
7
8
def offset_boxes(anchors, assigned_bb, eps=1e-6):
"""对锚框偏移量的转换"""
c_anc = box_corner_to_center(anchors) # 转化为center方式
c_assigned_bb = box_corner_to_center(assigned_bb) # 转化为center方式
offset_xy = 10 * (c_assigned_bb[:, :2] - c_anc[:, :2]) / c_anc[:, 2:] # 计算offset
offset_wh = 5 * torch.log(eps + c_assigned_bb[:, 2:] / c_anc[:, 2:])
offset = torch.cat([offset_xy, offset_wh], axis=1)
return offset

如果一个锚框没有被分配到边界框,则我们标记其为背景

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def multibox_target(anchors, labels):
"""使用真实边界框标记锚框"""
batch_size, anchors = labels.shape[0], anchors.squeeze(0)
batch_offset, batch_mask, batch_class_labels = [], [], []
device, num_anchors = anchors.device, anchors.shape[0]
for i in range(batch_size):
label = labels[i, :, :]
anchors_bbox_map = assign_anchor_to_bbox(
label[:, 1:], anchors, device)
bbox_mask = ((anchors_bbox_map >= 0).float().unsqueeze(-1)).repeat(
1, 4)
# 将类标签和分配的边界框坐标初始化为零
class_labels = torch.zeros(num_anchors, dtype=torch.long,
device=device)
assigned_bb = torch.zeros((num_anchors, 4), dtype=torch.float32,
device=device)
# 使用真实边界框来标记锚框的类别。
# 如果一个锚框没有被分配,标记其为背景(值为零)
indices_true = torch.nonzero(anchors_bbox_map >= 0)
bb_idx = anchors_bbox_map[indices_true]
class_labels[indices_true] = label[bb_idx, 0].long() + 1
assigned_bb[indices_true] = label[bb_idx, 1:]
# 偏移量转换
offset = offset_boxes(anchors, assigned_bb) * bbox_mask
batch_offset.append(offset.reshape(-1))
batch_mask.append(bbox_mask.reshape(-1))
batch_class_labels.append(class_labels)
bbox_offset = torch.stack(batch_offset)
bbox_mask = torch.stack(batch_mask)
class_labels = torch.stack(batch_class_labels)
return (bbox_offset, bbox_mask, class_labels)

网络

image-20240805015705635

类别预测层

对于每一层,feature map若是h×wh\times w的图片,每个像素生成aa个锚框,预测qq个类别,若使用全连接层,参数量hwa(q+1)hwa(q+1)(0类是背景),非常恐怖

需要像之前一样通过通道数表达神经元的方法,进行网络优化

  • 输入:输入本身就是h×wh\times w的feature map,因此能够对应上
  • 输出通道:对于每个锚框需要输出q+1q+1个概率,因此通道数为a(q+1)a(q+1)
1
nn.Conv2d(num_inputs, num_anchors * (num_classes + 1), kernel_size=3, padding=1)

边界框预测层

同理,但是输出为4,代表四个锚框坐标和边界框的偏移量

1
nn.Conv2d(num_inputs, num_anchors * 4, kernel_size=3, padding=1)

高宽减半块(下采样)

网络提升时,需要不断下采样,增大感受野

实际上用的是VGG

1
2
3
4
5
6
7
8
9
10
11
12
13
14
def down_sample_blk(in_channels, out_channels): #VGG Block
blk = []
for _ in range(2):
blk.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
blk.append(nn.BatchNorm2d(out_channels))
blk.append(nn.ReLU())
in_channels = out_channels
blk.append(nn.MaxPool2d(2)) # 减半
return nn.Sequential(*blk)


block = down_sample_blk(3, 10)
block( torch.zeros(32, 3, 100, 100) ).shape
# torch.Size([32, 10, 50, 50]) 成功减半

基本网络块

设计在第一次对图片进行特征提取,后续的网络块并不会进行过多卷积

选择叠加3次高宽减半层,每次把通道数翻倍

1
2
3
4
5
6
7
8
9
10
def base_net():
blk = []
num_filters = [3, 16, 32, 64] # 指定每一层的通道数
for i in range(len(num_filters) - 1):
blk.append(down_sample_blk(num_filters[i], num_filters[i+1]))
return nn.Sequential(*blk)

block = base_net()
block( torch.zeros(32, 3, 256, 256) ).shape
# torch.Size([32, 64, 32, 32])

网络框架

网络一共5块,每一个块都需要生成锚框,进行分类、预测

  • 基本网络块:高宽32,32
  • 高宽减半块(64,128):高宽16,16
  • 高宽减半块(128,128):高宽8,8
  • 高宽减半块(128,128)):高宽4,4
  • 全局池化:高宽降到1
1
2
3
4
5
6
7
8
9
10
def get_blk(i):
if i == 0:
blk = base_net()
elif i == 1:
blk = down_sample_blk(64, 128)
elif i <= 3:
blk = down_sample_blk(128, 128)
else:
blk = nn.AdaptiveMaxPool2d((1,1))
return blk

Model

模型的输出非常多,对于每一层都需要输出:

  • feature map
  • 锚框集合
  • 类别
  • 偏移量

其中后三个会保留到最后的所有输出内

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

sizes = [[0.2, 0.272], [0.37, 0.447], [0.54, 0.619], [0.71, 0.79],
[0.88, 0.961]]
ratios = [[1, 2, 0.5]] * 5
num_anchors = len(sizes[0]) + len(ratios[0]) - 1


# 整理输出用
def flatten_pred(pred):
return torch.flatten(pred.permute(0, 2, 3, 1), start_dim=1)

def concat_preds(preds):
return torch.cat([flatten_pred(p) for p in preds], dim=1)




class TinySSD(nn.Module):
def __init__(self, num_classes, **kwargs) -> None:
super().__init__(**kwargs)
self.num_classes = num_classes
self.blk, self.cls, self.predictor = nn.ModuleList(), nn.ModuleList(), nn.ModuleList()

in_channels = [64, 128, 128, 128, 128]
# 记录每一层的网络块
for i in range(5):
self.blk.append( get_blk(i) )
self.cls.append( nn.Conv2d( in_channels[i], num_anchors * (num_classes + 1), kernel_size=3, padding=1) )
self.predictor.append( nn.Conv2d( in_channels[i], num_anchors * 4, kernel_size=3, padding=1) )

def forward(self, X):
anchors, cls_preds, bbox_preds = [None] * 5, [None] * 5, [None] * 5
for i in range(5):
X = self.blk[i](X)
anchors[i] = multibox_prior(X, sizes = sizes[i], ratios = ratios[i] ) # 锚框
cls_preds[i] = self.cls[i](X) # 分类结果
bbox_preds[i] = self.predictor[i](X) # 回归结果

# 整理
anchors = torch.cat(anchors, dim = 1)

cls_preds = concat_preds(cls_preds)
cls_preds = cls_preds.reshape(
cls_preds.shape[0], -1, self.num_classes + 1
)

bbox_preds = concat_preds(bbox_preds)

return anchors, cls_preds, bbox_preds

net = TinySSD(num_classes=1)
X = torch.zeros((32, 3, 256, 256))
anchors, cls_preds, bbox_preds = net(X)

print('output anchors:', anchors.shape)
print('output class preds:', cls_preds.shape)
print('output bbox preds:', bbox_preds.shape)

训练

  • 分类误差:交叉熵
  • 回归误差:L1范数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

trainer = torch.optim.SGD(net.parameters(), lr=0.2, weight_decay=5e-4)
cls_loss = nn.CrossEntropyLoss(reduction='none') # 不合并
bbox_loss = nn.L1Loss(reduction='none')

def calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks):
batch_size, num_classes = cls_preds.shape[0], cls_preds.shape[2]
cls = cls_loss(
cls_preds.reshape(-1, num_classes),
cls_labels.reshape(-1)
).reshape(batch_size, -1).mean(dim=1)

bbox = bbox_loss(bbox_preds * bbox_masks, bbox_labels * bbox_masks).mean(dim=1)
return cls + bbox

# 准确率
def cls_eval(cls_preds, cls_labels):
return float((cls_preds.argmax(dim=-1).type(cls_labels.dtype) == cls_labels).sum())

# 相对误差
def bbox_eval(bbox_preds, bbox_labels, bbox_masks):
return float((torch.abs((bbox_labels - bbox_preds) * bbox_masks)).sum())

需要对预测结果reshape后,分别计算出对应的分类误差和回归误差,相加则得到我们需要的误差

CATALOG
  1. 1. 计算机视觉 · SSD代码实现
    1. 1.1. 边界框
    2. 1.2. 数据集
      1. 1.2.1. 读取
        1. 1.2.1.1. Dataset
        2. 1.2.1.2. DataLoader
    3. 1.3. 锚框
      1. 1.3.1. 生成
      2. 1.3.2. 交并比IoU
      3. 1.3.3. 锚框绑定
    4. 1.4. 网络
      1. 1.4.1. 类别预测层
      2. 1.4.2. 边界框预测层
      3. 1.4.3. 高宽减半块(下采样)
      4. 1.4.4. 基本网络块
      5. 1.4.5. 网络框架
      6. 1.4.6. Model
    5. 1.5. 训练