注意
点击 此处 下载完整示例代码
6. 深入了解如何在 Kinetics400 上训练 SlowFast 模型¶
这是一个使用 Gluon CV 工具包进行视频动作识别的教程,一个循序渐进的例子。读者应具备深度学习的基础知识,并熟悉 Gluon API。新用户可以先阅读 A 60-minute Gluon Crash Course。您可以 立即开始训练 或 `深入了解`_。
立即开始训练¶
注意
欢迎跳过本教程,因为训练脚本是完整的且可以直接运行。
下载 完整的 Python 脚本: train_recognizer.py
有关更多训练命令选项,请运行 python train_recognizer.py -h
请查看 模型库 获取重现预训练模型的训练命令。
网络结构¶
首先,让我们将必要的库导入 Python。
from __future__ import division
import argparse, time, logging, os, sys, math
import numpy as np
import mxnet as mx
import gluoncv as gcv
from mxnet import gluon, nd, init, context
from mxnet import autograd as ag
from mxnet.gluon import nn
from mxnet.gluon.data.vision import transforms
from gluoncv.data.transforms import video
from gluoncv.data import Kinetics400
from gluoncv.model_zoo import get_model
from gluoncv.utils import makedirs, LRSequential, LRScheduler, split_and_load, TrainingHistory
在本教程中,我们选择一个广泛采用的模型,SlowFast
。 SlowFast 是一种新的 3D 视频分类模型,旨在达到准确性和效率之间的最佳平衡。它提出了两个分支,快速分支和慢速分支,以处理视频中的不同方面。快速分支通过使用许多但小的视频帧来捕捉运动动态。慢速分支通过使用少量但大的视频帧来捕捉精细外观细节。两个分支的特征通过横向连接相结合。
# number of GPUs to use
num_gpus = 1
ctx = [mx.gpu(i) for i in range(num_gpus)]
# Get the model slowfast_4x16_resnet50_kinetics400 with 400 output classes, without pre-trained weights
net = get_model(name='slowfast_4x16_resnet50_kinetics400', nclass=400)
net.collect_params().reset_ctx(ctx)
print(net)
输出
SlowFast(
(fast_conv1): Conv3D(3 -> 8, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
(fast_bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=8)
(fast_relu): Activation(relu)
(fast_maxpool): MaxPool3D(size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), ceil_mode=False, global_pool=False, pool_type=max, layout=NCDHW)
(fast_res2): HybridSequential(
(0): Bottleneck(
(conv1): Conv3D(8 -> 8, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=8)
(conv2): Conv3D(8 -> 8, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=8)
(conv3): Conv3D(8 -> 32, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(relu): Activation(relu)
(downsample): HybridSequential(
(0): Conv3D(8 -> 32, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
)
)
(1): Bottleneck(
(conv1): Conv3D(32 -> 8, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=8)
(conv2): Conv3D(8 -> 8, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=8)
(conv3): Conv3D(8 -> 32, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(relu): Activation(relu)
)
(2): Bottleneck(
(conv1): Conv3D(32 -> 8, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=8)
(conv2): Conv3D(8 -> 8, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=8)
(conv3): Conv3D(8 -> 32, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(relu): Activation(relu)
)
)
(fast_res3): HybridSequential(
(0): Bottleneck(
(conv1): Conv3D(32 -> 16, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=16)
(conv2): Conv3D(16 -> 16, kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=16)
(conv3): Conv3D(16 -> 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(relu): Activation(relu)
(downsample): HybridSequential(
(0): Conv3D(32 -> 64, kernel_size=(1, 1, 1), stride=(1, 2, 2), bias=False)
(1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
)
)
(1): Bottleneck(
(conv1): Conv3D(64 -> 16, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=16)
(conv2): Conv3D(16 -> 16, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=16)
(conv3): Conv3D(16 -> 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(relu): Activation(relu)
)
(2): Bottleneck(
(conv1): Conv3D(64 -> 16, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=16)
(conv2): Conv3D(16 -> 16, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=16)
(conv3): Conv3D(16 -> 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(relu): Activation(relu)
)
(3): Bottleneck(
(conv1): Conv3D(64 -> 16, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=16)
(conv2): Conv3D(16 -> 16, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=16)
(conv3): Conv3D(16 -> 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(relu): Activation(relu)
)
)
(fast_res4): HybridSequential(
(0): Bottleneck(
(conv1): Conv3D(64 -> 32, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(conv2): Conv3D(32 -> 32, kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(conv3): Conv3D(32 -> 128, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(relu): Activation(relu)
(downsample): HybridSequential(
(0): Conv3D(64 -> 128, kernel_size=(1, 1, 1), stride=(1, 2, 2), bias=False)
(1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
)
)
(1): Bottleneck(
(conv1): Conv3D(128 -> 32, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(conv2): Conv3D(32 -> 32, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(conv3): Conv3D(32 -> 128, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(relu): Activation(relu)
)
(2): Bottleneck(
(conv1): Conv3D(128 -> 32, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(conv2): Conv3D(32 -> 32, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(conv3): Conv3D(32 -> 128, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(relu): Activation(relu)
)
(3): Bottleneck(
(conv1): Conv3D(128 -> 32, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(conv2): Conv3D(32 -> 32, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(conv3): Conv3D(32 -> 128, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(relu): Activation(relu)
)
(4): Bottleneck(
(conv1): Conv3D(128 -> 32, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(conv2): Conv3D(32 -> 32, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(conv3): Conv3D(32 -> 128, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(relu): Activation(relu)
)
(5): Bottleneck(
(conv1): Conv3D(128 -> 32, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(conv2): Conv3D(32 -> 32, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
(conv3): Conv3D(32 -> 128, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(relu): Activation(relu)
)
)
(fast_res5): HybridSequential(
(0): Bottleneck(
(conv1): Conv3D(128 -> 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(conv2): Conv3D(64 -> 64, kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(conv3): Conv3D(64 -> 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(relu): Activation(relu)
(downsample): HybridSequential(
(0): Conv3D(128 -> 256, kernel_size=(1, 1, 1), stride=(1, 2, 2), bias=False)
(1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
)
)
(1): Bottleneck(
(conv1): Conv3D(256 -> 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(conv2): Conv3D(64 -> 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(conv3): Conv3D(64 -> 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(relu): Activation(relu)
)
(2): Bottleneck(
(conv1): Conv3D(256 -> 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(conv2): Conv3D(64 -> 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(conv3): Conv3D(64 -> 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(relu): Activation(relu)
)
)
(lateral_p1): HybridSequential(
(0): Conv3D(8 -> 16, kernel_size=(5, 1, 1), stride=(8, 1, 1), padding=(2, 0, 0), bias=False)
(1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=16)
(2): Activation(relu)
)
(lateral_res2): HybridSequential(
(0): Conv3D(32 -> 64, kernel_size=(5, 1, 1), stride=(8, 1, 1), padding=(2, 0, 0), bias=False)
(1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(2): Activation(relu)
)
(lateral_res3): HybridSequential(
(0): Conv3D(64 -> 128, kernel_size=(5, 1, 1), stride=(8, 1, 1), padding=(2, 0, 0), bias=False)
(1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(2): Activation(relu)
)
(lateral_res4): HybridSequential(
(0): Conv3D(128 -> 256, kernel_size=(5, 1, 1), stride=(8, 1, 1), padding=(2, 0, 0), bias=False)
(1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(2): Activation(relu)
)
(slow_conv1): Conv3D(3 -> 64, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
(slow_bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(slow_relu): Activation(relu)
(slow_maxpool): MaxPool3D(size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), ceil_mode=False, global_pool=False, pool_type=max, layout=NCDHW)
(slow_res2): HybridSequential(
(0): Bottleneck(
(conv1): Conv3D(80 -> 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(conv2): Conv3D(64 -> 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(conv3): Conv3D(64 -> 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(relu): Activation(relu)
(downsample): HybridSequential(
(0): Conv3D(80 -> 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
)
)
(1): Bottleneck(
(conv1): Conv3D(256 -> 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(conv2): Conv3D(64 -> 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(conv3): Conv3D(64 -> 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(relu): Activation(relu)
)
(2): Bottleneck(
(conv1): Conv3D(256 -> 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(conv2): Conv3D(64 -> 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
(conv3): Conv3D(64 -> 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(relu): Activation(relu)
)
)
(slow_res3): HybridSequential(
(0): Bottleneck(
(conv1): Conv3D(320 -> 128, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(conv2): Conv3D(128 -> 128, kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(conv3): Conv3D(128 -> 512, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=512)
(relu): Activation(relu)
(downsample): HybridSequential(
(0): Conv3D(320 -> 512, kernel_size=(1, 1, 1), stride=(1, 2, 2), bias=False)
(1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=512)
)
)
(1): Bottleneck(
(conv1): Conv3D(512 -> 128, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(conv2): Conv3D(128 -> 128, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(conv3): Conv3D(128 -> 512, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=512)
(relu): Activation(relu)
)
(2): Bottleneck(
(conv1): Conv3D(512 -> 128, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(conv2): Conv3D(128 -> 128, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(conv3): Conv3D(128 -> 512, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=512)
(relu): Activation(relu)
)
(3): Bottleneck(
(conv1): Conv3D(512 -> 128, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(conv2): Conv3D(128 -> 128, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(conv3): Conv3D(128 -> 512, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=512)
(relu): Activation(relu)
)
)
(slow_res4): HybridSequential(
(0): Bottleneck(
(conv1): Conv3D(640 -> 256, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(conv2): Conv3D(256 -> 256, kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(conv3): Conv3D(256 -> 1024, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=1024)
(relu): Activation(relu)
(downsample): HybridSequential(
(0): Conv3D(640 -> 1024, kernel_size=(1, 1, 1), stride=(1, 2, 2), bias=False)
(1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=1024)
)
)
(1): Bottleneck(
(conv1): Conv3D(1024 -> 256, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(conv2): Conv3D(256 -> 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(conv3): Conv3D(256 -> 1024, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=1024)
(relu): Activation(relu)
)
(2): Bottleneck(
(conv1): Conv3D(1024 -> 256, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(conv2): Conv3D(256 -> 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(conv3): Conv3D(256 -> 1024, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=1024)
(relu): Activation(relu)
)
(3): Bottleneck(
(conv1): Conv3D(1024 -> 256, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(conv2): Conv3D(256 -> 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(conv3): Conv3D(256 -> 1024, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=1024)
(relu): Activation(relu)
)
(4): Bottleneck(
(conv1): Conv3D(1024 -> 256, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(conv2): Conv3D(256 -> 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(conv3): Conv3D(256 -> 1024, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=1024)
(relu): Activation(relu)
)
(5): Bottleneck(
(conv1): Conv3D(1024 -> 256, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(conv2): Conv3D(256 -> 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(conv3): Conv3D(256 -> 1024, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=1024)
(relu): Activation(relu)
)
)
(slow_res5): HybridSequential(
(0): Bottleneck(
(conv1): Conv3D(1280 -> 512, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=512)
(conv2): Conv3D(512 -> 512, kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=512)
(conv3): Conv3D(512 -> 2048, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=2048)
(relu): Activation(relu)
(downsample): HybridSequential(
(0): Conv3D(1280 -> 2048, kernel_size=(1, 1, 1), stride=(1, 2, 2), bias=False)
(1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=2048)
)
)
(1): Bottleneck(
(conv1): Conv3D(2048 -> 512, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=512)
(conv2): Conv3D(512 -> 512, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=512)
(conv3): Conv3D(512 -> 2048, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=2048)
(relu): Activation(relu)
)
(2): Bottleneck(
(conv1): Conv3D(2048 -> 512, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
(bn1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=512)
(conv2): Conv3D(512 -> 512, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
(bn2): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=512)
(conv3): Conv3D(512 -> 2048, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
(bn3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=2048)
(relu): Activation(relu)
)
)
(avg): GlobalAvgPool3D(size=(1, 1, 1), stride=(1, 1, 1), padding=(0, 0, 0), ceil_mode=True, global_pool=True, pool_type=avg, layout=NCDHW)
(dp): Dropout(p = 0.5, axes=())
(fc): Dense(2304 -> 400, linear)
)
数据增强和数据加载器¶
视频的数据增强与图像不同。例如,如果您想随机裁剪视频序列,则需要确保该序列中的所有视频帧都经过相同的裁剪过程。我们提供了一组新的转换函数,适用于多张图像。请查看 video.py 获取更多详情。此处使用的大多数视频数据增强策略在 [Wang15] 中有介绍。
transform_train = transforms.Compose([
# Fix the input video frames size as 256×340 and randomly sample the cropping width and height from
# {256,224,192,168}. After that, resize the cropped regions to 224 × 224.
video.VideoMultiScaleCrop(size=(224, 224), scale_ratios=[1.0, 0.875, 0.75, 0.66]),
# Randomly flip the video frames horizontally
video.VideoRandomHorizontalFlip(),
# Transpose the video frames from height*width*num_channels to num_channels*height*width
# and map values from [0, 255] to [0,1]
video.VideoToTensor(),
# Normalize the video frames with mean and standard deviation calculated across all images
video.VideoNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
有了转换函数,我们可以为我们的训练数据集定义数据加载器。
# Batch Size for Each GPU
per_device_batch_size = 5
# Number of data loader workers
num_workers = 0
# Calculate effective total batch size
batch_size = per_device_batch_size * num_gpus
# Set train=True for training the model.
# ``new_length`` indicates the number of frames we will cover.
# For SlowFast network, we evenly sample 32 frames for the fast branch and 4 frames for the slow branch.
# This leads to the actual input length of 36 video frames.
train_dataset = Kinetics400(train=True, new_length=64, slowfast=True, transform=transform_train)
print('Load %d training samples.' % len(train_dataset))
train_data = gluon.data.DataLoader(train_dataset, batch_size=batch_size,
shuffle=True, num_workers=num_workers)
输出
Load 375 training samples.
优化器、损失函数和评估指标¶
lr_decay = 0.1
warmup_epoch = 34
total_epoch = 196
num_batches = len(train_data)
lr_scheduler = LRSequential([
LRScheduler('linear', base_lr=0.01, target_lr=0.1,
nepochs=warmup_epoch, iters_per_epoch=num_batches),
LRScheduler('cosine', base_lr=0.1, target_lr=0,
nepochs=total_epoch - warmup_epoch,
iters_per_epoch=num_batches,
step_factor=lr_decay, power=2)
])
# Stochastic gradient descent
optimizer = 'sgd'
# Set parameters
optimizer_params = {'learning_rate': 0.01, 'wd': 0.0001, 'momentum': 0.9}
optimizer_params['lr_scheduler'] = lr_scheduler
# Define our trainer for net
trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)
为了优化我们的模型,我们需要一个损失函数。对于分类任务,我们通常使用 softmax 交叉熵作为损失函数。
loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
为简单起见,我们使用准确率作为评估指标来监控训练过程。此外,我们记录评估指标值,并在训练结束时打印出来。
train_metric = mx.metric.Accuracy()
train_history = TrainingHistory(['training-acc'])
训练¶
完成所有准备工作后,我们终于可以开始训练了!以下是脚本。
注意
为了快速完成本教程,我们仅在 Kinetics400 的一小部分子集上训练 0 个 epoch,每个 epoch 100 次迭代。在您的实验中,我们建议对完整的 Kinetics400 数据集设置 epochs=100
。
epochs = 0
for epoch in range(epochs):
tic = time.time()
train_metric.reset()
train_loss = 0
# Loop through each batch of training data
for i, batch in enumerate(train_data):
# Extract data and label
data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
# AutoGrad
with ag.record():
output = []
for _, X in enumerate(data):
X = X.reshape((-1,) + X.shape[2:])
pred = net(X)
output.append(pred)
loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
# Backpropagation
for l in loss:
l.backward()
# Optimize
trainer.step(batch_size)
# Update metrics
train_loss += sum([l.mean().asscalar() for l in loss])
train_metric.update(label, output)
if i == 100:
break
name, acc = train_metric.get()
# Update history and print metrics
train_history.update([acc])
print('[Epoch %d] train=%f loss=%f time: %f' %
(epoch, acc, train_loss / (i+1), time.time()-tic))
# We can plot the metric scores with:
train_history.plot()

由于使用了微小的子集,准确率数值相当低。您可以在完整的 Kinetics400 数据集上 立即开始训练。