add pare
This commit is contained in:
parent
e30a28bff0
commit
ad0791fac6
@ -14,7 +14,7 @@ args:
|
|||||||
key_from_previous: [bbox]
|
key_from_previous: [bbox]
|
||||||
key_keep: []
|
key_keep: []
|
||||||
args:
|
args:
|
||||||
ckpt: /nas/home/shuaiqing/Code/EasyMocapPublic/data/models/pose_hrnet_w48_384x288.pth
|
ckpt: data/models/pose_hrnet_w48_384x288.pth
|
||||||
vis2d:
|
vis2d:
|
||||||
module: myeasymocap.io.vis.Vis2D
|
module: myeasymocap.io.vis.Vis2D
|
||||||
skip: False
|
skip: False
|
||||||
|
3
myeasymocap/backbone/pare/backbone/__init__.py
Normal file
3
myeasymocap/backbone/pare/backbone/__init__.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# from .hrnet_pare import *
|
||||||
|
from .resnet import *
|
||||||
|
from .mobilenet import *
|
631
myeasymocap/backbone/pare/backbone/hrnet.py
Normal file
631
myeasymocap/backbone/pare/backbone/hrnet.py
Normal file
@ -0,0 +1,631 @@
|
|||||||
|
# ------------------------------------------------------------------------------
|
||||||
|
# Copyright (c) Microsoft
|
||||||
|
# Licensed under the MIT License.
|
||||||
|
# Written by Bin Xiao (Bin.Xiao@microsoft.com)
|
||||||
|
# ------------------------------------------------------------------------------
|
||||||
|
import os
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
# from loguru import logger
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from yacs.config import CfgNode as CN
|
||||||
|
|
||||||
|
models = [
|
||||||
|
'hrnet_w32',
|
||||||
|
'hrnet_w48',
|
||||||
|
]
|
||||||
|
|
||||||
|
BN_MOMENTUM = 0.1
|
||||||
|
|
||||||
|
|
||||||
|
def conv3x3(in_planes, out_planes, stride=1):
|
||||||
|
"""3x3 convolution with padding"""
|
||||||
|
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
|
||||||
|
padding=1, bias=False)
|
||||||
|
|
||||||
|
|
||||||
|
class BasicBlock(nn.Module):
|
||||||
|
expansion = 1
|
||||||
|
|
||||||
|
def __init__(self, inplanes, planes, stride=1, downsample=None):
|
||||||
|
super(BasicBlock, self).__init__()
|
||||||
|
self.conv1 = conv3x3(inplanes, planes, stride)
|
||||||
|
self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
|
||||||
|
self.relu = nn.ReLU(inplace=True)
|
||||||
|
self.conv2 = conv3x3(planes, planes)
|
||||||
|
self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
|
||||||
|
self.downsample = downsample
|
||||||
|
self.stride = stride
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
residual = x
|
||||||
|
|
||||||
|
out = self.conv1(x)
|
||||||
|
out = self.bn1(out)
|
||||||
|
out = self.relu(out)
|
||||||
|
|
||||||
|
out = self.conv2(out)
|
||||||
|
out = self.bn2(out)
|
||||||
|
|
||||||
|
if self.downsample is not None:
|
||||||
|
residual = self.downsample(x)
|
||||||
|
|
||||||
|
out += residual
|
||||||
|
out = self.relu(out)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class Bottleneck(nn.Module):
|
||||||
|
expansion = 4
|
||||||
|
|
||||||
|
def __init__(self, inplanes, planes, stride=1, downsample=None):
|
||||||
|
super(Bottleneck, self).__init__()
|
||||||
|
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
|
||||||
|
self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
|
||||||
|
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
|
||||||
|
padding=1, bias=False)
|
||||||
|
self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
|
||||||
|
self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
|
||||||
|
bias=False)
|
||||||
|
self.bn3 = nn.BatchNorm2d(planes * self.expansion,
|
||||||
|
momentum=BN_MOMENTUM)
|
||||||
|
self.relu = nn.ReLU(inplace=True)
|
||||||
|
self.downsample = downsample
|
||||||
|
self.stride = stride
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
residual = x
|
||||||
|
|
||||||
|
out = self.conv1(x)
|
||||||
|
out = self.bn1(out)
|
||||||
|
out = self.relu(out)
|
||||||
|
|
||||||
|
out = self.conv2(out)
|
||||||
|
out = self.bn2(out)
|
||||||
|
out = self.relu(out)
|
||||||
|
|
||||||
|
out = self.conv3(out)
|
||||||
|
out = self.bn3(out)
|
||||||
|
|
||||||
|
if self.downsample is not None:
|
||||||
|
residual = self.downsample(x)
|
||||||
|
|
||||||
|
out += residual
|
||||||
|
out = self.relu(out)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class HighResolutionModule(nn.Module):
|
||||||
|
def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
|
||||||
|
num_channels, fuse_method, multi_scale_output=True):
|
||||||
|
super(HighResolutionModule, self).__init__()
|
||||||
|
self._check_branches(
|
||||||
|
num_branches, blocks, num_blocks, num_inchannels, num_channels)
|
||||||
|
|
||||||
|
self.num_inchannels = num_inchannels
|
||||||
|
self.fuse_method = fuse_method
|
||||||
|
self.num_branches = num_branches
|
||||||
|
|
||||||
|
self.multi_scale_output = multi_scale_output
|
||||||
|
|
||||||
|
self.branches = self._make_branches(
|
||||||
|
num_branches, blocks, num_blocks, num_channels)
|
||||||
|
self.fuse_layers = self._make_fuse_layers()
|
||||||
|
self.relu = nn.ReLU(True)
|
||||||
|
|
||||||
|
def _check_branches(self, num_branches, blocks, num_blocks,
|
||||||
|
num_inchannels, num_channels):
|
||||||
|
if num_branches != len(num_blocks):
|
||||||
|
error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(
|
||||||
|
num_branches, len(num_blocks))
|
||||||
|
logger.error(error_msg)
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
|
if num_branches != len(num_channels):
|
||||||
|
error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format(
|
||||||
|
num_branches, len(num_channels))
|
||||||
|
logger.error(error_msg)
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
|
if num_branches != len(num_inchannels):
|
||||||
|
error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format(
|
||||||
|
num_branches, len(num_inchannels))
|
||||||
|
logger.error(error_msg)
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
|
def _make_one_branch(self, branch_index, block, num_blocks, num_channels,
|
||||||
|
stride=1):
|
||||||
|
downsample = None
|
||||||
|
if stride != 1 or \
|
||||||
|
self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion:
|
||||||
|
downsample = nn.Sequential(
|
||||||
|
nn.Conv2d(
|
||||||
|
self.num_inchannels[branch_index],
|
||||||
|
num_channels[branch_index] * block.expansion,
|
||||||
|
kernel_size=1, stride=stride, bias=False
|
||||||
|
),
|
||||||
|
nn.BatchNorm2d(
|
||||||
|
num_channels[branch_index] * block.expansion,
|
||||||
|
momentum=BN_MOMENTUM
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
layers = []
|
||||||
|
layers.append(
|
||||||
|
block(
|
||||||
|
self.num_inchannels[branch_index],
|
||||||
|
num_channels[branch_index],
|
||||||
|
stride,
|
||||||
|
downsample
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.num_inchannels[branch_index] = \
|
||||||
|
num_channels[branch_index] * block.expansion
|
||||||
|
for i in range(1, num_blocks[branch_index]):
|
||||||
|
layers.append(
|
||||||
|
block(
|
||||||
|
self.num_inchannels[branch_index],
|
||||||
|
num_channels[branch_index]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return nn.Sequential(*layers)
|
||||||
|
|
||||||
|
def _make_branches(self, num_branches, block, num_blocks, num_channels):
|
||||||
|
branches = []
|
||||||
|
|
||||||
|
for i in range(num_branches):
|
||||||
|
branches.append(
|
||||||
|
self._make_one_branch(i, block, num_blocks, num_channels)
|
||||||
|
)
|
||||||
|
|
||||||
|
return nn.ModuleList(branches)
|
||||||
|
|
||||||
|
def _make_fuse_layers(self):
|
||||||
|
if self.num_branches == 1:
|
||||||
|
return None
|
||||||
|
|
||||||
|
num_branches = self.num_branches
|
||||||
|
num_inchannels = self.num_inchannels
|
||||||
|
fuse_layers = []
|
||||||
|
for i in range(num_branches if self.multi_scale_output else 1):
|
||||||
|
fuse_layer = []
|
||||||
|
for j in range(num_branches):
|
||||||
|
if j > i:
|
||||||
|
fuse_layer.append(
|
||||||
|
nn.Sequential(
|
||||||
|
nn.Conv2d(
|
||||||
|
num_inchannels[j],
|
||||||
|
num_inchannels[i],
|
||||||
|
1, 1, 0, bias=False
|
||||||
|
),
|
||||||
|
nn.BatchNorm2d(num_inchannels[i]),
|
||||||
|
nn.Upsample(scale_factor=2**(j-i), mode='nearest')
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif j == i:
|
||||||
|
fuse_layer.append(None)
|
||||||
|
else:
|
||||||
|
conv3x3s = []
|
||||||
|
for k in range(i-j):
|
||||||
|
if k == i - j - 1:
|
||||||
|
num_outchannels_conv3x3 = num_inchannels[i]
|
||||||
|
conv3x3s.append(
|
||||||
|
nn.Sequential(
|
||||||
|
nn.Conv2d(
|
||||||
|
num_inchannels[j],
|
||||||
|
num_outchannels_conv3x3,
|
||||||
|
3, 2, 1, bias=False
|
||||||
|
),
|
||||||
|
nn.BatchNorm2d(num_outchannels_conv3x3)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
num_outchannels_conv3x3 = num_inchannels[j]
|
||||||
|
conv3x3s.append(
|
||||||
|
nn.Sequential(
|
||||||
|
nn.Conv2d(
|
||||||
|
num_inchannels[j],
|
||||||
|
num_outchannels_conv3x3,
|
||||||
|
3, 2, 1, bias=False
|
||||||
|
),
|
||||||
|
nn.BatchNorm2d(num_outchannels_conv3x3),
|
||||||
|
nn.ReLU(True)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
fuse_layer.append(nn.Sequential(*conv3x3s))
|
||||||
|
fuse_layers.append(nn.ModuleList(fuse_layer))
|
||||||
|
|
||||||
|
return nn.ModuleList(fuse_layers)
|
||||||
|
|
||||||
|
def get_num_inchannels(self):
|
||||||
|
return self.num_inchannels
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.num_branches == 1:
|
||||||
|
return [self.branches[0](x[0])]
|
||||||
|
|
||||||
|
for i in range(self.num_branches):
|
||||||
|
x[i] = self.branches[i](x[i])
|
||||||
|
|
||||||
|
x_fuse = []
|
||||||
|
|
||||||
|
for i in range(len(self.fuse_layers)):
|
||||||
|
y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
|
||||||
|
for j in range(1, self.num_branches):
|
||||||
|
if i == j:
|
||||||
|
y = y + x[j]
|
||||||
|
else:
|
||||||
|
y = y + self.fuse_layers[i][j](x[j])
|
||||||
|
x_fuse.append(self.relu(y))
|
||||||
|
|
||||||
|
return x_fuse
|
||||||
|
|
||||||
|
|
||||||
|
blocks_dict = {
|
||||||
|
'BASIC': BasicBlock,
|
||||||
|
'BOTTLENECK': Bottleneck
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class PoseHighResolutionNet(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, cfg):
|
||||||
|
self.inplanes = 64
|
||||||
|
extra = cfg['MODEL']['EXTRA']
|
||||||
|
super(PoseHighResolutionNet, self).__init__()
|
||||||
|
|
||||||
|
self.cfg = extra
|
||||||
|
|
||||||
|
# stem net
|
||||||
|
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1,
|
||||||
|
bias=False)
|
||||||
|
self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
|
||||||
|
self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1,
|
||||||
|
bias=False)
|
||||||
|
self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
|
||||||
|
self.relu = nn.ReLU(inplace=True)
|
||||||
|
self.layer1 = self._make_layer(Bottleneck, 64, 4)
|
||||||
|
|
||||||
|
self.stage2_cfg = extra['STAGE2']
|
||||||
|
num_channels = self.stage2_cfg['NUM_CHANNELS']
|
||||||
|
block = blocks_dict[self.stage2_cfg['BLOCK']]
|
||||||
|
num_channels = [
|
||||||
|
num_channels[i] * block.expansion for i in range(len(num_channels))
|
||||||
|
]
|
||||||
|
self.transition1 = self._make_transition_layer([256], num_channels)
|
||||||
|
self.stage2, pre_stage_channels = self._make_stage(
|
||||||
|
self.stage2_cfg, num_channels)
|
||||||
|
|
||||||
|
self.stage3_cfg = extra['STAGE3']
|
||||||
|
num_channels = self.stage3_cfg['NUM_CHANNELS']
|
||||||
|
block = blocks_dict[self.stage3_cfg['BLOCK']]
|
||||||
|
num_channels = [
|
||||||
|
num_channels[i] * block.expansion for i in range(len(num_channels))
|
||||||
|
]
|
||||||
|
self.transition2 = self._make_transition_layer(
|
||||||
|
pre_stage_channels, num_channels)
|
||||||
|
self.stage3, pre_stage_channels = self._make_stage(
|
||||||
|
self.stage3_cfg, num_channels)
|
||||||
|
|
||||||
|
self.stage4_cfg = extra['STAGE4']
|
||||||
|
num_channels = self.stage4_cfg['NUM_CHANNELS']
|
||||||
|
block = blocks_dict[self.stage4_cfg['BLOCK']]
|
||||||
|
num_channels = [
|
||||||
|
num_channels[i] * block.expansion for i in range(len(num_channels))
|
||||||
|
]
|
||||||
|
self.transition3 = self._make_transition_layer(
|
||||||
|
pre_stage_channels, num_channels)
|
||||||
|
self.stage4, pre_stage_channels = self._make_stage(
|
||||||
|
self.stage4_cfg, num_channels, multi_scale_output=True)
|
||||||
|
|
||||||
|
self.final_layer = nn.Conv2d(
|
||||||
|
in_channels=pre_stage_channels[0],
|
||||||
|
out_channels=cfg['MODEL']['NUM_JOINTS'],
|
||||||
|
kernel_size=extra['FINAL_CONV_KERNEL'],
|
||||||
|
stride=1,
|
||||||
|
padding=1 if extra['FINAL_CONV_KERNEL'] == 3 else 0
|
||||||
|
)
|
||||||
|
|
||||||
|
self.pretrained_layers = extra['PRETRAINED_LAYERS']
|
||||||
|
|
||||||
|
if extra.DOWNSAMPLE and extra.USE_CONV:
|
||||||
|
self.downsample_stage_1 = self._make_downsample_layer(3, num_channel=self.stage2_cfg['NUM_CHANNELS'][0])
|
||||||
|
self.downsample_stage_2 = self._make_downsample_layer(2, num_channel=self.stage2_cfg['NUM_CHANNELS'][-1])
|
||||||
|
self.downsample_stage_3 = self._make_downsample_layer(1, num_channel=self.stage3_cfg['NUM_CHANNELS'][-1])
|
||||||
|
elif not extra.DOWNSAMPLE and extra.USE_CONV:
|
||||||
|
self.upsample_stage_2 = self._make_upsample_layer(1, num_channel=self.stage2_cfg['NUM_CHANNELS'][-1])
|
||||||
|
self.upsample_stage_3 = self._make_upsample_layer(2, num_channel=self.stage3_cfg['NUM_CHANNELS'][-1])
|
||||||
|
self.upsample_stage_4 = self._make_upsample_layer(3, num_channel=self.stage4_cfg['NUM_CHANNELS'][-1])
|
||||||
|
|
||||||
|
def _make_transition_layer(
|
||||||
|
self, num_channels_pre_layer, num_channels_cur_layer):
|
||||||
|
num_branches_cur = len(num_channels_cur_layer)
|
||||||
|
num_branches_pre = len(num_channels_pre_layer)
|
||||||
|
|
||||||
|
transition_layers = []
|
||||||
|
for i in range(num_branches_cur):
|
||||||
|
if i < num_branches_pre:
|
||||||
|
if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
|
||||||
|
transition_layers.append(
|
||||||
|
nn.Sequential(
|
||||||
|
nn.Conv2d(
|
||||||
|
num_channels_pre_layer[i],
|
||||||
|
num_channels_cur_layer[i],
|
||||||
|
3, 1, 1, bias=False
|
||||||
|
),
|
||||||
|
nn.BatchNorm2d(num_channels_cur_layer[i]),
|
||||||
|
nn.ReLU(inplace=True)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
transition_layers.append(None)
|
||||||
|
else:
|
||||||
|
conv3x3s = []
|
||||||
|
for j in range(i+1-num_branches_pre):
|
||||||
|
inchannels = num_channels_pre_layer[-1]
|
||||||
|
outchannels = num_channels_cur_layer[i] \
|
||||||
|
if j == i-num_branches_pre else inchannels
|
||||||
|
conv3x3s.append(
|
||||||
|
nn.Sequential(
|
||||||
|
nn.Conv2d(
|
||||||
|
inchannels, outchannels, 3, 2, 1, bias=False
|
||||||
|
),
|
||||||
|
nn.BatchNorm2d(outchannels),
|
||||||
|
nn.ReLU(inplace=True)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
transition_layers.append(nn.Sequential(*conv3x3s))
|
||||||
|
|
||||||
|
return nn.ModuleList(transition_layers)
|
||||||
|
|
||||||
|
def _make_layer(self, block, planes, blocks, stride=1):
|
||||||
|
downsample = None
|
||||||
|
if stride != 1 or self.inplanes != planes * block.expansion:
|
||||||
|
downsample = nn.Sequential(
|
||||||
|
nn.Conv2d(
|
||||||
|
self.inplanes, planes * block.expansion,
|
||||||
|
kernel_size=1, stride=stride, bias=False
|
||||||
|
),
|
||||||
|
nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
|
||||||
|
)
|
||||||
|
|
||||||
|
layers = []
|
||||||
|
layers.append(block(self.inplanes, planes, stride, downsample))
|
||||||
|
self.inplanes = planes * block.expansion
|
||||||
|
for i in range(1, blocks):
|
||||||
|
layers.append(block(self.inplanes, planes))
|
||||||
|
|
||||||
|
return nn.Sequential(*layers)
|
||||||
|
|
||||||
|
def _make_stage(self, layer_config, num_inchannels,
|
||||||
|
multi_scale_output=True):
|
||||||
|
num_modules = layer_config['NUM_MODULES']
|
||||||
|
num_branches = layer_config['NUM_BRANCHES']
|
||||||
|
num_blocks = layer_config['NUM_BLOCKS']
|
||||||
|
num_channels = layer_config['NUM_CHANNELS']
|
||||||
|
block = blocks_dict[layer_config['BLOCK']]
|
||||||
|
fuse_method = layer_config['FUSE_METHOD']
|
||||||
|
|
||||||
|
modules = []
|
||||||
|
for i in range(num_modules):
|
||||||
|
# multi_scale_output is only used last module
|
||||||
|
if not multi_scale_output and i == num_modules - 1:
|
||||||
|
reset_multi_scale_output = False
|
||||||
|
else:
|
||||||
|
reset_multi_scale_output = True
|
||||||
|
|
||||||
|
modules.append(
|
||||||
|
HighResolutionModule(
|
||||||
|
num_branches,
|
||||||
|
block,
|
||||||
|
num_blocks,
|
||||||
|
num_inchannels,
|
||||||
|
num_channels,
|
||||||
|
fuse_method,
|
||||||
|
reset_multi_scale_output
|
||||||
|
)
|
||||||
|
)
|
||||||
|
num_inchannels = modules[-1].get_num_inchannels()
|
||||||
|
|
||||||
|
return nn.Sequential(*modules), num_inchannels
|
||||||
|
|
||||||
|
def _make_upsample_layer(self, num_layers, num_channel, kernel_size=3):
|
||||||
|
layers = []
|
||||||
|
for i in range(num_layers):
|
||||||
|
layers.append(nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True))
|
||||||
|
layers.append(
|
||||||
|
nn.Conv2d(
|
||||||
|
in_channels=num_channel, out_channels=num_channel,
|
||||||
|
kernel_size=kernel_size, stride=1, padding=1, bias=False,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
layers.append(nn.BatchNorm2d(num_channel, momentum=BN_MOMENTUM))
|
||||||
|
layers.append(nn.ReLU(inplace=True))
|
||||||
|
|
||||||
|
return nn.Sequential(*layers)
|
||||||
|
|
||||||
|
def _make_downsample_layer(self, num_layers, num_channel, kernel_size=3):
|
||||||
|
layers = []
|
||||||
|
for i in range(num_layers):
|
||||||
|
layers.append(
|
||||||
|
nn.Conv2d(
|
||||||
|
in_channels=num_channel, out_channels=num_channel,
|
||||||
|
kernel_size=kernel_size, stride=2, padding=1, bias=False,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
layers.append(nn.BatchNorm2d(num_channel, momentum=BN_MOMENTUM))
|
||||||
|
layers.append(nn.ReLU(inplace=True))
|
||||||
|
|
||||||
|
return nn.Sequential(*layers)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.conv1(x)
|
||||||
|
x = self.bn1(x)
|
||||||
|
x = self.relu(x)
|
||||||
|
x = self.conv2(x)
|
||||||
|
x = self.bn2(x)
|
||||||
|
x = self.relu(x)
|
||||||
|
x = self.layer1(x)
|
||||||
|
|
||||||
|
x_list = []
|
||||||
|
for i in range(self.stage2_cfg['NUM_BRANCHES']):
|
||||||
|
if self.transition1[i] is not None:
|
||||||
|
x_list.append(self.transition1[i](x))
|
||||||
|
else:
|
||||||
|
x_list.append(x)
|
||||||
|
y_list = self.stage2(x_list)
|
||||||
|
|
||||||
|
x_list = []
|
||||||
|
for i in range(self.stage3_cfg['NUM_BRANCHES']):
|
||||||
|
if self.transition2[i] is not None:
|
||||||
|
x_list.append(self.transition2[i](y_list[-1]))
|
||||||
|
else:
|
||||||
|
x_list.append(y_list[i])
|
||||||
|
y_list = self.stage3(x_list)
|
||||||
|
|
||||||
|
x_list = []
|
||||||
|
for i in range(self.stage4_cfg['NUM_BRANCHES']):
|
||||||
|
if self.transition3[i] is not None:
|
||||||
|
x_list.append(self.transition3[i](y_list[-1]))
|
||||||
|
else:
|
||||||
|
x_list.append(y_list[i])
|
||||||
|
x = self.stage4(x_list)
|
||||||
|
|
||||||
|
if self.cfg.DOWNSAMPLE:
|
||||||
|
if self.cfg.USE_CONV:
|
||||||
|
# Downsampling with strided convolutions
|
||||||
|
x1 = self.downsample_stage_1(x[0])
|
||||||
|
x2 = self.downsample_stage_2(x[1])
|
||||||
|
x3 = self.downsample_stage_3(x[2])
|
||||||
|
x = torch.cat([x1, x2, x3, x[3]], 1)
|
||||||
|
else:
|
||||||
|
# Downsampling with interpolation
|
||||||
|
x0_h, x0_w = x[3].size(2), x[3].size(3)
|
||||||
|
x1 = F.interpolate(x[0], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
|
||||||
|
x2 = F.interpolate(x[1], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
|
||||||
|
x3 = F.interpolate(x[2], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
|
||||||
|
x = torch.cat([x1, x2, x3, x[3]], 1)
|
||||||
|
else:
|
||||||
|
if self.cfg.USE_CONV:
|
||||||
|
# Upsampling with interpolations + convolutions
|
||||||
|
x1 = self.upsample_stage_2(x[1])
|
||||||
|
x2 = self.upsample_stage_3(x[2])
|
||||||
|
x3 = self.upsample_stage_4(x[3])
|
||||||
|
x = torch.cat([x[0], x1, x2, x3], 1)
|
||||||
|
else:
|
||||||
|
# Upsampling with interpolation
|
||||||
|
x0_h, x0_w = x[0].size(2), x[0].size(3)
|
||||||
|
x1 = F.interpolate(x[1], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
|
||||||
|
x2 = F.interpolate(x[2], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
|
||||||
|
x3 = F.interpolate(x[3], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
|
||||||
|
x = torch.cat([x[0], x1, x2, x3], 1)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
def init_weights(self, pretrained=''):
|
||||||
|
# logger.info('=> init weights from normal distribution')
|
||||||
|
for m in self.modules():
|
||||||
|
if isinstance(m, nn.Conv2d):
|
||||||
|
# nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
|
||||||
|
nn.init.normal_(m.weight, std=0.001)
|
||||||
|
for name, _ in m.named_parameters():
|
||||||
|
if name in ['bias']:
|
||||||
|
nn.init.constant_(m.bias, 0)
|
||||||
|
elif isinstance(m, nn.BatchNorm2d):
|
||||||
|
nn.init.constant_(m.weight, 1)
|
||||||
|
nn.init.constant_(m.bias, 0)
|
||||||
|
elif isinstance(m, nn.ConvTranspose2d):
|
||||||
|
nn.init.normal_(m.weight, std=0.001)
|
||||||
|
for name, _ in m.named_parameters():
|
||||||
|
if name in ['bias']:
|
||||||
|
nn.init.constant_(m.bias, 0)
|
||||||
|
|
||||||
|
if os.path.isfile(pretrained):
|
||||||
|
pretrained_state_dict = torch.load(pretrained)
|
||||||
|
logger.info('=> loading pretrained model {}'.format(pretrained))
|
||||||
|
|
||||||
|
need_init_state_dict = {}
|
||||||
|
for name, m in pretrained_state_dict.items():
|
||||||
|
if name.split('.')[0] in self.pretrained_layers \
|
||||||
|
or self.pretrained_layers[0] is '*':
|
||||||
|
need_init_state_dict[name] = m
|
||||||
|
self.load_state_dict(need_init_state_dict, strict=False)
|
||||||
|
elif pretrained:
|
||||||
|
# logger.warning('IMPORTANT WARNING!! Please download pre-trained models if you are in TRAINING mode!')
|
||||||
|
# raise ValueError('{} is not exist!'.format(pretrained))
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def get_pose_net(cfg, is_train):
|
||||||
|
model = PoseHighResolutionNet(cfg)
|
||||||
|
|
||||||
|
if is_train and cfg['MODEL']['INIT_WEIGHTS']:
|
||||||
|
model.init_weights(cfg['MODEL']['PRETRAINED'])
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def get_cfg_defaults(pretrained, width=32, downsample=False, use_conv=False):
|
||||||
|
# pose_multi_resoluton_net related params
|
||||||
|
HRNET = CN()
|
||||||
|
HRNET.PRETRAINED_LAYERS = [
|
||||||
|
'conv1', 'bn1', 'conv2', 'bn2', 'layer1', 'transition1',
|
||||||
|
'stage2', 'transition2', 'stage3', 'transition3', 'stage4',
|
||||||
|
]
|
||||||
|
HRNET.STEM_INPLANES = 64
|
||||||
|
HRNET.FINAL_CONV_KERNEL = 1
|
||||||
|
HRNET.STAGE2 = CN()
|
||||||
|
HRNET.STAGE2.NUM_MODULES = 1
|
||||||
|
HRNET.STAGE2.NUM_BRANCHES = 2
|
||||||
|
HRNET.STAGE2.NUM_BLOCKS = [4, 4]
|
||||||
|
HRNET.STAGE2.NUM_CHANNELS = [width, width*2]
|
||||||
|
HRNET.STAGE2.BLOCK = 'BASIC'
|
||||||
|
HRNET.STAGE2.FUSE_METHOD = 'SUM'
|
||||||
|
HRNET.STAGE3 = CN()
|
||||||
|
HRNET.STAGE3.NUM_MODULES = 4
|
||||||
|
HRNET.STAGE3.NUM_BRANCHES = 3
|
||||||
|
HRNET.STAGE3.NUM_BLOCKS = [4, 4, 4]
|
||||||
|
HRNET.STAGE3.NUM_CHANNELS = [width, width*2, width*4]
|
||||||
|
HRNET.STAGE3.BLOCK = 'BASIC'
|
||||||
|
HRNET.STAGE3.FUSE_METHOD = 'SUM'
|
||||||
|
HRNET.STAGE4 = CN()
|
||||||
|
HRNET.STAGE4.NUM_MODULES = 3
|
||||||
|
HRNET.STAGE4.NUM_BRANCHES = 4
|
||||||
|
HRNET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
|
||||||
|
HRNET.STAGE4.NUM_CHANNELS = [width, width*2, width*4, width*8]
|
||||||
|
HRNET.STAGE4.BLOCK = 'BASIC'
|
||||||
|
HRNET.STAGE4.FUSE_METHOD = 'SUM'
|
||||||
|
HRNET.DOWNSAMPLE = downsample
|
||||||
|
HRNET.USE_CONV = use_conv
|
||||||
|
|
||||||
|
cfg = CN()
|
||||||
|
cfg.MODEL = CN()
|
||||||
|
cfg.MODEL.INIT_WEIGHTS = True
|
||||||
|
cfg.MODEL.PRETRAINED = pretrained # 'data/pretrained_models/hrnet_w32-36af842e.pth'
|
||||||
|
cfg.MODEL.EXTRA = HRNET
|
||||||
|
cfg.MODEL.NUM_JOINTS = 24
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
def hrnet_w32(
|
||||||
|
pretrained=True,
|
||||||
|
pretrained_ckpt='data/pretrained_models/pose_coco/pose_hrnet_w32_256x192.pth',
|
||||||
|
downsample=False,
|
||||||
|
use_conv=False,
|
||||||
|
):
|
||||||
|
cfg = get_cfg_defaults(pretrained_ckpt, width=32, downsample=downsample, use_conv=use_conv)
|
||||||
|
return get_pose_net(cfg, is_train=True)
|
||||||
|
|
||||||
|
|
||||||
|
def hrnet_w48(
|
||||||
|
pretrained=True,
|
||||||
|
pretrained_ckpt='data/pretrained_models/pose_coco/pose_hrnet_w48_256x192.pth',
|
||||||
|
downsample=False,
|
||||||
|
use_conv=False,
|
||||||
|
):
|
||||||
|
cfg = get_cfg_defaults(pretrained_ckpt, width=48, downsample=downsample, use_conv=use_conv)
|
||||||
|
return get_pose_net(cfg, is_train=True)
|
191
myeasymocap/backbone/pare/backbone/mobilenet.py
Normal file
191
myeasymocap/backbone/pare/backbone/mobilenet.py
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
from torch import nn
|
||||||
|
try:
|
||||||
|
from torch.hub import load_state_dict_from_url
|
||||||
|
except ImportError:
|
||||||
|
from torchvision.models.utils import load_state_dict_from_url
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ['MobileNetV2', 'mobilenet_v2']
|
||||||
|
|
||||||
|
|
||||||
|
model_urls = {
|
||||||
|
'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _make_divisible(v, divisor, min_value=None):
|
||||||
|
"""
|
||||||
|
This function is taken from the original tf repo.
|
||||||
|
It ensures that all layers have a channel number that is divisible by 8
|
||||||
|
It can be seen here:
|
||||||
|
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
|
||||||
|
:param v:
|
||||||
|
:param divisor:
|
||||||
|
:param min_value:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
if min_value is None:
|
||||||
|
min_value = divisor
|
||||||
|
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||||
|
# Make sure that round down does not go down by more than 10%.
|
||||||
|
if new_v < 0.9 * v:
|
||||||
|
new_v += divisor
|
||||||
|
return new_v
|
||||||
|
|
||||||
|
|
||||||
|
class ConvBNReLU(nn.Sequential):
|
||||||
|
def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1, norm_layer=None):
|
||||||
|
padding = (kernel_size - 1) // 2
|
||||||
|
if norm_layer is None:
|
||||||
|
norm_layer = nn.BatchNorm2d
|
||||||
|
super(ConvBNReLU, self).__init__(
|
||||||
|
nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
|
||||||
|
norm_layer(out_planes),
|
||||||
|
nn.ReLU6(inplace=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class InvertedResidual(nn.Module):
|
||||||
|
def __init__(self, inp, oup, stride, expand_ratio, norm_layer=None):
|
||||||
|
super(InvertedResidual, self).__init__()
|
||||||
|
self.stride = stride
|
||||||
|
assert stride in [1, 2]
|
||||||
|
|
||||||
|
if norm_layer is None:
|
||||||
|
norm_layer = nn.BatchNorm2d
|
||||||
|
|
||||||
|
hidden_dim = int(round(inp * expand_ratio))
|
||||||
|
self.use_res_connect = self.stride == 1 and inp == oup
|
||||||
|
|
||||||
|
layers = []
|
||||||
|
if expand_ratio != 1:
|
||||||
|
# pw
|
||||||
|
layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1, norm_layer=norm_layer))
|
||||||
|
layers.extend([
|
||||||
|
# dw
|
||||||
|
ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim, norm_layer=norm_layer),
|
||||||
|
# pw-linear
|
||||||
|
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
|
||||||
|
norm_layer(oup),
|
||||||
|
])
|
||||||
|
self.conv = nn.Sequential(*layers)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.use_res_connect:
|
||||||
|
return x + self.conv(x)
|
||||||
|
else:
|
||||||
|
return self.conv(x)
|
||||||
|
|
||||||
|
|
||||||
|
class MobileNetV2(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
num_classes=1000,
|
||||||
|
width_mult=1.0,
|
||||||
|
inverted_residual_setting=None,
|
||||||
|
round_nearest=8,
|
||||||
|
block=None,
|
||||||
|
norm_layer=None):
|
||||||
|
"""
|
||||||
|
MobileNet V2 main class
|
||||||
|
|
||||||
|
Args:
|
||||||
|
num_classes (int): Number of classes
|
||||||
|
width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
|
||||||
|
inverted_residual_setting: Network structure
|
||||||
|
round_nearest (int): Round the number of channels in each layer to be a multiple of this number
|
||||||
|
Set to 1 to turn off rounding
|
||||||
|
block: Module specifying inverted residual building block for mobilenet
|
||||||
|
norm_layer: Module specifying the normalization layer to use
|
||||||
|
|
||||||
|
"""
|
||||||
|
super(MobileNetV2, self).__init__()
|
||||||
|
|
||||||
|
if block is None:
|
||||||
|
block = InvertedResidual
|
||||||
|
|
||||||
|
if norm_layer is None:
|
||||||
|
norm_layer = nn.BatchNorm2d
|
||||||
|
|
||||||
|
input_channel = 32
|
||||||
|
last_channel = 1280
|
||||||
|
|
||||||
|
if inverted_residual_setting is None:
|
||||||
|
inverted_residual_setting = [
|
||||||
|
# t, c, n, s
|
||||||
|
[1, 16, 1, 1],
|
||||||
|
[6, 24, 2, 2],
|
||||||
|
[6, 32, 3, 2],
|
||||||
|
[6, 64, 4, 2],
|
||||||
|
[6, 96, 3, 1],
|
||||||
|
[6, 160, 3, 2],
|
||||||
|
[6, 320, 1, 1],
|
||||||
|
]
|
||||||
|
|
||||||
|
# only check the first element, assuming user knows t,c,n,s are required
|
||||||
|
if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
|
||||||
|
raise ValueError("inverted_residual_setting should be non-empty "
|
||||||
|
"or a 4-element list, got {}".format(inverted_residual_setting))
|
||||||
|
|
||||||
|
# building first layer
|
||||||
|
input_channel = _make_divisible(input_channel * width_mult, round_nearest)
|
||||||
|
self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
|
||||||
|
features = [ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer)]
|
||||||
|
# building inverted residual blocks
|
||||||
|
for t, c, n, s in inverted_residual_setting:
|
||||||
|
output_channel = _make_divisible(c * width_mult, round_nearest)
|
||||||
|
for i in range(n):
|
||||||
|
stride = s if i == 0 else 1
|
||||||
|
features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer))
|
||||||
|
input_channel = output_channel
|
||||||
|
# building last several layers
|
||||||
|
features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer))
|
||||||
|
# make it nn.Sequential
|
||||||
|
self.features = nn.Sequential(*features)
|
||||||
|
|
||||||
|
# building classifier
|
||||||
|
# self.classifier = nn.Sequential(
|
||||||
|
# nn.Dropout(0.2),
|
||||||
|
# nn.Linear(self.last_channel, num_classes),
|
||||||
|
# )
|
||||||
|
|
||||||
|
# weight initialization
|
||||||
|
for m in self.modules():
|
||||||
|
if isinstance(m, nn.Conv2d):
|
||||||
|
nn.init.kaiming_normal_(m.weight, mode='fan_out')
|
||||||
|
if m.bias is not None:
|
||||||
|
nn.init.zeros_(m.bias)
|
||||||
|
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
|
||||||
|
nn.init.ones_(m.weight)
|
||||||
|
nn.init.zeros_(m.bias)
|
||||||
|
elif isinstance(m, nn.Linear):
|
||||||
|
nn.init.normal_(m.weight, 0, 0.01)
|
||||||
|
nn.init.zeros_(m.bias)
|
||||||
|
|
||||||
|
def _forward_impl(self, x):
|
||||||
|
# This exists since TorchScript doesn't support inheritance, so the superclass method
|
||||||
|
# (this one) needs to have a name other than `forward` that can be accessed in a subclass
|
||||||
|
x = self.features(x)
|
||||||
|
# Cannot use "squeeze" as batch-size can be 1 => must use reshape with x.shape[0]
|
||||||
|
# x = nn.functional.adaptive_avg_pool2d(x, 1).reshape(x.shape[0], -1)
|
||||||
|
# x = self.classifier(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self._forward_impl(x)
|
||||||
|
|
||||||
|
|
||||||
|
def mobilenet_v2(pretrained=False, progress=True, **kwargs):
|
||||||
|
"""
|
||||||
|
Constructs a MobileNetV2 architecture from
|
||||||
|
`"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||||
|
progress (bool): If True, displays a progress bar of the download to stderr
|
||||||
|
"""
|
||||||
|
model = MobileNetV2(**kwargs)
|
||||||
|
if pretrained:
|
||||||
|
state_dict = load_state_dict_from_url(model_urls['mobilenet_v2'],
|
||||||
|
progress=progress)
|
||||||
|
model.load_state_dict(state_dict, strict=False)
|
||||||
|
return model
|
355
myeasymocap/backbone/pare/backbone/resnet.py
Normal file
355
myeasymocap/backbone/pare/backbone/resnet.py
Normal file
@ -0,0 +1,355 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
try:
|
||||||
|
from torch.hub import load_state_dict_from_url
|
||||||
|
except ImportError:
|
||||||
|
from torchvision.models.utils import load_state_dict_from_url
|
||||||
|
|
||||||
|
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
|
||||||
|
'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
|
||||||
|
'wide_resnet50_2', 'wide_resnet101_2']
|
||||||
|
|
||||||
|
|
||||||
|
model_urls = {
|
||||||
|
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
|
||||||
|
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
|
||||||
|
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
|
||||||
|
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
|
||||||
|
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
|
||||||
|
'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
|
||||||
|
'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
|
||||||
|
'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
|
||||||
|
'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
|
||||||
|
"""3x3 convolution with padding"""
|
||||||
|
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
|
||||||
|
padding=dilation, groups=groups, bias=False, dilation=dilation)
|
||||||
|
|
||||||
|
|
||||||
|
def conv1x1(in_planes, out_planes, stride=1):
|
||||||
|
"""1x1 convolution"""
|
||||||
|
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
|
||||||
|
|
||||||
|
|
||||||
|
class BasicBlock(nn.Module):
|
||||||
|
expansion = 1
|
||||||
|
|
||||||
|
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
|
||||||
|
base_width=64, dilation=1, norm_layer=None):
|
||||||
|
super(BasicBlock, self).__init__()
|
||||||
|
if norm_layer is None:
|
||||||
|
norm_layer = nn.BatchNorm2d
|
||||||
|
if groups != 1 or base_width != 64:
|
||||||
|
raise ValueError('BasicBlock only supports groups=1 and base_width=64')
|
||||||
|
if dilation > 1:
|
||||||
|
raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
|
||||||
|
# Both self.conv1 and self.downsample layers downsample the input when stride != 1
|
||||||
|
self.conv1 = conv3x3(inplanes, planes, stride)
|
||||||
|
self.bn1 = norm_layer(planes)
|
||||||
|
self.relu = nn.ReLU(inplace=True)
|
||||||
|
self.conv2 = conv3x3(planes, planes)
|
||||||
|
self.bn2 = norm_layer(planes)
|
||||||
|
self.downsample = downsample
|
||||||
|
self.stride = stride
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
identity = x
|
||||||
|
|
||||||
|
out = self.conv1(x)
|
||||||
|
out = self.bn1(out)
|
||||||
|
out = self.relu(out)
|
||||||
|
|
||||||
|
out = self.conv2(out)
|
||||||
|
out = self.bn2(out)
|
||||||
|
|
||||||
|
if self.downsample is not None:
|
||||||
|
identity = self.downsample(x)
|
||||||
|
|
||||||
|
out += identity
|
||||||
|
out = self.relu(out)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class Bottleneck(nn.Module):
|
||||||
|
# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
|
||||||
|
# while original implementation places the stride at the first 1x1 convolution(self.conv1)
|
||||||
|
# according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
|
||||||
|
# This variant is also known as ResNet V1.5 and improves accuracy according to
|
||||||
|
# https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
|
||||||
|
|
||||||
|
expansion = 4
|
||||||
|
|
||||||
|
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
|
||||||
|
base_width=64, dilation=1, norm_layer=None):
|
||||||
|
super(Bottleneck, self).__init__()
|
||||||
|
if norm_layer is None:
|
||||||
|
norm_layer = nn.BatchNorm2d
|
||||||
|
width = int(planes * (base_width / 64.)) * groups
|
||||||
|
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
|
||||||
|
self.conv1 = conv1x1(inplanes, width)
|
||||||
|
self.bn1 = norm_layer(width)
|
||||||
|
self.conv2 = conv3x3(width, width, stride, groups, dilation)
|
||||||
|
self.bn2 = norm_layer(width)
|
||||||
|
self.conv3 = conv1x1(width, planes * self.expansion)
|
||||||
|
self.bn3 = norm_layer(planes * self.expansion)
|
||||||
|
self.relu = nn.ReLU(inplace=True)
|
||||||
|
self.downsample = downsample
|
||||||
|
self.stride = stride
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
identity = x
|
||||||
|
|
||||||
|
out = self.conv1(x)
|
||||||
|
out = self.bn1(out)
|
||||||
|
out = self.relu(out)
|
||||||
|
|
||||||
|
out = self.conv2(out)
|
||||||
|
out = self.bn2(out)
|
||||||
|
out = self.relu(out)
|
||||||
|
|
||||||
|
out = self.conv3(out)
|
||||||
|
out = self.bn3(out)
|
||||||
|
|
||||||
|
if self.downsample is not None:
|
||||||
|
identity = self.downsample(x)
|
||||||
|
|
||||||
|
out += identity
|
||||||
|
out = self.relu(out)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class ResNet(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
|
||||||
|
groups=1, width_per_group=64, replace_stride_with_dilation=None,
|
||||||
|
norm_layer=None):
|
||||||
|
super(ResNet, self).__init__()
|
||||||
|
if norm_layer is None:
|
||||||
|
norm_layer = nn.BatchNorm2d
|
||||||
|
self._norm_layer = norm_layer
|
||||||
|
|
||||||
|
self.inplanes = 64
|
||||||
|
self.dilation = 1
|
||||||
|
if replace_stride_with_dilation is None:
|
||||||
|
# each element in the tuple indicates if we should replace
|
||||||
|
# the 2x2 stride with a dilated convolution instead
|
||||||
|
replace_stride_with_dilation = [False, False, False]
|
||||||
|
if len(replace_stride_with_dilation) != 3:
|
||||||
|
raise ValueError("replace_stride_with_dilation should be None "
|
||||||
|
"or a 3-element tuple, got {}".format(replace_stride_with_dilation))
|
||||||
|
self.groups = groups
|
||||||
|
self.base_width = width_per_group
|
||||||
|
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
|
||||||
|
bias=False)
|
||||||
|
self.bn1 = norm_layer(self.inplanes)
|
||||||
|
self.relu = nn.ReLU(inplace=True)
|
||||||
|
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
||||||
|
self.layer1 = self._make_layer(block, 64, layers[0])
|
||||||
|
self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
|
||||||
|
dilate=replace_stride_with_dilation[0])
|
||||||
|
self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
|
||||||
|
dilate=replace_stride_with_dilation[1])
|
||||||
|
self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
|
||||||
|
dilate=replace_stride_with_dilation[2])
|
||||||
|
# self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
|
||||||
|
# self.fc = nn.Linear(512 * block.expansion, num_classes)
|
||||||
|
|
||||||
|
for m in self.modules():
|
||||||
|
if isinstance(m, nn.Conv2d):
|
||||||
|
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
|
||||||
|
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
|
||||||
|
nn.init.constant_(m.weight, 1)
|
||||||
|
nn.init.constant_(m.bias, 0)
|
||||||
|
|
||||||
|
# Zero-initialize the last BN in each residual branch,
|
||||||
|
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
|
||||||
|
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
|
||||||
|
if zero_init_residual:
|
||||||
|
for m in self.modules():
|
||||||
|
if isinstance(m, Bottleneck):
|
||||||
|
nn.init.constant_(m.bn3.weight, 0)
|
||||||
|
elif isinstance(m, BasicBlock):
|
||||||
|
nn.init.constant_(m.bn2.weight, 0)
|
||||||
|
|
||||||
|
def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
|
||||||
|
norm_layer = self._norm_layer
|
||||||
|
downsample = None
|
||||||
|
previous_dilation = self.dilation
|
||||||
|
if dilate:
|
||||||
|
self.dilation *= stride
|
||||||
|
stride = 1
|
||||||
|
if stride != 1 or self.inplanes != planes * block.expansion:
|
||||||
|
downsample = nn.Sequential(
|
||||||
|
conv1x1(self.inplanes, planes * block.expansion, stride),
|
||||||
|
norm_layer(planes * block.expansion),
|
||||||
|
)
|
||||||
|
|
||||||
|
layers = []
|
||||||
|
layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
|
||||||
|
self.base_width, previous_dilation, norm_layer))
|
||||||
|
self.inplanes = planes * block.expansion
|
||||||
|
for _ in range(1, blocks):
|
||||||
|
layers.append(block(self.inplanes, planes, groups=self.groups,
|
||||||
|
base_width=self.base_width, dilation=self.dilation,
|
||||||
|
norm_layer=norm_layer))
|
||||||
|
|
||||||
|
return nn.Sequential(*layers)
|
||||||
|
|
||||||
|
def _forward_impl(self, x):
|
||||||
|
# See note [TorchScript super()]
|
||||||
|
x = self.conv1(x)
|
||||||
|
x = self.bn1(x)
|
||||||
|
x = self.relu(x)
|
||||||
|
x = self.maxpool(x)
|
||||||
|
|
||||||
|
x = self.layer1(x)
|
||||||
|
x = self.layer2(x)
|
||||||
|
x = self.layer3(x)
|
||||||
|
x = self.layer4(x)
|
||||||
|
|
||||||
|
# x = self.avgpool(x)
|
||||||
|
# x = torch.flatten(x, 1)
|
||||||
|
# x = self.fc(x)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self._forward_impl(x)
|
||||||
|
|
||||||
|
|
||||||
|
def _resnet(arch, block, layers, pretrained, progress, **kwargs):
|
||||||
|
model = ResNet(block, layers, **kwargs)
|
||||||
|
if pretrained:
|
||||||
|
state_dict = load_state_dict_from_url(model_urls[arch],
|
||||||
|
progress=progress)
|
||||||
|
model.load_state_dict(state_dict, strict=False)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def resnet18(pretrained=False, progress=True, **kwargs):
|
||||||
|
r"""ResNet-18 model from
|
||||||
|
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||||
|
progress (bool): If True, displays a progress bar of the download to stderr
|
||||||
|
"""
|
||||||
|
return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
|
||||||
|
**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def resnet34(pretrained=False, progress=True, **kwargs):
|
||||||
|
r"""ResNet-34 model from
|
||||||
|
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||||
|
progress (bool): If True, displays a progress bar of the download to stderr
|
||||||
|
"""
|
||||||
|
return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
|
||||||
|
**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def resnet50(pretrained=False, progress=True, **kwargs):
|
||||||
|
r"""ResNet-50 model from
|
||||||
|
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||||
|
progress (bool): If True, displays a progress bar of the download to stderr
|
||||||
|
"""
|
||||||
|
return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
|
||||||
|
**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def resnet101(pretrained=False, progress=True, **kwargs):
|
||||||
|
r"""ResNet-101 model from
|
||||||
|
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||||
|
progress (bool): If True, displays a progress bar of the download to stderr
|
||||||
|
"""
|
||||||
|
return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
|
||||||
|
**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def resnet152(pretrained=False, progress=True, **kwargs):
|
||||||
|
r"""ResNet-152 model from
|
||||||
|
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||||
|
progress (bool): If True, displays a progress bar of the download to stderr
|
||||||
|
"""
|
||||||
|
return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
|
||||||
|
**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
|
||||||
|
r"""ResNeXt-50 32x4d model from
|
||||||
|
`"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||||
|
progress (bool): If True, displays a progress bar of the download to stderr
|
||||||
|
"""
|
||||||
|
kwargs['groups'] = 32
|
||||||
|
kwargs['width_per_group'] = 4
|
||||||
|
return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
|
||||||
|
pretrained, progress, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
|
||||||
|
r"""ResNeXt-101 32x8d model from
|
||||||
|
`"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||||
|
progress (bool): If True, displays a progress bar of the download to stderr
|
||||||
|
"""
|
||||||
|
kwargs['groups'] = 32
|
||||||
|
kwargs['width_per_group'] = 8
|
||||||
|
return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
|
||||||
|
pretrained, progress, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
|
||||||
|
r"""Wide ResNet-50-2 model from
|
||||||
|
`"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
|
||||||
|
|
||||||
|
The model is the same as ResNet except for the bottleneck number of channels
|
||||||
|
which is twice larger in every block. The number of channels in outer 1x1
|
||||||
|
convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
|
||||||
|
channels, and in Wide ResNet-50-2 has 2048-1024-2048.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||||
|
progress (bool): If True, displays a progress bar of the download to stderr
|
||||||
|
"""
|
||||||
|
kwargs['width_per_group'] = 64 * 2
|
||||||
|
return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
|
||||||
|
pretrained, progress, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def wide_resnet101_2(pretrained=False, progress=True, **kwargs):
|
||||||
|
r"""Wide ResNet-101-2 model from
|
||||||
|
`"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
|
||||||
|
|
||||||
|
The model is the same as ResNet except for the bottleneck number of channels
|
||||||
|
which is twice larger in every block. The number of channels in outer 1x1
|
||||||
|
convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
|
||||||
|
channels, and in Wide ResNet-50-2 has 2048-1024-2048.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||||
|
progress (bool): If True, displays a progress bar of the download to stderr
|
||||||
|
"""
|
||||||
|
kwargs['width_per_group'] = 64 * 2
|
||||||
|
return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
|
||||||
|
pretrained, progress, **kwargs)
|
36
myeasymocap/backbone/pare/backbone/utils.py
Normal file
36
myeasymocap/backbone/pare/backbone/utils.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
|
||||||
|
# holder of all proprietary rights on this computer program.
|
||||||
|
# You can only use this computer program if you have closed
|
||||||
|
# a license agreement with MPG or you get the right to use the computer
|
||||||
|
# program from someone who is authorized to grant you that right.
|
||||||
|
# Any use of the computer program without a valid license is prohibited and
|
||||||
|
# liable to prosecution.
|
||||||
|
#
|
||||||
|
# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
|
||||||
|
# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
|
||||||
|
# for Intelligent Systems. All rights reserved.
|
||||||
|
#
|
||||||
|
# Contact: ps-license@tuebingen.mpg.de
|
||||||
|
|
||||||
|
def get_backbone_info(backbone):
|
||||||
|
info = {
|
||||||
|
'resnet18': {'n_output_channels': 512, 'downsample_rate': 4},
|
||||||
|
'resnet34': {'n_output_channels': 512, 'downsample_rate': 4},
|
||||||
|
'resnet50': {'n_output_channels': 2048, 'downsample_rate': 4},
|
||||||
|
'resnet50_adf_dropout': {'n_output_channels': 2048, 'downsample_rate': 4},
|
||||||
|
'resnet50_dropout': {'n_output_channels': 2048, 'downsample_rate': 4},
|
||||||
|
'resnet101': {'n_output_channels': 2048, 'downsample_rate': 4},
|
||||||
|
'resnet152': {'n_output_channels': 2048, 'downsample_rate': 4},
|
||||||
|
'resnext50_32x4d': {'n_output_channels': 2048, 'downsample_rate': 4},
|
||||||
|
'resnext101_32x8d': {'n_output_channels': 2048, 'downsample_rate': 4},
|
||||||
|
'wide_resnet50_2': {'n_output_channels': 2048, 'downsample_rate': 4},
|
||||||
|
'wide_resnet101_2': {'n_output_channels': 2048, 'downsample_rate': 4},
|
||||||
|
'mobilenet_v2': {'n_output_channels': 1280, 'downsample_rate': 4},
|
||||||
|
'hrnet_w32': {'n_output_channels': 480, 'downsample_rate': 4},
|
||||||
|
'hrnet_w48': {'n_output_channels': 720, 'downsample_rate': 4},
|
||||||
|
# 'hrnet_w64': {'n_output_channels': 2048, 'downsample_rate': 4},
|
||||||
|
'dla34': {'n_output_channels': 512, 'downsample_rate': 4},
|
||||||
|
}
|
||||||
|
return info[backbone]
|
239
myeasymocap/backbone/pare/config.py
Normal file
239
myeasymocap/backbone/pare/config.py
Normal file
@ -0,0 +1,239 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
|
||||||
|
# holder of all proprietary rights on this computer program.
|
||||||
|
# You can only use this computer program if you have closed
|
||||||
|
# a license agreement with MPG or you get the right to use the computer
|
||||||
|
# program from someone who is authorized to grant you that right.
|
||||||
|
# Any use of the computer program without a valid license is prohibited and
|
||||||
|
# liable to prosecution.
|
||||||
|
#
|
||||||
|
# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
|
||||||
|
# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
|
||||||
|
# for Intelligent Systems. All rights reserved.
|
||||||
|
#
|
||||||
|
# Contact: ps-license@tuebingen.mpg.de
|
||||||
|
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import yaml
|
||||||
|
import shutil
|
||||||
|
import argparse
|
||||||
|
import operator
|
||||||
|
import itertools
|
||||||
|
from os.path import join
|
||||||
|
from functools import reduce
|
||||||
|
from yacs.config import CfgNode as CN
|
||||||
|
from typing import Dict, List, Union, Any
|
||||||
|
|
||||||
|
# from ..utils.cluster import execute_task_on_cluster
|
||||||
|
|
||||||
|
##### CONSTANTS #####
|
||||||
|
DATASET_NPZ_PATH = 'data/dataset_extras'
|
||||||
|
DATASET_LMDB_PATH = 'data/lmdb'
|
||||||
|
|
||||||
|
MMPOSE_PATH = '/is/cluster/work/mkocabas/projects/mmpose'
|
||||||
|
MMDET_PATH = '/is/cluster/work/mkocabas/projects/mmdetection'
|
||||||
|
MMPOSE_CFG = os.path.join(MMPOSE_PATH, 'configs/top_down/hrnet/coco-wholebody/hrnet_w48_coco_wholebody_256x192.py')
|
||||||
|
MMPOSE_CKPT = os.path.join(MMPOSE_PATH, 'checkpoints/hrnet_w48_coco_wholebody_256x192-643e18cb_20200922.pth')
|
||||||
|
MMDET_CFG = os.path.join(MMDET_PATH, 'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py')
|
||||||
|
MMDET_CKPT = os.path.join(MMDET_PATH, 'checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth')
|
||||||
|
|
||||||
|
PW3D_ROOT = 'data/dataset_folders/3dpw'
|
||||||
|
OH3D_ROOT = 'data/dataset_folders/3doh'
|
||||||
|
|
||||||
|
JOINT_REGRESSOR_TRAIN_EXTRA = 'models/pare/data/J_regressor_extra.npy'
|
||||||
|
JOINT_REGRESSOR_H36M = 'models/pare/data/J_regressor_h36m.npy'
|
||||||
|
SMPL_MEAN_PARAMS = 'models/pare/data/smpl_mean_params.npz'
|
||||||
|
SMPL_MODEL_DIR = 'models/pare/data/body_models/smpl'
|
||||||
|
|
||||||
|
COCO_OCCLUDERS_FILE = 'data/occlusion_augmentation/coco_train2014_occluders.pkl'
|
||||||
|
PASCAL_OCCLUDERS_FILE = 'data/occlusion_augmentation/pascal_occluders.pkl'
|
||||||
|
|
||||||
|
DATASET_FOLDERS = {
|
||||||
|
'3dpw': PW3D_ROOT,
|
||||||
|
'3dpw-val': PW3D_ROOT,
|
||||||
|
'3dpw-val-cam': PW3D_ROOT,
|
||||||
|
'3dpw-test-cam': PW3D_ROOT,
|
||||||
|
'3dpw-train-cam': PW3D_ROOT,
|
||||||
|
'3dpw-cam': PW3D_ROOT,
|
||||||
|
'3dpw-all': PW3D_ROOT,
|
||||||
|
'3doh': OH3D_ROOT,
|
||||||
|
}
|
||||||
|
|
||||||
|
DATASET_FILES = [
|
||||||
|
# Training
|
||||||
|
{
|
||||||
|
'3dpw-all': join(DATASET_NPZ_PATH, '3dpw_all_test_with_mmpose.npz'),
|
||||||
|
'3doh': join(DATASET_NPZ_PATH, '3doh_test.npz'),
|
||||||
|
},
|
||||||
|
# Testing
|
||||||
|
{
|
||||||
|
'3doh': join(DATASET_NPZ_PATH, '3doh_train.npz'),
|
||||||
|
'3dpw': join(DATASET_NPZ_PATH, '3dpw_train.npz'),
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
EVAL_MESH_DATASETS = ['3dpw', '3dpw-val', '3dpw-all', '3doh']
|
||||||
|
|
||||||
|
##### CONFIGS #####
|
||||||
|
hparams = CN()
|
||||||
|
|
||||||
|
# General settings
|
||||||
|
hparams.LOG_DIR = 'logs/experiments'
|
||||||
|
hparams.METHOD = 'pare'
|
||||||
|
hparams.EXP_NAME = 'default'
|
||||||
|
hparams.RUN_TEST = False
|
||||||
|
hparams.PROJECT_NAME = 'pare'
|
||||||
|
hparams.SEED_VALUE = -1
|
||||||
|
|
||||||
|
hparams.SYSTEM = CN()
|
||||||
|
hparams.SYSTEM.GPU = ''
|
||||||
|
hparams.SYSTEM.CLUSTER_NODE = 0.0
|
||||||
|
|
||||||
|
# Dataset hparams
|
||||||
|
hparams.DATASET = CN()
|
||||||
|
hparams.DATASET.LOAD_TYPE = 'Base'
|
||||||
|
hparams.DATASET.NOISE_FACTOR = 0.4
|
||||||
|
hparams.DATASET.ROT_FACTOR = 30
|
||||||
|
hparams.DATASET.SCALE_FACTOR = 0.25
|
||||||
|
hparams.DATASET.FLIP_PROB = 0.5
|
||||||
|
hparams.DATASET.CROP_PROB = 0.0
|
||||||
|
hparams.DATASET.CROP_FACTOR = 0.0
|
||||||
|
hparams.DATASET.BATCH_SIZE = 64
|
||||||
|
hparams.DATASET.NUM_WORKERS = 8
|
||||||
|
hparams.DATASET.PIN_MEMORY = True
|
||||||
|
hparams.DATASET.SHUFFLE_TRAIN = True
|
||||||
|
hparams.DATASET.TRAIN_DS = 'all'
|
||||||
|
hparams.DATASET.VAL_DS = '3dpw_3doh'
|
||||||
|
hparams.DATASET.NUM_IMAGES = -1
|
||||||
|
hparams.DATASET.TRAIN_NUM_IMAGES = -1
|
||||||
|
hparams.DATASET.TEST_NUM_IMAGES = -1
|
||||||
|
hparams.DATASET.IMG_RES = 224
|
||||||
|
hparams.DATASET.USE_HEATMAPS = '' # 'hm', 'hm_soft', 'part_segm', 'attention'
|
||||||
|
hparams.DATASET.RENDER_RES = 480
|
||||||
|
hparams.DATASET.MESH_COLOR = 'pinkish'
|
||||||
|
hparams.DATASET.FOCAL_LENGTH = 5000.
|
||||||
|
hparams.DATASET.IGNORE_3D = False
|
||||||
|
hparams.DATASET.USE_SYNTHETIC_OCCLUSION = False
|
||||||
|
hparams.DATASET.OCC_AUG_DATASET = 'pascal'
|
||||||
|
hparams.DATASET.USE_3D_CONF = False
|
||||||
|
hparams.DATASET.USE_GENDER = False
|
||||||
|
# this is a bit confusing but for the in the wild dataset ratios should be same, otherwise the code
|
||||||
|
# will be a bit verbose
|
||||||
|
hparams.DATASET.DATASETS_AND_RATIOS = 'h36m_mpii_lspet_coco_mpi-inf-3dhp_0.3_0.6_0.6_0.6_0.1'
|
||||||
|
hparams.DATASET.STAGE_DATASETS = '0+h36m_coco_0.2_0.8 2+h36m_coco_0.4_0.6'
|
||||||
|
# enable non parametric representation
|
||||||
|
hparams.DATASET.NONPARAMETRIC = False
|
||||||
|
|
||||||
|
# optimizer config
|
||||||
|
hparams.OPTIMIZER = CN()
|
||||||
|
hparams.OPTIMIZER.TYPE = 'adam'
|
||||||
|
hparams.OPTIMIZER.LR = 0.0001 # 0.00003
|
||||||
|
hparams.OPTIMIZER.WD = 0.0
|
||||||
|
|
||||||
|
# Training process hparams
|
||||||
|
hparams.TRAINING = CN()
|
||||||
|
hparams.TRAINING.RESUME = None
|
||||||
|
hparams.TRAINING.PRETRAINED = None
|
||||||
|
hparams.TRAINING.PRETRAINED_LIT = None
|
||||||
|
hparams.TRAINING.MAX_EPOCHS = 100
|
||||||
|
hparams.TRAINING.LOG_SAVE_INTERVAL = 50
|
||||||
|
hparams.TRAINING.LOG_FREQ_TB_IMAGES = 500
|
||||||
|
hparams.TRAINING.CHECK_VAL_EVERY_N_EPOCH = 1
|
||||||
|
hparams.TRAINING.RELOAD_DATALOADERS_EVERY_EPOCH = True
|
||||||
|
hparams.TRAINING.NUM_SMPLIFY_ITERS = 100 # 50
|
||||||
|
hparams.TRAINING.RUN_SMPLIFY = False
|
||||||
|
hparams.TRAINING.SMPLIFY_THRESHOLD = 100
|
||||||
|
hparams.TRAINING.DROPOUT_P = 0.2
|
||||||
|
hparams.TRAINING.TEST_BEFORE_TRAINING = False
|
||||||
|
hparams.TRAINING.SAVE_IMAGES = False
|
||||||
|
hparams.TRAINING.USE_PART_SEGM_LOSS = False
|
||||||
|
hparams.TRAINING.USE_AMP = False
|
||||||
|
|
||||||
|
# Training process hparams
|
||||||
|
hparams.TESTING = CN()
|
||||||
|
hparams.TESTING.SAVE_IMAGES = False
|
||||||
|
hparams.TESTING.SAVE_FREQ = 1
|
||||||
|
hparams.TESTING.SAVE_RESULTS = True
|
||||||
|
hparams.TESTING.SAVE_MESHES = False
|
||||||
|
hparams.TESTING.SIDEVIEW = True
|
||||||
|
hparams.TESTING.TEST_ON_TRAIN_END = True
|
||||||
|
hparams.TESTING.MULTI_SIDEVIEW = False
|
||||||
|
hparams.TESTING.USE_GT_CAM = False
|
||||||
|
|
||||||
|
# PARE method hparams
|
||||||
|
hparams.PARE = CN()
|
||||||
|
hparams.PARE.BACKBONE = 'resnet50' # hrnet_w32-conv, hrnet_w32-interp
|
||||||
|
hparams.PARE.NUM_JOINTS = 24
|
||||||
|
hparams.PARE.SOFTMAX_TEMP = 1.
|
||||||
|
hparams.PARE.NUM_FEATURES_SMPL = 64
|
||||||
|
hparams.PARE.USE_ATTENTION = False
|
||||||
|
hparams.PARE.USE_SELF_ATTENTION = False
|
||||||
|
hparams.PARE.USE_KEYPOINT_ATTENTION = False
|
||||||
|
hparams.PARE.USE_KEYPOINT_FEATURES_FOR_SMPL_REGRESSION = False
|
||||||
|
hparams.PARE.USE_POSTCONV_KEYPOINT_ATTENTION = False
|
||||||
|
hparams.PARE.KEYPOINT_ATTENTION_ACT = 'softmax'
|
||||||
|
hparams.PARE.USE_SCALE_KEYPOINT_ATTENTION = False
|
||||||
|
hparams.PARE.USE_FINAL_NONLOCAL = None
|
||||||
|
hparams.PARE.USE_BRANCH_NONLOCAL = None
|
||||||
|
hparams.PARE.USE_HMR_REGRESSION = False
|
||||||
|
hparams.PARE.USE_COATTENTION = False
|
||||||
|
hparams.PARE.NUM_COATTENTION_ITER = 1
|
||||||
|
hparams.PARE.COATTENTION_CONV = 'simple' # 'double_1', 'double_3', 'single_1', 'single_3', 'simple'
|
||||||
|
hparams.PARE.USE_UPSAMPLING = False
|
||||||
|
hparams.PARE.DECONV_CONV_KERNEL_SIZE = 4
|
||||||
|
hparams.PARE.USE_SOFT_ATTENTION = False
|
||||||
|
hparams.PARE.NUM_BRANCH_ITERATION = 0
|
||||||
|
hparams.PARE.BRANCH_DEEPER = False
|
||||||
|
hparams.PARE.NUM_DECONV_LAYERS = 3
|
||||||
|
hparams.PARE.NUM_DECONV_FILTERS = 256
|
||||||
|
hparams.PARE.USE_RESNET_CONV_HRNET = False
|
||||||
|
hparams.PARE.USE_POS_ENC = False
|
||||||
|
|
||||||
|
hparams.PARE.ITERATIVE_REGRESSION = False
|
||||||
|
hparams.PARE.ITER_RESIDUAL = False
|
||||||
|
hparams.PARE.NUM_ITERATIONS = 3
|
||||||
|
hparams.PARE.SHAPE_INPUT_TYPE = 'feats.all_pose.shape.cam'
|
||||||
|
hparams.PARE.POSE_INPUT_TYPE = 'feats.neighbor_pose_feats.all_pose.self_pose.neighbor_pose.shape.cam'
|
||||||
|
|
||||||
|
hparams.PARE.POSE_MLP_NUM_LAYERS = 1
|
||||||
|
hparams.PARE.SHAPE_MLP_NUM_LAYERS = 1
|
||||||
|
hparams.PARE.POSE_MLP_HIDDEN_SIZE = 256
|
||||||
|
hparams.PARE.SHAPE_MLP_HIDDEN_SIZE = 256
|
||||||
|
|
||||||
|
hparams.PARE.SHAPE_LOSS_WEIGHT = 0
|
||||||
|
hparams.PARE.KEYPOINT_LOSS_WEIGHT = 5.
|
||||||
|
hparams.PARE.KEYPOINT_NATIVE_LOSS_WEIGHT = 5.
|
||||||
|
hparams.PARE.HEATMAPS_LOSS_WEIGHT = 5.
|
||||||
|
hparams.PARE.SMPL_PART_LOSS_WEIGHT = 1.
|
||||||
|
hparams.PARE.PART_SEGM_LOSS_WEIGHT = 1.
|
||||||
|
hparams.PARE.POSE_LOSS_WEIGHT = 1.
|
||||||
|
hparams.PARE.BETA_LOSS_WEIGHT = 0.001
|
||||||
|
hparams.PARE.OPENPOSE_TRAIN_WEIGHT = 0.
|
||||||
|
hparams.PARE.GT_TRAIN_WEIGHT = 1.
|
||||||
|
hparams.PARE.LOSS_WEIGHT = 60.
|
||||||
|
hparams.PARE.USE_SHAPE_REG = False
|
||||||
|
hparams.PARE.USE_MEAN_CAMSHAPE = False
|
||||||
|
hparams.PARE.USE_MEAN_POSE = False
|
||||||
|
hparams.PARE.INIT_XAVIER = False
|
||||||
|
|
||||||
|
|
||||||
|
def get_hparams_defaults():
|
||||||
|
"""Get a yacs hparamsNode object with default values for my_project."""
|
||||||
|
# Return a clone so that the defaults will not be altered
|
||||||
|
# This is for the "local variable" use pattern
|
||||||
|
return hparams.clone()
|
||||||
|
|
||||||
|
|
||||||
|
def update_hparams(hparams_file):
|
||||||
|
hparams = get_hparams_defaults()
|
||||||
|
hparams.merge_from_file(hparams_file)
|
||||||
|
return hparams.clone()
|
||||||
|
|
||||||
|
|
||||||
|
def update_hparams_from_dict(cfg_dict):
|
||||||
|
hparams = get_hparams_defaults()
|
||||||
|
cfg = hparams.load_cfg(str(cfg_dict))
|
||||||
|
hparams.merge_from_other_cfg(cfg)
|
||||||
|
return hparams.clone()
|
195
myeasymocap/backbone/pare/constants.py
Normal file
195
myeasymocap/backbone/pare/constants.py
Normal file
@ -0,0 +1,195 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
|
||||||
|
# holder of all proprietary rights on this computer program.
|
||||||
|
# You can only use this computer program if you have closed
|
||||||
|
# a license agreement with MPG or you get the right to use the computer
|
||||||
|
# program from someone who is authorized to grant you that right.
|
||||||
|
# Any use of the computer program without a valid license is prohibited and
|
||||||
|
# liable to prosecution.
|
||||||
|
#
|
||||||
|
# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
|
||||||
|
# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
|
||||||
|
# for Intelligent Systems. All rights reserved.
|
||||||
|
#
|
||||||
|
# Contact: ps-license@tuebingen.mpg.de
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# Mean and standard deviation for normalizing input image
|
||||||
|
IMG_NORM_MEAN = [0.485, 0.456, 0.406]
|
||||||
|
IMG_NORM_STD = [0.229, 0.224, 0.225]
|
||||||
|
|
||||||
|
"""
|
||||||
|
We create a superset of joints containing the OpenPose joints together with the ones that each dataset provides.
|
||||||
|
We keep a superset of 24 joints such that we include all joints from every dataset.
|
||||||
|
If a dataset doesn't provide annotations for a specific joint, we simply ignore it.
|
||||||
|
The joints used here are the following:
|
||||||
|
"""
|
||||||
|
JOINT_NAMES = [
|
||||||
|
# 25 OpenPose joints (in the order provided by OpenPose)
|
||||||
|
'OP Nose',
|
||||||
|
'OP Neck',
|
||||||
|
'OP RShoulder',
|
||||||
|
'OP RElbow',
|
||||||
|
'OP RWrist',
|
||||||
|
'OP LShoulder',
|
||||||
|
'OP LElbow',
|
||||||
|
'OP LWrist',
|
||||||
|
'OP MidHip',
|
||||||
|
'OP RHip',
|
||||||
|
'OP RKnee',
|
||||||
|
'OP RAnkle',
|
||||||
|
'OP LHip',
|
||||||
|
'OP LKnee',
|
||||||
|
'OP LAnkle',
|
||||||
|
'OP REye',
|
||||||
|
'OP LEye',
|
||||||
|
'OP REar',
|
||||||
|
'OP LEar',
|
||||||
|
'OP LBigToe',
|
||||||
|
'OP LSmallToe',
|
||||||
|
'OP LHeel',
|
||||||
|
'OP RBigToe',
|
||||||
|
'OP RSmallToe',
|
||||||
|
'OP RHeel',
|
||||||
|
# 24 Ground Truth joints (superset of joints from different datasets)
|
||||||
|
'Right Ankle',
|
||||||
|
'Right Knee',
|
||||||
|
'Right Hip',
|
||||||
|
'Left Hip',
|
||||||
|
'Left Knee',
|
||||||
|
'Left Ankle',
|
||||||
|
'Right Wrist',
|
||||||
|
'Right Elbow',
|
||||||
|
'Right Shoulder',
|
||||||
|
'Left Shoulder',
|
||||||
|
'Left Elbow',
|
||||||
|
'Left Wrist',
|
||||||
|
'Neck (LSP)',
|
||||||
|
'Top of Head (LSP)',
|
||||||
|
'Pelvis (MPII)',
|
||||||
|
'Thorax (MPII)',
|
||||||
|
'Spine (H36M)',
|
||||||
|
'Jaw (H36M)',
|
||||||
|
'Head (H36M)',
|
||||||
|
'Nose',
|
||||||
|
'Left Eye',
|
||||||
|
'Right Eye',
|
||||||
|
'Left Ear',
|
||||||
|
'Right Ear'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Dict containing the joints in numerical order
|
||||||
|
JOINT_IDS = {JOINT_NAMES[i]: i for i in range(len(JOINT_NAMES))}
|
||||||
|
|
||||||
|
# Map joints to SMPL joints
|
||||||
|
JOINT_MAP = {
|
||||||
|
'OP Nose': 24, 'OP Neck': 12, 'OP RShoulder': 17,
|
||||||
|
'OP RElbow': 19, 'OP RWrist': 21, 'OP LShoulder': 16,
|
||||||
|
'OP LElbow': 18, 'OP LWrist': 20, 'OP MidHip': 0,
|
||||||
|
'OP RHip': 2, 'OP RKnee': 5, 'OP RAnkle': 8,
|
||||||
|
'OP LHip': 1, 'OP LKnee': 4, 'OP LAnkle': 7,
|
||||||
|
'OP REye': 25, 'OP LEye': 26, 'OP REar': 27,
|
||||||
|
'OP LEar': 28, 'OP LBigToe': 29, 'OP LSmallToe': 30,
|
||||||
|
'OP LHeel': 31, 'OP RBigToe': 32, 'OP RSmallToe': 33, 'OP RHeel': 34,
|
||||||
|
'Right Ankle': 8, 'Right Knee': 5, 'Right Hip': 45,
|
||||||
|
'Left Hip': 46, 'Left Knee': 4, 'Left Ankle': 7,
|
||||||
|
'Right Wrist': 21, 'Right Elbow': 19, 'Right Shoulder': 17,
|
||||||
|
'Left Shoulder': 16, 'Left Elbow': 18, 'Left Wrist': 20,
|
||||||
|
'Neck (LSP)': 47, 'Top of Head (LSP)': 48,
|
||||||
|
'Pelvis (MPII)': 49, 'Thorax (MPII)': 50,
|
||||||
|
'Spine (H36M)': 51, 'Jaw (H36M)': 52,
|
||||||
|
'Head (H36M)': 53, 'Nose': 24, 'Left Eye': 26,
|
||||||
|
'Right Eye': 25, 'Left Ear': 28, 'Right Ear': 27
|
||||||
|
}
|
||||||
|
|
||||||
|
# Joint selectors
|
||||||
|
# Indices to get the 14 LSP joints from the 17 H36M joints
|
||||||
|
H36M_TO_J17 = [6, 5, 4, 1, 2, 3, 16, 15, 14, 11, 12, 13, 8, 10, 0, 7, 9]
|
||||||
|
H36M_TO_J14 = H36M_TO_J17[:14]
|
||||||
|
# Indices to get the 14 LSP joints from the ground truth joints
|
||||||
|
J24_TO_J17 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18, 14, 16, 17]
|
||||||
|
J24_TO_J14 = J24_TO_J17[:14]
|
||||||
|
|
||||||
|
# Permutation of SMPL pose parameters when flipping the shape
|
||||||
|
SMPL_JOINTS_FLIP_PERM = [0, 2, 1, 3, 5, 4, 6, 8, 7, 9, 11, 10, 12, 14, 13, 15, 17, 16, 19, 18, 21, 20, 23, 22]
|
||||||
|
SMPL_POSE_FLIP_PERM = []
|
||||||
|
for i in SMPL_JOINTS_FLIP_PERM:
|
||||||
|
SMPL_POSE_FLIP_PERM.append(3*i)
|
||||||
|
SMPL_POSE_FLIP_PERM.append(3*i+1)
|
||||||
|
SMPL_POSE_FLIP_PERM.append(3*i+2)
|
||||||
|
# Permutation indices for the 24 ground truth joints
|
||||||
|
J24_FLIP_PERM = [5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13, 14, 15, 16, 17, 18, 19, 21, 20, 23, 22]
|
||||||
|
# Permutation indices for the full set of 49 joints
|
||||||
|
J49_FLIP_PERM = [0, 1, 5, 6, 7, 2, 3, 4, 8, 12, 13, 14, 9, 10, 11, 16, 15, 18, 17, 22, 23, 24, 19, 20, 21]\
|
||||||
|
+ [25+i for i in J24_FLIP_PERM]
|
||||||
|
|
||||||
|
SMPLH_TO_SMPL = np.arange(0, 156).reshape((-1, 3))[
|
||||||
|
np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 37])
|
||||||
|
].reshape(-1)
|
||||||
|
|
||||||
|
pw3d_occluded_sequences = [
|
||||||
|
'courtyard_backpack',
|
||||||
|
'courtyard_basketball',
|
||||||
|
'courtyard_bodyScannerMotions',
|
||||||
|
'courtyard_box',
|
||||||
|
'courtyard_golf',
|
||||||
|
'courtyard_jacket',
|
||||||
|
'courtyard_laceShoe',
|
||||||
|
'downtown_stairs',
|
||||||
|
'flat_guitar',
|
||||||
|
'flat_packBags',
|
||||||
|
'outdoors_climbing',
|
||||||
|
'outdoors_crosscountry',
|
||||||
|
'outdoors_fencing',
|
||||||
|
'outdoors_freestyle',
|
||||||
|
'outdoors_golf',
|
||||||
|
'outdoors_parcours',
|
||||||
|
'outdoors_slalom',
|
||||||
|
]
|
||||||
|
|
||||||
|
pw3d_test_sequences = [
|
||||||
|
'flat_packBags_00',
|
||||||
|
'downtown_weeklyMarket_00',
|
||||||
|
'outdoors_fencing_01',
|
||||||
|
'downtown_walkBridge_01',
|
||||||
|
'downtown_enterShop_00',
|
||||||
|
'downtown_rampAndStairs_00',
|
||||||
|
'downtown_bar_00',
|
||||||
|
'downtown_runForBus_01',
|
||||||
|
'downtown_cafe_00',
|
||||||
|
'flat_guitar_01',
|
||||||
|
'downtown_runForBus_00',
|
||||||
|
'downtown_sitOnStairs_00',
|
||||||
|
'downtown_bus_00',
|
||||||
|
'downtown_arguing_00',
|
||||||
|
'downtown_crossStreets_00',
|
||||||
|
'downtown_walkUphill_00',
|
||||||
|
'downtown_walking_00',
|
||||||
|
'downtown_car_00',
|
||||||
|
'downtown_warmWelcome_00',
|
||||||
|
'downtown_upstairs_00',
|
||||||
|
'downtown_stairs_00',
|
||||||
|
'downtown_windowShopping_00',
|
||||||
|
'office_phoneCall_00',
|
||||||
|
'downtown_downstairs_00'
|
||||||
|
]
|
||||||
|
|
||||||
|
pw3d_cam_sequences = [
|
||||||
|
# TEST
|
||||||
|
'downtown_downstairs_00',
|
||||||
|
'downtown_stairs_00',
|
||||||
|
'downtown_rampAndStairs_00',
|
||||||
|
'flat_packBags_00',
|
||||||
|
'flat_guitar_01',
|
||||||
|
'downtown_warmWelcome_00',
|
||||||
|
'downtown_walkUphill_00',
|
||||||
|
# VALIDATION
|
||||||
|
'outdoors_parcours_01',
|
||||||
|
'outdoors_crosscountry_00',
|
||||||
|
'outdoors_freestyle_01',
|
||||||
|
'downtown_walkDownhill_00',
|
||||||
|
'outdoors_parcours_00',
|
||||||
|
]
|
||||||
|
|
4
myeasymocap/backbone/pare/head/__init__.py
Normal file
4
myeasymocap/backbone/pare/head/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
from .pare_head import PareHead
|
||||||
|
from .hmr_head import HMRHead
|
||||||
|
# from .smpl_head import SMPLHead
|
||||||
|
# from .smpl_cam_head import SMPLCamHead
|
203
myeasymocap/backbone/pare/head/hmr_head.py
Normal file
203
myeasymocap/backbone/pare/head/hmr_head.py
Normal file
@ -0,0 +1,203 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
|
||||||
|
# holder of all proprietary rights on this computer program.
|
||||||
|
# You can only use this computer program if you have closed
|
||||||
|
# a license agreement with MPG or you get the right to use the computer
|
||||||
|
# program from someone who is authorized to grant you that right.
|
||||||
|
# Any use of the computer program without a valid license is prohibited and
|
||||||
|
# liable to prosecution.
|
||||||
|
#
|
||||||
|
# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
|
||||||
|
# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
|
||||||
|
# for Intelligent Systems. All rights reserved.
|
||||||
|
#
|
||||||
|
# Contact: ps-license@tuebingen.mpg.de
|
||||||
|
|
||||||
|
import math
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from ..config import SMPL_MEAN_PARAMS
|
||||||
|
from ..utils.geometry import rot6d_to_rotmat, rotmat_to_rot6d
|
||||||
|
|
||||||
|
BN_MOMENTUM = 0.1
|
||||||
|
|
||||||
|
|
||||||
|
class HMRHead(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
num_input_features,
|
||||||
|
smpl_mean_params=SMPL_MEAN_PARAMS,
|
||||||
|
estimate_var=False,
|
||||||
|
use_separate_var_branch=False,
|
||||||
|
uncertainty_activation='',
|
||||||
|
backbone='resnet50',
|
||||||
|
use_cam_feats=False,
|
||||||
|
):
|
||||||
|
super(HMRHead, self).__init__()
|
||||||
|
|
||||||
|
npose = 24 * 6
|
||||||
|
self.npose = npose
|
||||||
|
self.estimate_var = estimate_var
|
||||||
|
self.use_separate_var_branch = use_separate_var_branch
|
||||||
|
self.uncertainty_activation = uncertainty_activation
|
||||||
|
self.backbone = backbone
|
||||||
|
self.num_input_features = num_input_features
|
||||||
|
self.use_cam_feats = use_cam_feats
|
||||||
|
|
||||||
|
if use_cam_feats:
|
||||||
|
num_input_features += 7 # 6d rotmat + vfov
|
||||||
|
|
||||||
|
self.avgpool = nn.AdaptiveAvgPool2d(1) # nn.AvgPool2d(7, stride=1)
|
||||||
|
self.fc1 = nn.Linear(num_input_features + npose + 13, 1024)
|
||||||
|
self.drop1 = nn.Dropout()
|
||||||
|
self.fc2 = nn.Linear(1024, 1024)
|
||||||
|
self.drop2 = nn.Dropout()
|
||||||
|
|
||||||
|
if self.estimate_var:
|
||||||
|
# estimate variance for pose and shape parameters
|
||||||
|
if self.use_separate_var_branch:
|
||||||
|
# Decouple var estimation layer using separate linear layers
|
||||||
|
self.decpose = nn.Linear(1024, npose)
|
||||||
|
self.decshape = nn.Linear(1024, 10)
|
||||||
|
self.deccam = nn.Linear(1024, 3)
|
||||||
|
self.decpose_var = nn.Linear(1024, npose)
|
||||||
|
self.decshape_var = nn.Linear(1024, 10)
|
||||||
|
nn.init.xavier_uniform_(self.decpose_var.weight, gain=0.01)
|
||||||
|
nn.init.xavier_uniform_(self.decshape_var.weight, gain=0.01)
|
||||||
|
else:
|
||||||
|
# double the output sizes to estimate var
|
||||||
|
self.decpose = nn.Linear(1024, npose * 2)
|
||||||
|
self.decshape = nn.Linear(1024, 10 * 2)
|
||||||
|
self.deccam = nn.Linear(1024, 3)
|
||||||
|
else:
|
||||||
|
self.decpose = nn.Linear(1024, npose)
|
||||||
|
self.decshape = nn.Linear(1024, 10)
|
||||||
|
self.deccam = nn.Linear(1024, 3)
|
||||||
|
|
||||||
|
nn.init.xavier_uniform_(self.decpose.weight, gain=0.01)
|
||||||
|
nn.init.xavier_uniform_(self.decshape.weight, gain=0.01)
|
||||||
|
nn.init.xavier_uniform_(self.deccam.weight, gain=0.01)
|
||||||
|
|
||||||
|
if self.backbone.startswith('hrnet'):
|
||||||
|
self.downsample_module = self._make_head()
|
||||||
|
|
||||||
|
for m in self.modules():
|
||||||
|
if isinstance(m, nn.Conv2d):
|
||||||
|
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
|
||||||
|
m.weight.data.normal_(0, math.sqrt(2. / n))
|
||||||
|
elif isinstance(m, nn.BatchNorm2d):
|
||||||
|
m.weight.data.fill_(1)
|
||||||
|
m.bias.data.zero_()
|
||||||
|
|
||||||
|
mean_params = np.load(smpl_mean_params)
|
||||||
|
init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0)
|
||||||
|
init_shape = torch.from_numpy(mean_params['shape'][:].astype('float32')).unsqueeze(0)
|
||||||
|
init_cam = torch.from_numpy(mean_params['cam']).unsqueeze(0)
|
||||||
|
self.register_buffer('init_pose', init_pose)
|
||||||
|
self.register_buffer('init_shape', init_shape)
|
||||||
|
self.register_buffer('init_cam', init_cam)
|
||||||
|
|
||||||
|
def _make_head(self):
|
||||||
|
# downsampling modules
|
||||||
|
downsamp_modules = []
|
||||||
|
for i in range(3):
|
||||||
|
in_channels = self.num_input_features
|
||||||
|
out_channels = self.num_input_features
|
||||||
|
|
||||||
|
downsamp_module = nn.Sequential(
|
||||||
|
nn.Conv2d(in_channels=in_channels,
|
||||||
|
out_channels=out_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1),
|
||||||
|
nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM),
|
||||||
|
nn.ReLU(inplace=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
downsamp_modules.append(downsamp_module)
|
||||||
|
|
||||||
|
downsamp_modules = nn.Sequential(*downsamp_modules)
|
||||||
|
|
||||||
|
return downsamp_modules
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
features,
|
||||||
|
init_pose=None,
|
||||||
|
init_shape=None,
|
||||||
|
init_cam=None,
|
||||||
|
cam_rotmat=None,
|
||||||
|
cam_vfov=None,
|
||||||
|
n_iter=3
|
||||||
|
):
|
||||||
|
# if self.backbone.startswith('hrnet'):
|
||||||
|
# features = self.downsample_module(features)
|
||||||
|
|
||||||
|
batch_size = features.shape[0]
|
||||||
|
|
||||||
|
if init_pose is None:
|
||||||
|
init_pose = self.init_pose.expand(batch_size, -1)
|
||||||
|
if init_shape is None:
|
||||||
|
init_shape = self.init_shape.expand(batch_size, -1)
|
||||||
|
if init_cam is None:
|
||||||
|
init_cam = self.init_cam.expand(batch_size, -1)
|
||||||
|
|
||||||
|
xf = self.avgpool(features)
|
||||||
|
xf = xf.view(xf.size(0), -1)
|
||||||
|
|
||||||
|
pred_pose = init_pose
|
||||||
|
pred_shape = init_shape
|
||||||
|
pred_cam = init_cam
|
||||||
|
for i in range(n_iter):
|
||||||
|
if self.use_cam_feats:
|
||||||
|
xc = torch.cat([xf, pred_pose, pred_shape, pred_cam,
|
||||||
|
rotmat_to_rot6d(cam_rotmat), cam_vfov.unsqueeze(-1)], 1)
|
||||||
|
else:
|
||||||
|
xc = torch.cat([xf, pred_pose, pred_shape, pred_cam], 1)
|
||||||
|
xc = self.fc1(xc)
|
||||||
|
xc = self.drop1(xc)
|
||||||
|
xc = self.fc2(xc)
|
||||||
|
xc = self.drop2(xc)
|
||||||
|
if self.estimate_var:
|
||||||
|
pred_pose = self.decpose(xc)[:,:self.npose] + pred_pose
|
||||||
|
pred_shape = self.decshape(xc)[:,:10] + pred_shape
|
||||||
|
pred_cam = self.deccam(xc) + pred_cam
|
||||||
|
|
||||||
|
if self.use_separate_var_branch:
|
||||||
|
pred_pose_var = self.decpose_var(xc)
|
||||||
|
pred_shape_var = self.decshape_var(xc)
|
||||||
|
else:
|
||||||
|
pred_pose_var = self.decpose(xc)[:,self.npose:]
|
||||||
|
pred_shape_var = self.decshape(xc)[:,10:]
|
||||||
|
|
||||||
|
if self.uncertainty_activation != '':
|
||||||
|
# Use an activation layer to output uncertainty
|
||||||
|
pred_pose_var = eval(f'F.{self.uncertainty_activation}')(pred_pose_var)
|
||||||
|
pred_shape_var = eval(f'F.{self.uncertainty_activation}')(pred_shape_var)
|
||||||
|
else:
|
||||||
|
pred_pose = self.decpose(xc) + pred_pose
|
||||||
|
pred_shape = self.decshape(xc) + pred_shape
|
||||||
|
pred_cam = self.deccam(xc) + pred_cam
|
||||||
|
|
||||||
|
pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, 24, 3, 3)
|
||||||
|
|
||||||
|
output = {
|
||||||
|
'pred_pose': pred_rotmat,
|
||||||
|
'pred_cam': pred_cam,
|
||||||
|
'pred_shape': pred_shape,
|
||||||
|
'pred_pose_6d': pred_pose,
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.estimate_var:
|
||||||
|
output.update({
|
||||||
|
'pred_pose_var': torch.cat([pred_pose, pred_pose_var], dim=1),
|
||||||
|
'pred_shape_var': torch.cat([pred_shape, pred_shape_var], dim=1),
|
||||||
|
})
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
def keep_variance(x, min_variance):
|
||||||
|
return x + min_variance
|
926
myeasymocap/backbone/pare/head/pare_head.py
Normal file
926
myeasymocap/backbone/pare/head/pare_head.py
Normal file
@ -0,0 +1,926 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
|
||||||
|
# holder of all proprietary rights on this computer program.
|
||||||
|
# You can only use this computer program if you have closed
|
||||||
|
# a license agreement with MPG or you get the right to use the computer
|
||||||
|
# program from someone who is authorized to grant you that right.
|
||||||
|
# Any use of the computer program without a valid license is prohibited and
|
||||||
|
# liable to prosecution.
|
||||||
|
#
|
||||||
|
# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
|
||||||
|
# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
|
||||||
|
# for Intelligent Systems. All rights reserved.
|
||||||
|
#
|
||||||
|
# Contact: ps-license@tuebingen.mpg.de
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
from ..config import SMPL_MEAN_PARAMS
|
||||||
|
from ..layers.coattention import CoAttention
|
||||||
|
from ..utils.geometry import rot6d_to_rotmat, get_coord_maps
|
||||||
|
from ..utils.kp_utils import get_smpl_neighbor_triplets
|
||||||
|
from ..layers.softargmax import softargmax2d, get_heatmap_preds
|
||||||
|
from ..layers import LocallyConnected2d, KeypointAttention, interpolate
|
||||||
|
from ..layers.non_local import dot_product
|
||||||
|
from ..backbone.resnet import conv3x3, conv1x1, BasicBlock
|
||||||
|
|
||||||
|
class logger:
|
||||||
|
@staticmethod
|
||||||
|
def info(*args, **kwargs):
|
||||||
|
pass
|
||||||
|
BN_MOMENTUM = 0.1
|
||||||
|
|
||||||
|
|
||||||
|
class PareHead(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
num_joints,
|
||||||
|
num_input_features,
|
||||||
|
softmax_temp=1.0,
|
||||||
|
num_deconv_layers=3,
|
||||||
|
num_deconv_filters=(256, 256, 256),
|
||||||
|
num_deconv_kernels=(4, 4, 4),
|
||||||
|
num_camera_params=3,
|
||||||
|
num_features_smpl=64,
|
||||||
|
final_conv_kernel=1,
|
||||||
|
iterative_regression=False,
|
||||||
|
iter_residual=False,
|
||||||
|
num_iterations=3,
|
||||||
|
shape_input_type='feats', # 'feats.pose.shape.cam'
|
||||||
|
pose_input_type='feats', # 'feats.neighbor_pose_feats.all_pose.self_pose.neighbor_pose.shape.cam'
|
||||||
|
pose_mlp_num_layers=1,
|
||||||
|
shape_mlp_num_layers=1,
|
||||||
|
pose_mlp_hidden_size=256,
|
||||||
|
shape_mlp_hidden_size=256,
|
||||||
|
use_keypoint_features_for_smpl_regression=False,
|
||||||
|
use_heatmaps='',
|
||||||
|
use_keypoint_attention=False,
|
||||||
|
use_postconv_keypoint_attention=False,
|
||||||
|
keypoint_attention_act='softmax',
|
||||||
|
use_scale_keypoint_attention=False,
|
||||||
|
use_branch_nonlocal=None, # 'concatenation', 'dot_product', 'embedded_gaussian', 'gaussian'
|
||||||
|
use_final_nonlocal=None, # 'concatenation', 'dot_product', 'embedded_gaussian', 'gaussian'
|
||||||
|
backbone='resnet',
|
||||||
|
use_hmr_regression=False,
|
||||||
|
use_coattention=False,
|
||||||
|
num_coattention_iter=1,
|
||||||
|
coattention_conv='simple', # 'double_1', 'double_3', 'single_1', 'single_3', 'simple'
|
||||||
|
use_upsampling=False,
|
||||||
|
use_soft_attention=False, # Stefan & Otmar 3DV style attention
|
||||||
|
num_branch_iteration=0,
|
||||||
|
branch_deeper=False,
|
||||||
|
use_resnet_conv_hrnet=False,
|
||||||
|
use_position_encodings=None,
|
||||||
|
use_mean_camshape=False,
|
||||||
|
use_mean_pose=False,
|
||||||
|
init_xavier=False,
|
||||||
|
):
|
||||||
|
super(PareHead, self).__init__()
|
||||||
|
self.backbone = backbone
|
||||||
|
self.num_joints = num_joints
|
||||||
|
self.deconv_with_bias = False
|
||||||
|
self.use_heatmaps = use_heatmaps
|
||||||
|
self.num_iterations = num_iterations
|
||||||
|
self.use_final_nonlocal = use_final_nonlocal
|
||||||
|
self.use_branch_nonlocal = use_branch_nonlocal
|
||||||
|
self.use_hmr_regression = use_hmr_regression
|
||||||
|
self.use_coattention = use_coattention
|
||||||
|
self.num_coattention_iter = num_coattention_iter
|
||||||
|
self.coattention_conv = coattention_conv
|
||||||
|
self.use_soft_attention = use_soft_attention
|
||||||
|
self.num_branch_iteration = num_branch_iteration
|
||||||
|
self.iter_residual = iter_residual
|
||||||
|
self.iterative_regression = iterative_regression
|
||||||
|
self.pose_mlp_num_layers = pose_mlp_num_layers
|
||||||
|
self.shape_mlp_num_layers = shape_mlp_num_layers
|
||||||
|
self.pose_mlp_hidden_size = pose_mlp_hidden_size
|
||||||
|
self.shape_mlp_hidden_size = shape_mlp_hidden_size
|
||||||
|
self.use_keypoint_attention = use_keypoint_attention
|
||||||
|
self.use_keypoint_features_for_smpl_regression = use_keypoint_features_for_smpl_regression
|
||||||
|
self.use_position_encodings = use_position_encodings
|
||||||
|
self.use_mean_camshape = use_mean_camshape
|
||||||
|
self.use_mean_pose = use_mean_pose
|
||||||
|
|
||||||
|
self.num_input_features = num_input_features
|
||||||
|
|
||||||
|
if use_soft_attention:
|
||||||
|
# These options should be True by default when soft attention is used
|
||||||
|
self.use_keypoint_features_for_smpl_regression = True
|
||||||
|
self.use_hmr_regression = True
|
||||||
|
self.use_coattention = False
|
||||||
|
logger.warning('Coattention cannot be used together with soft attention')
|
||||||
|
logger.warning('Overriding use_coattention=False')
|
||||||
|
|
||||||
|
if use_coattention:
|
||||||
|
self.use_keypoint_features_for_smpl_regression = False
|
||||||
|
logger.warning('\"use_keypoint_features_for_smpl_regression\" cannot be used together with co-attention')
|
||||||
|
logger.warning('Overriding \"use_keypoint_features_for_smpl_regression\"=False')
|
||||||
|
|
||||||
|
if use_hmr_regression:
|
||||||
|
self.iterative_regression = False
|
||||||
|
logger.warning('iterative_regression cannot be used together with hmr regression')
|
||||||
|
|
||||||
|
if self.use_heatmaps in ['part_segm', 'attention']:
|
||||||
|
logger.info('\"Keypoint Attention\" should be activated to be able to use part segmentation')
|
||||||
|
logger.info('Overriding use_keypoint_attention')
|
||||||
|
self.use_keypoint_attention = True
|
||||||
|
|
||||||
|
assert num_iterations > 0, '\"num_iterations\" should be greater than 0.'
|
||||||
|
|
||||||
|
if use_position_encodings:
|
||||||
|
assert backbone.startswith('hrnet'), 'backbone should be hrnet to use position encodings'
|
||||||
|
# self.pos_enc = get_coord_maps(size=56)
|
||||||
|
self.register_buffer('pos_enc', get_coord_maps(size=56))
|
||||||
|
num_input_features += 2
|
||||||
|
self.num_input_features = num_input_features
|
||||||
|
|
||||||
|
if backbone.startswith('hrnet'):
|
||||||
|
if use_resnet_conv_hrnet:
|
||||||
|
logger.info('Using resnet block for keypoint and smpl conv layers...')
|
||||||
|
self.keypoint_deconv_layers = self._make_res_conv_layers(
|
||||||
|
input_channels=self.num_input_features,
|
||||||
|
num_channels=num_deconv_filters[-1],
|
||||||
|
num_basic_blocks=num_deconv_layers,
|
||||||
|
)
|
||||||
|
self.num_input_features = num_input_features
|
||||||
|
self.smpl_deconv_layers = self._make_res_conv_layers(
|
||||||
|
input_channels=self.num_input_features,
|
||||||
|
num_channels=num_deconv_filters[-1],
|
||||||
|
num_basic_blocks=num_deconv_layers,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.keypoint_deconv_layers = self._make_conv_layer(
|
||||||
|
num_deconv_layers,
|
||||||
|
num_deconv_filters,
|
||||||
|
(3,)*num_deconv_layers,
|
||||||
|
)
|
||||||
|
self.num_input_features = num_input_features
|
||||||
|
self.smpl_deconv_layers = self._make_conv_layer(
|
||||||
|
num_deconv_layers,
|
||||||
|
num_deconv_filters,
|
||||||
|
(3,)*num_deconv_layers,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# part branch that estimates 2d keypoints
|
||||||
|
|
||||||
|
conv_fn = self._make_upsample_layer if use_upsampling else self._make_deconv_layer
|
||||||
|
|
||||||
|
if use_upsampling:
|
||||||
|
logger.info('Upsampling is active to increase spatial dimension')
|
||||||
|
logger.info(f'Upsampling conv kernels: {num_deconv_kernels}')
|
||||||
|
|
||||||
|
self.keypoint_deconv_layers = conv_fn(
|
||||||
|
num_deconv_layers,
|
||||||
|
num_deconv_filters,
|
||||||
|
num_deconv_kernels,
|
||||||
|
)
|
||||||
|
# reset inplanes to 2048 -> final resnet layer
|
||||||
|
self.num_input_features = num_input_features
|
||||||
|
self.smpl_deconv_layers = conv_fn(
|
||||||
|
num_deconv_layers,
|
||||||
|
num_deconv_filters,
|
||||||
|
num_deconv_kernels,
|
||||||
|
)
|
||||||
|
|
||||||
|
pose_mlp_inp_dim = num_deconv_filters[-1]
|
||||||
|
smpl_final_dim = num_features_smpl
|
||||||
|
shape_mlp_inp_dim = num_joints * smpl_final_dim
|
||||||
|
|
||||||
|
if self.use_soft_attention:
|
||||||
|
logger.info('Soft attention (Stefan & Otmar 3DV) is active')
|
||||||
|
self.keypoint_final_layer = nn.Sequential(
|
||||||
|
conv3x3(num_deconv_filters[-1], 256),
|
||||||
|
nn.BatchNorm2d(256),
|
||||||
|
nn.ReLU(inplace=True),
|
||||||
|
conv1x1(256, num_joints+1 if self.use_heatmaps in ('part_segm', 'part_segm_pool') else num_joints),
|
||||||
|
)
|
||||||
|
|
||||||
|
soft_att_feature_size = smpl_final_dim # if use_hmr_regression else pose_mlp_inp_dim
|
||||||
|
self.smpl_final_layer = nn.Sequential(
|
||||||
|
conv3x3(num_deconv_filters[-1], 256),
|
||||||
|
nn.BatchNorm2d(256),
|
||||||
|
nn.ReLU(inplace=True),
|
||||||
|
conv1x1(256, soft_att_feature_size),
|
||||||
|
)
|
||||||
|
# pose_mlp_inp_dim = soft_att_feature_size
|
||||||
|
else:
|
||||||
|
self.keypoint_final_layer = nn.Conv2d(
|
||||||
|
in_channels=num_deconv_filters[-1],
|
||||||
|
out_channels=num_joints+1 if self.use_heatmaps in ('part_segm', 'part_segm_pool') else num_joints,
|
||||||
|
kernel_size=final_conv_kernel,
|
||||||
|
stride=1,
|
||||||
|
padding=1 if final_conv_kernel == 3 else 0,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.smpl_final_layer = nn.Conv2d(
|
||||||
|
in_channels=num_deconv_filters[-1],
|
||||||
|
out_channels=smpl_final_dim,
|
||||||
|
kernel_size=final_conv_kernel,
|
||||||
|
stride=1,
|
||||||
|
padding=1 if final_conv_kernel == 3 else 0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# temperature for softargmax function
|
||||||
|
self.register_buffer('temperature', torch.tensor(softmax_temp))
|
||||||
|
|
||||||
|
# if self.iterative_regression or self.num_branch_iteration > 0 or self.use_coattention:
|
||||||
|
mean_params = np.load(SMPL_MEAN_PARAMS)
|
||||||
|
init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0)
|
||||||
|
init_shape = torch.from_numpy(mean_params['shape'][:].astype('float32')).unsqueeze(0)
|
||||||
|
init_cam = torch.from_numpy(mean_params['cam']).unsqueeze(0)
|
||||||
|
self.register_buffer('init_pose', init_pose)
|
||||||
|
self.register_buffer('init_shape', init_shape)
|
||||||
|
self.register_buffer('init_cam', init_cam)
|
||||||
|
|
||||||
|
if self.iterative_regression:
|
||||||
|
# enable iterative regression similar to HMR
|
||||||
|
# these are the features that can be used as input to final MLPs
|
||||||
|
input_type_dim = {
|
||||||
|
'feats': 0, # image features for self
|
||||||
|
'neighbor_pose_feats': 2 * 256, # image features from neighbor joints
|
||||||
|
'all_pose': 24 * 6, # rot6d of all joints from previous iter
|
||||||
|
'self_pose': 6, # rot6d of self
|
||||||
|
'neighbor_pose': 2 * 6, # rot6d of neighbor joints from previous iter
|
||||||
|
'shape': 10, # smpl betas/shape
|
||||||
|
'cam': num_camera_params, # weak perspective camera
|
||||||
|
}
|
||||||
|
|
||||||
|
assert 'feats' in shape_input_type, '\"feats\" should be the default value'
|
||||||
|
assert 'feats' in pose_input_type, '\"feats\" should be the default value'
|
||||||
|
|
||||||
|
self.shape_input_type = shape_input_type.split('.')
|
||||||
|
self.pose_input_type = pose_input_type.split('.')
|
||||||
|
|
||||||
|
pose_mlp_inp_dim = pose_mlp_inp_dim + sum([input_type_dim[x] for x in self.pose_input_type])
|
||||||
|
shape_mlp_inp_dim = shape_mlp_inp_dim + sum([input_type_dim[x] for x in self.shape_input_type])
|
||||||
|
|
||||||
|
logger.debug(f'Shape MLP takes \"{self.shape_input_type}\" as input, '
|
||||||
|
f'input dim: {shape_mlp_inp_dim}')
|
||||||
|
logger.debug(f'Pose MLP takes \"{self.pose_input_type}\" as input, '
|
||||||
|
f'input dim: {pose_mlp_inp_dim}')
|
||||||
|
|
||||||
|
self.pose_mlp_inp_dim = pose_mlp_inp_dim
|
||||||
|
self.shape_mlp_inp_dim = shape_mlp_inp_dim
|
||||||
|
|
||||||
|
if self.use_hmr_regression:
|
||||||
|
logger.info(f'HMR regression is active...')
|
||||||
|
# enable iterative regression similar to HMR
|
||||||
|
|
||||||
|
self.fc1 = nn.Linear(num_joints * smpl_final_dim + (num_joints * 6) + 10 + num_camera_params, 1024)
|
||||||
|
self.drop1 = nn.Dropout()
|
||||||
|
self.fc2 = nn.Linear(1024, 1024)
|
||||||
|
self.drop2 = nn.Dropout()
|
||||||
|
self.decpose = nn.Linear(1024, (num_joints * 6))
|
||||||
|
self.decshape = nn.Linear(1024, 10)
|
||||||
|
self.deccam = nn.Linear(1024, num_camera_params)
|
||||||
|
|
||||||
|
nn.init.xavier_uniform_(self.decpose.weight, gain=0.01)
|
||||||
|
nn.init.xavier_uniform_(self.decshape.weight, gain=0.01)
|
||||||
|
nn.init.xavier_uniform_(self.deccam.weight, gain=0.01)
|
||||||
|
else:
|
||||||
|
# here we use 2 different MLPs to estimate shape and camera
|
||||||
|
# They take a channelwise downsampled version of smpl features
|
||||||
|
self.shape_mlp = self._get_shape_mlp(output_size=10)
|
||||||
|
self.cam_mlp = self._get_shape_mlp(output_size=num_camera_params)
|
||||||
|
|
||||||
|
# for pose each joint has a separate MLP
|
||||||
|
# weights for these MLPs are not shared
|
||||||
|
# hence we use Locally Connected layers
|
||||||
|
# TODO support kernel_size > 1 to access context of other joints
|
||||||
|
self.pose_mlp = self._get_pose_mlp(num_joints=num_joints, output_size=6)
|
||||||
|
|
||||||
|
if init_xavier:
|
||||||
|
nn.init.xavier_uniform_(self.shape_mlp.weight, gain=0.01)
|
||||||
|
nn.init.xavier_uniform_(self.cam_mlp.weight, gain=0.01)
|
||||||
|
nn.init.xavier_uniform_(self.pose_mlp.weight, gain=0.01)
|
||||||
|
|
||||||
|
if self.use_branch_nonlocal:
|
||||||
|
logger.info(f'Branch nonlocal is active, type {self.use_branch_nonlocal}')
|
||||||
|
self.branch_2d_nonlocal = eval(self.use_branch_nonlocal).NONLocalBlock2D(
|
||||||
|
in_channels=num_deconv_filters[-1],
|
||||||
|
sub_sample=False,
|
||||||
|
bn_layer=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.branch_3d_nonlocal = eval(self.use_branch_nonlocal).NONLocalBlock2D(
|
||||||
|
in_channels=num_deconv_filters[-1],
|
||||||
|
sub_sample=False,
|
||||||
|
bn_layer=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.use_final_nonlocal:
|
||||||
|
logger.info(f'Final nonlocal is active, type {self.use_final_nonlocal}')
|
||||||
|
self.final_pose_nonlocal = eval(self.use_final_nonlocal).NONLocalBlock1D(
|
||||||
|
in_channels=self.pose_mlp_inp_dim,
|
||||||
|
sub_sample=False,
|
||||||
|
bn_layer=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.final_shape_nonlocal = eval(self.use_final_nonlocal).NONLocalBlock1D(
|
||||||
|
in_channels=num_features_smpl,
|
||||||
|
sub_sample=False,
|
||||||
|
bn_layer=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.use_keypoint_attention:
|
||||||
|
logger.info('Keypoint attention is active')
|
||||||
|
self.keypoint_attention = KeypointAttention(
|
||||||
|
use_conv=use_postconv_keypoint_attention,
|
||||||
|
in_channels=(self.pose_mlp_inp_dim, smpl_final_dim),
|
||||||
|
out_channels=(self.pose_mlp_inp_dim, smpl_final_dim),
|
||||||
|
act=keypoint_attention_act,
|
||||||
|
use_scale=use_scale_keypoint_attention,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.use_coattention:
|
||||||
|
logger.info(f'Coattention is active, final conv type {self.coattention_conv}')
|
||||||
|
self.coattention = CoAttention(n_channel=num_deconv_filters[-1], final_conv=self.coattention_conv)
|
||||||
|
|
||||||
|
if self.num_branch_iteration > 0:
|
||||||
|
logger.info(f'Branch iteration is active')
|
||||||
|
if branch_deeper:
|
||||||
|
self.branch_iter_2d_nonlocal = nn.Sequential(
|
||||||
|
conv3x3(num_deconv_filters[-1], 256),
|
||||||
|
nn.BatchNorm2d(256),
|
||||||
|
nn.ReLU(inplace=True),
|
||||||
|
dot_product.NONLocalBlock2D(
|
||||||
|
in_channels=num_deconv_filters[-1],
|
||||||
|
sub_sample=False,
|
||||||
|
bn_layer=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.branch_iter_3d_nonlocal = nn.Sequential(
|
||||||
|
conv3x3(num_deconv_filters[-1], 256),
|
||||||
|
nn.BatchNorm2d(256),
|
||||||
|
nn.ReLU(inplace=True),
|
||||||
|
dot_product.NONLocalBlock2D(
|
||||||
|
in_channels=num_deconv_filters[-1],
|
||||||
|
sub_sample=False,
|
||||||
|
bn_layer=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.branch_iter_2d_nonlocal = dot_product.NONLocalBlock2D(
|
||||||
|
in_channels=num_deconv_filters[-1],
|
||||||
|
sub_sample=False,
|
||||||
|
bn_layer=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.branch_iter_3d_nonlocal = dot_product.NONLocalBlock2D(
|
||||||
|
in_channels=num_deconv_filters[-1],
|
||||||
|
sub_sample=False,
|
||||||
|
bn_layer=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_shape_mlp(self, output_size):
|
||||||
|
if self.shape_mlp_num_layers == 1:
|
||||||
|
return nn.Linear(self.shape_mlp_inp_dim, output_size)
|
||||||
|
|
||||||
|
module_list = []
|
||||||
|
for i in range(self.shape_mlp_num_layers):
|
||||||
|
if i == 0:
|
||||||
|
module_list.append(
|
||||||
|
nn.Linear(self.shape_mlp_inp_dim, self.shape_mlp_hidden_size)
|
||||||
|
)
|
||||||
|
elif i == self.shape_mlp_num_layers - 1:
|
||||||
|
module_list.append(
|
||||||
|
nn.Linear(self.shape_mlp_hidden_size, output_size)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
module_list.append(
|
||||||
|
nn.Linear(self.shape_mlp_hidden_size, self.shape_mlp_hidden_size)
|
||||||
|
)
|
||||||
|
return nn.Sequential(*module_list)
|
||||||
|
|
||||||
|
def _get_pose_mlp(self, num_joints, output_size):
|
||||||
|
if self.pose_mlp_num_layers == 1:
|
||||||
|
return LocallyConnected2d(
|
||||||
|
in_channels=self.pose_mlp_inp_dim,
|
||||||
|
out_channels=output_size,
|
||||||
|
output_size=[num_joints, 1],
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
module_list = []
|
||||||
|
for i in range(self.pose_mlp_num_layers):
|
||||||
|
if i == 0:
|
||||||
|
module_list.append(
|
||||||
|
LocallyConnected2d(
|
||||||
|
in_channels=self.pose_mlp_inp_dim,
|
||||||
|
out_channels=self.pose_mlp_hidden_size,
|
||||||
|
output_size=[num_joints, 1],
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif i == self.pose_mlp_num_layers - 1:
|
||||||
|
module_list.append(
|
||||||
|
LocallyConnected2d(
|
||||||
|
in_channels=self.pose_mlp_hidden_size,
|
||||||
|
out_channels=output_size,
|
||||||
|
output_size=[num_joints, 1],
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
module_list.append(
|
||||||
|
LocallyConnected2d(
|
||||||
|
in_channels=self.pose_mlp_hidden_size,
|
||||||
|
out_channels=self.pose_mlp_hidden_size,
|
||||||
|
output_size=[num_joints, 1],
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return nn.Sequential(*module_list)
|
||||||
|
|
||||||
|
def _get_deconv_cfg(self, deconv_kernel):
|
||||||
|
if deconv_kernel == 4:
|
||||||
|
padding = 1
|
||||||
|
output_padding = 0
|
||||||
|
elif deconv_kernel == 3:
|
||||||
|
padding = 1
|
||||||
|
output_padding = 1
|
||||||
|
elif deconv_kernel == 2:
|
||||||
|
padding = 0
|
||||||
|
output_padding = 0
|
||||||
|
|
||||||
|
return deconv_kernel, padding, output_padding
|
||||||
|
|
||||||
|
def _make_conv_layer(self, num_layers, num_filters, num_kernels):
|
||||||
|
assert num_layers == len(num_filters), \
|
||||||
|
'ERROR: num_conv_layers is different len(num_conv_filters)'
|
||||||
|
assert num_layers == len(num_kernels), \
|
||||||
|
'ERROR: num_conv_layers is different len(num_conv_filters)'
|
||||||
|
layers = []
|
||||||
|
for i in range(num_layers):
|
||||||
|
kernel, padding, output_padding = \
|
||||||
|
self._get_deconv_cfg(num_kernels[i])
|
||||||
|
|
||||||
|
planes = num_filters[i]
|
||||||
|
layers.append(
|
||||||
|
nn.Conv2d(
|
||||||
|
in_channels=self.num_input_features,
|
||||||
|
out_channels=planes,
|
||||||
|
kernel_size=kernel,
|
||||||
|
stride=1,
|
||||||
|
padding=padding,
|
||||||
|
bias=self.deconv_with_bias))
|
||||||
|
layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
|
||||||
|
layers.append(nn.ReLU(inplace=True))
|
||||||
|
self.num_input_features = planes
|
||||||
|
|
||||||
|
return nn.Sequential(*layers)
|
||||||
|
|
||||||
|
def _make_res_conv_layers(self, input_channels, num_channels=64,
|
||||||
|
num_heads=1, num_basic_blocks=2):
|
||||||
|
head_layers = []
|
||||||
|
|
||||||
|
# kernel_sizes, strides, paddings = self._get_trans_cfg()
|
||||||
|
# for kernel_size, padding, stride in zip(kernel_sizes, paddings, strides):
|
||||||
|
head_layers.append(nn.Sequential(
|
||||||
|
nn.Conv2d(
|
||||||
|
in_channels=input_channels,
|
||||||
|
out_channels=num_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1
|
||||||
|
),
|
||||||
|
nn.BatchNorm2d(num_channels, momentum=BN_MOMENTUM),
|
||||||
|
nn.ReLU(inplace=True))
|
||||||
|
)
|
||||||
|
|
||||||
|
for i in range(num_heads):
|
||||||
|
layers = []
|
||||||
|
for _ in range(num_basic_blocks):
|
||||||
|
layers.append(nn.Sequential(BasicBlock(num_channels, num_channels)))
|
||||||
|
head_layers.append(nn.Sequential(*layers))
|
||||||
|
|
||||||
|
# head_layers.append(nn.Conv2d(in_channels=num_channels, out_channels=output_channels,
|
||||||
|
# kernel_size=1, stride=1, padding=0))
|
||||||
|
|
||||||
|
return nn.Sequential(*head_layers)
|
||||||
|
|
||||||
|
def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
|
||||||
|
assert num_layers == len(num_filters), \
|
||||||
|
'ERROR: num_deconv_layers is different len(num_deconv_filters)'
|
||||||
|
assert num_layers == len(num_kernels), \
|
||||||
|
'ERROR: num_deconv_layers is different len(num_deconv_filters)'
|
||||||
|
|
||||||
|
layers = []
|
||||||
|
for i in range(num_layers):
|
||||||
|
kernel, padding, output_padding = \
|
||||||
|
self._get_deconv_cfg(num_kernels[i])
|
||||||
|
|
||||||
|
planes = num_filters[i]
|
||||||
|
layers.append(
|
||||||
|
nn.ConvTranspose2d(
|
||||||
|
in_channels=self.num_input_features,
|
||||||
|
out_channels=planes,
|
||||||
|
kernel_size=kernel,
|
||||||
|
stride=2,
|
||||||
|
padding=padding,
|
||||||
|
output_padding=output_padding,
|
||||||
|
bias=self.deconv_with_bias))
|
||||||
|
layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
|
||||||
|
layers.append(nn.ReLU(inplace=True))
|
||||||
|
# if self.use_self_attention:
|
||||||
|
# layers.append(SelfAttention(planes))
|
||||||
|
self.num_input_features = planes
|
||||||
|
|
||||||
|
return nn.Sequential(*layers)
|
||||||
|
|
||||||
|
def _make_upsample_layer(self, num_layers, num_filters, num_kernels):
|
||||||
|
assert num_layers == len(num_filters), \
|
||||||
|
'ERROR: num_layers is different len(num_filters)'
|
||||||
|
assert num_layers == len(num_kernels), \
|
||||||
|
'ERROR: num_layers is different len(num_filters)'
|
||||||
|
|
||||||
|
layers = []
|
||||||
|
for i in range(num_layers):
|
||||||
|
kernel, padding, output_padding = \
|
||||||
|
self._get_deconv_cfg(num_kernels[i])
|
||||||
|
|
||||||
|
planes = num_filters[i]
|
||||||
|
layers.append(nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True))
|
||||||
|
layers.append(
|
||||||
|
nn.Conv2d(in_channels=self.num_input_features, out_channels=planes,
|
||||||
|
kernel_size=kernel, stride=1, padding=padding, bias=self.deconv_with_bias)
|
||||||
|
)
|
||||||
|
layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
|
||||||
|
layers.append(nn.ReLU(inplace=True))
|
||||||
|
# if self.use_self_attention:
|
||||||
|
# layers.append(SelfAttention(planes))
|
||||||
|
self.num_input_features = planes
|
||||||
|
|
||||||
|
return nn.Sequential(*layers)
|
||||||
|
|
||||||
|
def _prepare_pose_mlp_inp(self, feats, pred_pose, pred_shape, pred_cam):
|
||||||
|
# feats shape: [N, 256, J, 1]
|
||||||
|
# pose shape: [N, 6, J, 1]
|
||||||
|
# cam shape: [N, 3]
|
||||||
|
# beta shape: [N, 10]
|
||||||
|
batch_size, num_joints = pred_pose.shape[0], pred_pose.shape[2]
|
||||||
|
|
||||||
|
joint_triplets = get_smpl_neighbor_triplets()
|
||||||
|
|
||||||
|
inp_list = []
|
||||||
|
|
||||||
|
for inp_type in self.pose_input_type:
|
||||||
|
if inp_type == 'feats':
|
||||||
|
# add image features
|
||||||
|
inp_list.append(feats)
|
||||||
|
|
||||||
|
if inp_type == 'neighbor_pose_feats':
|
||||||
|
# add the image features from neighboring joints
|
||||||
|
n_pose_feat = []
|
||||||
|
for jt in joint_triplets:
|
||||||
|
n_pose_feat.append(
|
||||||
|
feats[:, :, jt[1:]].reshape(batch_size, -1, 1).unsqueeze(-2)
|
||||||
|
)
|
||||||
|
n_pose_feat = torch.cat(n_pose_feat, 2)
|
||||||
|
inp_list.append(n_pose_feat)
|
||||||
|
|
||||||
|
if inp_type == 'self_pose':
|
||||||
|
# add image features
|
||||||
|
inp_list.append(pred_pose)
|
||||||
|
|
||||||
|
if inp_type == 'all_pose':
|
||||||
|
# append all of the joint angels
|
||||||
|
all_pose = pred_pose.reshape(batch_size, -1, 1)[..., None].repeat(1, 1, num_joints, 1)
|
||||||
|
inp_list.append(all_pose)
|
||||||
|
|
||||||
|
if inp_type == 'neighbor_pose':
|
||||||
|
# append only the joint angles of neighboring ones
|
||||||
|
n_pose = []
|
||||||
|
for jt in joint_triplets:
|
||||||
|
n_pose.append(
|
||||||
|
pred_pose[:,:,jt[1:]].reshape(batch_size, -1, 1).unsqueeze(-2)
|
||||||
|
)
|
||||||
|
n_pose = torch.cat(n_pose, 2)
|
||||||
|
inp_list.append(n_pose)
|
||||||
|
|
||||||
|
if inp_type == 'shape':
|
||||||
|
# append shape predictions
|
||||||
|
pred_shape = pred_shape[..., None, None].repeat(1, 1, num_joints, 1)
|
||||||
|
inp_list.append(pred_shape)
|
||||||
|
|
||||||
|
if inp_type == 'cam':
|
||||||
|
# append camera predictions
|
||||||
|
pred_cam = pred_cam[..., None, None].repeat(1, 1, num_joints, 1)
|
||||||
|
inp_list.append(pred_cam)
|
||||||
|
|
||||||
|
assert len(inp_list) > 0
|
||||||
|
|
||||||
|
# for i,inp in enumerate(inp_list):
|
||||||
|
# print(i, inp.shape)
|
||||||
|
|
||||||
|
return torch.cat(inp_list, 1)
|
||||||
|
|
||||||
|
def _prepare_shape_mlp_inp(self, feats, pred_pose, pred_shape, pred_cam):
|
||||||
|
# feats shape: [N, 256, J, 1]
|
||||||
|
# pose shape: [N, 6, J, 1]
|
||||||
|
# cam shape: [N, 3]
|
||||||
|
# beta shape: [N, 10]
|
||||||
|
batch_size, num_joints = pred_pose.shape[:2]
|
||||||
|
|
||||||
|
inp_list = []
|
||||||
|
|
||||||
|
for inp_type in self.shape_input_type:
|
||||||
|
if inp_type == 'feats':
|
||||||
|
# add image features
|
||||||
|
inp_list.append(feats)
|
||||||
|
|
||||||
|
if inp_type == 'all_pose':
|
||||||
|
# append all of the joint angels
|
||||||
|
pred_pose = pred_pose.reshape(batch_size, -1)
|
||||||
|
inp_list.append(pred_pose)
|
||||||
|
|
||||||
|
if inp_type == 'shape':
|
||||||
|
# append shape predictions
|
||||||
|
inp_list.append(pred_shape)
|
||||||
|
|
||||||
|
if inp_type == 'cam':
|
||||||
|
# append camera predictions
|
||||||
|
inp_list.append(pred_cam)
|
||||||
|
|
||||||
|
assert len(inp_list) > 0
|
||||||
|
|
||||||
|
return torch.cat(inp_list, 1)
|
||||||
|
|
||||||
|
def forward(self, features, gt_segm=None):
|
||||||
|
batch_size = features.shape[0]
|
||||||
|
|
||||||
|
init_pose = self.init_pose.expand(batch_size, -1) # N, Jx6
|
||||||
|
init_shape = self.init_shape.expand(batch_size, -1)
|
||||||
|
init_cam = self.init_cam.expand(batch_size, -1)
|
||||||
|
|
||||||
|
if self.use_position_encodings:
|
||||||
|
features = torch.cat((features, self.pos_enc.repeat(features.shape[0], 1, 1, 1)), 1)
|
||||||
|
|
||||||
|
output = {}
|
||||||
|
|
||||||
|
############## 2D PART BRANCH FEATURES ##############
|
||||||
|
part_feats = self._get_2d_branch_feats(features)
|
||||||
|
|
||||||
|
############## GET PART ATTENTION MAP ##############
|
||||||
|
part_attention = self._get_part_attention_map(part_feats, output)
|
||||||
|
|
||||||
|
############## 3D SMPL BRANCH FEATURES ##############
|
||||||
|
smpl_feats = self._get_3d_smpl_feats(features, part_feats)
|
||||||
|
|
||||||
|
############## SAMPLE LOCAL FEATURES ##############
|
||||||
|
if gt_segm is not None:
|
||||||
|
# logger.debug(gt_segm.shape)
|
||||||
|
# import IPython; IPython.embed(); exit()
|
||||||
|
gt_segm = F.interpolate(gt_segm.unsqueeze(1).float(), scale_factor=(1/4, 1/4), mode='nearest').long().squeeze(1)
|
||||||
|
part_attention = F.one_hot(gt_segm.to('cpu'), num_classes=self.num_joints + 1).permute(0,3,1,2).float()[:,1:,:,:]
|
||||||
|
part_attention = part_attention.to('cuda')
|
||||||
|
# part_attention = F.interpolate(part_attention, scale_factor=1/4, mode='bilinear', align_corners=True)
|
||||||
|
# import IPython; IPython.embed(); exit()
|
||||||
|
point_local_feat, cam_shape_feats = self._get_local_feats(smpl_feats, part_attention, output)
|
||||||
|
|
||||||
|
############## GET FINAL PREDICTIONS ##############
|
||||||
|
pred_pose, pred_shape, pred_cam = self._get_final_preds(
|
||||||
|
point_local_feat, cam_shape_feats, init_pose, init_shape, init_cam
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.use_coattention:
|
||||||
|
for c in range(self.num_coattention_iter):
|
||||||
|
smpl_feats, part_feats = self.coattention(smpl_feats, part_feats)
|
||||||
|
part_attention = self._get_part_attention_map(part_feats, output)
|
||||||
|
point_local_feat, cam_shape_feats = self._get_local_feats(smpl_feats, part_attention, output)
|
||||||
|
pred_pose, pred_shape, pred_cam = self._get_final_preds(
|
||||||
|
point_local_feat, cam_shape_feats, pred_pose, pred_shape, pred_cam
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.num_branch_iteration > 0:
|
||||||
|
for nbi in range(self.num_branch_iteration):
|
||||||
|
if self.use_soft_attention:
|
||||||
|
smpl_feats = self.branch_iter_3d_nonlocal(smpl_feats)
|
||||||
|
part_feats = self.branch_iter_2d_nonlocal(part_feats)
|
||||||
|
else:
|
||||||
|
smpl_feats = self.branch_iter_3d_nonlocal(smpl_feats)
|
||||||
|
part_feats = smpl_feats
|
||||||
|
|
||||||
|
part_attention = self._get_part_attention_map(part_feats, output)
|
||||||
|
point_local_feat, cam_shape_feats = self._get_local_feats(smpl_feats, part_attention, output)
|
||||||
|
pred_pose, pred_shape, pred_cam = self._get_final_preds(
|
||||||
|
point_local_feat, cam_shape_feats, pred_pose, pred_shape, pred_cam,
|
||||||
|
)
|
||||||
|
|
||||||
|
pred_rotmat = rot6d_to_rotmat(pred_pose).reshape(batch_size, 24, 3, 3)
|
||||||
|
|
||||||
|
output.update({
|
||||||
|
'pred_pose': pred_rotmat,
|
||||||
|
'pred_cam': pred_cam,
|
||||||
|
'pred_shape': pred_shape,
|
||||||
|
})
|
||||||
|
return output
|
||||||
|
|
||||||
|
def _get_local_feats(self, smpl_feats, part_attention, output):
|
||||||
|
cam_shape_feats = self.smpl_final_layer(smpl_feats)
|
||||||
|
|
||||||
|
if self.use_keypoint_attention:
|
||||||
|
point_local_feat = self.keypoint_attention(smpl_feats, part_attention)
|
||||||
|
cam_shape_feats = self.keypoint_attention(cam_shape_feats, part_attention)
|
||||||
|
else:
|
||||||
|
point_local_feat = interpolate(smpl_feats, output['pred_kp2d'])
|
||||||
|
cam_shape_feats = interpolate(cam_shape_feats, output['pred_kp2d'])
|
||||||
|
return point_local_feat, cam_shape_feats
|
||||||
|
|
||||||
|
def _get_2d_branch_feats(self, features):
|
||||||
|
part_feats = self.keypoint_deconv_layers(features)
|
||||||
|
if self.use_branch_nonlocal:
|
||||||
|
part_feats = self.branch_2d_nonlocal(part_feats)
|
||||||
|
return part_feats
|
||||||
|
|
||||||
|
def _get_3d_smpl_feats(self, features, part_feats):
|
||||||
|
if self.use_keypoint_features_for_smpl_regression:
|
||||||
|
smpl_feats = part_feats
|
||||||
|
else:
|
||||||
|
smpl_feats = self.smpl_deconv_layers(features)
|
||||||
|
if self.use_branch_nonlocal:
|
||||||
|
smpl_feats = self.branch_3d_nonlocal(smpl_feats)
|
||||||
|
|
||||||
|
return smpl_feats
|
||||||
|
|
||||||
|
def _get_part_attention_map(self, part_feats, output):
|
||||||
|
heatmaps = self.keypoint_final_layer(part_feats)
|
||||||
|
|
||||||
|
if self.use_heatmaps == 'hm':
|
||||||
|
# returns coords between [-1,1]
|
||||||
|
pred_kp2d, confidence = get_heatmap_preds(heatmaps)
|
||||||
|
output['pred_kp2d'] = pred_kp2d
|
||||||
|
output['pred_kp2d_conf'] = confidence
|
||||||
|
output['pred_heatmaps_2d'] = heatmaps
|
||||||
|
elif self.use_heatmaps == 'hm_soft':
|
||||||
|
pred_kp2d, _ = softargmax2d(heatmaps, self.temperature)
|
||||||
|
output['pred_kp2d'] = pred_kp2d
|
||||||
|
output['pred_heatmaps_2d'] = heatmaps
|
||||||
|
elif self.use_heatmaps == 'part_segm':
|
||||||
|
output['pred_segm_mask'] = heatmaps
|
||||||
|
heatmaps = heatmaps[:,1:,:,:] # remove the first channel which encodes the background
|
||||||
|
elif self.use_heatmaps == 'part_segm_pool':
|
||||||
|
output['pred_segm_mask'] = heatmaps
|
||||||
|
heatmaps = heatmaps[:,1:,:,:] # remove the first channel which encodes the background
|
||||||
|
pred_kp2d, _ = softargmax2d(heatmaps, self.temperature) # get_heatmap_preds(heatmaps)
|
||||||
|
output['pred_kp2d'] = pred_kp2d
|
||||||
|
|
||||||
|
for k, v in output.items():
|
||||||
|
if torch.any(torch.isnan(v)):
|
||||||
|
logger.debug(f'{k} is Nan!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
|
||||||
|
if torch.any(torch.isinf(v)):
|
||||||
|
logger.debug(f'{k} is Inf!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
|
||||||
|
|
||||||
|
# if torch.any(torch.isnan(pred_kp2d)):
|
||||||
|
# print('pred_kp2d nan', pred_kp2d.min(), pred_kp2d.max())
|
||||||
|
# if torch.any(torch.isnan(heatmaps)):
|
||||||
|
# print('heatmap nan', heatmaps.min(), heatmaps.max())
|
||||||
|
#
|
||||||
|
# if torch.any(torch.isinf(pred_kp2d)):
|
||||||
|
# print('pred_kp2d inf', pred_kp2d.min(), pred_kp2d.max())
|
||||||
|
# if torch.any(torch.isinf(heatmaps)):
|
||||||
|
# print('heatmap inf', heatmaps.min(), heatmaps.max())
|
||||||
|
|
||||||
|
elif self.use_heatmaps == 'attention':
|
||||||
|
output['pred_attention'] = heatmaps
|
||||||
|
else:
|
||||||
|
# returns coords between [-1,1]
|
||||||
|
pred_kp2d, _ = softargmax2d(heatmaps, self.temperature)
|
||||||
|
output['pred_kp2d'] = pred_kp2d
|
||||||
|
output['pred_heatmaps_2d'] = heatmaps
|
||||||
|
return heatmaps
|
||||||
|
|
||||||
|
def _get_final_preds(self, pose_feats, cam_shape_feats, init_pose, init_shape, init_cam):
|
||||||
|
if self.use_hmr_regression:
|
||||||
|
return self._hmr_get_final_preds(cam_shape_feats, init_pose, init_shape, init_cam)
|
||||||
|
else:
|
||||||
|
return self._pare_get_final_preds(pose_feats, cam_shape_feats, init_pose, init_shape, init_cam)
|
||||||
|
|
||||||
|
def _hmr_get_final_preds(self, cam_shape_feats, init_pose, init_shape, init_cam):
|
||||||
|
if self.use_final_nonlocal:
|
||||||
|
cam_shape_feats = self.final_shape_nonlocal(cam_shape_feats)
|
||||||
|
|
||||||
|
xf = torch.flatten(cam_shape_feats, start_dim=1)
|
||||||
|
|
||||||
|
pred_pose = init_pose
|
||||||
|
pred_shape = init_shape
|
||||||
|
pred_cam = init_cam
|
||||||
|
for i in range(3):
|
||||||
|
xc = torch.cat([xf, pred_pose, pred_shape, pred_cam], 1)
|
||||||
|
xc = self.fc1(xc)
|
||||||
|
xc = self.drop1(xc)
|
||||||
|
xc = self.fc2(xc)
|
||||||
|
xc = self.drop2(xc)
|
||||||
|
pred_pose = self.decpose(xc) + pred_pose
|
||||||
|
pred_shape = self.decshape(xc) + pred_shape
|
||||||
|
pred_cam = self.deccam(xc) + pred_cam
|
||||||
|
|
||||||
|
return pred_pose, pred_shape, pred_cam
|
||||||
|
|
||||||
|
def _pare_get_final_preds(self, pose_feats, cam_shape_feats, init_pose, init_shape, init_cam):
|
||||||
|
pose_feats = pose_feats.unsqueeze(-1) #
|
||||||
|
|
||||||
|
if init_pose.shape[-1] == 6:
|
||||||
|
# This means init_pose comes from a previous iteration
|
||||||
|
init_pose = init_pose.transpose(2,1).unsqueeze(-1)
|
||||||
|
else:
|
||||||
|
# This means init pose comes from mean pose
|
||||||
|
init_pose = init_pose.reshape(init_pose.shape[0], 6, -1).unsqueeze(-1)
|
||||||
|
|
||||||
|
if self.iterative_regression:
|
||||||
|
|
||||||
|
shape_feats = torch.flatten(cam_shape_feats, start_dim=1)
|
||||||
|
|
||||||
|
pred_pose = init_pose # [N, 6, J, 1]
|
||||||
|
pred_cam = init_cam # [N, 3]
|
||||||
|
pred_shape = init_shape # [N, 10]
|
||||||
|
|
||||||
|
# import IPython; IPython.embed(); exit(1)
|
||||||
|
|
||||||
|
for i in range(self.num_iterations):
|
||||||
|
# pose_feats shape: [N, 256, 24, 1]
|
||||||
|
# shape_feats shape: [N, 24*64]
|
||||||
|
pose_mlp_inp = self._prepare_pose_mlp_inp(pose_feats, pred_pose, pred_shape, pred_cam)
|
||||||
|
shape_mlp_inp = self._prepare_shape_mlp_inp(shape_feats, pred_pose, pred_shape, pred_cam)
|
||||||
|
|
||||||
|
# print('pose_mlp_inp', pose_mlp_inp.shape)
|
||||||
|
# print('shape_mlp_inp', shape_mlp_inp.shape)
|
||||||
|
# TODO: this does not work but let it go since we dont use iterative regression for now.
|
||||||
|
# if self.use_final_nonlocal:
|
||||||
|
# pose_mlp_inp = self.final_pose_nonlocal(pose_mlp_inp)
|
||||||
|
# shape_mlp_inp = self.final_shape_nonlocal(shape_mlp_inp)
|
||||||
|
|
||||||
|
if self.iter_residual:
|
||||||
|
pred_pose = self.pose_mlp(pose_mlp_inp) + pred_pose
|
||||||
|
pred_cam = self.cam_mlp(shape_mlp_inp) + pred_cam
|
||||||
|
pred_shape = self.shape_mlp(shape_mlp_inp) + pred_shape
|
||||||
|
else:
|
||||||
|
pred_pose = self.pose_mlp(pose_mlp_inp)
|
||||||
|
pred_cam = self.cam_mlp(shape_mlp_inp)
|
||||||
|
pred_shape = self.shape_mlp(shape_mlp_inp) + init_shape
|
||||||
|
else:
|
||||||
|
shape_feats = cam_shape_feats
|
||||||
|
if self.use_final_nonlocal:
|
||||||
|
pose_feats = self.final_pose_nonlocal(pose_feats.squeeze(-1)).unsqueeze(-1)
|
||||||
|
shape_feats = self.final_shape_nonlocal(shape_feats)
|
||||||
|
|
||||||
|
shape_feats = torch.flatten(shape_feats, start_dim=1)
|
||||||
|
|
||||||
|
pred_pose = self.pose_mlp(pose_feats)
|
||||||
|
pred_cam = self.cam_mlp(shape_feats)
|
||||||
|
pred_shape = self.shape_mlp(shape_feats)
|
||||||
|
|
||||||
|
if self.use_mean_camshape:
|
||||||
|
pred_cam = pred_cam + init_cam
|
||||||
|
pred_shape = pred_shape + init_shape
|
||||||
|
|
||||||
|
if self.use_mean_pose:
|
||||||
|
pred_pose = pred_pose + init_pose
|
||||||
|
|
||||||
|
|
||||||
|
pred_pose = pred_pose.squeeze(-1).transpose(2, 1) # N, J, 6
|
||||||
|
return pred_pose, pred_shape, pred_cam
|
||||||
|
|
||||||
|
def forward_pretraining(self, features):
|
||||||
|
# TODO: implement pretraining
|
||||||
|
kp_feats = self.keypoint_deconv_layers(features)
|
||||||
|
heatmaps = self.keypoint_final_layer(kp_feats)
|
||||||
|
|
||||||
|
output = {}
|
||||||
|
|
||||||
|
if self.use_heatmaps == 'hm':
|
||||||
|
# returns coords between [-1,1]
|
||||||
|
pred_kp2d, confidence = get_heatmap_preds(heatmaps)
|
||||||
|
output['pred_kp2d'] = pred_kp2d
|
||||||
|
output['pred_kp2d_conf'] = confidence
|
||||||
|
elif self.use_heatmaps == 'hm_soft':
|
||||||
|
pred_kp2d, _ = softargmax2d(heatmaps, self.temperature)
|
||||||
|
output['pred_kp2d'] = pred_kp2d
|
||||||
|
else:
|
||||||
|
# returns coords between [-1,1]
|
||||||
|
pred_kp2d, _ = softargmax2d(heatmaps, self.temperature)
|
||||||
|
output['pred_kp2d'] = pred_kp2d
|
||||||
|
|
||||||
|
if self.use_keypoint_features_for_smpl_regression:
|
||||||
|
smpl_feats = kp_feats
|
||||||
|
else:
|
||||||
|
smpl_feats = self.smpl_deconv_layers(features)
|
||||||
|
|
||||||
|
cam_shape_feats = self.smpl_final_layer(smpl_feats)
|
||||||
|
|
||||||
|
output.update({
|
||||||
|
'kp_feats': heatmaps,
|
||||||
|
'heatmaps': heatmaps,
|
||||||
|
'smpl_feats': smpl_feats,
|
||||||
|
'cam_shape_feats': cam_shape_feats,
|
||||||
|
})
|
||||||
|
return output
|
133
myeasymocap/backbone/pare/head/smpl_cam_head.py
Normal file
133
myeasymocap/backbone/pare/head/smpl_cam_head.py
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
|
||||||
|
# holder of all proprietary rights on this computer program.
|
||||||
|
# You can only use this computer program if you have closed
|
||||||
|
# a license agreement with MPG or you get the right to use the computer
|
||||||
|
# program from someone who is authorized to grant you that right.
|
||||||
|
# Any use of the computer program without a valid license is prohibited and
|
||||||
|
# liable to prosecution.
|
||||||
|
#
|
||||||
|
# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
|
||||||
|
# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
|
||||||
|
# for Intelligent Systems. All rights reserved.
|
||||||
|
#
|
||||||
|
# Contact: ps-license@tuebingen.mpg.de
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from .. import config
|
||||||
|
from .smpl_head import SMPL
|
||||||
|
|
||||||
|
|
||||||
|
class SMPLCamHead(nn.Module):
|
||||||
|
def __init__(self, img_res=224):
|
||||||
|
super(SMPLCamHead, self).__init__()
|
||||||
|
self.smpl = SMPL(config.SMPL_MODEL_DIR, create_transl=False)
|
||||||
|
self.add_module('smpl', self.smpl)
|
||||||
|
|
||||||
|
self.img_res = img_res
|
||||||
|
|
||||||
|
def forward(self, rotmat, shape, cam, cam_rotmat, cam_intrinsics,
|
||||||
|
bbox_scale, bbox_center, img_w, img_h, normalize_joints2d=False):
|
||||||
|
'''
|
||||||
|
:param rotmat: rotation in euler angles format (N,J,3,3)
|
||||||
|
:param shape: smpl betas
|
||||||
|
:param cam: weak perspective camera
|
||||||
|
:param normalize_joints2d: bool, normalize joints between -1, 1 if true
|
||||||
|
:param cam_rotmat (Nx3x3) camera rotation matrix
|
||||||
|
:param cam_intrinsics (Nx3x3) camera intrinsics matrix
|
||||||
|
:param bbox_scale (N,) bbox height normalized by 200
|
||||||
|
:param bbox_center (N,2) bbox center
|
||||||
|
:param img_w (N,) original image width
|
||||||
|
:param img_h (N,) original image height
|
||||||
|
:return: dict with keys 'vertices', 'joints3d', 'joints2d' if cam is True
|
||||||
|
'''
|
||||||
|
smpl_output = self.smpl(
|
||||||
|
betas=shape,
|
||||||
|
body_pose=rotmat[:, 1:].contiguous(),
|
||||||
|
global_orient=rotmat[:, 0].unsqueeze(1).contiguous(),
|
||||||
|
pose2rot=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
output = {
|
||||||
|
'smpl_vertices': smpl_output.vertices,
|
||||||
|
'smpl_joints3d': smpl_output.joints,
|
||||||
|
}
|
||||||
|
|
||||||
|
joints3d = smpl_output.joints
|
||||||
|
|
||||||
|
cam_t = convert_pare_to_full_img_cam(
|
||||||
|
pare_cam=cam,
|
||||||
|
bbox_height=bbox_scale * 200.,
|
||||||
|
bbox_center=bbox_center,
|
||||||
|
img_w=img_w,
|
||||||
|
img_h=img_h,
|
||||||
|
focal_length=cam_intrinsics[:, 0, 0],
|
||||||
|
crop_res=self.img_res,
|
||||||
|
)
|
||||||
|
|
||||||
|
joints2d = perspective_projection(
|
||||||
|
joints3d,
|
||||||
|
rotation=cam_rotmat,
|
||||||
|
translation=cam_t,
|
||||||
|
cam_intrinsics=cam_intrinsics,
|
||||||
|
)
|
||||||
|
|
||||||
|
# logger.debug(f'PARE cam: {cam}')
|
||||||
|
# logger.debug(f'FIMG cam: {cam_t}')
|
||||||
|
# logger.debug(f'joints2d: {joints2d}')
|
||||||
|
|
||||||
|
|
||||||
|
if normalize_joints2d:
|
||||||
|
# Normalize keypoints to [-1,1]
|
||||||
|
joints2d = joints2d / (self.img_res / 2.)
|
||||||
|
|
||||||
|
output['smpl_joints2d'] = joints2d
|
||||||
|
output['pred_cam_t'] = cam_t
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def perspective_projection(points, rotation, translation, cam_intrinsics):
|
||||||
|
"""
|
||||||
|
This function computes the perspective projection of a set of points.
|
||||||
|
Input:
|
||||||
|
points (bs, N, 3): 3D points
|
||||||
|
rotation (bs, 3, 3): Camera rotation
|
||||||
|
translation (bs, 3): Camera translation
|
||||||
|
cam_intrinsics (bs, 3, 3): Camera intrinsics
|
||||||
|
"""
|
||||||
|
K = cam_intrinsics
|
||||||
|
|
||||||
|
# Transform points
|
||||||
|
points = torch.einsum('bij,bkj->bki', rotation, points)
|
||||||
|
points = points + translation.unsqueeze(1)
|
||||||
|
|
||||||
|
# Apply perspective distortion
|
||||||
|
projected_points = points / points[:,:,-1].unsqueeze(-1)
|
||||||
|
|
||||||
|
# Apply camera intrinsics
|
||||||
|
projected_points = torch.einsum('bij,bkj->bki', K, projected_points.float())
|
||||||
|
|
||||||
|
return projected_points[:, :, :-1]
|
||||||
|
|
||||||
|
|
||||||
|
def convert_pare_to_full_img_cam(
|
||||||
|
pare_cam, bbox_height, bbox_center,
|
||||||
|
img_w, img_h, focal_length, crop_res=224):
|
||||||
|
# Converts weak perspective camera estimated by PARE in
|
||||||
|
# bbox coords to perspective camera in full image coordinates
|
||||||
|
# from https://arxiv.org/pdf/2009.06549.pdf
|
||||||
|
s, tx, ty = pare_cam[:, 0], pare_cam[:, 1], pare_cam[:, 2]
|
||||||
|
res = 224
|
||||||
|
r = bbox_height / res
|
||||||
|
tz = 2 * focal_length / (r * res * s)
|
||||||
|
|
||||||
|
cx = 2 * (bbox_center[:, 0] - (img_w / 2.)) / (s * bbox_height)
|
||||||
|
cy = 2 * (bbox_center[:, 1] - (img_h / 2.)) / (s * bbox_height)
|
||||||
|
|
||||||
|
cam_t = torch.stack([tx + cx, ty + cy, tz], dim=-1)
|
||||||
|
|
||||||
|
return cam_t
|
104
myeasymocap/backbone/pare/head/smpl_head.py
Normal file
104
myeasymocap/backbone/pare/head/smpl_head.py
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
|
||||||
|
# holder of all proprietary rights on this computer program.
|
||||||
|
# You can only use this computer program if you have closed
|
||||||
|
# a license agreement with MPG or you get the right to use the computer
|
||||||
|
# program from someone who is authorized to grant you that right.
|
||||||
|
# Any use of the computer program without a valid license is prohibited and
|
||||||
|
# liable to prosecution.
|
||||||
|
#
|
||||||
|
# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
|
||||||
|
# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
|
||||||
|
# for Intelligent Systems. All rights reserved.
|
||||||
|
#
|
||||||
|
# Contact: ps-license@tuebingen.mpg.de
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from smplx import SMPL as _SMPL
|
||||||
|
from smplx.utils import SMPLOutput
|
||||||
|
from smplx.lbs import vertices2joints
|
||||||
|
|
||||||
|
from .. import config, constants
|
||||||
|
from ..utils.geometry import perspective_projection, convert_weak_perspective_to_perspective
|
||||||
|
|
||||||
|
|
||||||
|
class SMPL(_SMPL):
|
||||||
|
""" Extension of the official SMPL implementation to support more joints """
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(SMPL, self).__init__(*args, **kwargs)
|
||||||
|
joints = [constants.JOINT_MAP[i] for i in constants.JOINT_NAMES]
|
||||||
|
J_regressor_extra = np.load(config.JOINT_REGRESSOR_TRAIN_EXTRA)
|
||||||
|
self.register_buffer('J_regressor_extra', torch.tensor(J_regressor_extra, dtype=torch.float32))
|
||||||
|
self.joint_map = torch.tensor(joints, dtype=torch.long)
|
||||||
|
|
||||||
|
def forward(self, *args, **kwargs):
|
||||||
|
kwargs['get_skin'] = True
|
||||||
|
smpl_output = super(SMPL, self).forward(*args, **kwargs)
|
||||||
|
extra_joints = vertices2joints(self.J_regressor_extra, smpl_output.vertices)
|
||||||
|
joints = torch.cat([smpl_output.joints, extra_joints], dim=1)
|
||||||
|
joints = joints[:, self.joint_map, :]
|
||||||
|
output = SMPLOutput(vertices=smpl_output.vertices,
|
||||||
|
global_orient=smpl_output.global_orient,
|
||||||
|
body_pose=smpl_output.body_pose,
|
||||||
|
joints=joints,
|
||||||
|
betas=smpl_output.betas,
|
||||||
|
full_pose=smpl_output.full_pose)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class SMPLHead(nn.Module):
|
||||||
|
def __init__(self, focal_length=5000., img_res=224):
|
||||||
|
super(SMPLHead, self).__init__()
|
||||||
|
self.smpl = SMPL(config.SMPL_MODEL_DIR, create_transl=False)
|
||||||
|
self.add_module('smpl', self.smpl)
|
||||||
|
self.focal_length = focal_length
|
||||||
|
self.img_res = img_res
|
||||||
|
|
||||||
|
def forward(self, rotmat, shape, cam=None, normalize_joints2d=False):
|
||||||
|
'''
|
||||||
|
:param rotmat: rotation in euler angles format (N,J,3,3)
|
||||||
|
:param shape: smpl betas
|
||||||
|
:param cam: weak perspective camera
|
||||||
|
:param normalize_joints2d: bool, normalize joints between -1, 1 if true
|
||||||
|
:return: dict with keys 'vertices', 'joints3d', 'joints2d' if cam is True
|
||||||
|
'''
|
||||||
|
smpl_output = self.smpl(
|
||||||
|
betas=shape,
|
||||||
|
body_pose=rotmat[:, 1:].contiguous(),
|
||||||
|
global_orient=rotmat[:, 0].unsqueeze(1).contiguous(),
|
||||||
|
pose2rot=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
output = {
|
||||||
|
'smpl_vertices': smpl_output.vertices,
|
||||||
|
'smpl_joints3d': smpl_output.joints,
|
||||||
|
}
|
||||||
|
if cam is not None:
|
||||||
|
joints3d = smpl_output.joints
|
||||||
|
batch_size = joints3d.shape[0]
|
||||||
|
device = joints3d.device
|
||||||
|
cam_t = convert_weak_perspective_to_perspective(
|
||||||
|
cam,
|
||||||
|
focal_length=self.focal_length,
|
||||||
|
img_res=self.img_res,
|
||||||
|
)
|
||||||
|
joints2d = perspective_projection(
|
||||||
|
joints3d,
|
||||||
|
rotation=torch.eye(3, device=device).unsqueeze(0).expand(batch_size, -1, -1),
|
||||||
|
translation=cam_t,
|
||||||
|
focal_length=self.focal_length,
|
||||||
|
camera_center=torch.zeros(batch_size, 2, device=device)
|
||||||
|
)
|
||||||
|
if normalize_joints2d:
|
||||||
|
# Normalize keypoints to [-1,1]
|
||||||
|
joints2d = joints2d / (self.img_res / 2.)
|
||||||
|
|
||||||
|
output['smpl_joints2d'] = joints2d
|
||||||
|
output['pred_cam_t'] = cam_t
|
||||||
|
|
||||||
|
return output
|
4
myeasymocap/backbone/pare/layers/__init__.py
Normal file
4
myeasymocap/backbone/pare/layers/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
from .locallyconnected2d import LocallyConnected2d
|
||||||
|
from .interpolate import interpolate
|
||||||
|
from .nonlocalattention import NonLocalAttention
|
||||||
|
from .keypoint_attention import KeypointAttention
|
126
myeasymocap/backbone/pare/layers/coattention.py
Normal file
126
myeasymocap/backbone/pare/layers/coattention.py
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
|
||||||
|
# holder of all proprietary rights on this computer program.
|
||||||
|
# You can only use this computer program if you have closed
|
||||||
|
# a license agreement with MPG or you get the right to use the computer
|
||||||
|
# program from someone who is authorized to grant you that right.
|
||||||
|
# Any use of the computer program without a valid license is prohibited and
|
||||||
|
# liable to prosecution.
|
||||||
|
#
|
||||||
|
# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
|
||||||
|
# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
|
||||||
|
# for Intelligent Systems. All rights reserved.
|
||||||
|
#
|
||||||
|
# Contact: ps-license@tuebingen.mpg.de
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
from ..backbone.resnet import conv1x1, conv3x3
|
||||||
|
|
||||||
|
|
||||||
|
class CoAttention(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
n_channel,
|
||||||
|
final_conv='simple', # 'double_1', 'double_3', 'single_1', 'single_3', 'simple'
|
||||||
|
):
|
||||||
|
super(CoAttention, self).__init__()
|
||||||
|
self.linear_e = nn.Linear(n_channel, n_channel, bias=False)
|
||||||
|
self.channel = n_channel
|
||||||
|
# self.dim = all_dim
|
||||||
|
self.gate = nn.Conv2d(n_channel, 1, kernel_size=1, bias=False)
|
||||||
|
self.gate_s = nn.Sigmoid()
|
||||||
|
self.softmax = nn.Sigmoid()
|
||||||
|
|
||||||
|
if final_conv.startswith('double'):
|
||||||
|
kernel_size = int(final_conv[-1])
|
||||||
|
conv = conv1x1 if kernel_size == 1 else conv3x3
|
||||||
|
self.final_conv_1 = nn.Sequential(
|
||||||
|
conv(n_channel * 2, n_channel),
|
||||||
|
nn.BatchNorm2d(n_channel),
|
||||||
|
nn.ReLU(inplace=True),
|
||||||
|
conv(n_channel, n_channel),
|
||||||
|
nn.BatchNorm2d(n_channel),
|
||||||
|
nn.ReLU(inplace=True),
|
||||||
|
)
|
||||||
|
self.final_conv_2 = nn.Sequential(
|
||||||
|
conv(n_channel * 2, n_channel),
|
||||||
|
nn.BatchNorm2d(n_channel),
|
||||||
|
nn.ReLU(inplace=True),
|
||||||
|
conv(n_channel, n_channel),
|
||||||
|
nn.BatchNorm2d(n_channel),
|
||||||
|
nn.ReLU(inplace=True),
|
||||||
|
)
|
||||||
|
elif final_conv.startswith('single'):
|
||||||
|
kernel_size = int(final_conv[-1])
|
||||||
|
conv = conv1x1 if kernel_size == 1 else conv3x3
|
||||||
|
self.final_conv_1 = nn.Sequential(
|
||||||
|
conv(n_channel*2, n_channel),
|
||||||
|
nn.BatchNorm2d(n_channel),
|
||||||
|
nn.ReLU(inplace=True),
|
||||||
|
)
|
||||||
|
self.final_conv_2 = nn.Sequential(
|
||||||
|
conv(n_channel*2, n_channel),
|
||||||
|
nn.BatchNorm2d(n_channel),
|
||||||
|
nn.ReLU(inplace=True),
|
||||||
|
)
|
||||||
|
elif final_conv == 'simple':
|
||||||
|
self.final_conv_1 = conv1x1(n_channel * 2, n_channel)
|
||||||
|
self.final_conv_2 = conv1x1(n_channel * 2, n_channel)
|
||||||
|
|
||||||
|
for m in self.modules():
|
||||||
|
if isinstance(m, nn.Conv2d):
|
||||||
|
# n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
|
||||||
|
m.weight.data.normal_(0, 0.01)
|
||||||
|
# init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
|
||||||
|
# init.xavier_normal(m.weight.data)
|
||||||
|
# m.bias.data.fill_(0)
|
||||||
|
elif isinstance(m, nn.BatchNorm2d):
|
||||||
|
m.weight.data.fill_(1)
|
||||||
|
m.bias.data.zero_()
|
||||||
|
|
||||||
|
def forward(self, input_1, input_2):
|
||||||
|
'''
|
||||||
|
input_1: [N, C, H, W]
|
||||||
|
input_2: [N, C, H, W]
|
||||||
|
'''
|
||||||
|
|
||||||
|
b, c, h, w = input_1.shape
|
||||||
|
exemplar, query = input_1, input_2
|
||||||
|
|
||||||
|
exemplar_flat = exemplar.reshape(-1, c, h*w) # N,C,H*W
|
||||||
|
query_flat = query.reshape(-1, c, h*w)
|
||||||
|
|
||||||
|
# Compute coattention scores, S in the paper
|
||||||
|
exemplar_t = torch.transpose(exemplar_flat, 1, 2).contiguous() # batch size x dim x num
|
||||||
|
exemplar_corr = self.linear_e(exemplar_t)
|
||||||
|
A = torch.bmm(exemplar_corr, query_flat)
|
||||||
|
A1 = F.softmax(A.clone(), dim=1)
|
||||||
|
B = F.softmax(torch.transpose(A, 1, 2), dim=1)
|
||||||
|
query_att = torch.bmm(exemplar_flat, A1)
|
||||||
|
exemplar_att = torch.bmm(query_flat, B)
|
||||||
|
|
||||||
|
input1_att = exemplar_att.reshape(-1, c, h, w)
|
||||||
|
input2_att = query_att.reshape(-1, c, h, w)
|
||||||
|
|
||||||
|
# Apply gating on S, section gated coattention
|
||||||
|
input1_mask = self.gate(input1_att)
|
||||||
|
input2_mask = self.gate(input2_att)
|
||||||
|
|
||||||
|
input1_mask = self.gate_s(input1_mask)
|
||||||
|
input2_mask = self.gate_s(input2_mask)
|
||||||
|
|
||||||
|
input1_att = input1_att * input1_mask
|
||||||
|
input2_att = input2_att * input2_mask
|
||||||
|
|
||||||
|
# Concatenate inputs with their attended version
|
||||||
|
input1_att = torch.cat([input1_att, exemplar], 1)
|
||||||
|
input2_att = torch.cat([input2_att, query], 1)
|
||||||
|
|
||||||
|
input1 = self.final_conv_1(input1_att)
|
||||||
|
input2 = self.final_conv_2(input2_att)
|
||||||
|
|
||||||
|
return input1, input2
|
35
myeasymocap/backbone/pare/layers/interpolate.py
Normal file
35
myeasymocap/backbone/pare/layers/interpolate.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
|
||||||
|
# holder of all proprietary rights on this computer program.
|
||||||
|
# You can only use this computer program if you have closed
|
||||||
|
# a license agreement with MPG or you get the right to use the computer
|
||||||
|
# program from someone who is authorized to grant you that right.
|
||||||
|
# Any use of the computer program without a valid license is prohibited and
|
||||||
|
# liable to prosecution.
|
||||||
|
#
|
||||||
|
# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
|
||||||
|
# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
|
||||||
|
# for Intelligent Systems. All rights reserved.
|
||||||
|
#
|
||||||
|
# Contact: ps-license@tuebingen.mpg.de
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
def interpolate(feat, uv):
|
||||||
|
'''
|
||||||
|
|
||||||
|
:param feat: [B, C, H, W] image features
|
||||||
|
:param uv: [B, 2, N] uv coordinates in the image plane, range [-1, 1]
|
||||||
|
:return: [B, C, N] image features at the uv coordinates
|
||||||
|
'''
|
||||||
|
if uv.shape[-1] != 2:
|
||||||
|
uv = uv.transpose(1, 2) # [B, N, 2]
|
||||||
|
uv = uv.unsqueeze(2) # [B, N, 1, 2]
|
||||||
|
# NOTE: for newer PyTorch, it seems that training results are degraded due to implementation diff in F.grid_sample
|
||||||
|
# for old versions, simply remove the aligned_corners argument.
|
||||||
|
if int(torch.__version__.split('.')[1]) < 4:
|
||||||
|
samples = torch.nn.functional.grid_sample(feat, uv) # [B, C, N, 1]
|
||||||
|
else:
|
||||||
|
samples = torch.nn.functional.grid_sample(feat, uv, align_corners=True) # [B, C, N, 1]
|
||||||
|
return samples[:, :, :, 0] # [B, C, N]
|
56
myeasymocap/backbone/pare/layers/keypoint_attention.py
Normal file
56
myeasymocap/backbone/pare/layers/keypoint_attention.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
|
||||||
|
# holder of all proprietary rights on this computer program.
|
||||||
|
# You can only use this computer program if you have closed
|
||||||
|
# a license agreement with MPG or you get the right to use the computer
|
||||||
|
# program from someone who is authorized to grant you that right.
|
||||||
|
# Any use of the computer program without a valid license is prohibited and
|
||||||
|
# liable to prosecution.
|
||||||
|
#
|
||||||
|
# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
|
||||||
|
# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
|
||||||
|
# for Intelligent Systems. All rights reserved.
|
||||||
|
#
|
||||||
|
# Contact: ps-license@tuebingen.mpg.de
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
class KeypointAttention(nn.Module):
|
||||||
|
def __init__(self, use_conv=False, in_channels=(256, 64), out_channels=(256, 64), act='softmax', use_scale=False):
|
||||||
|
super(KeypointAttention, self).__init__()
|
||||||
|
self.use_conv = use_conv
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.out_channels = out_channels
|
||||||
|
self.act = act
|
||||||
|
self.use_scale = use_scale
|
||||||
|
if use_conv:
|
||||||
|
self.conv1x1_pose = nn.Conv1d(in_channels[0], out_channels[0], kernel_size=1)
|
||||||
|
self.conv1x1_shape_cam = nn.Conv1d(in_channels[1], out_channels[1], kernel_size=1)
|
||||||
|
|
||||||
|
def forward(self, features, heatmaps):
|
||||||
|
batch_size, num_joints, height, width = heatmaps.shape
|
||||||
|
|
||||||
|
if self.use_scale:
|
||||||
|
scale = 1.0 / np.sqrt(height * width)
|
||||||
|
heatmaps = heatmaps * scale
|
||||||
|
|
||||||
|
if self.act == 'softmax':
|
||||||
|
normalized_heatmap = F.softmax(heatmaps.reshape(batch_size, num_joints, -1), dim=-1)
|
||||||
|
elif self.act == 'sigmoid':
|
||||||
|
normalized_heatmap = torch.sigmoid(heatmaps.reshape(batch_size, num_joints, -1))
|
||||||
|
features = features.reshape(batch_size, -1, height*width)
|
||||||
|
|
||||||
|
attended_features = torch.matmul(normalized_heatmap, features.transpose(2,1))
|
||||||
|
attended_features = attended_features.transpose(2,1)
|
||||||
|
|
||||||
|
if self.use_conv:
|
||||||
|
if attended_features.shape[1] == self.in_channels[0]:
|
||||||
|
attended_features = self.conv1x1_pose(attended_features)
|
||||||
|
else:
|
||||||
|
attended_features = self.conv1x1_shape_cam(attended_features)
|
||||||
|
|
||||||
|
return attended_features
|
49
myeasymocap/backbone/pare/layers/locallyconnected2d.py
Normal file
49
myeasymocap/backbone/pare/layers/locallyconnected2d.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
|
||||||
|
# holder of all proprietary rights on this computer program.
|
||||||
|
# You can only use this computer program if you have closed
|
||||||
|
# a license agreement with MPG or you get the right to use the computer
|
||||||
|
# program from someone who is authorized to grant you that right.
|
||||||
|
# Any use of the computer program without a valid license is prohibited and
|
||||||
|
# liable to prosecution.
|
||||||
|
#
|
||||||
|
# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
|
||||||
|
# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
|
||||||
|
# for Intelligent Systems. All rights reserved.
|
||||||
|
#
|
||||||
|
# Contact: ps-license@tuebingen.mpg.de
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.nn.modules.utils import _pair
|
||||||
|
|
||||||
|
|
||||||
|
class LocallyConnected2d(nn.Module):
|
||||||
|
def __init__(self, in_channels, out_channels, output_size, kernel_size, stride, bias=False):
|
||||||
|
super(LocallyConnected2d, self).__init__()
|
||||||
|
output_size = _pair(output_size)
|
||||||
|
self.weight = nn.Parameter(
|
||||||
|
torch.randn(1, out_channels, in_channels, output_size[0], output_size[1], kernel_size ** 2),
|
||||||
|
requires_grad=True,
|
||||||
|
)
|
||||||
|
if bias:
|
||||||
|
self.bias = nn.Parameter(
|
||||||
|
torch.randn(1, out_channels, output_size[0], output_size[1]), requires_grad=True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.register_parameter('bias', None)
|
||||||
|
self.kernel_size = _pair(kernel_size)
|
||||||
|
self.stride = _pair(stride)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
_, c, h, w = x.size()
|
||||||
|
kh, kw = self.kernel_size
|
||||||
|
dh, dw = self.stride
|
||||||
|
x = x.unfold(2, kh, dh).unfold(3, kw, dw)
|
||||||
|
x = x.contiguous().view(*x.size()[:-2], -1)
|
||||||
|
# Sum in in_channel and kernel_size dims
|
||||||
|
out = (x.unsqueeze(1) * self.weight).sum([2, -1])
|
||||||
|
if self.bias is not None:
|
||||||
|
out += self.bias
|
||||||
|
return out
|
152
myeasymocap/backbone/pare/layers/non_local/dot_product.py
Normal file
152
myeasymocap/backbone/pare/layers/non_local/dot_product.py
Normal file
@ -0,0 +1,152 @@
|
|||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
|
||||||
|
class _NonLocalBlockND(nn.Module):
|
||||||
|
def __init__(self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True):
|
||||||
|
super(_NonLocalBlockND, self).__init__()
|
||||||
|
|
||||||
|
assert dimension in [1, 2, 3]
|
||||||
|
|
||||||
|
self.dimension = dimension
|
||||||
|
self.sub_sample = sub_sample
|
||||||
|
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.inter_channels = inter_channels
|
||||||
|
|
||||||
|
if self.inter_channels is None:
|
||||||
|
self.inter_channels = in_channels // 2
|
||||||
|
if self.inter_channels == 0:
|
||||||
|
self.inter_channels = 1
|
||||||
|
|
||||||
|
if dimension == 3:
|
||||||
|
conv_nd = nn.Conv3d
|
||||||
|
max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
|
||||||
|
bn = nn.BatchNorm3d
|
||||||
|
elif dimension == 2:
|
||||||
|
conv_nd = nn.Conv2d
|
||||||
|
max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
|
||||||
|
bn = nn.BatchNorm2d
|
||||||
|
else:
|
||||||
|
conv_nd = nn.Conv1d
|
||||||
|
max_pool_layer = nn.MaxPool1d(kernel_size=(2))
|
||||||
|
bn = nn.BatchNorm1d
|
||||||
|
|
||||||
|
self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
|
||||||
|
kernel_size=1, stride=1, padding=0)
|
||||||
|
|
||||||
|
if bn_layer:
|
||||||
|
self.W = nn.Sequential(
|
||||||
|
conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
|
||||||
|
kernel_size=1, stride=1, padding=0),
|
||||||
|
bn(self.in_channels)
|
||||||
|
)
|
||||||
|
nn.init.constant_(self.W[1].weight, 0)
|
||||||
|
nn.init.constant_(self.W[1].bias, 0)
|
||||||
|
else:
|
||||||
|
self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
|
||||||
|
kernel_size=1, stride=1, padding=0)
|
||||||
|
nn.init.constant_(self.W.weight, 0)
|
||||||
|
nn.init.constant_(self.W.bias, 0)
|
||||||
|
|
||||||
|
self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
|
||||||
|
kernel_size=1, stride=1, padding=0)
|
||||||
|
|
||||||
|
self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
|
||||||
|
kernel_size=1, stride=1, padding=0)
|
||||||
|
|
||||||
|
if sub_sample:
|
||||||
|
self.g = nn.Sequential(self.g, max_pool_layer)
|
||||||
|
self.phi = nn.Sequential(self.phi, max_pool_layer)
|
||||||
|
|
||||||
|
def forward(self, x, return_nl_map=False):
|
||||||
|
"""
|
||||||
|
:param x: (b, c, t, h, w)
|
||||||
|
:param return_nl_map: if True return z, nl_map, else only return z.
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
|
||||||
|
batch_size = x.size(0)
|
||||||
|
|
||||||
|
g_x = self.g(x).view(batch_size, self.inter_channels, -1)
|
||||||
|
g_x = g_x.permute(0, 2, 1)
|
||||||
|
|
||||||
|
theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
|
||||||
|
theta_x = theta_x.permute(0, 2, 1)
|
||||||
|
phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
|
||||||
|
f = torch.matmul(theta_x, phi_x)
|
||||||
|
N = f.size(-1)
|
||||||
|
f_div_C = f / N
|
||||||
|
|
||||||
|
y = torch.matmul(f_div_C, g_x)
|
||||||
|
y = y.permute(0, 2, 1).contiguous()
|
||||||
|
y = y.view(batch_size, self.inter_channels, *x.size()[2:])
|
||||||
|
W_y = self.W(y)
|
||||||
|
z = W_y + x
|
||||||
|
|
||||||
|
if return_nl_map:
|
||||||
|
return z, f_div_C
|
||||||
|
return z
|
||||||
|
|
||||||
|
|
||||||
|
class NONLocalBlock1D(_NonLocalBlockND):
|
||||||
|
def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
|
||||||
|
super(NONLocalBlock1D, self).__init__(in_channels,
|
||||||
|
inter_channels=inter_channels,
|
||||||
|
dimension=1, sub_sample=sub_sample,
|
||||||
|
bn_layer=bn_layer)
|
||||||
|
|
||||||
|
|
||||||
|
class NONLocalBlock2D(_NonLocalBlockND):
|
||||||
|
def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
|
||||||
|
super(NONLocalBlock2D, self).__init__(in_channels,
|
||||||
|
inter_channels=inter_channels,
|
||||||
|
dimension=2, sub_sample=sub_sample,
|
||||||
|
bn_layer=bn_layer)
|
||||||
|
|
||||||
|
|
||||||
|
class NONLocalBlock3D(_NonLocalBlockND):
|
||||||
|
def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
|
||||||
|
super(NONLocalBlock3D, self).__init__(in_channels,
|
||||||
|
inter_channels=inter_channels,
|
||||||
|
dimension=3, sub_sample=sub_sample,
|
||||||
|
bn_layer=bn_layer)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import torch
|
||||||
|
|
||||||
|
img = torch.zeros(2, 256, 24)
|
||||||
|
net = NONLocalBlock1D(
|
||||||
|
in_channels=256, inter_channels=None, sub_sample=False, bn_layer=True
|
||||||
|
)
|
||||||
|
out = net(img)
|
||||||
|
print(out.size())
|
||||||
|
|
||||||
|
img = torch.zeros(2, 256, 56, 56)
|
||||||
|
net = NONLocalBlock2D(
|
||||||
|
in_channels=256, inter_channels=None, sub_sample=False, bn_layer=True
|
||||||
|
)
|
||||||
|
out = net(img)
|
||||||
|
print(out.size())
|
||||||
|
|
||||||
|
|
||||||
|
# for (sub_sample_, bn_layer_) in [(True, True), (False, False), (True, False), (False, True)]:
|
||||||
|
# img = torch.zeros(2, 256, 24)
|
||||||
|
# net = NONLocalBlock1D(256, inter_channels=24, sub_sample=sub_sample_, bn_layer=bn_layer_)
|
||||||
|
# out = net(img)
|
||||||
|
# print(out.size())
|
||||||
|
#
|
||||||
|
# img = torch.zeros(2, 3, 20, 20)
|
||||||
|
# net = NONLocalBlock2D(3, sub_sample=sub_sample_, bn_layer=bn_layer_)
|
||||||
|
# out = net(img)
|
||||||
|
# print(out.size())
|
||||||
|
#
|
||||||
|
# img = torch.randn(2, 3, 8, 20, 20)
|
||||||
|
# net = NONLocalBlock3D(3, sub_sample=sub_sample_, bn_layer=bn_layer_)
|
||||||
|
# out = net(img)
|
||||||
|
# print(out.size())
|
||||||
|
|
||||||
|
|
||||||
|
|
57
myeasymocap/backbone/pare/layers/nonlocalattention.py
Normal file
57
myeasymocap/backbone/pare/layers/nonlocalattention.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
|
||||||
|
# holder of all proprietary rights on this computer program.
|
||||||
|
# You can only use this computer program if you have closed
|
||||||
|
# a license agreement with MPG or you get the right to use the computer
|
||||||
|
# program from someone who is authorized to grant you that right.
|
||||||
|
# Any use of the computer program without a valid license is prohibited and
|
||||||
|
# liable to prosecution.
|
||||||
|
#
|
||||||
|
# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
|
||||||
|
# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
|
||||||
|
# for Intelligent Systems. All rights reserved.
|
||||||
|
#
|
||||||
|
# Contact: ps-license@tuebingen.mpg.de
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
class NonLocalAttention(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_channels=256,
|
||||||
|
out_channels=256,
|
||||||
|
):
|
||||||
|
super(NonLocalAttention, self).__init__()
|
||||||
|
self.conv1x1 = nn.Conv1d(in_channels, out_channels, kernel_size=1)
|
||||||
|
|
||||||
|
def forward(self, input):
|
||||||
|
'''
|
||||||
|
input [N, Feats, J, 1]
|
||||||
|
output [N, Feats, J, 1]
|
||||||
|
'''
|
||||||
|
batch_size, n_feats, n_joints, _ = input.shape
|
||||||
|
input = input.squeeze(-1)
|
||||||
|
|
||||||
|
# Compute attention weights
|
||||||
|
attention = torch.matmul(input.transpose(2, 1), input)
|
||||||
|
norm_attention = F.softmax(attention, dim=-1)
|
||||||
|
|
||||||
|
# Compute final dot product
|
||||||
|
out = torch.matmul(input, norm_attention)
|
||||||
|
out = self.conv1x1(out)
|
||||||
|
|
||||||
|
out = out.unsqueeze(-1) # [N, F, J, 1]
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
nla = NonLocalAttention()
|
||||||
|
|
||||||
|
inp = torch.rand(32, 256, 24, 1)
|
||||||
|
|
||||||
|
out = nla(inp)
|
||||||
|
print(out.shape)
|
154
myeasymocap/backbone/pare/layers/softargmax.py
Normal file
154
myeasymocap/backbone/pare/layers/softargmax.py
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
|
||||||
|
# holder of all proprietary rights on this computer program.
|
||||||
|
# You can only use this computer program if you have closed
|
||||||
|
# a license agreement with MPG or you get the right to use the computer
|
||||||
|
# program from someone who is authorized to grant you that right.
|
||||||
|
# Any use of the computer program without a valid license is prohibited and
|
||||||
|
# liable to prosecution.
|
||||||
|
#
|
||||||
|
# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
|
||||||
|
# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
|
||||||
|
# for Intelligent Systems. All rights reserved.
|
||||||
|
#
|
||||||
|
# Contact: ps-license@tuebingen.mpg.de
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
def _softmax(tensor, temperature, dim=-1):
|
||||||
|
return F.softmax(tensor * temperature, dim=dim)
|
||||||
|
|
||||||
|
|
||||||
|
def softargmax1d(
|
||||||
|
heatmaps,
|
||||||
|
temperature=None,
|
||||||
|
normalize_keypoints=True,
|
||||||
|
):
|
||||||
|
dtype, device = heatmaps.dtype, heatmaps.device
|
||||||
|
if temperature is None:
|
||||||
|
temperature = torch.tensor(1.0, dtype=dtype, device=device)
|
||||||
|
batch_size, num_channels, dim = heatmaps.shape
|
||||||
|
points = torch.arange(0, dim, device=device, dtype=dtype).reshape(1, 1, dim).expand(batch_size, -1, -1)
|
||||||
|
# y = torch.arange(0, height, device=device, dtype=dtype).reshape(1, 1, height, 1).expand(batch_size, -1, -1, width)
|
||||||
|
# Should be Bx2xHxW
|
||||||
|
|
||||||
|
# points = torch.cat([x, y], dim=1)
|
||||||
|
normalized_heatmap = _softmax(
|
||||||
|
heatmaps.reshape(batch_size, num_channels, -1),
|
||||||
|
temperature=temperature.reshape(1, -1, 1),
|
||||||
|
dim=-1)
|
||||||
|
|
||||||
|
# Should be BxJx2
|
||||||
|
keypoints = (normalized_heatmap.reshape(batch_size, -1, dim) * points).sum(dim=-1)
|
||||||
|
|
||||||
|
if normalize_keypoints:
|
||||||
|
# Normalize keypoints to [-1, 1]
|
||||||
|
keypoints = (keypoints / (dim - 1) * 2 - 1)
|
||||||
|
|
||||||
|
return keypoints, normalized_heatmap.reshape(
|
||||||
|
batch_size, -1, dim)
|
||||||
|
|
||||||
|
|
||||||
|
def softargmax2d(
|
||||||
|
heatmaps,
|
||||||
|
temperature=None,
|
||||||
|
normalize_keypoints=True,
|
||||||
|
):
|
||||||
|
dtype, device = heatmaps.dtype, heatmaps.device
|
||||||
|
if temperature is None:
|
||||||
|
temperature = torch.tensor(1.0, dtype=dtype, device=device)
|
||||||
|
batch_size, num_channels, height, width = heatmaps.shape
|
||||||
|
x = torch.arange(0, width, device=device, dtype=dtype).reshape(1, 1, 1, width).expand(batch_size, -1, height, -1)
|
||||||
|
y = torch.arange(0, height, device=device, dtype=dtype).reshape(1, 1, height, 1).expand(batch_size, -1, -1, width)
|
||||||
|
# Should be Bx2xHxW
|
||||||
|
points = torch.cat([x, y], dim=1)
|
||||||
|
normalized_heatmap = _softmax(
|
||||||
|
heatmaps.reshape(batch_size, num_channels, -1),
|
||||||
|
temperature=temperature.reshape(1, -1, 1),
|
||||||
|
dim=-1)
|
||||||
|
|
||||||
|
# Should be BxJx2
|
||||||
|
keypoints = (
|
||||||
|
normalized_heatmap.reshape(batch_size, -1, 1, height * width) *
|
||||||
|
points.reshape(batch_size, 1, 2, -1)).sum(dim=-1)
|
||||||
|
|
||||||
|
if normalize_keypoints:
|
||||||
|
# Normalize keypoints to [-1, 1]
|
||||||
|
keypoints[:, :, 0] = (keypoints[:, :, 0] / (width - 1) * 2 - 1)
|
||||||
|
keypoints[:, :, 1] = (keypoints[:, :, 1] / (height - 1) * 2 - 1)
|
||||||
|
|
||||||
|
return keypoints, normalized_heatmap.reshape(
|
||||||
|
batch_size, -1, height, width)
|
||||||
|
|
||||||
|
|
||||||
|
def softargmax3d(
|
||||||
|
heatmaps,
|
||||||
|
temperature=None,
|
||||||
|
normalize_keypoints=True,
|
||||||
|
):
|
||||||
|
dtype, device = heatmaps.dtype, heatmaps.device
|
||||||
|
if temperature is None:
|
||||||
|
temperature = torch.tensor(1.0, dtype=dtype, device=device)
|
||||||
|
batch_size, num_channels, height, width, depth = heatmaps.shape
|
||||||
|
x = torch.arange(0, width, device=device, dtype=dtype).reshape(1, 1, 1, width, 1).expand(batch_size, -1, height, -1, depth)
|
||||||
|
y = torch.arange(0, height, device=device, dtype=dtype).reshape(1, 1, height, 1, 1).expand(batch_size, -1, -1, width, depth)
|
||||||
|
z = torch.arange(0, depth, device=device, dtype=dtype).reshape(1, 1, 1, 1, depth).expand(batch_size, -1, height, width, -1)
|
||||||
|
# Should be Bx2xHxW
|
||||||
|
points = torch.cat([x, y, z], dim=1)
|
||||||
|
normalized_heatmap = _softmax(
|
||||||
|
heatmaps.reshape(batch_size, num_channels, -1),
|
||||||
|
temperature=temperature.reshape(1, -1, 1),
|
||||||
|
dim=-1)
|
||||||
|
|
||||||
|
# Should be BxJx3
|
||||||
|
keypoints = (
|
||||||
|
normalized_heatmap.reshape(batch_size, -1, 1, height * width * depth) *
|
||||||
|
points.reshape(batch_size, 1, 3, -1)).sum(dim=-1)
|
||||||
|
|
||||||
|
if normalize_keypoints:
|
||||||
|
# Normalize keypoints to [-1, 1]
|
||||||
|
keypoints[:, :, 0] = (keypoints[:, :, 0] / (width - 1) * 2 - 1)
|
||||||
|
keypoints[:, :, 1] = (keypoints[:, :, 1] / (height - 1) * 2 - 1)
|
||||||
|
keypoints[:, :, 2] = (keypoints[:, :, 2] / (depth - 1) * 2 - 1)
|
||||||
|
|
||||||
|
return keypoints, normalized_heatmap.reshape(
|
||||||
|
batch_size, -1, height, width, depth)
|
||||||
|
|
||||||
|
|
||||||
|
def get_heatmap_preds(batch_heatmaps, normalize_keypoints=True):
|
||||||
|
'''
|
||||||
|
get predictions from score maps
|
||||||
|
heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
|
||||||
|
'''
|
||||||
|
assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim'
|
||||||
|
|
||||||
|
batch_size = batch_heatmaps.shape[0]
|
||||||
|
num_joints = batch_heatmaps.shape[1]
|
||||||
|
height = batch_heatmaps.shape[2]
|
||||||
|
width = batch_heatmaps.shape[3]
|
||||||
|
heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1))
|
||||||
|
|
||||||
|
maxvals, idx = torch.max(heatmaps_reshaped, 2)
|
||||||
|
|
||||||
|
maxvals = maxvals.reshape((batch_size, num_joints, 1))
|
||||||
|
idx = idx.reshape((batch_size, num_joints, 1))
|
||||||
|
|
||||||
|
preds = idx.repeat(1, 1, 2).float()
|
||||||
|
|
||||||
|
preds[:, :, 0] = (preds[:, :, 0]) % width
|
||||||
|
preds[:, :, 1] = torch.floor((preds[:, :, 1]) / width)
|
||||||
|
|
||||||
|
pred_mask = torch.gt(maxvals, 0.0).repeat(1, 1, 2)
|
||||||
|
pred_mask = pred_mask.float()
|
||||||
|
|
||||||
|
preds *= pred_mask
|
||||||
|
|
||||||
|
if normalize_keypoints:
|
||||||
|
# Normalize keypoints to [-1, 1]
|
||||||
|
preds[:, :, 0] = (preds[:, :, 0] / (width - 1) * 2 - 1)
|
||||||
|
preds[:, :, 1] = (preds[:, :, 1] / (height - 1) * 2 - 1)
|
||||||
|
|
||||||
|
return preds, maxvals
|
262
myeasymocap/backbone/pare/pare.py
Normal file
262
myeasymocap/backbone/pare/pare.py
Normal file
@ -0,0 +1,262 @@
|
|||||||
|
import os
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from .config import update_hparams
|
||||||
|
# from .head import PareHead, SMPLHead, SMPLCamHead
|
||||||
|
from .head import PareHead
|
||||||
|
from .backbone.utils import get_backbone_info
|
||||||
|
from .backbone.hrnet import hrnet_w32
|
||||||
|
from os.path import join
|
||||||
|
from easymocap.multistage.torchgeometry import rotation_matrix_to_axis_angle
|
||||||
|
import cv2
|
||||||
|
|
||||||
|
def try_to_download():
|
||||||
|
model_dir = os.path.join(os.path.dirname(__file__), '..', '..', 'models', 'pare')
|
||||||
|
cmd = 'wget https://www.dropbox.com/s/aeulffqzb3zmh8x/pare-github-data.zip'
|
||||||
|
os.system(cmd)
|
||||||
|
os.makedirs(model_dir, exist_ok=True)
|
||||||
|
cmd = 'unzip pare-github-data.zip -d {}'.format(model_dir)
|
||||||
|
os.system(cmd)
|
||||||
|
|
||||||
|
CFG = 'models/pare/data/pare/checkpoints/pare_w_3dpw_config.yaml'
|
||||||
|
CKPT = 'models/pare/data/pare/checkpoints/pare_w_3dpw_checkpoint.ckpt'
|
||||||
|
|
||||||
|
class PARE(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
num_joints=24,
|
||||||
|
softmax_temp=1.0,
|
||||||
|
num_features_smpl=64,
|
||||||
|
backbone='resnet50',
|
||||||
|
focal_length=5000.,
|
||||||
|
img_res=224,
|
||||||
|
pretrained=None,
|
||||||
|
iterative_regression=False,
|
||||||
|
iter_residual=False,
|
||||||
|
num_iterations=3,
|
||||||
|
shape_input_type='feats', # 'feats.all_pose.shape.cam',
|
||||||
|
pose_input_type='feats', # 'feats.neighbor_pose_feats.all_pose.self_pose.neighbor_pose.shape.cam'
|
||||||
|
pose_mlp_num_layers=1,
|
||||||
|
shape_mlp_num_layers=1,
|
||||||
|
pose_mlp_hidden_size=256,
|
||||||
|
shape_mlp_hidden_size=256,
|
||||||
|
use_keypoint_features_for_smpl_regression=False,
|
||||||
|
use_heatmaps='',
|
||||||
|
use_keypoint_attention=False,
|
||||||
|
keypoint_attention_act='softmax',
|
||||||
|
use_postconv_keypoint_attention=False,
|
||||||
|
use_scale_keypoint_attention=False,
|
||||||
|
use_final_nonlocal=None,
|
||||||
|
use_branch_nonlocal=None,
|
||||||
|
use_hmr_regression=False,
|
||||||
|
use_coattention=False,
|
||||||
|
num_coattention_iter=1,
|
||||||
|
coattention_conv='simple',
|
||||||
|
deconv_conv_kernel_size=4,
|
||||||
|
use_upsampling=False,
|
||||||
|
use_soft_attention=False,
|
||||||
|
num_branch_iteration=0,
|
||||||
|
branch_deeper=False,
|
||||||
|
num_deconv_layers=3,
|
||||||
|
num_deconv_filters=256,
|
||||||
|
use_resnet_conv_hrnet=False,
|
||||||
|
use_position_encodings=None,
|
||||||
|
use_mean_camshape=False,
|
||||||
|
use_mean_pose=False,
|
||||||
|
init_xavier=False,
|
||||||
|
use_cam=False,
|
||||||
|
):
|
||||||
|
super(PARE, self).__init__()
|
||||||
|
if backbone.startswith('hrnet'):
|
||||||
|
backbone, use_conv = backbone.split('-')
|
||||||
|
# hrnet_w32-conv, hrnet_w32-interp
|
||||||
|
self.backbone = eval(backbone)(
|
||||||
|
pretrained=True,
|
||||||
|
downsample=False,
|
||||||
|
use_conv=(use_conv == 'conv')
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.backbone = eval(backbone)(pretrained=True)
|
||||||
|
|
||||||
|
# self.backbone = eval(backbone)(pretrained=True)
|
||||||
|
self.head = PareHead(
|
||||||
|
num_joints=num_joints,
|
||||||
|
num_input_features=get_backbone_info(backbone)['n_output_channels'],
|
||||||
|
softmax_temp=softmax_temp,
|
||||||
|
num_deconv_layers=num_deconv_layers,
|
||||||
|
num_deconv_filters=[num_deconv_filters] * num_deconv_layers,
|
||||||
|
num_deconv_kernels=[deconv_conv_kernel_size] * num_deconv_layers,
|
||||||
|
num_features_smpl=num_features_smpl,
|
||||||
|
final_conv_kernel=1,
|
||||||
|
iterative_regression=iterative_regression,
|
||||||
|
iter_residual=iter_residual,
|
||||||
|
num_iterations=num_iterations,
|
||||||
|
shape_input_type=shape_input_type,
|
||||||
|
pose_input_type=pose_input_type,
|
||||||
|
pose_mlp_num_layers=pose_mlp_num_layers,
|
||||||
|
shape_mlp_num_layers=shape_mlp_num_layers,
|
||||||
|
pose_mlp_hidden_size=pose_mlp_hidden_size,
|
||||||
|
shape_mlp_hidden_size=shape_mlp_hidden_size,
|
||||||
|
use_keypoint_features_for_smpl_regression=use_keypoint_features_for_smpl_regression,
|
||||||
|
use_heatmaps=use_heatmaps,
|
||||||
|
use_keypoint_attention=use_keypoint_attention,
|
||||||
|
use_postconv_keypoint_attention=use_postconv_keypoint_attention,
|
||||||
|
keypoint_attention_act=keypoint_attention_act,
|
||||||
|
use_scale_keypoint_attention=use_scale_keypoint_attention,
|
||||||
|
use_branch_nonlocal=use_branch_nonlocal, # 'concatenation', 'dot_product', 'embedded_gaussian', 'gaussian'
|
||||||
|
use_final_nonlocal=use_final_nonlocal, # 'concatenation', 'dot_product', 'embedded_gaussian', 'gaussian'
|
||||||
|
backbone=backbone,
|
||||||
|
use_hmr_regression=use_hmr_regression,
|
||||||
|
use_coattention=use_coattention,
|
||||||
|
num_coattention_iter=num_coattention_iter,
|
||||||
|
coattention_conv=coattention_conv,
|
||||||
|
use_upsampling=use_upsampling,
|
||||||
|
use_soft_attention=use_soft_attention,
|
||||||
|
num_branch_iteration=num_branch_iteration,
|
||||||
|
branch_deeper=branch_deeper,
|
||||||
|
use_resnet_conv_hrnet=use_resnet_conv_hrnet,
|
||||||
|
use_position_encodings=use_position_encodings,
|
||||||
|
use_mean_camshape=use_mean_camshape,
|
||||||
|
use_mean_pose=use_mean_pose,
|
||||||
|
init_xavier=init_xavier,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.use_cam = use_cam
|
||||||
|
# if self.use_cam:
|
||||||
|
# self.smpl = SMPLCamHead(
|
||||||
|
# img_res=img_res,
|
||||||
|
# )
|
||||||
|
# else:
|
||||||
|
# self.smpl = SMPLHead(
|
||||||
|
# focal_length=focal_length,
|
||||||
|
# img_res=img_res
|
||||||
|
# )
|
||||||
|
|
||||||
|
if pretrained is not None:
|
||||||
|
self.load_pretrained(pretrained)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
images,
|
||||||
|
gt_segm=None,
|
||||||
|
):
|
||||||
|
features = self.backbone(images)
|
||||||
|
hmr_output = self.head(features, gt_segm=gt_segm)
|
||||||
|
rotmat = hmr_output['pred_pose']
|
||||||
|
shape = hmr_output['pred_shape']
|
||||||
|
rotmat_flat = rotmat.reshape(-1, 3, 3)
|
||||||
|
rvec_flat = rotation_matrix_to_axis_angle(rotmat_flat)
|
||||||
|
rvec = rvec_flat.reshape(*rotmat.shape[:-2], 3)
|
||||||
|
rvec = rvec.reshape(*rvec.shape[:-2], -1)
|
||||||
|
return {
|
||||||
|
'Rh': rvec[..., :3],
|
||||||
|
'Th': torch.zeros_like(rvec[..., :3]),
|
||||||
|
'poses': rvec[..., 3:],
|
||||||
|
'shapes': shape,
|
||||||
|
}
|
||||||
|
|
||||||
|
from ..basetopdown import BaseTopDownModelCache
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
class NullSPIN:
|
||||||
|
def __init__(self, ckpt) -> None:
|
||||||
|
self.name = 'spin'
|
||||||
|
|
||||||
|
def __call__(self, bbox, images, imgname):
|
||||||
|
from easymocap.mytools.reader import read_smpl
|
||||||
|
basename = os.path.basename(imgname)
|
||||||
|
cachename = join(self.output, self.name, basename.replace('.jpg', '.json'))
|
||||||
|
if os.path.exists(cachename):
|
||||||
|
params = read_smpl(cachename)
|
||||||
|
params = params[0]
|
||||||
|
params = {key:val[0] for key, val in params.items() if key != 'id'}
|
||||||
|
ret = {
|
||||||
|
'params': params
|
||||||
|
}
|
||||||
|
return ret
|
||||||
|
else:
|
||||||
|
import ipdb; ipdb.set_trace()
|
||||||
|
|
||||||
|
class MyPARE(BaseTopDownModelCache):
|
||||||
|
def __init__(self, ckpt) -> None:
|
||||||
|
super().__init__('pare', bbox_scale=1.1, res_input=224)
|
||||||
|
if not os.path.exists(CFG):
|
||||||
|
from ...io.model import try_to_download_SMPL
|
||||||
|
try_to_download_SMPL('models/pare')
|
||||||
|
self.model_cfg = update_hparams(CFG)
|
||||||
|
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
||||||
|
self.model = self._build_model()
|
||||||
|
self._load_pretrained_model(CKPT)
|
||||||
|
self.model.eval()
|
||||||
|
self.model.to(self.device)
|
||||||
|
|
||||||
|
def __call__(self, bbox, images, imgnames):
|
||||||
|
return super().__call__(bbox[0], images, imgnames)
|
||||||
|
|
||||||
|
def _build_model(self):
|
||||||
|
# ========= Define PARE model ========= #
|
||||||
|
model_cfg = self.model_cfg
|
||||||
|
|
||||||
|
if model_cfg.METHOD == 'pare':
|
||||||
|
model = PARE(
|
||||||
|
backbone=model_cfg.PARE.BACKBONE,
|
||||||
|
num_joints=model_cfg.PARE.NUM_JOINTS,
|
||||||
|
softmax_temp=model_cfg.PARE.SOFTMAX_TEMP,
|
||||||
|
num_features_smpl=model_cfg.PARE.NUM_FEATURES_SMPL,
|
||||||
|
focal_length=model_cfg.DATASET.FOCAL_LENGTH,
|
||||||
|
img_res=model_cfg.DATASET.IMG_RES,
|
||||||
|
pretrained=model_cfg.TRAINING.PRETRAINED,
|
||||||
|
iterative_regression=model_cfg.PARE.ITERATIVE_REGRESSION,
|
||||||
|
num_iterations=model_cfg.PARE.NUM_ITERATIONS,
|
||||||
|
iter_residual=model_cfg.PARE.ITER_RESIDUAL,
|
||||||
|
shape_input_type=model_cfg.PARE.SHAPE_INPUT_TYPE,
|
||||||
|
pose_input_type=model_cfg.PARE.POSE_INPUT_TYPE,
|
||||||
|
pose_mlp_num_layers=model_cfg.PARE.POSE_MLP_NUM_LAYERS,
|
||||||
|
shape_mlp_num_layers=model_cfg.PARE.SHAPE_MLP_NUM_LAYERS,
|
||||||
|
pose_mlp_hidden_size=model_cfg.PARE.POSE_MLP_HIDDEN_SIZE,
|
||||||
|
shape_mlp_hidden_size=model_cfg.PARE.SHAPE_MLP_HIDDEN_SIZE,
|
||||||
|
use_keypoint_features_for_smpl_regression=model_cfg.PARE.USE_KEYPOINT_FEATURES_FOR_SMPL_REGRESSION,
|
||||||
|
use_heatmaps=model_cfg.DATASET.USE_HEATMAPS,
|
||||||
|
use_keypoint_attention=model_cfg.PARE.USE_KEYPOINT_ATTENTION,
|
||||||
|
use_postconv_keypoint_attention=model_cfg.PARE.USE_POSTCONV_KEYPOINT_ATTENTION,
|
||||||
|
use_scale_keypoint_attention=model_cfg.PARE.USE_SCALE_KEYPOINT_ATTENTION,
|
||||||
|
keypoint_attention_act=model_cfg.PARE.KEYPOINT_ATTENTION_ACT,
|
||||||
|
use_final_nonlocal=model_cfg.PARE.USE_FINAL_NONLOCAL,
|
||||||
|
use_branch_nonlocal=model_cfg.PARE.USE_BRANCH_NONLOCAL,
|
||||||
|
use_hmr_regression=model_cfg.PARE.USE_HMR_REGRESSION,
|
||||||
|
use_coattention=model_cfg.PARE.USE_COATTENTION,
|
||||||
|
num_coattention_iter=model_cfg.PARE.NUM_COATTENTION_ITER,
|
||||||
|
coattention_conv=model_cfg.PARE.COATTENTION_CONV,
|
||||||
|
use_upsampling=model_cfg.PARE.USE_UPSAMPLING,
|
||||||
|
deconv_conv_kernel_size=model_cfg.PARE.DECONV_CONV_KERNEL_SIZE,
|
||||||
|
use_soft_attention=model_cfg.PARE.USE_SOFT_ATTENTION,
|
||||||
|
num_branch_iteration=model_cfg.PARE.NUM_BRANCH_ITERATION,
|
||||||
|
branch_deeper=model_cfg.PARE.BRANCH_DEEPER,
|
||||||
|
num_deconv_layers=model_cfg.PARE.NUM_DECONV_LAYERS,
|
||||||
|
num_deconv_filters=model_cfg.PARE.NUM_DECONV_FILTERS,
|
||||||
|
use_resnet_conv_hrnet=model_cfg.PARE.USE_RESNET_CONV_HRNET,
|
||||||
|
use_position_encodings=model_cfg.PARE.USE_POS_ENC,
|
||||||
|
use_mean_camshape=model_cfg.PARE.USE_MEAN_CAMSHAPE,
|
||||||
|
use_mean_pose=model_cfg.PARE.USE_MEAN_POSE,
|
||||||
|
init_xavier=model_cfg.PARE.INIT_XAVIER,
|
||||||
|
).to(self.device)
|
||||||
|
else:
|
||||||
|
exit()
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
def _load_pretrained_model(self, ckpt):
|
||||||
|
# ========= Load pretrained weights ========= #
|
||||||
|
state_dict = torch.load(ckpt, map_location='cpu')['state_dict']
|
||||||
|
pretrained_keys = state_dict.keys()
|
||||||
|
new_state_dict = {}
|
||||||
|
for pk in pretrained_keys:
|
||||||
|
if pk.startswith('model.'):
|
||||||
|
new_state_dict[pk.replace('model.', '')] = state_dict[pk]
|
||||||
|
else:
|
||||||
|
new_state_dict[pk] = state_dict[pk]
|
||||||
|
|
||||||
|
self.model.load_state_dict(new_state_dict, strict=False)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pass
|
722
myeasymocap/backbone/pare/utils/geometry.py
Normal file
722
myeasymocap/backbone/pare/utils/geometry.py
Normal file
@ -0,0 +1,722 @@
|
|||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
"""
|
||||||
|
Useful geometric operations, e.g. Perspective projection and a differentiable Rodrigues formula
|
||||||
|
Parts of the code are taken from https://github.com/MandyMo/pytorch_HMR
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def batch_rot2aa(Rs):
|
||||||
|
"""
|
||||||
|
Rs is B x 3 x 3
|
||||||
|
void cMathUtil::RotMatToAxisAngle(const tMatrix& mat, tVector& out_axis,
|
||||||
|
double& out_theta)
|
||||||
|
{
|
||||||
|
double c = 0.5 * (mat(0, 0) + mat(1, 1) + mat(2, 2) - 1);
|
||||||
|
c = cMathUtil::Clamp(c, -1.0, 1.0);
|
||||||
|
|
||||||
|
out_theta = std::acos(c);
|
||||||
|
|
||||||
|
if (std::abs(out_theta) < 0.00001)
|
||||||
|
{
|
||||||
|
out_axis = tVector(0, 0, 1, 0);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
double m21 = mat(2, 1) - mat(1, 2);
|
||||||
|
double m02 = mat(0, 2) - mat(2, 0);
|
||||||
|
double m10 = mat(1, 0) - mat(0, 1);
|
||||||
|
double denom = std::sqrt(m21 * m21 + m02 * m02 + m10 * m10);
|
||||||
|
out_axis[0] = m21 / denom;
|
||||||
|
out_axis[1] = m02 / denom;
|
||||||
|
out_axis[2] = m10 / denom;
|
||||||
|
out_axis[3] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
cos = 0.5 * (torch.stack([torch.trace(x) for x in Rs]) - 1)
|
||||||
|
cos = torch.clamp(cos, -1, 1)
|
||||||
|
|
||||||
|
theta = torch.acos(cos)
|
||||||
|
|
||||||
|
m21 = Rs[:, 2, 1] - Rs[:, 1, 2]
|
||||||
|
m02 = Rs[:, 0, 2] - Rs[:, 2, 0]
|
||||||
|
m10 = Rs[:, 1, 0] - Rs[:, 0, 1]
|
||||||
|
denom = torch.sqrt(m21 * m21 + m02 * m02 + m10 * m10)
|
||||||
|
|
||||||
|
axis0 = torch.where(torch.abs(theta) < 0.00001, m21, m21 / denom)
|
||||||
|
axis1 = torch.where(torch.abs(theta) < 0.00001, m02, m02 / denom)
|
||||||
|
axis2 = torch.where(torch.abs(theta) < 0.00001, m10, m10 / denom)
|
||||||
|
|
||||||
|
return theta.unsqueeze(1) * torch.stack([axis0, axis1, axis2], 1)
|
||||||
|
|
||||||
|
|
||||||
|
def batch_rodrigues(theta):
|
||||||
|
"""Convert axis-angle representation to rotation matrix.
|
||||||
|
Args:
|
||||||
|
theta: size = [B, 3]
|
||||||
|
Returns:
|
||||||
|
Rotation matrix corresponding to the quaternion -- size = [B, 3, 3]
|
||||||
|
"""
|
||||||
|
l1norm = torch.norm(theta + 1e-8, p = 2, dim = 1)
|
||||||
|
angle = torch.unsqueeze(l1norm, -1)
|
||||||
|
normalized = torch.div(theta, angle)
|
||||||
|
angle = angle * 0.5
|
||||||
|
v_cos = torch.cos(angle)
|
||||||
|
v_sin = torch.sin(angle)
|
||||||
|
quat = torch.cat([v_cos, v_sin * normalized], dim = 1)
|
||||||
|
return quat_to_rotmat(quat)
|
||||||
|
|
||||||
|
|
||||||
|
def quat_to_rotmat(quat):
|
||||||
|
"""Convert quaternion coefficients to rotation matrix.
|
||||||
|
Args:
|
||||||
|
quat: size = [B, 4] 4 <===>(w, x, y, z)
|
||||||
|
Returns:
|
||||||
|
Rotation matrix corresponding to the quaternion -- size = [B, 3, 3]
|
||||||
|
"""
|
||||||
|
norm_quat = quat
|
||||||
|
norm_quat = norm_quat/norm_quat.norm(p=2, dim=1, keepdim=True)
|
||||||
|
w, x, y, z = norm_quat[:,0], norm_quat[:,1], norm_quat[:,2], norm_quat[:,3]
|
||||||
|
|
||||||
|
B = quat.size(0)
|
||||||
|
|
||||||
|
w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
|
||||||
|
wx, wy, wz = w*x, w*y, w*z
|
||||||
|
xy, xz, yz = x*y, x*z, y*z
|
||||||
|
|
||||||
|
rotMat = torch.stack([w2 + x2 - y2 - z2, 2*xy - 2*wz, 2*wy + 2*xz,
|
||||||
|
2*wz + 2*xy, w2 - x2 + y2 - z2, 2*yz - 2*wx,
|
||||||
|
2*xz - 2*wy, 2*wx + 2*yz, w2 - x2 - y2 + z2], dim=1).view(B, 3, 3)
|
||||||
|
return rotMat
|
||||||
|
|
||||||
|
|
||||||
|
def rot6d_to_rotmat(x):
|
||||||
|
"""Convert 6D rotation representation to 3x3 rotation matrix.
|
||||||
|
Based on Zhou et al., "On the Continuity of Rotation Representations in Neural Networks", CVPR 2019
|
||||||
|
Input:
|
||||||
|
(B,6) Batch of 6-D rotation representations
|
||||||
|
Output:
|
||||||
|
(B,3,3) Batch of corresponding rotation matrices
|
||||||
|
"""
|
||||||
|
x = x.reshape(-1,3,2)
|
||||||
|
a1 = x[:, :, 0]
|
||||||
|
a2 = x[:, :, 1]
|
||||||
|
b1 = F.normalize(a1)
|
||||||
|
b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
|
||||||
|
b3 = torch.cross(b1, b2)
|
||||||
|
return torch.stack((b1, b2, b3), dim=-1)
|
||||||
|
|
||||||
|
|
||||||
|
def rotmat_to_rot6d(x):
|
||||||
|
rotmat = x.reshape(-1, 3, 3)
|
||||||
|
rot6d = rotmat[:, :, :2].reshape(x.shape[0], -1)
|
||||||
|
return rot6d
|
||||||
|
|
||||||
|
|
||||||
|
def rotation_matrix_to_angle_axis(rotation_matrix):
|
||||||
|
"""
|
||||||
|
This function is borrowed from https://github.com/kornia/kornia
|
||||||
|
|
||||||
|
Convert 3x4 rotation matrix to Rodrigues vector
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rotation_matrix (Tensor): rotation matrix.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Rodrigues vector transformation.
|
||||||
|
|
||||||
|
Shape:
|
||||||
|
- Input: :math:`(N, 3, 4)`
|
||||||
|
- Output: :math:`(N, 3)`
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> input = torch.rand(2, 3, 4) # Nx4x4
|
||||||
|
>>> output = tgm.rotation_matrix_to_angle_axis(input) # Nx3
|
||||||
|
"""
|
||||||
|
if rotation_matrix.shape[1:] == (3,3):
|
||||||
|
rot_mat = rotation_matrix.reshape(-1, 3, 3)
|
||||||
|
hom = torch.tensor([0, 0, 1], dtype=torch.float32,
|
||||||
|
device=rotation_matrix.device).reshape(1, 3, 1).expand(rot_mat.shape[0], -1, -1)
|
||||||
|
rotation_matrix = torch.cat([rot_mat, hom], dim=-1)
|
||||||
|
|
||||||
|
quaternion = rotation_matrix_to_quaternion(rotation_matrix)
|
||||||
|
aa = quaternion_to_angle_axis(quaternion)
|
||||||
|
aa[torch.isnan(aa)] = 0.0
|
||||||
|
return aa
|
||||||
|
|
||||||
|
|
||||||
|
def quaternion_to_angle_axis(quaternion: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
This function is borrowed from https://github.com/kornia/kornia
|
||||||
|
|
||||||
|
Convert quaternion vector to angle axis of rotation.
|
||||||
|
|
||||||
|
Adapted from ceres C++ library: ceres-solver/include/ceres/rotation.h
|
||||||
|
|
||||||
|
Args:
|
||||||
|
quaternion (torch.Tensor): tensor with quaternions.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
torch.Tensor: tensor with angle axis of rotation.
|
||||||
|
|
||||||
|
Shape:
|
||||||
|
- Input: :math:`(*, 4)` where `*` means, any number of dimensions
|
||||||
|
- Output: :math:`(*, 3)`
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> quaternion = torch.rand(2, 4) # Nx4
|
||||||
|
>>> angle_axis = tgm.quaternion_to_angle_axis(quaternion) # Nx3
|
||||||
|
"""
|
||||||
|
if not torch.is_tensor(quaternion):
|
||||||
|
raise TypeError("Input type is not a torch.Tensor. Got {}".format(
|
||||||
|
type(quaternion)))
|
||||||
|
|
||||||
|
if not quaternion.shape[-1] == 4:
|
||||||
|
raise ValueError("Input must be a tensor of shape Nx4 or 4. Got {}"
|
||||||
|
.format(quaternion.shape))
|
||||||
|
# unpack input and compute conversion
|
||||||
|
q1: torch.Tensor = quaternion[..., 1]
|
||||||
|
q2: torch.Tensor = quaternion[..., 2]
|
||||||
|
q3: torch.Tensor = quaternion[..., 3]
|
||||||
|
sin_squared_theta: torch.Tensor = q1 * q1 + q2 * q2 + q3 * q3
|
||||||
|
|
||||||
|
sin_theta: torch.Tensor = torch.sqrt(sin_squared_theta)
|
||||||
|
cos_theta: torch.Tensor = quaternion[..., 0]
|
||||||
|
two_theta: torch.Tensor = 2.0 * torch.where(
|
||||||
|
cos_theta < 0.0,
|
||||||
|
torch.atan2(-sin_theta, -cos_theta),
|
||||||
|
torch.atan2(sin_theta, cos_theta))
|
||||||
|
|
||||||
|
k_pos: torch.Tensor = two_theta / sin_theta
|
||||||
|
k_neg: torch.Tensor = 2.0 * torch.ones_like(sin_theta)
|
||||||
|
k: torch.Tensor = torch.where(sin_squared_theta > 0.0, k_pos, k_neg)
|
||||||
|
|
||||||
|
angle_axis: torch.Tensor = torch.zeros_like(quaternion)[..., :3]
|
||||||
|
angle_axis[..., 0] += q1 * k
|
||||||
|
angle_axis[..., 1] += q2 * k
|
||||||
|
angle_axis[..., 2] += q3 * k
|
||||||
|
return angle_axis
|
||||||
|
|
||||||
|
|
||||||
|
def rotation_matrix_to_quaternion(rotation_matrix, eps=1e-6):
|
||||||
|
"""
|
||||||
|
This function is borrowed from https://github.com/kornia/kornia
|
||||||
|
|
||||||
|
Convert 3x4 rotation matrix to 4d quaternion vector
|
||||||
|
|
||||||
|
This algorithm is based on algorithm described in
|
||||||
|
https://github.com/KieranWynn/pyquaternion/blob/master/pyquaternion/quaternion.py#L201
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rotation_matrix (Tensor): the rotation matrix to convert.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
Tensor: the rotation in quaternion
|
||||||
|
|
||||||
|
Shape:
|
||||||
|
- Input: :math:`(N, 3, 4)`
|
||||||
|
- Output: :math:`(N, 4)`
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> input = torch.rand(4, 3, 4) # Nx3x4
|
||||||
|
>>> output = tgm.rotation_matrix_to_quaternion(input) # Nx4
|
||||||
|
"""
|
||||||
|
if not torch.is_tensor(rotation_matrix):
|
||||||
|
raise TypeError("Input type is not a torch.Tensor. Got {}".format(
|
||||||
|
type(rotation_matrix)))
|
||||||
|
|
||||||
|
if len(rotation_matrix.shape) > 3:
|
||||||
|
raise ValueError(
|
||||||
|
"Input size must be a three dimensional tensor. Got {}".format(
|
||||||
|
rotation_matrix.shape))
|
||||||
|
if not rotation_matrix.shape[-2:] == (3, 4):
|
||||||
|
raise ValueError(
|
||||||
|
"Input size must be a N x 3 x 4 tensor. Got {}".format(
|
||||||
|
rotation_matrix.shape))
|
||||||
|
|
||||||
|
rmat_t = torch.transpose(rotation_matrix, 1, 2)
|
||||||
|
|
||||||
|
mask_d2 = rmat_t[:, 2, 2] < eps
|
||||||
|
|
||||||
|
mask_d0_d1 = rmat_t[:, 0, 0] > rmat_t[:, 1, 1]
|
||||||
|
mask_d0_nd1 = rmat_t[:, 0, 0] < -rmat_t[:, 1, 1]
|
||||||
|
|
||||||
|
t0 = 1 + rmat_t[:, 0, 0] - rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
|
||||||
|
q0 = torch.stack([rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
|
||||||
|
t0, rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
|
||||||
|
rmat_t[:, 2, 0] + rmat_t[:, 0, 2]], -1)
|
||||||
|
t0_rep = t0.repeat(4, 1).t()
|
||||||
|
|
||||||
|
t1 = 1 - rmat_t[:, 0, 0] + rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
|
||||||
|
q1 = torch.stack([rmat_t[:, 2, 0] - rmat_t[:, 0, 2],
|
||||||
|
rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
|
||||||
|
t1, rmat_t[:, 1, 2] + rmat_t[:, 2, 1]], -1)
|
||||||
|
t1_rep = t1.repeat(4, 1).t()
|
||||||
|
|
||||||
|
t2 = 1 - rmat_t[:, 0, 0] - rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
|
||||||
|
q2 = torch.stack([rmat_t[:, 0, 1] - rmat_t[:, 1, 0],
|
||||||
|
rmat_t[:, 2, 0] + rmat_t[:, 0, 2],
|
||||||
|
rmat_t[:, 1, 2] + rmat_t[:, 2, 1], t2], -1)
|
||||||
|
t2_rep = t2.repeat(4, 1).t()
|
||||||
|
|
||||||
|
t3 = 1 + rmat_t[:, 0, 0] + rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
|
||||||
|
q3 = torch.stack([t3, rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
|
||||||
|
rmat_t[:, 2, 0] - rmat_t[:, 0, 2],
|
||||||
|
rmat_t[:, 0, 1] - rmat_t[:, 1, 0]], -1)
|
||||||
|
t3_rep = t3.repeat(4, 1).t()
|
||||||
|
|
||||||
|
mask_c0 = mask_d2 * mask_d0_d1
|
||||||
|
mask_c1 = mask_d2 * ~mask_d0_d1
|
||||||
|
mask_c2 = ~mask_d2 * mask_d0_nd1
|
||||||
|
mask_c3 = ~mask_d2 * ~mask_d0_nd1
|
||||||
|
mask_c0 = mask_c0.view(-1, 1).type_as(q0)
|
||||||
|
mask_c1 = mask_c1.view(-1, 1).type_as(q1)
|
||||||
|
mask_c2 = mask_c2.view(-1, 1).type_as(q2)
|
||||||
|
mask_c3 = mask_c3.view(-1, 1).type_as(q3)
|
||||||
|
|
||||||
|
q = q0 * mask_c0 + q1 * mask_c1 + q2 * mask_c2 + q3 * mask_c3
|
||||||
|
q /= torch.sqrt(t0_rep * mask_c0 + t1_rep * mask_c1 + # noqa
|
||||||
|
t2_rep * mask_c2 + t3_rep * mask_c3) # noqa
|
||||||
|
q *= 0.5
|
||||||
|
return q
|
||||||
|
|
||||||
|
def convert_perspective_to_weak_perspective(
|
||||||
|
perspective_camera,
|
||||||
|
focal_length=5000.,
|
||||||
|
img_res=224,
|
||||||
|
):
|
||||||
|
# Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz]
|
||||||
|
# in 3D given the bounding box size
|
||||||
|
# This camera translation can be used in a full perspective projection
|
||||||
|
# if isinstance(focal_length, torch.Tensor):
|
||||||
|
# focal_length = focal_length[:, 0]
|
||||||
|
|
||||||
|
weak_perspective_camera = torch.stack(
|
||||||
|
[
|
||||||
|
|
||||||
|
2 * focal_length / (img_res * perspective_camera[:, 2] + 1e-9),
|
||||||
|
perspective_camera[:, 0],
|
||||||
|
perspective_camera[:, 1],
|
||||||
|
],
|
||||||
|
dim=-1
|
||||||
|
)
|
||||||
|
return weak_perspective_camera
|
||||||
|
|
||||||
|
|
||||||
|
def convert_weak_perspective_to_perspective(
|
||||||
|
weak_perspective_camera,
|
||||||
|
focal_length=5000.,
|
||||||
|
img_res=224,
|
||||||
|
):
|
||||||
|
# Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz]
|
||||||
|
# in 3D given the bounding box size
|
||||||
|
# This camera translation can be used in a full perspective projection
|
||||||
|
# if isinstance(focal_length, torch.Tensor):
|
||||||
|
# focal_length = focal_length[:, 0]
|
||||||
|
|
||||||
|
perspective_camera = torch.stack(
|
||||||
|
[
|
||||||
|
weak_perspective_camera[:, 1],
|
||||||
|
weak_perspective_camera[:, 2],
|
||||||
|
2 * focal_length / (img_res * weak_perspective_camera[:, 0] + 1e-9)
|
||||||
|
],
|
||||||
|
dim=-1
|
||||||
|
)
|
||||||
|
return perspective_camera
|
||||||
|
|
||||||
|
|
||||||
|
def perspective_projection(points, rotation, translation,
|
||||||
|
focal_length, camera_center):
|
||||||
|
"""
|
||||||
|
This function computes the perspective projection of a set of points.
|
||||||
|
Input:
|
||||||
|
points (bs, N, 3): 3D points
|
||||||
|
rotation (bs, 3, 3): Camera rotation
|
||||||
|
translation (bs, 3): Camera translation
|
||||||
|
focal_length (bs,) or scalar: Focal length
|
||||||
|
camera_center (bs, 2): Camera center
|
||||||
|
"""
|
||||||
|
batch_size = points.shape[0]
|
||||||
|
K = torch.zeros([batch_size, 3, 3], device=points.device)
|
||||||
|
K[:,0,0] = focal_length
|
||||||
|
K[:,1,1] = focal_length
|
||||||
|
K[:,2,2] = 1.
|
||||||
|
K[:,:-1, -1] = camera_center
|
||||||
|
|
||||||
|
# Transform points
|
||||||
|
points = torch.einsum('bij,bkj->bki', rotation, points)
|
||||||
|
points = points + translation.unsqueeze(1)
|
||||||
|
|
||||||
|
# Apply perspective distortion
|
||||||
|
projected_points = points / points[:,:,-1].unsqueeze(-1)
|
||||||
|
|
||||||
|
# Apply camera intrinsics
|
||||||
|
projected_points = torch.einsum('bij,bkj->bki', K, projected_points)
|
||||||
|
|
||||||
|
return projected_points[:, :, :-1]
|
||||||
|
|
||||||
|
|
||||||
|
def weak_perspective_projection(points, rotation, weak_cam_params, focal_length, camera_center, img_res):
|
||||||
|
"""
|
||||||
|
This function computes the perspective projection of a set of points.
|
||||||
|
Input:
|
||||||
|
points (bs, N, 3): 3D points
|
||||||
|
rotation (bs, 3, 3): Camera rotation
|
||||||
|
translation (bs, 3): Camera translation
|
||||||
|
focal_length (bs,) or scalar: Focal length
|
||||||
|
camera_center (bs, 2): Camera center
|
||||||
|
"""
|
||||||
|
batch_size = points.shape[0]
|
||||||
|
K = torch.zeros([batch_size, 3, 3], device=points.device)
|
||||||
|
K[:,0,0] = focal_length
|
||||||
|
K[:,1,1] = focal_length
|
||||||
|
K[:,2,2] = 1.
|
||||||
|
K[:,:-1, -1] = camera_center
|
||||||
|
|
||||||
|
translation = convert_weak_perspective_to_perspective(weak_cam_params, focal_length, img_res)
|
||||||
|
|
||||||
|
# Transform points
|
||||||
|
points = torch.einsum('bij,bkj->bki', rotation, points)
|
||||||
|
points = points + translation.unsqueeze(1)
|
||||||
|
|
||||||
|
# Apply perspective distortion
|
||||||
|
projected_points = points / points[:,:,-1].unsqueeze(-1)
|
||||||
|
|
||||||
|
# Apply camera intrinsics
|
||||||
|
projected_points = torch.einsum('bij,bkj->bki', K, projected_points)
|
||||||
|
|
||||||
|
return projected_points[:, :, :-1]
|
||||||
|
|
||||||
|
|
||||||
|
def estimate_translation_np(S, joints_2d, joints_conf, focal_length=5000., img_size=224.):
|
||||||
|
"""Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
|
||||||
|
Input:
|
||||||
|
S: (25, 3) 3D joint locations
|
||||||
|
joints: (25, 3) 2D joint locations and confidence
|
||||||
|
Returns:
|
||||||
|
(3,) camera translation vector
|
||||||
|
"""
|
||||||
|
|
||||||
|
num_joints = S.shape[0]
|
||||||
|
# focal length
|
||||||
|
f = np.array([focal_length,focal_length])
|
||||||
|
# optical center
|
||||||
|
center = np.array([img_size/2., img_size/2.])
|
||||||
|
|
||||||
|
# transformations
|
||||||
|
Z = np.reshape(np.tile(S[:,2],(2,1)).T,-1)
|
||||||
|
XY = np.reshape(S[:,0:2],-1)
|
||||||
|
O = np.tile(center,num_joints)
|
||||||
|
F = np.tile(f,num_joints)
|
||||||
|
weight2 = np.reshape(np.tile(np.sqrt(joints_conf),(2,1)).T,-1)
|
||||||
|
|
||||||
|
# least squares
|
||||||
|
Q = np.array([F*np.tile(np.array([1,0]),num_joints), F*np.tile(np.array([0,1]),num_joints), O-np.reshape(joints_2d,-1)]).T
|
||||||
|
c = (np.reshape(joints_2d,-1)-O)*Z - F*XY
|
||||||
|
|
||||||
|
# weighted least squares
|
||||||
|
W = np.diagflat(weight2)
|
||||||
|
Q = np.dot(W,Q)
|
||||||
|
c = np.dot(W,c)
|
||||||
|
|
||||||
|
# square matrix
|
||||||
|
A = np.dot(Q.T,Q)
|
||||||
|
b = np.dot(Q.T,c)
|
||||||
|
|
||||||
|
# solution
|
||||||
|
trans = np.linalg.solve(A, b)
|
||||||
|
|
||||||
|
return trans
|
||||||
|
|
||||||
|
|
||||||
|
def estimate_translation(S, joints_2d, focal_length=5000., img_size=224., use_all_joints=False, rotation=None):
|
||||||
|
"""Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
|
||||||
|
Input:
|
||||||
|
S: (B, 49, 3) 3D joint locations
|
||||||
|
joints: (B, 49, 3) 2D joint locations and confidence
|
||||||
|
Returns:
|
||||||
|
(B, 3) camera translation vectors
|
||||||
|
"""
|
||||||
|
|
||||||
|
device = S.device
|
||||||
|
|
||||||
|
if rotation is not None:
|
||||||
|
S = torch.einsum('bij,bkj->bki', rotation, S)
|
||||||
|
|
||||||
|
# Use only joints 25:49 (GT joints)
|
||||||
|
if use_all_joints:
|
||||||
|
S = S.cpu().numpy()
|
||||||
|
joints_2d = joints_2d.cpu().numpy()
|
||||||
|
else:
|
||||||
|
S = S[:, 25:, :].cpu().numpy()
|
||||||
|
joints_2d = joints_2d[:, 25:, :].cpu().numpy()
|
||||||
|
|
||||||
|
joints_conf = joints_2d[:, :, -1]
|
||||||
|
joints_2d = joints_2d[:, :, :-1]
|
||||||
|
trans = np.zeros((S.shape[0], 3), dtype=np.float32)
|
||||||
|
# Find the translation for each example in the batch
|
||||||
|
for i in range(S.shape[0]):
|
||||||
|
S_i = S[i]
|
||||||
|
joints_i = joints_2d[i]
|
||||||
|
conf_i = joints_conf[i]
|
||||||
|
trans[i] = estimate_translation_np(S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size)
|
||||||
|
return torch.from_numpy(trans).to(device)
|
||||||
|
|
||||||
|
|
||||||
|
def estimate_translation_cam(S, joints_2d, focal_length=(5000., 5000.), img_size=(224., 224.),
|
||||||
|
use_all_joints=False, rotation=None):
|
||||||
|
"""Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
|
||||||
|
Input:
|
||||||
|
S: (B, 49, 3) 3D joint locations
|
||||||
|
joints: (B, 49, 3) 2D joint locations and confidence
|
||||||
|
Returns:
|
||||||
|
(B, 3) camera translation vectors
|
||||||
|
"""
|
||||||
|
|
||||||
|
def estimate_translation_np(S, joints_2d, joints_conf, focal_length=(5000., 5000.), img_size=(224., 224.)):
|
||||||
|
"""Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
|
||||||
|
Input:
|
||||||
|
S: (25, 3) 3D joint locations
|
||||||
|
joints: (25, 3) 2D joint locations and confidence
|
||||||
|
Returns:
|
||||||
|
(3,) camera translation vector
|
||||||
|
"""
|
||||||
|
|
||||||
|
num_joints = S.shape[0]
|
||||||
|
# focal length
|
||||||
|
f = np.array([focal_length[0], focal_length[1]])
|
||||||
|
# optical center
|
||||||
|
center = np.array([img_size[0] / 2., img_size[1] / 2.])
|
||||||
|
|
||||||
|
# transformations
|
||||||
|
Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1)
|
||||||
|
XY = np.reshape(S[:, 0:2], -1)
|
||||||
|
O = np.tile(center, num_joints)
|
||||||
|
F = np.tile(f, num_joints)
|
||||||
|
weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1)
|
||||||
|
|
||||||
|
# least squares
|
||||||
|
Q = np.array([F * np.tile(np.array([1, 0]), num_joints), F * np.tile(np.array([0, 1]), num_joints),
|
||||||
|
O - np.reshape(joints_2d, -1)]).T
|
||||||
|
c = (np.reshape(joints_2d, -1) - O) * Z - F * XY
|
||||||
|
|
||||||
|
# weighted least squares
|
||||||
|
W = np.diagflat(weight2)
|
||||||
|
Q = np.dot(W, Q)
|
||||||
|
c = np.dot(W, c)
|
||||||
|
|
||||||
|
# square matrix
|
||||||
|
A = np.dot(Q.T, Q)
|
||||||
|
b = np.dot(Q.T, c)
|
||||||
|
|
||||||
|
# solution
|
||||||
|
trans = np.linalg.solve(A, b)
|
||||||
|
|
||||||
|
return trans
|
||||||
|
|
||||||
|
device = S.device
|
||||||
|
|
||||||
|
if rotation is not None:
|
||||||
|
S = torch.einsum('bij,bkj->bki', rotation, S)
|
||||||
|
|
||||||
|
# Use only joints 25:49 (GT joints)
|
||||||
|
if use_all_joints:
|
||||||
|
S = S.cpu().numpy()
|
||||||
|
joints_2d = joints_2d.cpu().numpy()
|
||||||
|
else:
|
||||||
|
S = S[:, 25:, :].cpu().numpy()
|
||||||
|
joints_2d = joints_2d[:, 25:, :].cpu().numpy()
|
||||||
|
|
||||||
|
joints_conf = joints_2d[:, :, -1]
|
||||||
|
joints_2d = joints_2d[:, :, :-1]
|
||||||
|
trans = np.zeros((S.shape[0], 3), dtype=np.float32)
|
||||||
|
# Find the translation for each example in the batch
|
||||||
|
for i in range(S.shape[0]):
|
||||||
|
S_i = S[i]
|
||||||
|
joints_i = joints_2d[i]
|
||||||
|
conf_i = joints_conf[i]
|
||||||
|
trans[i] = estimate_translation_np(S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size)
|
||||||
|
return torch.from_numpy(trans).to(device)
|
||||||
|
|
||||||
|
|
||||||
|
def get_coord_maps(size=56):
|
||||||
|
xx_ones = torch.ones([1, size], dtype=torch.int32)
|
||||||
|
xx_ones = xx_ones.unsqueeze(-1)
|
||||||
|
|
||||||
|
xx_range = torch.arange(size, dtype=torch.int32).unsqueeze(0)
|
||||||
|
xx_range = xx_range.unsqueeze(1)
|
||||||
|
|
||||||
|
xx_channel = torch.matmul(xx_ones, xx_range)
|
||||||
|
xx_channel = xx_channel.unsqueeze(-1)
|
||||||
|
|
||||||
|
yy_ones = torch.ones([1, size], dtype=torch.int32)
|
||||||
|
yy_ones = yy_ones.unsqueeze(1)
|
||||||
|
|
||||||
|
yy_range = torch.arange(size, dtype=torch.int32).unsqueeze(0)
|
||||||
|
yy_range = yy_range.unsqueeze(-1)
|
||||||
|
|
||||||
|
yy_channel = torch.matmul(yy_range, yy_ones)
|
||||||
|
yy_channel = yy_channel.unsqueeze(-1)
|
||||||
|
|
||||||
|
xx_channel = xx_channel.permute(0, 3, 1, 2)
|
||||||
|
yy_channel = yy_channel.permute(0, 3, 1, 2)
|
||||||
|
|
||||||
|
xx_channel = xx_channel.float() / (size - 1)
|
||||||
|
yy_channel = yy_channel.float() / (size - 1)
|
||||||
|
|
||||||
|
xx_channel = xx_channel * 2 - 1
|
||||||
|
yy_channel = yy_channel * 2 - 1
|
||||||
|
|
||||||
|
out = torch.cat([xx_channel, yy_channel], dim=1)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def look_at(eye, at=np.array([0, 0, 0]), up=np.array([0, 0, 1]), eps=1e-5):
|
||||||
|
at = at.astype(float).reshape(1, 3)
|
||||||
|
up = up.astype(float).reshape(1, 3)
|
||||||
|
|
||||||
|
eye = eye.reshape(-1, 3)
|
||||||
|
up = up.repeat(eye.shape[0] // up.shape[0], axis=0)
|
||||||
|
eps = np.array([eps]).reshape(1, 1).repeat(up.shape[0], axis=0)
|
||||||
|
|
||||||
|
z_axis = eye - at
|
||||||
|
z_axis /= np.max(np.stack([np.linalg.norm(z_axis, axis=1, keepdims=True), eps]))
|
||||||
|
|
||||||
|
x_axis = np.cross(up, z_axis)
|
||||||
|
x_axis /= np.max(np.stack([np.linalg.norm(x_axis, axis=1, keepdims=True), eps]))
|
||||||
|
|
||||||
|
y_axis = np.cross(z_axis, x_axis)
|
||||||
|
y_axis /= np.max(np.stack([np.linalg.norm(y_axis, axis=1, keepdims=True), eps]))
|
||||||
|
|
||||||
|
r_mat = np.concatenate((x_axis.reshape(-1, 3, 1), y_axis.reshape(-1, 3, 1), z_axis.reshape(-1, 3, 1)), axis=2)
|
||||||
|
|
||||||
|
return r_mat
|
||||||
|
|
||||||
|
|
||||||
|
def to_sphere(u, v):
|
||||||
|
theta = 2 * np.pi * u
|
||||||
|
phi = np.arccos(1 - 2 * v)
|
||||||
|
cx = np.sin(phi) * np.cos(theta)
|
||||||
|
cy = np.sin(phi) * np.sin(theta)
|
||||||
|
cz = np.cos(phi)
|
||||||
|
s = np.stack([cx, cy, cz])
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def sample_on_sphere(range_u=(0, 1), range_v=(0, 1)):
|
||||||
|
u = np.random.uniform(*range_u)
|
||||||
|
v = np.random.uniform(*range_v)
|
||||||
|
return to_sphere(u, v)
|
||||||
|
|
||||||
|
|
||||||
|
def sample_pose_on_sphere(range_v=(0,1), range_u=(0,1), radius=1, up=[0,1,0]):
|
||||||
|
# sample location on unit sphere
|
||||||
|
loc = sample_on_sphere(range_u, range_v)
|
||||||
|
|
||||||
|
# sample radius if necessary
|
||||||
|
if isinstance(radius, tuple):
|
||||||
|
radius = np.random.uniform(*radius)
|
||||||
|
|
||||||
|
loc = loc * radius
|
||||||
|
R = look_at(loc, up=np.array(up))[0]
|
||||||
|
|
||||||
|
RT = np.concatenate([R, loc.reshape(3, 1)], axis=1)
|
||||||
|
RT = torch.Tensor(RT.astype(np.float32))
|
||||||
|
return RT
|
||||||
|
|
||||||
|
|
||||||
|
def rectify_pose(camera_r, body_aa, rotate_x=False):
|
||||||
|
body_r = batch_rodrigues(body_aa).reshape(-1,3,3)
|
||||||
|
|
||||||
|
if rotate_x:
|
||||||
|
rotate_x = torch.tensor([[[1.0, 0.0, 0.0], [0.0, -1.0, 0.0], [0.0, 0.0, -1.0]]])
|
||||||
|
body_r = body_r @ rotate_x
|
||||||
|
|
||||||
|
final_r = camera_r @ body_r
|
||||||
|
body_aa = batch_rot2aa(final_r)
|
||||||
|
return body_aa
|
||||||
|
|
||||||
|
|
||||||
|
def batch_euler2matrix(r):
|
||||||
|
return quaternion_to_rotation_matrix(euler_to_quaternion(r))
|
||||||
|
|
||||||
|
|
||||||
|
def euler_to_quaternion(r):
|
||||||
|
x = r[..., 0]
|
||||||
|
y = r[..., 1]
|
||||||
|
z = r[..., 2]
|
||||||
|
|
||||||
|
z = z/2.0
|
||||||
|
y = y/2.0
|
||||||
|
x = x/2.0
|
||||||
|
cz = torch.cos(z)
|
||||||
|
sz = torch.sin(z)
|
||||||
|
cy = torch.cos(y)
|
||||||
|
sy = torch.sin(y)
|
||||||
|
cx = torch.cos(x)
|
||||||
|
sx = torch.sin(x)
|
||||||
|
quaternion = torch.zeros_like(r.repeat(1,2))[..., :4].to(r.device)
|
||||||
|
quaternion[..., 0] += cx*cy*cz - sx*sy*sz
|
||||||
|
quaternion[..., 1] += cx*sy*sz + cy*cz*sx
|
||||||
|
quaternion[..., 2] += cx*cz*sy - sx*cy*sz
|
||||||
|
quaternion[..., 3] += cx*cy*sz + sx*cz*sy
|
||||||
|
return quaternion
|
||||||
|
|
||||||
|
|
||||||
|
def quaternion_to_rotation_matrix(quat):
|
||||||
|
"""Convert quaternion coefficients to rotation matrix.
|
||||||
|
Args:
|
||||||
|
quat: size = [B, 4] 4 <===>(w, x, y, z)
|
||||||
|
Returns:
|
||||||
|
Rotation matrix corresponding to the quaternion -- size = [B, 3, 3]
|
||||||
|
"""
|
||||||
|
norm_quat = quat
|
||||||
|
norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True)
|
||||||
|
w, x, y, z = norm_quat[:, 0], norm_quat[:, 1], norm_quat[:, 2], norm_quat[:, 3]
|
||||||
|
|
||||||
|
B = quat.size(0)
|
||||||
|
|
||||||
|
w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
|
||||||
|
wx, wy, wz = w * x, w * y, w * z
|
||||||
|
xy, xz, yz = x * y, x * z, y * z
|
||||||
|
|
||||||
|
rotMat = torch.stack([w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz,
|
||||||
|
2 * wz + 2 * xy, w2 - x2 + y2 - z2, 2 * yz - 2 * wx,
|
||||||
|
2 * xz - 2 * wy, 2 * wx + 2 * yz, w2 - x2 - y2 + z2], dim=1).view(B, 3, 3)
|
||||||
|
return rotMat
|
||||||
|
|
||||||
|
def euler_angles_from_rotmat(R):
|
||||||
|
"""
|
||||||
|
computer euler angles for rotation around x, y, z axis
|
||||||
|
from rotation amtrix
|
||||||
|
R: 4x4 rotation matrix
|
||||||
|
https://www.gregslabaugh.net/publications/euler.pdf
|
||||||
|
"""
|
||||||
|
r21 = np.round(R[:, 2, 0].item(), 4)
|
||||||
|
if abs(r21) != 1:
|
||||||
|
y_angle1 = -1 * torch.asin(R[:, 2, 0])
|
||||||
|
y_angle2 = math.pi + torch.asin(R[:, 2, 0])
|
||||||
|
cy1, cy2 = torch.cos(y_angle1), torch.cos(y_angle2)
|
||||||
|
|
||||||
|
x_angle1 = torch.atan2(R[:, 2, 1] / cy1, R[:, 2, 2] / cy1)
|
||||||
|
x_angle2 = torch.atan2(R[:, 2, 1] / cy2, R[:, 2, 2] / cy2)
|
||||||
|
z_angle1 = torch.atan2(R[:, 1, 0] / cy1, R[:, 0, 0] / cy1)
|
||||||
|
z_angle2 = torch.atan2(R[:, 1, 0] / cy2, R[:, 0, 0] / cy2)
|
||||||
|
|
||||||
|
s1 = (x_angle1, y_angle1, z_angle1)
|
||||||
|
s2 = (x_angle2, y_angle2, z_angle2)
|
||||||
|
s = (s1, s2)
|
||||||
|
|
||||||
|
else:
|
||||||
|
z_angle = torch.tensor([0], device=R.device).float()
|
||||||
|
if r21 == -1:
|
||||||
|
y_angle = torch.tensor([math.pi / 2], device=R.device).float()
|
||||||
|
x_angle = z_angle + torch.atan2(R[:, 0, 1], R[:, 0, 2])
|
||||||
|
else:
|
||||||
|
y_angle = -torch.tensor([math.pi / 2], device=R.device).float()
|
||||||
|
x_angle = -z_angle + torch.atan2(-R[:, 0, 1], R[:, 0, 2])
|
||||||
|
s = ((x_angle, y_angle, z_angle),)
|
||||||
|
return s
|
1130
myeasymocap/backbone/pare/utils/kp_utils.py
Normal file
1130
myeasymocap/backbone/pare/utils/kp_utils.py
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user