diff --git a/config/1v1p/hrnet_pare_finetune.yml b/config/1v1p/hrnet_pare_finetune.yml index 6dbfcd4..2228e9e 100644 --- a/config/1v1p/hrnet_pare_finetune.yml +++ b/config/1v1p/hrnet_pare_finetune.yml @@ -14,7 +14,7 @@ args: key_from_previous: [bbox] key_keep: [] args: - ckpt: /nas/home/shuaiqing/Code/EasyMocapPublic/data/models/pose_hrnet_w48_384x288.pth + ckpt: data/models/pose_hrnet_w48_384x288.pth vis2d: module: myeasymocap.io.vis.Vis2D skip: False diff --git a/myeasymocap/backbone/pare/backbone/__init__.py b/myeasymocap/backbone/pare/backbone/__init__.py new file mode 100644 index 0000000..327e346 --- /dev/null +++ b/myeasymocap/backbone/pare/backbone/__init__.py @@ -0,0 +1,3 @@ +# from .hrnet_pare import * +from .resnet import * +from .mobilenet import * \ No newline at end of file diff --git a/myeasymocap/backbone/pare/backbone/hrnet.py b/myeasymocap/backbone/pare/backbone/hrnet.py new file mode 100644 index 0000000..d4bca12 --- /dev/null +++ b/myeasymocap/backbone/pare/backbone/hrnet.py @@ -0,0 +1,631 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# ------------------------------------------------------------------------------ +import os + +import torch +import torch.nn as nn +# from loguru import logger +import torch.nn.functional as F +from yacs.config import CfgNode as CN + +models = [ + 'hrnet_w32', + 'hrnet_w48', +] + +BN_MOMENTUM = 0.1 + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, + bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, + momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class HighResolutionModule(nn.Module): + def __init__(self, num_branches, blocks, num_blocks, num_inchannels, + num_channels, fuse_method, multi_scale_output=True): + super(HighResolutionModule, self).__init__() + self._check_branches( + num_branches, blocks, num_blocks, num_inchannels, num_channels) + + self.num_inchannels = num_inchannels + self.fuse_method = fuse_method + self.num_branches = num_branches + + self.multi_scale_output = multi_scale_output + + self.branches = self._make_branches( + num_branches, blocks, num_blocks, num_channels) + self.fuse_layers = self._make_fuse_layers() + self.relu = nn.ReLU(True) + + def _check_branches(self, num_branches, blocks, num_blocks, + num_inchannels, num_channels): + if num_branches != len(num_blocks): + error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format( + num_branches, len(num_blocks)) + logger.error(error_msg) + raise ValueError(error_msg) + + if num_branches != len(num_channels): + error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format( + num_branches, len(num_channels)) + logger.error(error_msg) + raise ValueError(error_msg) + + if num_branches != len(num_inchannels): + error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format( + num_branches, len(num_inchannels)) + logger.error(error_msg) + raise ValueError(error_msg) + + def _make_one_branch(self, branch_index, block, num_blocks, num_channels, + stride=1): + downsample = None + if stride != 1 or \ + self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.num_inchannels[branch_index], + num_channels[branch_index] * block.expansion, + kernel_size=1, stride=stride, bias=False + ), + nn.BatchNorm2d( + num_channels[branch_index] * block.expansion, + momentum=BN_MOMENTUM + ), + ) + + layers = [] + layers.append( + block( + self.num_inchannels[branch_index], + num_channels[branch_index], + stride, + downsample + ) + ) + self.num_inchannels[branch_index] = \ + num_channels[branch_index] * block.expansion + for i in range(1, num_blocks[branch_index]): + layers.append( + block( + self.num_inchannels[branch_index], + num_channels[branch_index] + ) + ) + + return nn.Sequential(*layers) + + def _make_branches(self, num_branches, block, num_blocks, num_channels): + branches = [] + + for i in range(num_branches): + branches.append( + self._make_one_branch(i, block, num_blocks, num_channels) + ) + + return nn.ModuleList(branches) + + def _make_fuse_layers(self): + if self.num_branches == 1: + return None + + num_branches = self.num_branches + num_inchannels = self.num_inchannels + fuse_layers = [] + for i in range(num_branches if self.multi_scale_output else 1): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_inchannels[i], + 1, 1, 0, bias=False + ), + nn.BatchNorm2d(num_inchannels[i]), + nn.Upsample(scale_factor=2**(j-i), mode='nearest') + ) + ) + elif j == i: + fuse_layer.append(None) + else: + conv3x3s = [] + for k in range(i-j): + if k == i - j - 1: + num_outchannels_conv3x3 = num_inchannels[i] + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_outchannels_conv3x3, + 3, 2, 1, bias=False + ), + nn.BatchNorm2d(num_outchannels_conv3x3) + ) + ) + else: + num_outchannels_conv3x3 = num_inchannels[j] + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_outchannels_conv3x3, + 3, 2, 1, bias=False + ), + nn.BatchNorm2d(num_outchannels_conv3x3), + nn.ReLU(True) + ) + ) + fuse_layer.append(nn.Sequential(*conv3x3s)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def get_num_inchannels(self): + return self.num_inchannels + + def forward(self, x): + if self.num_branches == 1: + return [self.branches[0](x[0])] + + for i in range(self.num_branches): + x[i] = self.branches[i](x[i]) + + x_fuse = [] + + for i in range(len(self.fuse_layers)): + y = x[0] if i == 0 else self.fuse_layers[i][0](x[0]) + for j in range(1, self.num_branches): + if i == j: + y = y + x[j] + else: + y = y + self.fuse_layers[i][j](x[j]) + x_fuse.append(self.relu(y)) + + return x_fuse + + +blocks_dict = { + 'BASIC': BasicBlock, + 'BOTTLENECK': Bottleneck +} + + +class PoseHighResolutionNet(nn.Module): + + def __init__(self, cfg): + self.inplanes = 64 + extra = cfg['MODEL']['EXTRA'] + super(PoseHighResolutionNet, self).__init__() + + self.cfg = extra + + # stem net + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, + bias=False) + self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, + bias=False) + self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.layer1 = self._make_layer(Bottleneck, 64, 4) + + self.stage2_cfg = extra['STAGE2'] + num_channels = self.stage2_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage2_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels)) + ] + self.transition1 = self._make_transition_layer([256], num_channels) + self.stage2, pre_stage_channels = self._make_stage( + self.stage2_cfg, num_channels) + + self.stage3_cfg = extra['STAGE3'] + num_channels = self.stage3_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage3_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels)) + ] + self.transition2 = self._make_transition_layer( + pre_stage_channels, num_channels) + self.stage3, pre_stage_channels = self._make_stage( + self.stage3_cfg, num_channels) + + self.stage4_cfg = extra['STAGE4'] + num_channels = self.stage4_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage4_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels)) + ] + self.transition3 = self._make_transition_layer( + pre_stage_channels, num_channels) + self.stage4, pre_stage_channels = self._make_stage( + self.stage4_cfg, num_channels, multi_scale_output=True) + + self.final_layer = nn.Conv2d( + in_channels=pre_stage_channels[0], + out_channels=cfg['MODEL']['NUM_JOINTS'], + kernel_size=extra['FINAL_CONV_KERNEL'], + stride=1, + padding=1 if extra['FINAL_CONV_KERNEL'] == 3 else 0 + ) + + self.pretrained_layers = extra['PRETRAINED_LAYERS'] + + if extra.DOWNSAMPLE and extra.USE_CONV: + self.downsample_stage_1 = self._make_downsample_layer(3, num_channel=self.stage2_cfg['NUM_CHANNELS'][0]) + self.downsample_stage_2 = self._make_downsample_layer(2, num_channel=self.stage2_cfg['NUM_CHANNELS'][-1]) + self.downsample_stage_3 = self._make_downsample_layer(1, num_channel=self.stage3_cfg['NUM_CHANNELS'][-1]) + elif not extra.DOWNSAMPLE and extra.USE_CONV: + self.upsample_stage_2 = self._make_upsample_layer(1, num_channel=self.stage2_cfg['NUM_CHANNELS'][-1]) + self.upsample_stage_3 = self._make_upsample_layer(2, num_channel=self.stage3_cfg['NUM_CHANNELS'][-1]) + self.upsample_stage_4 = self._make_upsample_layer(3, num_channel=self.stage4_cfg['NUM_CHANNELS'][-1]) + + def _make_transition_layer( + self, num_channels_pre_layer, num_channels_cur_layer): + num_branches_cur = len(num_channels_cur_layer) + num_branches_pre = len(num_channels_pre_layer) + + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append( + nn.Sequential( + nn.Conv2d( + num_channels_pre_layer[i], + num_channels_cur_layer[i], + 3, 1, 1, bias=False + ), + nn.BatchNorm2d(num_channels_cur_layer[i]), + nn.ReLU(inplace=True) + ) + ) + else: + transition_layers.append(None) + else: + conv3x3s = [] + for j in range(i+1-num_branches_pre): + inchannels = num_channels_pre_layer[-1] + outchannels = num_channels_cur_layer[i] \ + if j == i-num_branches_pre else inchannels + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + inchannels, outchannels, 3, 2, 1, bias=False + ), + nn.BatchNorm2d(outchannels), + nn.ReLU(inplace=True) + ) + ) + transition_layers.append(nn.Sequential(*conv3x3s)) + + return nn.ModuleList(transition_layers) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False + ), + nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def _make_stage(self, layer_config, num_inchannels, + multi_scale_output=True): + num_modules = layer_config['NUM_MODULES'] + num_branches = layer_config['NUM_BRANCHES'] + num_blocks = layer_config['NUM_BLOCKS'] + num_channels = layer_config['NUM_CHANNELS'] + block = blocks_dict[layer_config['BLOCK']] + fuse_method = layer_config['FUSE_METHOD'] + + modules = [] + for i in range(num_modules): + # multi_scale_output is only used last module + if not multi_scale_output and i == num_modules - 1: + reset_multi_scale_output = False + else: + reset_multi_scale_output = True + + modules.append( + HighResolutionModule( + num_branches, + block, + num_blocks, + num_inchannels, + num_channels, + fuse_method, + reset_multi_scale_output + ) + ) + num_inchannels = modules[-1].get_num_inchannels() + + return nn.Sequential(*modules), num_inchannels + + def _make_upsample_layer(self, num_layers, num_channel, kernel_size=3): + layers = [] + for i in range(num_layers): + layers.append(nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)) + layers.append( + nn.Conv2d( + in_channels=num_channel, out_channels=num_channel, + kernel_size=kernel_size, stride=1, padding=1, bias=False, + ) + ) + layers.append(nn.BatchNorm2d(num_channel, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + + return nn.Sequential(*layers) + + def _make_downsample_layer(self, num_layers, num_channel, kernel_size=3): + layers = [] + for i in range(num_layers): + layers.append( + nn.Conv2d( + in_channels=num_channel, out_channels=num_channel, + kernel_size=kernel_size, stride=2, padding=1, bias=False, + ) + ) + layers.append(nn.BatchNorm2d(num_channel, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + x = self.layer1(x) + + x_list = [] + for i in range(self.stage2_cfg['NUM_BRANCHES']): + if self.transition1[i] is not None: + x_list.append(self.transition1[i](x)) + else: + x_list.append(x) + y_list = self.stage2(x_list) + + x_list = [] + for i in range(self.stage3_cfg['NUM_BRANCHES']): + if self.transition2[i] is not None: + x_list.append(self.transition2[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage3(x_list) + + x_list = [] + for i in range(self.stage4_cfg['NUM_BRANCHES']): + if self.transition3[i] is not None: + x_list.append(self.transition3[i](y_list[-1])) + else: + x_list.append(y_list[i]) + x = self.stage4(x_list) + + if self.cfg.DOWNSAMPLE: + if self.cfg.USE_CONV: + # Downsampling with strided convolutions + x1 = self.downsample_stage_1(x[0]) + x2 = self.downsample_stage_2(x[1]) + x3 = self.downsample_stage_3(x[2]) + x = torch.cat([x1, x2, x3, x[3]], 1) + else: + # Downsampling with interpolation + x0_h, x0_w = x[3].size(2), x[3].size(3) + x1 = F.interpolate(x[0], size=(x0_h, x0_w), mode='bilinear', align_corners=True) + x2 = F.interpolate(x[1], size=(x0_h, x0_w), mode='bilinear', align_corners=True) + x3 = F.interpolate(x[2], size=(x0_h, x0_w), mode='bilinear', align_corners=True) + x = torch.cat([x1, x2, x3, x[3]], 1) + else: + if self.cfg.USE_CONV: + # Upsampling with interpolations + convolutions + x1 = self.upsample_stage_2(x[1]) + x2 = self.upsample_stage_3(x[2]) + x3 = self.upsample_stage_4(x[3]) + x = torch.cat([x[0], x1, x2, x3], 1) + else: + # Upsampling with interpolation + x0_h, x0_w = x[0].size(2), x[0].size(3) + x1 = F.interpolate(x[1], size=(x0_h, x0_w), mode='bilinear', align_corners=True) + x2 = F.interpolate(x[2], size=(x0_h, x0_w), mode='bilinear', align_corners=True) + x3 = F.interpolate(x[3], size=(x0_h, x0_w), mode='bilinear', align_corners=True) + x = torch.cat([x[0], x1, x2, x3], 1) + + return x + + def init_weights(self, pretrained=''): + # logger.info('=> init weights from normal distribution') + for m in self.modules(): + if isinstance(m, nn.Conv2d): + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + nn.init.normal_(m.weight, std=0.001) + for name, _ in m.named_parameters(): + if name in ['bias']: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.ConvTranspose2d): + nn.init.normal_(m.weight, std=0.001) + for name, _ in m.named_parameters(): + if name in ['bias']: + nn.init.constant_(m.bias, 0) + + if os.path.isfile(pretrained): + pretrained_state_dict = torch.load(pretrained) + logger.info('=> loading pretrained model {}'.format(pretrained)) + + need_init_state_dict = {} + for name, m in pretrained_state_dict.items(): + if name.split('.')[0] in self.pretrained_layers \ + or self.pretrained_layers[0] is '*': + need_init_state_dict[name] = m + self.load_state_dict(need_init_state_dict, strict=False) + elif pretrained: + # logger.warning('IMPORTANT WARNING!! Please download pre-trained models if you are in TRAINING mode!') + # raise ValueError('{} is not exist!'.format(pretrained)) + pass + + +def get_pose_net(cfg, is_train): + model = PoseHighResolutionNet(cfg) + + if is_train and cfg['MODEL']['INIT_WEIGHTS']: + model.init_weights(cfg['MODEL']['PRETRAINED']) + + return model + + +def get_cfg_defaults(pretrained, width=32, downsample=False, use_conv=False): + # pose_multi_resoluton_net related params + HRNET = CN() + HRNET.PRETRAINED_LAYERS = [ + 'conv1', 'bn1', 'conv2', 'bn2', 'layer1', 'transition1', + 'stage2', 'transition2', 'stage3', 'transition3', 'stage4', + ] + HRNET.STEM_INPLANES = 64 + HRNET.FINAL_CONV_KERNEL = 1 + HRNET.STAGE2 = CN() + HRNET.STAGE2.NUM_MODULES = 1 + HRNET.STAGE2.NUM_BRANCHES = 2 + HRNET.STAGE2.NUM_BLOCKS = [4, 4] + HRNET.STAGE2.NUM_CHANNELS = [width, width*2] + HRNET.STAGE2.BLOCK = 'BASIC' + HRNET.STAGE2.FUSE_METHOD = 'SUM' + HRNET.STAGE3 = CN() + HRNET.STAGE3.NUM_MODULES = 4 + HRNET.STAGE3.NUM_BRANCHES = 3 + HRNET.STAGE3.NUM_BLOCKS = [4, 4, 4] + HRNET.STAGE3.NUM_CHANNELS = [width, width*2, width*4] + HRNET.STAGE3.BLOCK = 'BASIC' + HRNET.STAGE3.FUSE_METHOD = 'SUM' + HRNET.STAGE4 = CN() + HRNET.STAGE4.NUM_MODULES = 3 + HRNET.STAGE4.NUM_BRANCHES = 4 + HRNET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4] + HRNET.STAGE4.NUM_CHANNELS = [width, width*2, width*4, width*8] + HRNET.STAGE4.BLOCK = 'BASIC' + HRNET.STAGE4.FUSE_METHOD = 'SUM' + HRNET.DOWNSAMPLE = downsample + HRNET.USE_CONV = use_conv + + cfg = CN() + cfg.MODEL = CN() + cfg.MODEL.INIT_WEIGHTS = True + cfg.MODEL.PRETRAINED = pretrained # 'data/pretrained_models/hrnet_w32-36af842e.pth' + cfg.MODEL.EXTRA = HRNET + cfg.MODEL.NUM_JOINTS = 24 + return cfg + + +def hrnet_w32( + pretrained=True, + pretrained_ckpt='data/pretrained_models/pose_coco/pose_hrnet_w32_256x192.pth', + downsample=False, + use_conv=False, +): + cfg = get_cfg_defaults(pretrained_ckpt, width=32, downsample=downsample, use_conv=use_conv) + return get_pose_net(cfg, is_train=True) + + +def hrnet_w48( + pretrained=True, + pretrained_ckpt='data/pretrained_models/pose_coco/pose_hrnet_w48_256x192.pth', + downsample=False, + use_conv=False, +): + cfg = get_cfg_defaults(pretrained_ckpt, width=48, downsample=downsample, use_conv=use_conv) + return get_pose_net(cfg, is_train=True) \ No newline at end of file diff --git a/myeasymocap/backbone/pare/backbone/mobilenet.py b/myeasymocap/backbone/pare/backbone/mobilenet.py new file mode 100644 index 0000000..53d9652 --- /dev/null +++ b/myeasymocap/backbone/pare/backbone/mobilenet.py @@ -0,0 +1,191 @@ +from torch import nn +try: + from torch.hub import load_state_dict_from_url +except ImportError: + from torchvision.models.utils import load_state_dict_from_url + + +__all__ = ['MobileNetV2', 'mobilenet_v2'] + + +model_urls = { + 'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth', +} + + +def _make_divisible(v, divisor, min_value=None): + """ + This function is taken from the original tf repo. + It ensures that all layers have a channel number that is divisible by 8 + It can be seen here: + https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py + :param v: + :param divisor: + :param min_value: + :return: + """ + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class ConvBNReLU(nn.Sequential): + def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1, norm_layer=None): + padding = (kernel_size - 1) // 2 + if norm_layer is None: + norm_layer = nn.BatchNorm2d + super(ConvBNReLU, self).__init__( + nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False), + norm_layer(out_planes), + nn.ReLU6(inplace=True) + ) + + +class InvertedResidual(nn.Module): + def __init__(self, inp, oup, stride, expand_ratio, norm_layer=None): + super(InvertedResidual, self).__init__() + self.stride = stride + assert stride in [1, 2] + + if norm_layer is None: + norm_layer = nn.BatchNorm2d + + hidden_dim = int(round(inp * expand_ratio)) + self.use_res_connect = self.stride == 1 and inp == oup + + layers = [] + if expand_ratio != 1: + # pw + layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1, norm_layer=norm_layer)) + layers.extend([ + # dw + ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim, norm_layer=norm_layer), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + norm_layer(oup), + ]) + self.conv = nn.Sequential(*layers) + + def forward(self, x): + if self.use_res_connect: + return x + self.conv(x) + else: + return self.conv(x) + + +class MobileNetV2(nn.Module): + def __init__(self, + num_classes=1000, + width_mult=1.0, + inverted_residual_setting=None, + round_nearest=8, + block=None, + norm_layer=None): + """ + MobileNet V2 main class + + Args: + num_classes (int): Number of classes + width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount + inverted_residual_setting: Network structure + round_nearest (int): Round the number of channels in each layer to be a multiple of this number + Set to 1 to turn off rounding + block: Module specifying inverted residual building block for mobilenet + norm_layer: Module specifying the normalization layer to use + + """ + super(MobileNetV2, self).__init__() + + if block is None: + block = InvertedResidual + + if norm_layer is None: + norm_layer = nn.BatchNorm2d + + input_channel = 32 + last_channel = 1280 + + if inverted_residual_setting is None: + inverted_residual_setting = [ + # t, c, n, s + [1, 16, 1, 1], + [6, 24, 2, 2], + [6, 32, 3, 2], + [6, 64, 4, 2], + [6, 96, 3, 1], + [6, 160, 3, 2], + [6, 320, 1, 1], + ] + + # only check the first element, assuming user knows t,c,n,s are required + if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: + raise ValueError("inverted_residual_setting should be non-empty " + "or a 4-element list, got {}".format(inverted_residual_setting)) + + # building first layer + input_channel = _make_divisible(input_channel * width_mult, round_nearest) + self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) + features = [ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer)] + # building inverted residual blocks + for t, c, n, s in inverted_residual_setting: + output_channel = _make_divisible(c * width_mult, round_nearest) + for i in range(n): + stride = s if i == 0 else 1 + features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer)) + input_channel = output_channel + # building last several layers + features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer)) + # make it nn.Sequential + self.features = nn.Sequential(*features) + + # building classifier + # self.classifier = nn.Sequential( + # nn.Dropout(0.2), + # nn.Linear(self.last_channel, num_classes), + # ) + + # weight initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + nn.init.zeros_(m.bias) + + def _forward_impl(self, x): + # This exists since TorchScript doesn't support inheritance, so the superclass method + # (this one) needs to have a name other than `forward` that can be accessed in a subclass + x = self.features(x) + # Cannot use "squeeze" as batch-size can be 1 => must use reshape with x.shape[0] + # x = nn.functional.adaptive_avg_pool2d(x, 1).reshape(x.shape[0], -1) + # x = self.classifier(x) + return x + + def forward(self, x): + return self._forward_impl(x) + + +def mobilenet_v2(pretrained=False, progress=True, **kwargs): + """ + Constructs a MobileNetV2 architecture from + `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + model = MobileNetV2(**kwargs) + if pretrained: + state_dict = load_state_dict_from_url(model_urls['mobilenet_v2'], + progress=progress) + model.load_state_dict(state_dict, strict=False) + return model diff --git a/myeasymocap/backbone/pare/backbone/resnet.py b/myeasymocap/backbone/pare/backbone/resnet.py new file mode 100644 index 0000000..80e6b64 --- /dev/null +++ b/myeasymocap/backbone/pare/backbone/resnet.py @@ -0,0 +1,355 @@ +import torch +import torch.nn as nn +try: + from torch.hub import load_state_dict_from_url +except ImportError: + from torchvision.models.utils import load_state_dict_from_url + +__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', + 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d', + 'wide_resnet50_2', 'wide_resnet101_2'] + + +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', + 'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth', + 'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth', + 'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth', + 'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth', +} + + +def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=dilation, groups=groups, bias=False, dilation=dilation) + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None): + super(BasicBlock, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + if groups != 1 or base_width != 64: + raise ValueError('BasicBlock only supports groups=1 and base_width=64') + if dilation > 1: + raise NotImplementedError("Dilation > 1 not supported in BasicBlock") + # Both self.conv1 and self.downsample layers downsample the input when stride != 1 + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = norm_layer(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2) + # while original implementation places the stride at the first 1x1 convolution(self.conv1) + # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385. + # This variant is also known as ResNet V1.5 and improves accuracy according to + # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch. + + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None): + super(Bottleneck, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.)) * groups + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv1 = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.conv2 = conv3x3(width, width, stride, groups, dilation) + self.bn2 = norm_layer(width) + self.conv3 = conv1x1(width, planes * self.expansion) + self.bn3 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + +class ResNet(nn.Module): + + def __init__(self, block, layers, num_classes=1000, zero_init_residual=False, + groups=1, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None): + super(ResNet, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = 64 + self.dilation = 1 + if replace_stride_with_dilation is None: + # each element in the tuple indicates if we should replace + # the 2x2 stride with a dilated convolution instead + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2, + dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2, + dilate=replace_stride_with_dilation[1]) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2, + dilate=replace_stride_with_dilation[2]) + # self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + # self.fc = nn.Linear(512 * block.expansion, num_classes) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + # Zero-initialize the last BN in each residual branch, + # so that the residual branch starts with zeros, and each residual block behaves like an identity. + # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 + if zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + nn.init.constant_(m.bn3.weight, 0) + elif isinstance(m, BasicBlock): + nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, self.groups, + self.base_width, previous_dilation, norm_layer)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer)) + + return nn.Sequential(*layers) + + def _forward_impl(self, x): + # See note [TorchScript super()] + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + # x = self.avgpool(x) + # x = torch.flatten(x, 1) + # x = self.fc(x) + + return x + + def forward(self, x): + return self._forward_impl(x) + + +def _resnet(arch, block, layers, pretrained, progress, **kwargs): + model = ResNet(block, layers, **kwargs) + if pretrained: + state_dict = load_state_dict_from_url(model_urls[arch], + progress=progress) + model.load_state_dict(state_dict, strict=False) + return model + + +def resnet18(pretrained=False, progress=True, **kwargs): + r"""ResNet-18 model from + `"Deep Residual Learning for Image Recognition" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress, + **kwargs) + + +def resnet34(pretrained=False, progress=True, **kwargs): + r"""ResNet-34 model from + `"Deep Residual Learning for Image Recognition" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress, + **kwargs) + + +def resnet50(pretrained=False, progress=True, **kwargs): + r"""ResNet-50 model from + `"Deep Residual Learning for Image Recognition" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress, + **kwargs) + + +def resnet101(pretrained=False, progress=True, **kwargs): + r"""ResNet-101 model from + `"Deep Residual Learning for Image Recognition" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress, + **kwargs) + + +def resnet152(pretrained=False, progress=True, **kwargs): + r"""ResNet-152 model from + `"Deep Residual Learning for Image Recognition" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress, + **kwargs) + + +def resnext50_32x4d(pretrained=False, progress=True, **kwargs): + r"""ResNeXt-50 32x4d model from + `"Aggregated Residual Transformation for Deep Neural Networks" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + kwargs['groups'] = 32 + kwargs['width_per_group'] = 4 + return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3], + pretrained, progress, **kwargs) + + +def resnext101_32x8d(pretrained=False, progress=True, **kwargs): + r"""ResNeXt-101 32x8d model from + `"Aggregated Residual Transformation for Deep Neural Networks" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + kwargs['groups'] = 32 + kwargs['width_per_group'] = 8 + return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3], + pretrained, progress, **kwargs) + + +def wide_resnet50_2(pretrained=False, progress=True, **kwargs): + r"""Wide ResNet-50-2 model from + `"Wide Residual Networks" `_ + + The model is the same as ResNet except for the bottleneck number of channels + which is twice larger in every block. The number of channels in outer 1x1 + convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 + channels, and in Wide ResNet-50-2 has 2048-1024-2048. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + kwargs['width_per_group'] = 64 * 2 + return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3], + pretrained, progress, **kwargs) + + +def wide_resnet101_2(pretrained=False, progress=True, **kwargs): + r"""Wide ResNet-101-2 model from + `"Wide Residual Networks" `_ + + The model is the same as ResNet except for the bottleneck number of channels + which is twice larger in every block. The number of channels in outer 1x1 + convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 + channels, and in Wide ResNet-50-2 has 2048-1024-2048. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + kwargs['width_per_group'] = 64 * 2 + return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3], + pretrained, progress, **kwargs) diff --git a/myeasymocap/backbone/pare/backbone/utils.py b/myeasymocap/backbone/pare/backbone/utils.py new file mode 100644 index 0000000..e0dbc9b --- /dev/null +++ b/myeasymocap/backbone/pare/backbone/utils.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +def get_backbone_info(backbone): + info = { + 'resnet18': {'n_output_channels': 512, 'downsample_rate': 4}, + 'resnet34': {'n_output_channels': 512, 'downsample_rate': 4}, + 'resnet50': {'n_output_channels': 2048, 'downsample_rate': 4}, + 'resnet50_adf_dropout': {'n_output_channels': 2048, 'downsample_rate': 4}, + 'resnet50_dropout': {'n_output_channels': 2048, 'downsample_rate': 4}, + 'resnet101': {'n_output_channels': 2048, 'downsample_rate': 4}, + 'resnet152': {'n_output_channels': 2048, 'downsample_rate': 4}, + 'resnext50_32x4d': {'n_output_channels': 2048, 'downsample_rate': 4}, + 'resnext101_32x8d': {'n_output_channels': 2048, 'downsample_rate': 4}, + 'wide_resnet50_2': {'n_output_channels': 2048, 'downsample_rate': 4}, + 'wide_resnet101_2': {'n_output_channels': 2048, 'downsample_rate': 4}, + 'mobilenet_v2': {'n_output_channels': 1280, 'downsample_rate': 4}, + 'hrnet_w32': {'n_output_channels': 480, 'downsample_rate': 4}, + 'hrnet_w48': {'n_output_channels': 720, 'downsample_rate': 4}, + # 'hrnet_w64': {'n_output_channels': 2048, 'downsample_rate': 4}, + 'dla34': {'n_output_channels': 512, 'downsample_rate': 4}, + } + return info[backbone] \ No newline at end of file diff --git a/myeasymocap/backbone/pare/config.py b/myeasymocap/backbone/pare/config.py new file mode 100644 index 0000000..3720e05 --- /dev/null +++ b/myeasymocap/backbone/pare/config.py @@ -0,0 +1,239 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +import os +import time +import yaml +import shutil +import argparse +import operator +import itertools +from os.path import join +from functools import reduce +from yacs.config import CfgNode as CN +from typing import Dict, List, Union, Any + +# from ..utils.cluster import execute_task_on_cluster + +##### CONSTANTS ##### +DATASET_NPZ_PATH = 'data/dataset_extras' +DATASET_LMDB_PATH = 'data/lmdb' + +MMPOSE_PATH = '/is/cluster/work/mkocabas/projects/mmpose' +MMDET_PATH = '/is/cluster/work/mkocabas/projects/mmdetection' +MMPOSE_CFG = os.path.join(MMPOSE_PATH, 'configs/top_down/hrnet/coco-wholebody/hrnet_w48_coco_wholebody_256x192.py') +MMPOSE_CKPT = os.path.join(MMPOSE_PATH, 'checkpoints/hrnet_w48_coco_wholebody_256x192-643e18cb_20200922.pth') +MMDET_CFG = os.path.join(MMDET_PATH, 'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py') +MMDET_CKPT = os.path.join(MMDET_PATH, 'checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth') + +PW3D_ROOT = 'data/dataset_folders/3dpw' +OH3D_ROOT = 'data/dataset_folders/3doh' + +JOINT_REGRESSOR_TRAIN_EXTRA = 'models/pare/data/J_regressor_extra.npy' +JOINT_REGRESSOR_H36M = 'models/pare/data/J_regressor_h36m.npy' +SMPL_MEAN_PARAMS = 'models/pare/data/smpl_mean_params.npz' +SMPL_MODEL_DIR = 'models/pare/data/body_models/smpl' + +COCO_OCCLUDERS_FILE = 'data/occlusion_augmentation/coco_train2014_occluders.pkl' +PASCAL_OCCLUDERS_FILE = 'data/occlusion_augmentation/pascal_occluders.pkl' + +DATASET_FOLDERS = { + '3dpw': PW3D_ROOT, + '3dpw-val': PW3D_ROOT, + '3dpw-val-cam': PW3D_ROOT, + '3dpw-test-cam': PW3D_ROOT, + '3dpw-train-cam': PW3D_ROOT, + '3dpw-cam': PW3D_ROOT, + '3dpw-all': PW3D_ROOT, + '3doh': OH3D_ROOT, +} + +DATASET_FILES = [ + # Training + { + '3dpw-all': join(DATASET_NPZ_PATH, '3dpw_all_test_with_mmpose.npz'), + '3doh': join(DATASET_NPZ_PATH, '3doh_test.npz'), + }, + # Testing + { + '3doh': join(DATASET_NPZ_PATH, '3doh_train.npz'), + '3dpw': join(DATASET_NPZ_PATH, '3dpw_train.npz'), + } +] + +EVAL_MESH_DATASETS = ['3dpw', '3dpw-val', '3dpw-all', '3doh'] + +##### CONFIGS ##### +hparams = CN() + +# General settings +hparams.LOG_DIR = 'logs/experiments' +hparams.METHOD = 'pare' +hparams.EXP_NAME = 'default' +hparams.RUN_TEST = False +hparams.PROJECT_NAME = 'pare' +hparams.SEED_VALUE = -1 + +hparams.SYSTEM = CN() +hparams.SYSTEM.GPU = '' +hparams.SYSTEM.CLUSTER_NODE = 0.0 + +# Dataset hparams +hparams.DATASET = CN() +hparams.DATASET.LOAD_TYPE = 'Base' +hparams.DATASET.NOISE_FACTOR = 0.4 +hparams.DATASET.ROT_FACTOR = 30 +hparams.DATASET.SCALE_FACTOR = 0.25 +hparams.DATASET.FLIP_PROB = 0.5 +hparams.DATASET.CROP_PROB = 0.0 +hparams.DATASET.CROP_FACTOR = 0.0 +hparams.DATASET.BATCH_SIZE = 64 +hparams.DATASET.NUM_WORKERS = 8 +hparams.DATASET.PIN_MEMORY = True +hparams.DATASET.SHUFFLE_TRAIN = True +hparams.DATASET.TRAIN_DS = 'all' +hparams.DATASET.VAL_DS = '3dpw_3doh' +hparams.DATASET.NUM_IMAGES = -1 +hparams.DATASET.TRAIN_NUM_IMAGES = -1 +hparams.DATASET.TEST_NUM_IMAGES = -1 +hparams.DATASET.IMG_RES = 224 +hparams.DATASET.USE_HEATMAPS = '' # 'hm', 'hm_soft', 'part_segm', 'attention' +hparams.DATASET.RENDER_RES = 480 +hparams.DATASET.MESH_COLOR = 'pinkish' +hparams.DATASET.FOCAL_LENGTH = 5000. +hparams.DATASET.IGNORE_3D = False +hparams.DATASET.USE_SYNTHETIC_OCCLUSION = False +hparams.DATASET.OCC_AUG_DATASET = 'pascal' +hparams.DATASET.USE_3D_CONF = False +hparams.DATASET.USE_GENDER = False +# this is a bit confusing but for the in the wild dataset ratios should be same, otherwise the code +# will be a bit verbose +hparams.DATASET.DATASETS_AND_RATIOS = 'h36m_mpii_lspet_coco_mpi-inf-3dhp_0.3_0.6_0.6_0.6_0.1' +hparams.DATASET.STAGE_DATASETS = '0+h36m_coco_0.2_0.8 2+h36m_coco_0.4_0.6' +# enable non parametric representation +hparams.DATASET.NONPARAMETRIC = False + +# optimizer config +hparams.OPTIMIZER = CN() +hparams.OPTIMIZER.TYPE = 'adam' +hparams.OPTIMIZER.LR = 0.0001 # 0.00003 +hparams.OPTIMIZER.WD = 0.0 + +# Training process hparams +hparams.TRAINING = CN() +hparams.TRAINING.RESUME = None +hparams.TRAINING.PRETRAINED = None +hparams.TRAINING.PRETRAINED_LIT = None +hparams.TRAINING.MAX_EPOCHS = 100 +hparams.TRAINING.LOG_SAVE_INTERVAL = 50 +hparams.TRAINING.LOG_FREQ_TB_IMAGES = 500 +hparams.TRAINING.CHECK_VAL_EVERY_N_EPOCH = 1 +hparams.TRAINING.RELOAD_DATALOADERS_EVERY_EPOCH = True +hparams.TRAINING.NUM_SMPLIFY_ITERS = 100 # 50 +hparams.TRAINING.RUN_SMPLIFY = False +hparams.TRAINING.SMPLIFY_THRESHOLD = 100 +hparams.TRAINING.DROPOUT_P = 0.2 +hparams.TRAINING.TEST_BEFORE_TRAINING = False +hparams.TRAINING.SAVE_IMAGES = False +hparams.TRAINING.USE_PART_SEGM_LOSS = False +hparams.TRAINING.USE_AMP = False + +# Training process hparams +hparams.TESTING = CN() +hparams.TESTING.SAVE_IMAGES = False +hparams.TESTING.SAVE_FREQ = 1 +hparams.TESTING.SAVE_RESULTS = True +hparams.TESTING.SAVE_MESHES = False +hparams.TESTING.SIDEVIEW = True +hparams.TESTING.TEST_ON_TRAIN_END = True +hparams.TESTING.MULTI_SIDEVIEW = False +hparams.TESTING.USE_GT_CAM = False + +# PARE method hparams +hparams.PARE = CN() +hparams.PARE.BACKBONE = 'resnet50' # hrnet_w32-conv, hrnet_w32-interp +hparams.PARE.NUM_JOINTS = 24 +hparams.PARE.SOFTMAX_TEMP = 1. +hparams.PARE.NUM_FEATURES_SMPL = 64 +hparams.PARE.USE_ATTENTION = False +hparams.PARE.USE_SELF_ATTENTION = False +hparams.PARE.USE_KEYPOINT_ATTENTION = False +hparams.PARE.USE_KEYPOINT_FEATURES_FOR_SMPL_REGRESSION = False +hparams.PARE.USE_POSTCONV_KEYPOINT_ATTENTION = False +hparams.PARE.KEYPOINT_ATTENTION_ACT = 'softmax' +hparams.PARE.USE_SCALE_KEYPOINT_ATTENTION = False +hparams.PARE.USE_FINAL_NONLOCAL = None +hparams.PARE.USE_BRANCH_NONLOCAL = None +hparams.PARE.USE_HMR_REGRESSION = False +hparams.PARE.USE_COATTENTION = False +hparams.PARE.NUM_COATTENTION_ITER = 1 +hparams.PARE.COATTENTION_CONV = 'simple' # 'double_1', 'double_3', 'single_1', 'single_3', 'simple' +hparams.PARE.USE_UPSAMPLING = False +hparams.PARE.DECONV_CONV_KERNEL_SIZE = 4 +hparams.PARE.USE_SOFT_ATTENTION = False +hparams.PARE.NUM_BRANCH_ITERATION = 0 +hparams.PARE.BRANCH_DEEPER = False +hparams.PARE.NUM_DECONV_LAYERS = 3 +hparams.PARE.NUM_DECONV_FILTERS = 256 +hparams.PARE.USE_RESNET_CONV_HRNET = False +hparams.PARE.USE_POS_ENC = False + +hparams.PARE.ITERATIVE_REGRESSION = False +hparams.PARE.ITER_RESIDUAL = False +hparams.PARE.NUM_ITERATIONS = 3 +hparams.PARE.SHAPE_INPUT_TYPE = 'feats.all_pose.shape.cam' +hparams.PARE.POSE_INPUT_TYPE = 'feats.neighbor_pose_feats.all_pose.self_pose.neighbor_pose.shape.cam' + +hparams.PARE.POSE_MLP_NUM_LAYERS = 1 +hparams.PARE.SHAPE_MLP_NUM_LAYERS = 1 +hparams.PARE.POSE_MLP_HIDDEN_SIZE = 256 +hparams.PARE.SHAPE_MLP_HIDDEN_SIZE = 256 + +hparams.PARE.SHAPE_LOSS_WEIGHT = 0 +hparams.PARE.KEYPOINT_LOSS_WEIGHT = 5. +hparams.PARE.KEYPOINT_NATIVE_LOSS_WEIGHT = 5. +hparams.PARE.HEATMAPS_LOSS_WEIGHT = 5. +hparams.PARE.SMPL_PART_LOSS_WEIGHT = 1. +hparams.PARE.PART_SEGM_LOSS_WEIGHT = 1. +hparams.PARE.POSE_LOSS_WEIGHT = 1. +hparams.PARE.BETA_LOSS_WEIGHT = 0.001 +hparams.PARE.OPENPOSE_TRAIN_WEIGHT = 0. +hparams.PARE.GT_TRAIN_WEIGHT = 1. +hparams.PARE.LOSS_WEIGHT = 60. +hparams.PARE.USE_SHAPE_REG = False +hparams.PARE.USE_MEAN_CAMSHAPE = False +hparams.PARE.USE_MEAN_POSE = False +hparams.PARE.INIT_XAVIER = False + + +def get_hparams_defaults(): + """Get a yacs hparamsNode object with default values for my_project.""" + # Return a clone so that the defaults will not be altered + # This is for the "local variable" use pattern + return hparams.clone() + + +def update_hparams(hparams_file): + hparams = get_hparams_defaults() + hparams.merge_from_file(hparams_file) + return hparams.clone() + + +def update_hparams_from_dict(cfg_dict): + hparams = get_hparams_defaults() + cfg = hparams.load_cfg(str(cfg_dict)) + hparams.merge_from_other_cfg(cfg) + return hparams.clone() \ No newline at end of file diff --git a/myeasymocap/backbone/pare/constants.py b/myeasymocap/backbone/pare/constants.py new file mode 100644 index 0000000..fcd77db --- /dev/null +++ b/myeasymocap/backbone/pare/constants.py @@ -0,0 +1,195 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +import numpy as np + +# Mean and standard deviation for normalizing input image +IMG_NORM_MEAN = [0.485, 0.456, 0.406] +IMG_NORM_STD = [0.229, 0.224, 0.225] + +""" +We create a superset of joints containing the OpenPose joints together with the ones that each dataset provides. +We keep a superset of 24 joints such that we include all joints from every dataset. +If a dataset doesn't provide annotations for a specific joint, we simply ignore it. +The joints used here are the following: +""" +JOINT_NAMES = [ +# 25 OpenPose joints (in the order provided by OpenPose) +'OP Nose', +'OP Neck', +'OP RShoulder', +'OP RElbow', +'OP RWrist', +'OP LShoulder', +'OP LElbow', +'OP LWrist', +'OP MidHip', +'OP RHip', +'OP RKnee', +'OP RAnkle', +'OP LHip', +'OP LKnee', +'OP LAnkle', +'OP REye', +'OP LEye', +'OP REar', +'OP LEar', +'OP LBigToe', +'OP LSmallToe', +'OP LHeel', +'OP RBigToe', +'OP RSmallToe', +'OP RHeel', +# 24 Ground Truth joints (superset of joints from different datasets) +'Right Ankle', +'Right Knee', +'Right Hip', +'Left Hip', +'Left Knee', +'Left Ankle', +'Right Wrist', +'Right Elbow', +'Right Shoulder', +'Left Shoulder', +'Left Elbow', +'Left Wrist', +'Neck (LSP)', +'Top of Head (LSP)', +'Pelvis (MPII)', +'Thorax (MPII)', +'Spine (H36M)', +'Jaw (H36M)', +'Head (H36M)', +'Nose', +'Left Eye', +'Right Eye', +'Left Ear', +'Right Ear' +] + +# Dict containing the joints in numerical order +JOINT_IDS = {JOINT_NAMES[i]: i for i in range(len(JOINT_NAMES))} + +# Map joints to SMPL joints +JOINT_MAP = { +'OP Nose': 24, 'OP Neck': 12, 'OP RShoulder': 17, +'OP RElbow': 19, 'OP RWrist': 21, 'OP LShoulder': 16, +'OP LElbow': 18, 'OP LWrist': 20, 'OP MidHip': 0, +'OP RHip': 2, 'OP RKnee': 5, 'OP RAnkle': 8, +'OP LHip': 1, 'OP LKnee': 4, 'OP LAnkle': 7, +'OP REye': 25, 'OP LEye': 26, 'OP REar': 27, +'OP LEar': 28, 'OP LBigToe': 29, 'OP LSmallToe': 30, +'OP LHeel': 31, 'OP RBigToe': 32, 'OP RSmallToe': 33, 'OP RHeel': 34, +'Right Ankle': 8, 'Right Knee': 5, 'Right Hip': 45, +'Left Hip': 46, 'Left Knee': 4, 'Left Ankle': 7, +'Right Wrist': 21, 'Right Elbow': 19, 'Right Shoulder': 17, +'Left Shoulder': 16, 'Left Elbow': 18, 'Left Wrist': 20, +'Neck (LSP)': 47, 'Top of Head (LSP)': 48, +'Pelvis (MPII)': 49, 'Thorax (MPII)': 50, +'Spine (H36M)': 51, 'Jaw (H36M)': 52, +'Head (H36M)': 53, 'Nose': 24, 'Left Eye': 26, +'Right Eye': 25, 'Left Ear': 28, 'Right Ear': 27 +} + +# Joint selectors +# Indices to get the 14 LSP joints from the 17 H36M joints +H36M_TO_J17 = [6, 5, 4, 1, 2, 3, 16, 15, 14, 11, 12, 13, 8, 10, 0, 7, 9] +H36M_TO_J14 = H36M_TO_J17[:14] +# Indices to get the 14 LSP joints from the ground truth joints +J24_TO_J17 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18, 14, 16, 17] +J24_TO_J14 = J24_TO_J17[:14] + +# Permutation of SMPL pose parameters when flipping the shape +SMPL_JOINTS_FLIP_PERM = [0, 2, 1, 3, 5, 4, 6, 8, 7, 9, 11, 10, 12, 14, 13, 15, 17, 16, 19, 18, 21, 20, 23, 22] +SMPL_POSE_FLIP_PERM = [] +for i in SMPL_JOINTS_FLIP_PERM: + SMPL_POSE_FLIP_PERM.append(3*i) + SMPL_POSE_FLIP_PERM.append(3*i+1) + SMPL_POSE_FLIP_PERM.append(3*i+2) +# Permutation indices for the 24 ground truth joints +J24_FLIP_PERM = [5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13, 14, 15, 16, 17, 18, 19, 21, 20, 23, 22] +# Permutation indices for the full set of 49 joints +J49_FLIP_PERM = [0, 1, 5, 6, 7, 2, 3, 4, 8, 12, 13, 14, 9, 10, 11, 16, 15, 18, 17, 22, 23, 24, 19, 20, 21]\ + + [25+i for i in J24_FLIP_PERM] + +SMPLH_TO_SMPL = np.arange(0, 156).reshape((-1, 3))[ + np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 37]) +].reshape(-1) + +pw3d_occluded_sequences = [ + 'courtyard_backpack', + 'courtyard_basketball', + 'courtyard_bodyScannerMotions', + 'courtyard_box', + 'courtyard_golf', + 'courtyard_jacket', + 'courtyard_laceShoe', + 'downtown_stairs', + 'flat_guitar', + 'flat_packBags', + 'outdoors_climbing', + 'outdoors_crosscountry', + 'outdoors_fencing', + 'outdoors_freestyle', + 'outdoors_golf', + 'outdoors_parcours', + 'outdoors_slalom', +] + +pw3d_test_sequences = [ + 'flat_packBags_00', + 'downtown_weeklyMarket_00', + 'outdoors_fencing_01', + 'downtown_walkBridge_01', + 'downtown_enterShop_00', + 'downtown_rampAndStairs_00', + 'downtown_bar_00', + 'downtown_runForBus_01', + 'downtown_cafe_00', + 'flat_guitar_01', + 'downtown_runForBus_00', + 'downtown_sitOnStairs_00', + 'downtown_bus_00', + 'downtown_arguing_00', + 'downtown_crossStreets_00', + 'downtown_walkUphill_00', + 'downtown_walking_00', + 'downtown_car_00', + 'downtown_warmWelcome_00', + 'downtown_upstairs_00', + 'downtown_stairs_00', + 'downtown_windowShopping_00', + 'office_phoneCall_00', + 'downtown_downstairs_00' +] + +pw3d_cam_sequences = [ + # TEST + 'downtown_downstairs_00', + 'downtown_stairs_00', + 'downtown_rampAndStairs_00', + 'flat_packBags_00', + 'flat_guitar_01', + 'downtown_warmWelcome_00', + 'downtown_walkUphill_00', + # VALIDATION + 'outdoors_parcours_01', + 'outdoors_crosscountry_00', + 'outdoors_freestyle_01', + 'downtown_walkDownhill_00', + 'outdoors_parcours_00', +] + diff --git a/myeasymocap/backbone/pare/head/__init__.py b/myeasymocap/backbone/pare/head/__init__.py new file mode 100644 index 0000000..35dd4ca --- /dev/null +++ b/myeasymocap/backbone/pare/head/__init__.py @@ -0,0 +1,4 @@ +from .pare_head import PareHead +from .hmr_head import HMRHead +# from .smpl_head import SMPLHead +# from .smpl_cam_head import SMPLCamHead \ No newline at end of file diff --git a/myeasymocap/backbone/pare/head/hmr_head.py b/myeasymocap/backbone/pare/head/hmr_head.py new file mode 100644 index 0000000..c80b6c2 --- /dev/null +++ b/myeasymocap/backbone/pare/head/hmr_head.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +import math +import torch +import numpy as np +import torch.nn as nn + +from ..config import SMPL_MEAN_PARAMS +from ..utils.geometry import rot6d_to_rotmat, rotmat_to_rot6d + +BN_MOMENTUM = 0.1 + + +class HMRHead(nn.Module): + def __init__( + self, + num_input_features, + smpl_mean_params=SMPL_MEAN_PARAMS, + estimate_var=False, + use_separate_var_branch=False, + uncertainty_activation='', + backbone='resnet50', + use_cam_feats=False, + ): + super(HMRHead, self).__init__() + + npose = 24 * 6 + self.npose = npose + self.estimate_var = estimate_var + self.use_separate_var_branch = use_separate_var_branch + self.uncertainty_activation = uncertainty_activation + self.backbone = backbone + self.num_input_features = num_input_features + self.use_cam_feats = use_cam_feats + + if use_cam_feats: + num_input_features += 7 # 6d rotmat + vfov + + self.avgpool = nn.AdaptiveAvgPool2d(1) # nn.AvgPool2d(7, stride=1) + self.fc1 = nn.Linear(num_input_features + npose + 13, 1024) + self.drop1 = nn.Dropout() + self.fc2 = nn.Linear(1024, 1024) + self.drop2 = nn.Dropout() + + if self.estimate_var: + # estimate variance for pose and shape parameters + if self.use_separate_var_branch: + # Decouple var estimation layer using separate linear layers + self.decpose = nn.Linear(1024, npose) + self.decshape = nn.Linear(1024, 10) + self.deccam = nn.Linear(1024, 3) + self.decpose_var = nn.Linear(1024, npose) + self.decshape_var = nn.Linear(1024, 10) + nn.init.xavier_uniform_(self.decpose_var.weight, gain=0.01) + nn.init.xavier_uniform_(self.decshape_var.weight, gain=0.01) + else: + # double the output sizes to estimate var + self.decpose = nn.Linear(1024, npose * 2) + self.decshape = nn.Linear(1024, 10 * 2) + self.deccam = nn.Linear(1024, 3) + else: + self.decpose = nn.Linear(1024, npose) + self.decshape = nn.Linear(1024, 10) + self.deccam = nn.Linear(1024, 3) + + nn.init.xavier_uniform_(self.decpose.weight, gain=0.01) + nn.init.xavier_uniform_(self.decshape.weight, gain=0.01) + nn.init.xavier_uniform_(self.deccam.weight, gain=0.01) + + if self.backbone.startswith('hrnet'): + self.downsample_module = self._make_head() + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + mean_params = np.load(smpl_mean_params) + init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0) + init_shape = torch.from_numpy(mean_params['shape'][:].astype('float32')).unsqueeze(0) + init_cam = torch.from_numpy(mean_params['cam']).unsqueeze(0) + self.register_buffer('init_pose', init_pose) + self.register_buffer('init_shape', init_shape) + self.register_buffer('init_cam', init_cam) + + def _make_head(self): + # downsampling modules + downsamp_modules = [] + for i in range(3): + in_channels = self.num_input_features + out_channels = self.num_input_features + + downsamp_module = nn.Sequential( + nn.Conv2d(in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=2, + padding=1), + nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM), + nn.ReLU(inplace=True) + ) + + downsamp_modules.append(downsamp_module) + + downsamp_modules = nn.Sequential(*downsamp_modules) + + return downsamp_modules + + def forward( + self, + features, + init_pose=None, + init_shape=None, + init_cam=None, + cam_rotmat=None, + cam_vfov=None, + n_iter=3 + ): + # if self.backbone.startswith('hrnet'): + # features = self.downsample_module(features) + + batch_size = features.shape[0] + + if init_pose is None: + init_pose = self.init_pose.expand(batch_size, -1) + if init_shape is None: + init_shape = self.init_shape.expand(batch_size, -1) + if init_cam is None: + init_cam = self.init_cam.expand(batch_size, -1) + + xf = self.avgpool(features) + xf = xf.view(xf.size(0), -1) + + pred_pose = init_pose + pred_shape = init_shape + pred_cam = init_cam + for i in range(n_iter): + if self.use_cam_feats: + xc = torch.cat([xf, pred_pose, pred_shape, pred_cam, + rotmat_to_rot6d(cam_rotmat), cam_vfov.unsqueeze(-1)], 1) + else: + xc = torch.cat([xf, pred_pose, pred_shape, pred_cam], 1) + xc = self.fc1(xc) + xc = self.drop1(xc) + xc = self.fc2(xc) + xc = self.drop2(xc) + if self.estimate_var: + pred_pose = self.decpose(xc)[:,:self.npose] + pred_pose + pred_shape = self.decshape(xc)[:,:10] + pred_shape + pred_cam = self.deccam(xc) + pred_cam + + if self.use_separate_var_branch: + pred_pose_var = self.decpose_var(xc) + pred_shape_var = self.decshape_var(xc) + else: + pred_pose_var = self.decpose(xc)[:,self.npose:] + pred_shape_var = self.decshape(xc)[:,10:] + + if self.uncertainty_activation != '': + # Use an activation layer to output uncertainty + pred_pose_var = eval(f'F.{self.uncertainty_activation}')(pred_pose_var) + pred_shape_var = eval(f'F.{self.uncertainty_activation}')(pred_shape_var) + else: + pred_pose = self.decpose(xc) + pred_pose + pred_shape = self.decshape(xc) + pred_shape + pred_cam = self.deccam(xc) + pred_cam + + pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, 24, 3, 3) + + output = { + 'pred_pose': pred_rotmat, + 'pred_cam': pred_cam, + 'pred_shape': pred_shape, + 'pred_pose_6d': pred_pose, + } + + if self.estimate_var: + output.update({ + 'pred_pose_var': torch.cat([pred_pose, pred_pose_var], dim=1), + 'pred_shape_var': torch.cat([pred_shape, pred_shape_var], dim=1), + }) + + return output + +def keep_variance(x, min_variance): + return x + min_variance \ No newline at end of file diff --git a/myeasymocap/backbone/pare/head/pare_head.py b/myeasymocap/backbone/pare/head/pare_head.py new file mode 100644 index 0000000..941d42b --- /dev/null +++ b/myeasymocap/backbone/pare/head/pare_head.py @@ -0,0 +1,926 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +import torch +import numpy as np +import torch.nn as nn +import torch.nn.functional as F + +from ..config import SMPL_MEAN_PARAMS +from ..layers.coattention import CoAttention +from ..utils.geometry import rot6d_to_rotmat, get_coord_maps +from ..utils.kp_utils import get_smpl_neighbor_triplets +from ..layers.softargmax import softargmax2d, get_heatmap_preds +from ..layers import LocallyConnected2d, KeypointAttention, interpolate +from ..layers.non_local import dot_product +from ..backbone.resnet import conv3x3, conv1x1, BasicBlock + +class logger: + @staticmethod + def info(*args, **kwargs): + pass +BN_MOMENTUM = 0.1 + + +class PareHead(nn.Module): + def __init__( + self, + num_joints, + num_input_features, + softmax_temp=1.0, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), + num_camera_params=3, + num_features_smpl=64, + final_conv_kernel=1, + iterative_regression=False, + iter_residual=False, + num_iterations=3, + shape_input_type='feats', # 'feats.pose.shape.cam' + pose_input_type='feats', # 'feats.neighbor_pose_feats.all_pose.self_pose.neighbor_pose.shape.cam' + pose_mlp_num_layers=1, + shape_mlp_num_layers=1, + pose_mlp_hidden_size=256, + shape_mlp_hidden_size=256, + use_keypoint_features_for_smpl_regression=False, + use_heatmaps='', + use_keypoint_attention=False, + use_postconv_keypoint_attention=False, + keypoint_attention_act='softmax', + use_scale_keypoint_attention=False, + use_branch_nonlocal=None, # 'concatenation', 'dot_product', 'embedded_gaussian', 'gaussian' + use_final_nonlocal=None, # 'concatenation', 'dot_product', 'embedded_gaussian', 'gaussian' + backbone='resnet', + use_hmr_regression=False, + use_coattention=False, + num_coattention_iter=1, + coattention_conv='simple', # 'double_1', 'double_3', 'single_1', 'single_3', 'simple' + use_upsampling=False, + use_soft_attention=False, # Stefan & Otmar 3DV style attention + num_branch_iteration=0, + branch_deeper=False, + use_resnet_conv_hrnet=False, + use_position_encodings=None, + use_mean_camshape=False, + use_mean_pose=False, + init_xavier=False, + ): + super(PareHead, self).__init__() + self.backbone = backbone + self.num_joints = num_joints + self.deconv_with_bias = False + self.use_heatmaps = use_heatmaps + self.num_iterations = num_iterations + self.use_final_nonlocal = use_final_nonlocal + self.use_branch_nonlocal = use_branch_nonlocal + self.use_hmr_regression = use_hmr_regression + self.use_coattention = use_coattention + self.num_coattention_iter = num_coattention_iter + self.coattention_conv = coattention_conv + self.use_soft_attention = use_soft_attention + self.num_branch_iteration = num_branch_iteration + self.iter_residual = iter_residual + self.iterative_regression = iterative_regression + self.pose_mlp_num_layers = pose_mlp_num_layers + self.shape_mlp_num_layers = shape_mlp_num_layers + self.pose_mlp_hidden_size = pose_mlp_hidden_size + self.shape_mlp_hidden_size = shape_mlp_hidden_size + self.use_keypoint_attention = use_keypoint_attention + self.use_keypoint_features_for_smpl_regression = use_keypoint_features_for_smpl_regression + self.use_position_encodings = use_position_encodings + self.use_mean_camshape = use_mean_camshape + self.use_mean_pose = use_mean_pose + + self.num_input_features = num_input_features + + if use_soft_attention: + # These options should be True by default when soft attention is used + self.use_keypoint_features_for_smpl_regression = True + self.use_hmr_regression = True + self.use_coattention = False + logger.warning('Coattention cannot be used together with soft attention') + logger.warning('Overriding use_coattention=False') + + if use_coattention: + self.use_keypoint_features_for_smpl_regression = False + logger.warning('\"use_keypoint_features_for_smpl_regression\" cannot be used together with co-attention') + logger.warning('Overriding \"use_keypoint_features_for_smpl_regression\"=False') + + if use_hmr_regression: + self.iterative_regression = False + logger.warning('iterative_regression cannot be used together with hmr regression') + + if self.use_heatmaps in ['part_segm', 'attention']: + logger.info('\"Keypoint Attention\" should be activated to be able to use part segmentation') + logger.info('Overriding use_keypoint_attention') + self.use_keypoint_attention = True + + assert num_iterations > 0, '\"num_iterations\" should be greater than 0.' + + if use_position_encodings: + assert backbone.startswith('hrnet'), 'backbone should be hrnet to use position encodings' + # self.pos_enc = get_coord_maps(size=56) + self.register_buffer('pos_enc', get_coord_maps(size=56)) + num_input_features += 2 + self.num_input_features = num_input_features + + if backbone.startswith('hrnet'): + if use_resnet_conv_hrnet: + logger.info('Using resnet block for keypoint and smpl conv layers...') + self.keypoint_deconv_layers = self._make_res_conv_layers( + input_channels=self.num_input_features, + num_channels=num_deconv_filters[-1], + num_basic_blocks=num_deconv_layers, + ) + self.num_input_features = num_input_features + self.smpl_deconv_layers = self._make_res_conv_layers( + input_channels=self.num_input_features, + num_channels=num_deconv_filters[-1], + num_basic_blocks=num_deconv_layers, + ) + else: + self.keypoint_deconv_layers = self._make_conv_layer( + num_deconv_layers, + num_deconv_filters, + (3,)*num_deconv_layers, + ) + self.num_input_features = num_input_features + self.smpl_deconv_layers = self._make_conv_layer( + num_deconv_layers, + num_deconv_filters, + (3,)*num_deconv_layers, + ) + else: + # part branch that estimates 2d keypoints + + conv_fn = self._make_upsample_layer if use_upsampling else self._make_deconv_layer + + if use_upsampling: + logger.info('Upsampling is active to increase spatial dimension') + logger.info(f'Upsampling conv kernels: {num_deconv_kernels}') + + self.keypoint_deconv_layers = conv_fn( + num_deconv_layers, + num_deconv_filters, + num_deconv_kernels, + ) + # reset inplanes to 2048 -> final resnet layer + self.num_input_features = num_input_features + self.smpl_deconv_layers = conv_fn( + num_deconv_layers, + num_deconv_filters, + num_deconv_kernels, + ) + + pose_mlp_inp_dim = num_deconv_filters[-1] + smpl_final_dim = num_features_smpl + shape_mlp_inp_dim = num_joints * smpl_final_dim + + if self.use_soft_attention: + logger.info('Soft attention (Stefan & Otmar 3DV) is active') + self.keypoint_final_layer = nn.Sequential( + conv3x3(num_deconv_filters[-1], 256), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + conv1x1(256, num_joints+1 if self.use_heatmaps in ('part_segm', 'part_segm_pool') else num_joints), + ) + + soft_att_feature_size = smpl_final_dim # if use_hmr_regression else pose_mlp_inp_dim + self.smpl_final_layer = nn.Sequential( + conv3x3(num_deconv_filters[-1], 256), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + conv1x1(256, soft_att_feature_size), + ) + # pose_mlp_inp_dim = soft_att_feature_size + else: + self.keypoint_final_layer = nn.Conv2d( + in_channels=num_deconv_filters[-1], + out_channels=num_joints+1 if self.use_heatmaps in ('part_segm', 'part_segm_pool') else num_joints, + kernel_size=final_conv_kernel, + stride=1, + padding=1 if final_conv_kernel == 3 else 0, + ) + + self.smpl_final_layer = nn.Conv2d( + in_channels=num_deconv_filters[-1], + out_channels=smpl_final_dim, + kernel_size=final_conv_kernel, + stride=1, + padding=1 if final_conv_kernel == 3 else 0, + ) + + # temperature for softargmax function + self.register_buffer('temperature', torch.tensor(softmax_temp)) + + # if self.iterative_regression or self.num_branch_iteration > 0 or self.use_coattention: + mean_params = np.load(SMPL_MEAN_PARAMS) + init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0) + init_shape = torch.from_numpy(mean_params['shape'][:].astype('float32')).unsqueeze(0) + init_cam = torch.from_numpy(mean_params['cam']).unsqueeze(0) + self.register_buffer('init_pose', init_pose) + self.register_buffer('init_shape', init_shape) + self.register_buffer('init_cam', init_cam) + + if self.iterative_regression: + # enable iterative regression similar to HMR + # these are the features that can be used as input to final MLPs + input_type_dim = { + 'feats': 0, # image features for self + 'neighbor_pose_feats': 2 * 256, # image features from neighbor joints + 'all_pose': 24 * 6, # rot6d of all joints from previous iter + 'self_pose': 6, # rot6d of self + 'neighbor_pose': 2 * 6, # rot6d of neighbor joints from previous iter + 'shape': 10, # smpl betas/shape + 'cam': num_camera_params, # weak perspective camera + } + + assert 'feats' in shape_input_type, '\"feats\" should be the default value' + assert 'feats' in pose_input_type, '\"feats\" should be the default value' + + self.shape_input_type = shape_input_type.split('.') + self.pose_input_type = pose_input_type.split('.') + + pose_mlp_inp_dim = pose_mlp_inp_dim + sum([input_type_dim[x] for x in self.pose_input_type]) + shape_mlp_inp_dim = shape_mlp_inp_dim + sum([input_type_dim[x] for x in self.shape_input_type]) + + logger.debug(f'Shape MLP takes \"{self.shape_input_type}\" as input, ' + f'input dim: {shape_mlp_inp_dim}') + logger.debug(f'Pose MLP takes \"{self.pose_input_type}\" as input, ' + f'input dim: {pose_mlp_inp_dim}') + + self.pose_mlp_inp_dim = pose_mlp_inp_dim + self.shape_mlp_inp_dim = shape_mlp_inp_dim + + if self.use_hmr_regression: + logger.info(f'HMR regression is active...') + # enable iterative regression similar to HMR + + self.fc1 = nn.Linear(num_joints * smpl_final_dim + (num_joints * 6) + 10 + num_camera_params, 1024) + self.drop1 = nn.Dropout() + self.fc2 = nn.Linear(1024, 1024) + self.drop2 = nn.Dropout() + self.decpose = nn.Linear(1024, (num_joints * 6)) + self.decshape = nn.Linear(1024, 10) + self.deccam = nn.Linear(1024, num_camera_params) + + nn.init.xavier_uniform_(self.decpose.weight, gain=0.01) + nn.init.xavier_uniform_(self.decshape.weight, gain=0.01) + nn.init.xavier_uniform_(self.deccam.weight, gain=0.01) + else: + # here we use 2 different MLPs to estimate shape and camera + # They take a channelwise downsampled version of smpl features + self.shape_mlp = self._get_shape_mlp(output_size=10) + self.cam_mlp = self._get_shape_mlp(output_size=num_camera_params) + + # for pose each joint has a separate MLP + # weights for these MLPs are not shared + # hence we use Locally Connected layers + # TODO support kernel_size > 1 to access context of other joints + self.pose_mlp = self._get_pose_mlp(num_joints=num_joints, output_size=6) + + if init_xavier: + nn.init.xavier_uniform_(self.shape_mlp.weight, gain=0.01) + nn.init.xavier_uniform_(self.cam_mlp.weight, gain=0.01) + nn.init.xavier_uniform_(self.pose_mlp.weight, gain=0.01) + + if self.use_branch_nonlocal: + logger.info(f'Branch nonlocal is active, type {self.use_branch_nonlocal}') + self.branch_2d_nonlocal = eval(self.use_branch_nonlocal).NONLocalBlock2D( + in_channels=num_deconv_filters[-1], + sub_sample=False, + bn_layer=True, + ) + + self.branch_3d_nonlocal = eval(self.use_branch_nonlocal).NONLocalBlock2D( + in_channels=num_deconv_filters[-1], + sub_sample=False, + bn_layer=True, + ) + + if self.use_final_nonlocal: + logger.info(f'Final nonlocal is active, type {self.use_final_nonlocal}') + self.final_pose_nonlocal = eval(self.use_final_nonlocal).NONLocalBlock1D( + in_channels=self.pose_mlp_inp_dim, + sub_sample=False, + bn_layer=True, + ) + + self.final_shape_nonlocal = eval(self.use_final_nonlocal).NONLocalBlock1D( + in_channels=num_features_smpl, + sub_sample=False, + bn_layer=True, + ) + + if self.use_keypoint_attention: + logger.info('Keypoint attention is active') + self.keypoint_attention = KeypointAttention( + use_conv=use_postconv_keypoint_attention, + in_channels=(self.pose_mlp_inp_dim, smpl_final_dim), + out_channels=(self.pose_mlp_inp_dim, smpl_final_dim), + act=keypoint_attention_act, + use_scale=use_scale_keypoint_attention, + ) + + if self.use_coattention: + logger.info(f'Coattention is active, final conv type {self.coattention_conv}') + self.coattention = CoAttention(n_channel=num_deconv_filters[-1], final_conv=self.coattention_conv) + + if self.num_branch_iteration > 0: + logger.info(f'Branch iteration is active') + if branch_deeper: + self.branch_iter_2d_nonlocal = nn.Sequential( + conv3x3(num_deconv_filters[-1], 256), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + dot_product.NONLocalBlock2D( + in_channels=num_deconv_filters[-1], + sub_sample=False, + bn_layer=True, + ) + ) + + self.branch_iter_3d_nonlocal = nn.Sequential( + conv3x3(num_deconv_filters[-1], 256), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + dot_product.NONLocalBlock2D( + in_channels=num_deconv_filters[-1], + sub_sample=False, + bn_layer=True, + ) + ) + else: + self.branch_iter_2d_nonlocal = dot_product.NONLocalBlock2D( + in_channels=num_deconv_filters[-1], + sub_sample=False, + bn_layer=True, + ) + + self.branch_iter_3d_nonlocal = dot_product.NONLocalBlock2D( + in_channels=num_deconv_filters[-1], + sub_sample=False, + bn_layer=True, + ) + + def _get_shape_mlp(self, output_size): + if self.shape_mlp_num_layers == 1: + return nn.Linear(self.shape_mlp_inp_dim, output_size) + + module_list = [] + for i in range(self.shape_mlp_num_layers): + if i == 0: + module_list.append( + nn.Linear(self.shape_mlp_inp_dim, self.shape_mlp_hidden_size) + ) + elif i == self.shape_mlp_num_layers - 1: + module_list.append( + nn.Linear(self.shape_mlp_hidden_size, output_size) + ) + else: + module_list.append( + nn.Linear(self.shape_mlp_hidden_size, self.shape_mlp_hidden_size) + ) + return nn.Sequential(*module_list) + + def _get_pose_mlp(self, num_joints, output_size): + if self.pose_mlp_num_layers == 1: + return LocallyConnected2d( + in_channels=self.pose_mlp_inp_dim, + out_channels=output_size, + output_size=[num_joints, 1], + kernel_size=1, + stride=1, + ) + + module_list = [] + for i in range(self.pose_mlp_num_layers): + if i == 0: + module_list.append( + LocallyConnected2d( + in_channels=self.pose_mlp_inp_dim, + out_channels=self.pose_mlp_hidden_size, + output_size=[num_joints, 1], + kernel_size=1, + stride=1, + ) + ) + elif i == self.pose_mlp_num_layers - 1: + module_list.append( + LocallyConnected2d( + in_channels=self.pose_mlp_hidden_size, + out_channels=output_size, + output_size=[num_joints, 1], + kernel_size=1, + stride=1, + ) + ) + else: + module_list.append( + LocallyConnected2d( + in_channels=self.pose_mlp_hidden_size, + out_channels=self.pose_mlp_hidden_size, + output_size=[num_joints, 1], + kernel_size=1, + stride=1, + ) + ) + return nn.Sequential(*module_list) + + def _get_deconv_cfg(self, deconv_kernel): + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + + return deconv_kernel, padding, output_padding + + def _make_conv_layer(self, num_layers, num_filters, num_kernels): + assert num_layers == len(num_filters), \ + 'ERROR: num_conv_layers is different len(num_conv_filters)' + assert num_layers == len(num_kernels), \ + 'ERROR: num_conv_layers is different len(num_conv_filters)' + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + layers.append( + nn.Conv2d( + in_channels=self.num_input_features, + out_channels=planes, + kernel_size=kernel, + stride=1, + padding=padding, + bias=self.deconv_with_bias)) + layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + self.num_input_features = planes + + return nn.Sequential(*layers) + + def _make_res_conv_layers(self, input_channels, num_channels=64, + num_heads=1, num_basic_blocks=2): + head_layers = [] + + # kernel_sizes, strides, paddings = self._get_trans_cfg() + # for kernel_size, padding, stride in zip(kernel_sizes, paddings, strides): + head_layers.append(nn.Sequential( + nn.Conv2d( + in_channels=input_channels, + out_channels=num_channels, + kernel_size=3, + stride=1, + padding=1 + ), + nn.BatchNorm2d(num_channels, momentum=BN_MOMENTUM), + nn.ReLU(inplace=True)) + ) + + for i in range(num_heads): + layers = [] + for _ in range(num_basic_blocks): + layers.append(nn.Sequential(BasicBlock(num_channels, num_channels))) + head_layers.append(nn.Sequential(*layers)) + + # head_layers.append(nn.Conv2d(in_channels=num_channels, out_channels=output_channels, + # kernel_size=1, stride=1, padding=0)) + + return nn.Sequential(*head_layers) + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + assert num_layers == len(num_filters), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + assert num_layers == len(num_kernels), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + layers.append( + nn.ConvTranspose2d( + in_channels=self.num_input_features, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=self.deconv_with_bias)) + layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + # if self.use_self_attention: + # layers.append(SelfAttention(planes)) + self.num_input_features = planes + + return nn.Sequential(*layers) + + def _make_upsample_layer(self, num_layers, num_filters, num_kernels): + assert num_layers == len(num_filters), \ + 'ERROR: num_layers is different len(num_filters)' + assert num_layers == len(num_kernels), \ + 'ERROR: num_layers is different len(num_filters)' + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + layers.append(nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)) + layers.append( + nn.Conv2d(in_channels=self.num_input_features, out_channels=planes, + kernel_size=kernel, stride=1, padding=padding, bias=self.deconv_with_bias) + ) + layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + # if self.use_self_attention: + # layers.append(SelfAttention(planes)) + self.num_input_features = planes + + return nn.Sequential(*layers) + + def _prepare_pose_mlp_inp(self, feats, pred_pose, pred_shape, pred_cam): + # feats shape: [N, 256, J, 1] + # pose shape: [N, 6, J, 1] + # cam shape: [N, 3] + # beta shape: [N, 10] + batch_size, num_joints = pred_pose.shape[0], pred_pose.shape[2] + + joint_triplets = get_smpl_neighbor_triplets() + + inp_list = [] + + for inp_type in self.pose_input_type: + if inp_type == 'feats': + # add image features + inp_list.append(feats) + + if inp_type == 'neighbor_pose_feats': + # add the image features from neighboring joints + n_pose_feat = [] + for jt in joint_triplets: + n_pose_feat.append( + feats[:, :, jt[1:]].reshape(batch_size, -1, 1).unsqueeze(-2) + ) + n_pose_feat = torch.cat(n_pose_feat, 2) + inp_list.append(n_pose_feat) + + if inp_type == 'self_pose': + # add image features + inp_list.append(pred_pose) + + if inp_type == 'all_pose': + # append all of the joint angels + all_pose = pred_pose.reshape(batch_size, -1, 1)[..., None].repeat(1, 1, num_joints, 1) + inp_list.append(all_pose) + + if inp_type == 'neighbor_pose': + # append only the joint angles of neighboring ones + n_pose = [] + for jt in joint_triplets: + n_pose.append( + pred_pose[:,:,jt[1:]].reshape(batch_size, -1, 1).unsqueeze(-2) + ) + n_pose = torch.cat(n_pose, 2) + inp_list.append(n_pose) + + if inp_type == 'shape': + # append shape predictions + pred_shape = pred_shape[..., None, None].repeat(1, 1, num_joints, 1) + inp_list.append(pred_shape) + + if inp_type == 'cam': + # append camera predictions + pred_cam = pred_cam[..., None, None].repeat(1, 1, num_joints, 1) + inp_list.append(pred_cam) + + assert len(inp_list) > 0 + + # for i,inp in enumerate(inp_list): + # print(i, inp.shape) + + return torch.cat(inp_list, 1) + + def _prepare_shape_mlp_inp(self, feats, pred_pose, pred_shape, pred_cam): + # feats shape: [N, 256, J, 1] + # pose shape: [N, 6, J, 1] + # cam shape: [N, 3] + # beta shape: [N, 10] + batch_size, num_joints = pred_pose.shape[:2] + + inp_list = [] + + for inp_type in self.shape_input_type: + if inp_type == 'feats': + # add image features + inp_list.append(feats) + + if inp_type == 'all_pose': + # append all of the joint angels + pred_pose = pred_pose.reshape(batch_size, -1) + inp_list.append(pred_pose) + + if inp_type == 'shape': + # append shape predictions + inp_list.append(pred_shape) + + if inp_type == 'cam': + # append camera predictions + inp_list.append(pred_cam) + + assert len(inp_list) > 0 + + return torch.cat(inp_list, 1) + + def forward(self, features, gt_segm=None): + batch_size = features.shape[0] + + init_pose = self.init_pose.expand(batch_size, -1) # N, Jx6 + init_shape = self.init_shape.expand(batch_size, -1) + init_cam = self.init_cam.expand(batch_size, -1) + + if self.use_position_encodings: + features = torch.cat((features, self.pos_enc.repeat(features.shape[0], 1, 1, 1)), 1) + + output = {} + + ############## 2D PART BRANCH FEATURES ############## + part_feats = self._get_2d_branch_feats(features) + + ############## GET PART ATTENTION MAP ############## + part_attention = self._get_part_attention_map(part_feats, output) + + ############## 3D SMPL BRANCH FEATURES ############## + smpl_feats = self._get_3d_smpl_feats(features, part_feats) + + ############## SAMPLE LOCAL FEATURES ############## + if gt_segm is not None: + # logger.debug(gt_segm.shape) + # import IPython; IPython.embed(); exit() + gt_segm = F.interpolate(gt_segm.unsqueeze(1).float(), scale_factor=(1/4, 1/4), mode='nearest').long().squeeze(1) + part_attention = F.one_hot(gt_segm.to('cpu'), num_classes=self.num_joints + 1).permute(0,3,1,2).float()[:,1:,:,:] + part_attention = part_attention.to('cuda') + # part_attention = F.interpolate(part_attention, scale_factor=1/4, mode='bilinear', align_corners=True) + # import IPython; IPython.embed(); exit() + point_local_feat, cam_shape_feats = self._get_local_feats(smpl_feats, part_attention, output) + + ############## GET FINAL PREDICTIONS ############## + pred_pose, pred_shape, pred_cam = self._get_final_preds( + point_local_feat, cam_shape_feats, init_pose, init_shape, init_cam + ) + + if self.use_coattention: + for c in range(self.num_coattention_iter): + smpl_feats, part_feats = self.coattention(smpl_feats, part_feats) + part_attention = self._get_part_attention_map(part_feats, output) + point_local_feat, cam_shape_feats = self._get_local_feats(smpl_feats, part_attention, output) + pred_pose, pred_shape, pred_cam = self._get_final_preds( + point_local_feat, cam_shape_feats, pred_pose, pred_shape, pred_cam + ) + + if self.num_branch_iteration > 0: + for nbi in range(self.num_branch_iteration): + if self.use_soft_attention: + smpl_feats = self.branch_iter_3d_nonlocal(smpl_feats) + part_feats = self.branch_iter_2d_nonlocal(part_feats) + else: + smpl_feats = self.branch_iter_3d_nonlocal(smpl_feats) + part_feats = smpl_feats + + part_attention = self._get_part_attention_map(part_feats, output) + point_local_feat, cam_shape_feats = self._get_local_feats(smpl_feats, part_attention, output) + pred_pose, pred_shape, pred_cam = self._get_final_preds( + point_local_feat, cam_shape_feats, pred_pose, pred_shape, pred_cam, + ) + + pred_rotmat = rot6d_to_rotmat(pred_pose).reshape(batch_size, 24, 3, 3) + + output.update({ + 'pred_pose': pred_rotmat, + 'pred_cam': pred_cam, + 'pred_shape': pred_shape, + }) + return output + + def _get_local_feats(self, smpl_feats, part_attention, output): + cam_shape_feats = self.smpl_final_layer(smpl_feats) + + if self.use_keypoint_attention: + point_local_feat = self.keypoint_attention(smpl_feats, part_attention) + cam_shape_feats = self.keypoint_attention(cam_shape_feats, part_attention) + else: + point_local_feat = interpolate(smpl_feats, output['pred_kp2d']) + cam_shape_feats = interpolate(cam_shape_feats, output['pred_kp2d']) + return point_local_feat, cam_shape_feats + + def _get_2d_branch_feats(self, features): + part_feats = self.keypoint_deconv_layers(features) + if self.use_branch_nonlocal: + part_feats = self.branch_2d_nonlocal(part_feats) + return part_feats + + def _get_3d_smpl_feats(self, features, part_feats): + if self.use_keypoint_features_for_smpl_regression: + smpl_feats = part_feats + else: + smpl_feats = self.smpl_deconv_layers(features) + if self.use_branch_nonlocal: + smpl_feats = self.branch_3d_nonlocal(smpl_feats) + + return smpl_feats + + def _get_part_attention_map(self, part_feats, output): + heatmaps = self.keypoint_final_layer(part_feats) + + if self.use_heatmaps == 'hm': + # returns coords between [-1,1] + pred_kp2d, confidence = get_heatmap_preds(heatmaps) + output['pred_kp2d'] = pred_kp2d + output['pred_kp2d_conf'] = confidence + output['pred_heatmaps_2d'] = heatmaps + elif self.use_heatmaps == 'hm_soft': + pred_kp2d, _ = softargmax2d(heatmaps, self.temperature) + output['pred_kp2d'] = pred_kp2d + output['pred_heatmaps_2d'] = heatmaps + elif self.use_heatmaps == 'part_segm': + output['pred_segm_mask'] = heatmaps + heatmaps = heatmaps[:,1:,:,:] # remove the first channel which encodes the background + elif self.use_heatmaps == 'part_segm_pool': + output['pred_segm_mask'] = heatmaps + heatmaps = heatmaps[:,1:,:,:] # remove the first channel which encodes the background + pred_kp2d, _ = softargmax2d(heatmaps, self.temperature) # get_heatmap_preds(heatmaps) + output['pred_kp2d'] = pred_kp2d + + for k, v in output.items(): + if torch.any(torch.isnan(v)): + logger.debug(f'{k} is Nan!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') + if torch.any(torch.isinf(v)): + logger.debug(f'{k} is Inf!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') + + # if torch.any(torch.isnan(pred_kp2d)): + # print('pred_kp2d nan', pred_kp2d.min(), pred_kp2d.max()) + # if torch.any(torch.isnan(heatmaps)): + # print('heatmap nan', heatmaps.min(), heatmaps.max()) + # + # if torch.any(torch.isinf(pred_kp2d)): + # print('pred_kp2d inf', pred_kp2d.min(), pred_kp2d.max()) + # if torch.any(torch.isinf(heatmaps)): + # print('heatmap inf', heatmaps.min(), heatmaps.max()) + + elif self.use_heatmaps == 'attention': + output['pred_attention'] = heatmaps + else: + # returns coords between [-1,1] + pred_kp2d, _ = softargmax2d(heatmaps, self.temperature) + output['pred_kp2d'] = pred_kp2d + output['pred_heatmaps_2d'] = heatmaps + return heatmaps + + def _get_final_preds(self, pose_feats, cam_shape_feats, init_pose, init_shape, init_cam): + if self.use_hmr_regression: + return self._hmr_get_final_preds(cam_shape_feats, init_pose, init_shape, init_cam) + else: + return self._pare_get_final_preds(pose_feats, cam_shape_feats, init_pose, init_shape, init_cam) + + def _hmr_get_final_preds(self, cam_shape_feats, init_pose, init_shape, init_cam): + if self.use_final_nonlocal: + cam_shape_feats = self.final_shape_nonlocal(cam_shape_feats) + + xf = torch.flatten(cam_shape_feats, start_dim=1) + + pred_pose = init_pose + pred_shape = init_shape + pred_cam = init_cam + for i in range(3): + xc = torch.cat([xf, pred_pose, pred_shape, pred_cam], 1) + xc = self.fc1(xc) + xc = self.drop1(xc) + xc = self.fc2(xc) + xc = self.drop2(xc) + pred_pose = self.decpose(xc) + pred_pose + pred_shape = self.decshape(xc) + pred_shape + pred_cam = self.deccam(xc) + pred_cam + + return pred_pose, pred_shape, pred_cam + + def _pare_get_final_preds(self, pose_feats, cam_shape_feats, init_pose, init_shape, init_cam): + pose_feats = pose_feats.unsqueeze(-1) # + + if init_pose.shape[-1] == 6: + # This means init_pose comes from a previous iteration + init_pose = init_pose.transpose(2,1).unsqueeze(-1) + else: + # This means init pose comes from mean pose + init_pose = init_pose.reshape(init_pose.shape[0], 6, -1).unsqueeze(-1) + + if self.iterative_regression: + + shape_feats = torch.flatten(cam_shape_feats, start_dim=1) + + pred_pose = init_pose # [N, 6, J, 1] + pred_cam = init_cam # [N, 3] + pred_shape = init_shape # [N, 10] + + # import IPython; IPython.embed(); exit(1) + + for i in range(self.num_iterations): + # pose_feats shape: [N, 256, 24, 1] + # shape_feats shape: [N, 24*64] + pose_mlp_inp = self._prepare_pose_mlp_inp(pose_feats, pred_pose, pred_shape, pred_cam) + shape_mlp_inp = self._prepare_shape_mlp_inp(shape_feats, pred_pose, pred_shape, pred_cam) + + # print('pose_mlp_inp', pose_mlp_inp.shape) + # print('shape_mlp_inp', shape_mlp_inp.shape) + # TODO: this does not work but let it go since we dont use iterative regression for now. + # if self.use_final_nonlocal: + # pose_mlp_inp = self.final_pose_nonlocal(pose_mlp_inp) + # shape_mlp_inp = self.final_shape_nonlocal(shape_mlp_inp) + + if self.iter_residual: + pred_pose = self.pose_mlp(pose_mlp_inp) + pred_pose + pred_cam = self.cam_mlp(shape_mlp_inp) + pred_cam + pred_shape = self.shape_mlp(shape_mlp_inp) + pred_shape + else: + pred_pose = self.pose_mlp(pose_mlp_inp) + pred_cam = self.cam_mlp(shape_mlp_inp) + pred_shape = self.shape_mlp(shape_mlp_inp) + init_shape + else: + shape_feats = cam_shape_feats + if self.use_final_nonlocal: + pose_feats = self.final_pose_nonlocal(pose_feats.squeeze(-1)).unsqueeze(-1) + shape_feats = self.final_shape_nonlocal(shape_feats) + + shape_feats = torch.flatten(shape_feats, start_dim=1) + + pred_pose = self.pose_mlp(pose_feats) + pred_cam = self.cam_mlp(shape_feats) + pred_shape = self.shape_mlp(shape_feats) + + if self.use_mean_camshape: + pred_cam = pred_cam + init_cam + pred_shape = pred_shape + init_shape + + if self.use_mean_pose: + pred_pose = pred_pose + init_pose + + + pred_pose = pred_pose.squeeze(-1).transpose(2, 1) # N, J, 6 + return pred_pose, pred_shape, pred_cam + + def forward_pretraining(self, features): + # TODO: implement pretraining + kp_feats = self.keypoint_deconv_layers(features) + heatmaps = self.keypoint_final_layer(kp_feats) + + output = {} + + if self.use_heatmaps == 'hm': + # returns coords between [-1,1] + pred_kp2d, confidence = get_heatmap_preds(heatmaps) + output['pred_kp2d'] = pred_kp2d + output['pred_kp2d_conf'] = confidence + elif self.use_heatmaps == 'hm_soft': + pred_kp2d, _ = softargmax2d(heatmaps, self.temperature) + output['pred_kp2d'] = pred_kp2d + else: + # returns coords between [-1,1] + pred_kp2d, _ = softargmax2d(heatmaps, self.temperature) + output['pred_kp2d'] = pred_kp2d + + if self.use_keypoint_features_for_smpl_regression: + smpl_feats = kp_feats + else: + smpl_feats = self.smpl_deconv_layers(features) + + cam_shape_feats = self.smpl_final_layer(smpl_feats) + + output.update({ + 'kp_feats': heatmaps, + 'heatmaps': heatmaps, + 'smpl_feats': smpl_feats, + 'cam_shape_feats': cam_shape_feats, + }) + return output \ No newline at end of file diff --git a/myeasymocap/backbone/pare/head/smpl_cam_head.py b/myeasymocap/backbone/pare/head/smpl_cam_head.py new file mode 100644 index 0000000..e3f0edb --- /dev/null +++ b/myeasymocap/backbone/pare/head/smpl_cam_head.py @@ -0,0 +1,133 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +import torch +import torch.nn as nn + +from .. import config +from .smpl_head import SMPL + + +class SMPLCamHead(nn.Module): + def __init__(self, img_res=224): + super(SMPLCamHead, self).__init__() + self.smpl = SMPL(config.SMPL_MODEL_DIR, create_transl=False) + self.add_module('smpl', self.smpl) + + self.img_res = img_res + + def forward(self, rotmat, shape, cam, cam_rotmat, cam_intrinsics, + bbox_scale, bbox_center, img_w, img_h, normalize_joints2d=False): + ''' + :param rotmat: rotation in euler angles format (N,J,3,3) + :param shape: smpl betas + :param cam: weak perspective camera + :param normalize_joints2d: bool, normalize joints between -1, 1 if true + :param cam_rotmat (Nx3x3) camera rotation matrix + :param cam_intrinsics (Nx3x3) camera intrinsics matrix + :param bbox_scale (N,) bbox height normalized by 200 + :param bbox_center (N,2) bbox center + :param img_w (N,) original image width + :param img_h (N,) original image height + :return: dict with keys 'vertices', 'joints3d', 'joints2d' if cam is True + ''' + smpl_output = self.smpl( + betas=shape, + body_pose=rotmat[:, 1:].contiguous(), + global_orient=rotmat[:, 0].unsqueeze(1).contiguous(), + pose2rot=False, + ) + + output = { + 'smpl_vertices': smpl_output.vertices, + 'smpl_joints3d': smpl_output.joints, + } + + joints3d = smpl_output.joints + + cam_t = convert_pare_to_full_img_cam( + pare_cam=cam, + bbox_height=bbox_scale * 200., + bbox_center=bbox_center, + img_w=img_w, + img_h=img_h, + focal_length=cam_intrinsics[:, 0, 0], + crop_res=self.img_res, + ) + + joints2d = perspective_projection( + joints3d, + rotation=cam_rotmat, + translation=cam_t, + cam_intrinsics=cam_intrinsics, + ) + + # logger.debug(f'PARE cam: {cam}') + # logger.debug(f'FIMG cam: {cam_t}') + # logger.debug(f'joints2d: {joints2d}') + + + if normalize_joints2d: + # Normalize keypoints to [-1,1] + joints2d = joints2d / (self.img_res / 2.) + + output['smpl_joints2d'] = joints2d + output['pred_cam_t'] = cam_t + + return output + + +def perspective_projection(points, rotation, translation, cam_intrinsics): + """ + This function computes the perspective projection of a set of points. + Input: + points (bs, N, 3): 3D points + rotation (bs, 3, 3): Camera rotation + translation (bs, 3): Camera translation + cam_intrinsics (bs, 3, 3): Camera intrinsics + """ + K = cam_intrinsics + + # Transform points + points = torch.einsum('bij,bkj->bki', rotation, points) + points = points + translation.unsqueeze(1) + + # Apply perspective distortion + projected_points = points / points[:,:,-1].unsqueeze(-1) + + # Apply camera intrinsics + projected_points = torch.einsum('bij,bkj->bki', K, projected_points.float()) + + return projected_points[:, :, :-1] + + +def convert_pare_to_full_img_cam( + pare_cam, bbox_height, bbox_center, + img_w, img_h, focal_length, crop_res=224): + # Converts weak perspective camera estimated by PARE in + # bbox coords to perspective camera in full image coordinates + # from https://arxiv.org/pdf/2009.06549.pdf + s, tx, ty = pare_cam[:, 0], pare_cam[:, 1], pare_cam[:, 2] + res = 224 + r = bbox_height / res + tz = 2 * focal_length / (r * res * s) + + cx = 2 * (bbox_center[:, 0] - (img_w / 2.)) / (s * bbox_height) + cy = 2 * (bbox_center[:, 1] - (img_h / 2.)) / (s * bbox_height) + + cam_t = torch.stack([tx + cx, ty + cy, tz], dim=-1) + + return cam_t \ No newline at end of file diff --git a/myeasymocap/backbone/pare/head/smpl_head.py b/myeasymocap/backbone/pare/head/smpl_head.py new file mode 100644 index 0000000..623dafb --- /dev/null +++ b/myeasymocap/backbone/pare/head/smpl_head.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +import torch +import numpy as np +import torch.nn as nn + +from smplx import SMPL as _SMPL +from smplx.utils import SMPLOutput +from smplx.lbs import vertices2joints + +from .. import config, constants +from ..utils.geometry import perspective_projection, convert_weak_perspective_to_perspective + + +class SMPL(_SMPL): + """ Extension of the official SMPL implementation to support more joints """ + + def __init__(self, *args, **kwargs): + super(SMPL, self).__init__(*args, **kwargs) + joints = [constants.JOINT_MAP[i] for i in constants.JOINT_NAMES] + J_regressor_extra = np.load(config.JOINT_REGRESSOR_TRAIN_EXTRA) + self.register_buffer('J_regressor_extra', torch.tensor(J_regressor_extra, dtype=torch.float32)) + self.joint_map = torch.tensor(joints, dtype=torch.long) + + def forward(self, *args, **kwargs): + kwargs['get_skin'] = True + smpl_output = super(SMPL, self).forward(*args, **kwargs) + extra_joints = vertices2joints(self.J_regressor_extra, smpl_output.vertices) + joints = torch.cat([smpl_output.joints, extra_joints], dim=1) + joints = joints[:, self.joint_map, :] + output = SMPLOutput(vertices=smpl_output.vertices, + global_orient=smpl_output.global_orient, + body_pose=smpl_output.body_pose, + joints=joints, + betas=smpl_output.betas, + full_pose=smpl_output.full_pose) + return output + + +class SMPLHead(nn.Module): + def __init__(self, focal_length=5000., img_res=224): + super(SMPLHead, self).__init__() + self.smpl = SMPL(config.SMPL_MODEL_DIR, create_transl=False) + self.add_module('smpl', self.smpl) + self.focal_length = focal_length + self.img_res = img_res + + def forward(self, rotmat, shape, cam=None, normalize_joints2d=False): + ''' + :param rotmat: rotation in euler angles format (N,J,3,3) + :param shape: smpl betas + :param cam: weak perspective camera + :param normalize_joints2d: bool, normalize joints between -1, 1 if true + :return: dict with keys 'vertices', 'joints3d', 'joints2d' if cam is True + ''' + smpl_output = self.smpl( + betas=shape, + body_pose=rotmat[:, 1:].contiguous(), + global_orient=rotmat[:, 0].unsqueeze(1).contiguous(), + pose2rot=False, + ) + + output = { + 'smpl_vertices': smpl_output.vertices, + 'smpl_joints3d': smpl_output.joints, + } + if cam is not None: + joints3d = smpl_output.joints + batch_size = joints3d.shape[0] + device = joints3d.device + cam_t = convert_weak_perspective_to_perspective( + cam, + focal_length=self.focal_length, + img_res=self.img_res, + ) + joints2d = perspective_projection( + joints3d, + rotation=torch.eye(3, device=device).unsqueeze(0).expand(batch_size, -1, -1), + translation=cam_t, + focal_length=self.focal_length, + camera_center=torch.zeros(batch_size, 2, device=device) + ) + if normalize_joints2d: + # Normalize keypoints to [-1,1] + joints2d = joints2d / (self.img_res / 2.) + + output['smpl_joints2d'] = joints2d + output['pred_cam_t'] = cam_t + + return output diff --git a/myeasymocap/backbone/pare/layers/__init__.py b/myeasymocap/backbone/pare/layers/__init__.py new file mode 100644 index 0000000..106fb2e --- /dev/null +++ b/myeasymocap/backbone/pare/layers/__init__.py @@ -0,0 +1,4 @@ +from .locallyconnected2d import LocallyConnected2d +from .interpolate import interpolate +from .nonlocalattention import NonLocalAttention +from .keypoint_attention import KeypointAttention \ No newline at end of file diff --git a/myeasymocap/backbone/pare/layers/coattention.py b/myeasymocap/backbone/pare/layers/coattention.py new file mode 100644 index 0000000..d9efa70 --- /dev/null +++ b/myeasymocap/backbone/pare/layers/coattention.py @@ -0,0 +1,126 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..backbone.resnet import conv1x1, conv3x3 + + +class CoAttention(nn.Module): + def __init__( + self, + n_channel, + final_conv='simple', # 'double_1', 'double_3', 'single_1', 'single_3', 'simple' + ): + super(CoAttention, self).__init__() + self.linear_e = nn.Linear(n_channel, n_channel, bias=False) + self.channel = n_channel + # self.dim = all_dim + self.gate = nn.Conv2d(n_channel, 1, kernel_size=1, bias=False) + self.gate_s = nn.Sigmoid() + self.softmax = nn.Sigmoid() + + if final_conv.startswith('double'): + kernel_size = int(final_conv[-1]) + conv = conv1x1 if kernel_size == 1 else conv3x3 + self.final_conv_1 = nn.Sequential( + conv(n_channel * 2, n_channel), + nn.BatchNorm2d(n_channel), + nn.ReLU(inplace=True), + conv(n_channel, n_channel), + nn.BatchNorm2d(n_channel), + nn.ReLU(inplace=True), + ) + self.final_conv_2 = nn.Sequential( + conv(n_channel * 2, n_channel), + nn.BatchNorm2d(n_channel), + nn.ReLU(inplace=True), + conv(n_channel, n_channel), + nn.BatchNorm2d(n_channel), + nn.ReLU(inplace=True), + ) + elif final_conv.startswith('single'): + kernel_size = int(final_conv[-1]) + conv = conv1x1 if kernel_size == 1 else conv3x3 + self.final_conv_1 = nn.Sequential( + conv(n_channel*2, n_channel), + nn.BatchNorm2d(n_channel), + nn.ReLU(inplace=True), + ) + self.final_conv_2 = nn.Sequential( + conv(n_channel*2, n_channel), + nn.BatchNorm2d(n_channel), + nn.ReLU(inplace=True), + ) + elif final_conv == 'simple': + self.final_conv_1 = conv1x1(n_channel * 2, n_channel) + self.final_conv_2 = conv1x1(n_channel * 2, n_channel) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, 0.01) + # init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + # init.xavier_normal(m.weight.data) + # m.bias.data.fill_(0) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def forward(self, input_1, input_2): + ''' + input_1: [N, C, H, W] + input_2: [N, C, H, W] + ''' + + b, c, h, w = input_1.shape + exemplar, query = input_1, input_2 + + exemplar_flat = exemplar.reshape(-1, c, h*w) # N,C,H*W + query_flat = query.reshape(-1, c, h*w) + + # Compute coattention scores, S in the paper + exemplar_t = torch.transpose(exemplar_flat, 1, 2).contiguous() # batch size x dim x num + exemplar_corr = self.linear_e(exemplar_t) + A = torch.bmm(exemplar_corr, query_flat) + A1 = F.softmax(A.clone(), dim=1) + B = F.softmax(torch.transpose(A, 1, 2), dim=1) + query_att = torch.bmm(exemplar_flat, A1) + exemplar_att = torch.bmm(query_flat, B) + + input1_att = exemplar_att.reshape(-1, c, h, w) + input2_att = query_att.reshape(-1, c, h, w) + + # Apply gating on S, section gated coattention + input1_mask = self.gate(input1_att) + input2_mask = self.gate(input2_att) + + input1_mask = self.gate_s(input1_mask) + input2_mask = self.gate_s(input2_mask) + + input1_att = input1_att * input1_mask + input2_att = input2_att * input2_mask + + # Concatenate inputs with their attended version + input1_att = torch.cat([input1_att, exemplar], 1) + input2_att = torch.cat([input2_att, query], 1) + + input1 = self.final_conv_1(input1_att) + input2 = self.final_conv_2(input2_att) + + return input1, input2 diff --git a/myeasymocap/backbone/pare/layers/interpolate.py b/myeasymocap/backbone/pare/layers/interpolate.py new file mode 100644 index 0000000..754eb2a --- /dev/null +++ b/myeasymocap/backbone/pare/layers/interpolate.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +import torch + +def interpolate(feat, uv): + ''' + + :param feat: [B, C, H, W] image features + :param uv: [B, 2, N] uv coordinates in the image plane, range [-1, 1] + :return: [B, C, N] image features at the uv coordinates + ''' + if uv.shape[-1] != 2: + uv = uv.transpose(1, 2) # [B, N, 2] + uv = uv.unsqueeze(2) # [B, N, 1, 2] + # NOTE: for newer PyTorch, it seems that training results are degraded due to implementation diff in F.grid_sample + # for old versions, simply remove the aligned_corners argument. + if int(torch.__version__.split('.')[1]) < 4: + samples = torch.nn.functional.grid_sample(feat, uv) # [B, C, N, 1] + else: + samples = torch.nn.functional.grid_sample(feat, uv, align_corners=True) # [B, C, N, 1] + return samples[:, :, :, 0] # [B, C, N] \ No newline at end of file diff --git a/myeasymocap/backbone/pare/layers/keypoint_attention.py b/myeasymocap/backbone/pare/layers/keypoint_attention.py new file mode 100644 index 0000000..0039ac2 --- /dev/null +++ b/myeasymocap/backbone/pare/layers/keypoint_attention.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +import torch +import numpy as np +import torch.nn as nn +import torch.nn.functional as F + +class KeypointAttention(nn.Module): + def __init__(self, use_conv=False, in_channels=(256, 64), out_channels=(256, 64), act='softmax', use_scale=False): + super(KeypointAttention, self).__init__() + self.use_conv = use_conv + self.in_channels = in_channels + self.out_channels = out_channels + self.act = act + self.use_scale = use_scale + if use_conv: + self.conv1x1_pose = nn.Conv1d(in_channels[0], out_channels[0], kernel_size=1) + self.conv1x1_shape_cam = nn.Conv1d(in_channels[1], out_channels[1], kernel_size=1) + + def forward(self, features, heatmaps): + batch_size, num_joints, height, width = heatmaps.shape + + if self.use_scale: + scale = 1.0 / np.sqrt(height * width) + heatmaps = heatmaps * scale + + if self.act == 'softmax': + normalized_heatmap = F.softmax(heatmaps.reshape(batch_size, num_joints, -1), dim=-1) + elif self.act == 'sigmoid': + normalized_heatmap = torch.sigmoid(heatmaps.reshape(batch_size, num_joints, -1)) + features = features.reshape(batch_size, -1, height*width) + + attended_features = torch.matmul(normalized_heatmap, features.transpose(2,1)) + attended_features = attended_features.transpose(2,1) + + if self.use_conv: + if attended_features.shape[1] == self.in_channels[0]: + attended_features = self.conv1x1_pose(attended_features) + else: + attended_features = self.conv1x1_shape_cam(attended_features) + + return attended_features \ No newline at end of file diff --git a/myeasymocap/backbone/pare/layers/locallyconnected2d.py b/myeasymocap/backbone/pare/layers/locallyconnected2d.py new file mode 100644 index 0000000..6c54cee --- /dev/null +++ b/myeasymocap/backbone/pare/layers/locallyconnected2d.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +import torch +import torch.nn as nn +from torch.nn.modules.utils import _pair + + +class LocallyConnected2d(nn.Module): + def __init__(self, in_channels, out_channels, output_size, kernel_size, stride, bias=False): + super(LocallyConnected2d, self).__init__() + output_size = _pair(output_size) + self.weight = nn.Parameter( + torch.randn(1, out_channels, in_channels, output_size[0], output_size[1], kernel_size ** 2), + requires_grad=True, + ) + if bias: + self.bias = nn.Parameter( + torch.randn(1, out_channels, output_size[0], output_size[1]), requires_grad=True + ) + else: + self.register_parameter('bias', None) + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + + def forward(self, x): + _, c, h, w = x.size() + kh, kw = self.kernel_size + dh, dw = self.stride + x = x.unfold(2, kh, dh).unfold(3, kw, dw) + x = x.contiguous().view(*x.size()[:-2], -1) + # Sum in in_channel and kernel_size dims + out = (x.unsqueeze(1) * self.weight).sum([2, -1]) + if self.bias is not None: + out += self.bias + return out \ No newline at end of file diff --git a/myeasymocap/backbone/pare/layers/non_local/__init__.py b/myeasymocap/backbone/pare/layers/non_local/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/myeasymocap/backbone/pare/layers/non_local/dot_product.py b/myeasymocap/backbone/pare/layers/non_local/dot_product.py new file mode 100644 index 0000000..e020e71 --- /dev/null +++ b/myeasymocap/backbone/pare/layers/non_local/dot_product.py @@ -0,0 +1,152 @@ +import torch +from torch import nn +from torch.nn import functional as F + + +class _NonLocalBlockND(nn.Module): + def __init__(self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True): + super(_NonLocalBlockND, self).__init__() + + assert dimension in [1, 2, 3] + + self.dimension = dimension + self.sub_sample = sub_sample + + self.in_channels = in_channels + self.inter_channels = inter_channels + + if self.inter_channels is None: + self.inter_channels = in_channels // 2 + if self.inter_channels == 0: + self.inter_channels = 1 + + if dimension == 3: + conv_nd = nn.Conv3d + max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2)) + bn = nn.BatchNorm3d + elif dimension == 2: + conv_nd = nn.Conv2d + max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2)) + bn = nn.BatchNorm2d + else: + conv_nd = nn.Conv1d + max_pool_layer = nn.MaxPool1d(kernel_size=(2)) + bn = nn.BatchNorm1d + + self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, + kernel_size=1, stride=1, padding=0) + + if bn_layer: + self.W = nn.Sequential( + conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, + kernel_size=1, stride=1, padding=0), + bn(self.in_channels) + ) + nn.init.constant_(self.W[1].weight, 0) + nn.init.constant_(self.W[1].bias, 0) + else: + self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, + kernel_size=1, stride=1, padding=0) + nn.init.constant_(self.W.weight, 0) + nn.init.constant_(self.W.bias, 0) + + self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, + kernel_size=1, stride=1, padding=0) + + self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, + kernel_size=1, stride=1, padding=0) + + if sub_sample: + self.g = nn.Sequential(self.g, max_pool_layer) + self.phi = nn.Sequential(self.phi, max_pool_layer) + + def forward(self, x, return_nl_map=False): + """ + :param x: (b, c, t, h, w) + :param return_nl_map: if True return z, nl_map, else only return z. + :return: + """ + + batch_size = x.size(0) + + g_x = self.g(x).view(batch_size, self.inter_channels, -1) + g_x = g_x.permute(0, 2, 1) + + theta_x = self.theta(x).view(batch_size, self.inter_channels, -1) + theta_x = theta_x.permute(0, 2, 1) + phi_x = self.phi(x).view(batch_size, self.inter_channels, -1) + f = torch.matmul(theta_x, phi_x) + N = f.size(-1) + f_div_C = f / N + + y = torch.matmul(f_div_C, g_x) + y = y.permute(0, 2, 1).contiguous() + y = y.view(batch_size, self.inter_channels, *x.size()[2:]) + W_y = self.W(y) + z = W_y + x + + if return_nl_map: + return z, f_div_C + return z + + +class NONLocalBlock1D(_NonLocalBlockND): + def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True): + super(NONLocalBlock1D, self).__init__(in_channels, + inter_channels=inter_channels, + dimension=1, sub_sample=sub_sample, + bn_layer=bn_layer) + + +class NONLocalBlock2D(_NonLocalBlockND): + def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True): + super(NONLocalBlock2D, self).__init__(in_channels, + inter_channels=inter_channels, + dimension=2, sub_sample=sub_sample, + bn_layer=bn_layer) + + +class NONLocalBlock3D(_NonLocalBlockND): + def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True): + super(NONLocalBlock3D, self).__init__(in_channels, + inter_channels=inter_channels, + dimension=3, sub_sample=sub_sample, + bn_layer=bn_layer) + + +if __name__ == '__main__': + import torch + + img = torch.zeros(2, 256, 24) + net = NONLocalBlock1D( + in_channels=256, inter_channels=None, sub_sample=False, bn_layer=True + ) + out = net(img) + print(out.size()) + + img = torch.zeros(2, 256, 56, 56) + net = NONLocalBlock2D( + in_channels=256, inter_channels=None, sub_sample=False, bn_layer=True + ) + out = net(img) + print(out.size()) + + + # for (sub_sample_, bn_layer_) in [(True, True), (False, False), (True, False), (False, True)]: + # img = torch.zeros(2, 256, 24) + # net = NONLocalBlock1D(256, inter_channels=24, sub_sample=sub_sample_, bn_layer=bn_layer_) + # out = net(img) + # print(out.size()) + # + # img = torch.zeros(2, 3, 20, 20) + # net = NONLocalBlock2D(3, sub_sample=sub_sample_, bn_layer=bn_layer_) + # out = net(img) + # print(out.size()) + # + # img = torch.randn(2, 3, 8, 20, 20) + # net = NONLocalBlock3D(3, sub_sample=sub_sample_, bn_layer=bn_layer_) + # out = net(img) + # print(out.size()) + + + diff --git a/myeasymocap/backbone/pare/layers/nonlocalattention.py b/myeasymocap/backbone/pare/layers/nonlocalattention.py new file mode 100644 index 0000000..f63c663 --- /dev/null +++ b/myeasymocap/backbone/pare/layers/nonlocalattention.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class NonLocalAttention(nn.Module): + def __init__( + self, + in_channels=256, + out_channels=256, + ): + super(NonLocalAttention, self).__init__() + self.conv1x1 = nn.Conv1d(in_channels, out_channels, kernel_size=1) + + def forward(self, input): + ''' + input [N, Feats, J, 1] + output [N, Feats, J, 1] + ''' + batch_size, n_feats, n_joints, _ = input.shape + input = input.squeeze(-1) + + # Compute attention weights + attention = torch.matmul(input.transpose(2, 1), input) + norm_attention = F.softmax(attention, dim=-1) + + # Compute final dot product + out = torch.matmul(input, norm_attention) + out = self.conv1x1(out) + + out = out.unsqueeze(-1) # [N, F, J, 1] + return out + + +if __name__ == '__main__': + nla = NonLocalAttention() + + inp = torch.rand(32, 256, 24, 1) + + out = nla(inp) + print(out.shape) \ No newline at end of file diff --git a/myeasymocap/backbone/pare/layers/softargmax.py b/myeasymocap/backbone/pare/layers/softargmax.py new file mode 100644 index 0000000..e3ddf12 --- /dev/null +++ b/myeasymocap/backbone/pare/layers/softargmax.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +import torch +import torch.nn.functional as F + + +def _softmax(tensor, temperature, dim=-1): + return F.softmax(tensor * temperature, dim=dim) + + +def softargmax1d( + heatmaps, + temperature=None, + normalize_keypoints=True, +): + dtype, device = heatmaps.dtype, heatmaps.device + if temperature is None: + temperature = torch.tensor(1.0, dtype=dtype, device=device) + batch_size, num_channels, dim = heatmaps.shape + points = torch.arange(0, dim, device=device, dtype=dtype).reshape(1, 1, dim).expand(batch_size, -1, -1) + # y = torch.arange(0, height, device=device, dtype=dtype).reshape(1, 1, height, 1).expand(batch_size, -1, -1, width) + # Should be Bx2xHxW + + # points = torch.cat([x, y], dim=1) + normalized_heatmap = _softmax( + heatmaps.reshape(batch_size, num_channels, -1), + temperature=temperature.reshape(1, -1, 1), + dim=-1) + + # Should be BxJx2 + keypoints = (normalized_heatmap.reshape(batch_size, -1, dim) * points).sum(dim=-1) + + if normalize_keypoints: + # Normalize keypoints to [-1, 1] + keypoints = (keypoints / (dim - 1) * 2 - 1) + + return keypoints, normalized_heatmap.reshape( + batch_size, -1, dim) + + +def softargmax2d( + heatmaps, + temperature=None, + normalize_keypoints=True, +): + dtype, device = heatmaps.dtype, heatmaps.device + if temperature is None: + temperature = torch.tensor(1.0, dtype=dtype, device=device) + batch_size, num_channels, height, width = heatmaps.shape + x = torch.arange(0, width, device=device, dtype=dtype).reshape(1, 1, 1, width).expand(batch_size, -1, height, -1) + y = torch.arange(0, height, device=device, dtype=dtype).reshape(1, 1, height, 1).expand(batch_size, -1, -1, width) + # Should be Bx2xHxW + points = torch.cat([x, y], dim=1) + normalized_heatmap = _softmax( + heatmaps.reshape(batch_size, num_channels, -1), + temperature=temperature.reshape(1, -1, 1), + dim=-1) + + # Should be BxJx2 + keypoints = ( + normalized_heatmap.reshape(batch_size, -1, 1, height * width) * + points.reshape(batch_size, 1, 2, -1)).sum(dim=-1) + + if normalize_keypoints: + # Normalize keypoints to [-1, 1] + keypoints[:, :, 0] = (keypoints[:, :, 0] / (width - 1) * 2 - 1) + keypoints[:, :, 1] = (keypoints[:, :, 1] / (height - 1) * 2 - 1) + + return keypoints, normalized_heatmap.reshape( + batch_size, -1, height, width) + + +def softargmax3d( + heatmaps, + temperature=None, + normalize_keypoints=True, +): + dtype, device = heatmaps.dtype, heatmaps.device + if temperature is None: + temperature = torch.tensor(1.0, dtype=dtype, device=device) + batch_size, num_channels, height, width, depth = heatmaps.shape + x = torch.arange(0, width, device=device, dtype=dtype).reshape(1, 1, 1, width, 1).expand(batch_size, -1, height, -1, depth) + y = torch.arange(0, height, device=device, dtype=dtype).reshape(1, 1, height, 1, 1).expand(batch_size, -1, -1, width, depth) + z = torch.arange(0, depth, device=device, dtype=dtype).reshape(1, 1, 1, 1, depth).expand(batch_size, -1, height, width, -1) + # Should be Bx2xHxW + points = torch.cat([x, y, z], dim=1) + normalized_heatmap = _softmax( + heatmaps.reshape(batch_size, num_channels, -1), + temperature=temperature.reshape(1, -1, 1), + dim=-1) + + # Should be BxJx3 + keypoints = ( + normalized_heatmap.reshape(batch_size, -1, 1, height * width * depth) * + points.reshape(batch_size, 1, 3, -1)).sum(dim=-1) + + if normalize_keypoints: + # Normalize keypoints to [-1, 1] + keypoints[:, :, 0] = (keypoints[:, :, 0] / (width - 1) * 2 - 1) + keypoints[:, :, 1] = (keypoints[:, :, 1] / (height - 1) * 2 - 1) + keypoints[:, :, 2] = (keypoints[:, :, 2] / (depth - 1) * 2 - 1) + + return keypoints, normalized_heatmap.reshape( + batch_size, -1, height, width, depth) + + +def get_heatmap_preds(batch_heatmaps, normalize_keypoints=True): + ''' + get predictions from score maps + heatmaps: numpy.ndarray([batch_size, num_joints, height, width]) + ''' + assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim' + + batch_size = batch_heatmaps.shape[0] + num_joints = batch_heatmaps.shape[1] + height = batch_heatmaps.shape[2] + width = batch_heatmaps.shape[3] + heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1)) + + maxvals, idx = torch.max(heatmaps_reshaped, 2) + + maxvals = maxvals.reshape((batch_size, num_joints, 1)) + idx = idx.reshape((batch_size, num_joints, 1)) + + preds = idx.repeat(1, 1, 2).float() + + preds[:, :, 0] = (preds[:, :, 0]) % width + preds[:, :, 1] = torch.floor((preds[:, :, 1]) / width) + + pred_mask = torch.gt(maxvals, 0.0).repeat(1, 1, 2) + pred_mask = pred_mask.float() + + preds *= pred_mask + + if normalize_keypoints: + # Normalize keypoints to [-1, 1] + preds[:, :, 0] = (preds[:, :, 0] / (width - 1) * 2 - 1) + preds[:, :, 1] = (preds[:, :, 1] / (height - 1) * 2 - 1) + + return preds, maxvals \ No newline at end of file diff --git a/myeasymocap/backbone/pare/pare.py b/myeasymocap/backbone/pare/pare.py new file mode 100644 index 0000000..2925e67 --- /dev/null +++ b/myeasymocap/backbone/pare/pare.py @@ -0,0 +1,262 @@ +import os +import torch +import torch.nn as nn +from .config import update_hparams +# from .head import PareHead, SMPLHead, SMPLCamHead +from .head import PareHead +from .backbone.utils import get_backbone_info +from .backbone.hrnet import hrnet_w32 +from os.path import join +from easymocap.multistage.torchgeometry import rotation_matrix_to_axis_angle +import cv2 + +def try_to_download(): + model_dir = os.path.join(os.path.dirname(__file__), '..', '..', 'models', 'pare') + cmd = 'wget https://www.dropbox.com/s/aeulffqzb3zmh8x/pare-github-data.zip' + os.system(cmd) + os.makedirs(model_dir, exist_ok=True) + cmd = 'unzip pare-github-data.zip -d {}'.format(model_dir) + os.system(cmd) + +CFG = 'models/pare/data/pare/checkpoints/pare_w_3dpw_config.yaml' +CKPT = 'models/pare/data/pare/checkpoints/pare_w_3dpw_checkpoint.ckpt' + +class PARE(nn.Module): + def __init__( + self, + num_joints=24, + softmax_temp=1.0, + num_features_smpl=64, + backbone='resnet50', + focal_length=5000., + img_res=224, + pretrained=None, + iterative_regression=False, + iter_residual=False, + num_iterations=3, + shape_input_type='feats', # 'feats.all_pose.shape.cam', + pose_input_type='feats', # 'feats.neighbor_pose_feats.all_pose.self_pose.neighbor_pose.shape.cam' + pose_mlp_num_layers=1, + shape_mlp_num_layers=1, + pose_mlp_hidden_size=256, + shape_mlp_hidden_size=256, + use_keypoint_features_for_smpl_regression=False, + use_heatmaps='', + use_keypoint_attention=False, + keypoint_attention_act='softmax', + use_postconv_keypoint_attention=False, + use_scale_keypoint_attention=False, + use_final_nonlocal=None, + use_branch_nonlocal=None, + use_hmr_regression=False, + use_coattention=False, + num_coattention_iter=1, + coattention_conv='simple', + deconv_conv_kernel_size=4, + use_upsampling=False, + use_soft_attention=False, + num_branch_iteration=0, + branch_deeper=False, + num_deconv_layers=3, + num_deconv_filters=256, + use_resnet_conv_hrnet=False, + use_position_encodings=None, + use_mean_camshape=False, + use_mean_pose=False, + init_xavier=False, + use_cam=False, + ): + super(PARE, self).__init__() + if backbone.startswith('hrnet'): + backbone, use_conv = backbone.split('-') + # hrnet_w32-conv, hrnet_w32-interp + self.backbone = eval(backbone)( + pretrained=True, + downsample=False, + use_conv=(use_conv == 'conv') + ) + else: + self.backbone = eval(backbone)(pretrained=True) + + # self.backbone = eval(backbone)(pretrained=True) + self.head = PareHead( + num_joints=num_joints, + num_input_features=get_backbone_info(backbone)['n_output_channels'], + softmax_temp=softmax_temp, + num_deconv_layers=num_deconv_layers, + num_deconv_filters=[num_deconv_filters] * num_deconv_layers, + num_deconv_kernels=[deconv_conv_kernel_size] * num_deconv_layers, + num_features_smpl=num_features_smpl, + final_conv_kernel=1, + iterative_regression=iterative_regression, + iter_residual=iter_residual, + num_iterations=num_iterations, + shape_input_type=shape_input_type, + pose_input_type=pose_input_type, + pose_mlp_num_layers=pose_mlp_num_layers, + shape_mlp_num_layers=shape_mlp_num_layers, + pose_mlp_hidden_size=pose_mlp_hidden_size, + shape_mlp_hidden_size=shape_mlp_hidden_size, + use_keypoint_features_for_smpl_regression=use_keypoint_features_for_smpl_regression, + use_heatmaps=use_heatmaps, + use_keypoint_attention=use_keypoint_attention, + use_postconv_keypoint_attention=use_postconv_keypoint_attention, + keypoint_attention_act=keypoint_attention_act, + use_scale_keypoint_attention=use_scale_keypoint_attention, + use_branch_nonlocal=use_branch_nonlocal, # 'concatenation', 'dot_product', 'embedded_gaussian', 'gaussian' + use_final_nonlocal=use_final_nonlocal, # 'concatenation', 'dot_product', 'embedded_gaussian', 'gaussian' + backbone=backbone, + use_hmr_regression=use_hmr_regression, + use_coattention=use_coattention, + num_coattention_iter=num_coattention_iter, + coattention_conv=coattention_conv, + use_upsampling=use_upsampling, + use_soft_attention=use_soft_attention, + num_branch_iteration=num_branch_iteration, + branch_deeper=branch_deeper, + use_resnet_conv_hrnet=use_resnet_conv_hrnet, + use_position_encodings=use_position_encodings, + use_mean_camshape=use_mean_camshape, + use_mean_pose=use_mean_pose, + init_xavier=init_xavier, + ) + + self.use_cam = use_cam + # if self.use_cam: + # self.smpl = SMPLCamHead( + # img_res=img_res, + # ) + # else: + # self.smpl = SMPLHead( + # focal_length=focal_length, + # img_res=img_res + # ) + + if pretrained is not None: + self.load_pretrained(pretrained) + + def forward( + self, + images, + gt_segm=None, + ): + features = self.backbone(images) + hmr_output = self.head(features, gt_segm=gt_segm) + rotmat = hmr_output['pred_pose'] + shape = hmr_output['pred_shape'] + rotmat_flat = rotmat.reshape(-1, 3, 3) + rvec_flat = rotation_matrix_to_axis_angle(rotmat_flat) + rvec = rvec_flat.reshape(*rotmat.shape[:-2], 3) + rvec = rvec.reshape(*rvec.shape[:-2], -1) + return { + 'Rh': rvec[..., :3], + 'Th': torch.zeros_like(rvec[..., :3]), + 'poses': rvec[..., 3:], + 'shapes': shape, + } + +from ..basetopdown import BaseTopDownModelCache +import pickle + +class NullSPIN: + def __init__(self, ckpt) -> None: + self.name = 'spin' + + def __call__(self, bbox, images, imgname): + from easymocap.mytools.reader import read_smpl + basename = os.path.basename(imgname) + cachename = join(self.output, self.name, basename.replace('.jpg', '.json')) + if os.path.exists(cachename): + params = read_smpl(cachename) + params = params[0] + params = {key:val[0] for key, val in params.items() if key != 'id'} + ret = { + 'params': params + } + return ret + else: + import ipdb; ipdb.set_trace() + +class MyPARE(BaseTopDownModelCache): + def __init__(self, ckpt) -> None: + super().__init__('pare', bbox_scale=1.1, res_input=224) + if not os.path.exists(CFG): + from ...io.model import try_to_download_SMPL + try_to_download_SMPL('models/pare') + self.model_cfg = update_hparams(CFG) + self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + self.model = self._build_model() + self._load_pretrained_model(CKPT) + self.model.eval() + self.model.to(self.device) + + def __call__(self, bbox, images, imgnames): + return super().__call__(bbox[0], images, imgnames) + + def _build_model(self): + # ========= Define PARE model ========= # + model_cfg = self.model_cfg + + if model_cfg.METHOD == 'pare': + model = PARE( + backbone=model_cfg.PARE.BACKBONE, + num_joints=model_cfg.PARE.NUM_JOINTS, + softmax_temp=model_cfg.PARE.SOFTMAX_TEMP, + num_features_smpl=model_cfg.PARE.NUM_FEATURES_SMPL, + focal_length=model_cfg.DATASET.FOCAL_LENGTH, + img_res=model_cfg.DATASET.IMG_RES, + pretrained=model_cfg.TRAINING.PRETRAINED, + iterative_regression=model_cfg.PARE.ITERATIVE_REGRESSION, + num_iterations=model_cfg.PARE.NUM_ITERATIONS, + iter_residual=model_cfg.PARE.ITER_RESIDUAL, + shape_input_type=model_cfg.PARE.SHAPE_INPUT_TYPE, + pose_input_type=model_cfg.PARE.POSE_INPUT_TYPE, + pose_mlp_num_layers=model_cfg.PARE.POSE_MLP_NUM_LAYERS, + shape_mlp_num_layers=model_cfg.PARE.SHAPE_MLP_NUM_LAYERS, + pose_mlp_hidden_size=model_cfg.PARE.POSE_MLP_HIDDEN_SIZE, + shape_mlp_hidden_size=model_cfg.PARE.SHAPE_MLP_HIDDEN_SIZE, + use_keypoint_features_for_smpl_regression=model_cfg.PARE.USE_KEYPOINT_FEATURES_FOR_SMPL_REGRESSION, + use_heatmaps=model_cfg.DATASET.USE_HEATMAPS, + use_keypoint_attention=model_cfg.PARE.USE_KEYPOINT_ATTENTION, + use_postconv_keypoint_attention=model_cfg.PARE.USE_POSTCONV_KEYPOINT_ATTENTION, + use_scale_keypoint_attention=model_cfg.PARE.USE_SCALE_KEYPOINT_ATTENTION, + keypoint_attention_act=model_cfg.PARE.KEYPOINT_ATTENTION_ACT, + use_final_nonlocal=model_cfg.PARE.USE_FINAL_NONLOCAL, + use_branch_nonlocal=model_cfg.PARE.USE_BRANCH_NONLOCAL, + use_hmr_regression=model_cfg.PARE.USE_HMR_REGRESSION, + use_coattention=model_cfg.PARE.USE_COATTENTION, + num_coattention_iter=model_cfg.PARE.NUM_COATTENTION_ITER, + coattention_conv=model_cfg.PARE.COATTENTION_CONV, + use_upsampling=model_cfg.PARE.USE_UPSAMPLING, + deconv_conv_kernel_size=model_cfg.PARE.DECONV_CONV_KERNEL_SIZE, + use_soft_attention=model_cfg.PARE.USE_SOFT_ATTENTION, + num_branch_iteration=model_cfg.PARE.NUM_BRANCH_ITERATION, + branch_deeper=model_cfg.PARE.BRANCH_DEEPER, + num_deconv_layers=model_cfg.PARE.NUM_DECONV_LAYERS, + num_deconv_filters=model_cfg.PARE.NUM_DECONV_FILTERS, + use_resnet_conv_hrnet=model_cfg.PARE.USE_RESNET_CONV_HRNET, + use_position_encodings=model_cfg.PARE.USE_POS_ENC, + use_mean_camshape=model_cfg.PARE.USE_MEAN_CAMSHAPE, + use_mean_pose=model_cfg.PARE.USE_MEAN_POSE, + init_xavier=model_cfg.PARE.INIT_XAVIER, + ).to(self.device) + else: + exit() + + return model + + def _load_pretrained_model(self, ckpt): + # ========= Load pretrained weights ========= # + state_dict = torch.load(ckpt, map_location='cpu')['state_dict'] + pretrained_keys = state_dict.keys() + new_state_dict = {} + for pk in pretrained_keys: + if pk.startswith('model.'): + new_state_dict[pk.replace('model.', '')] = state_dict[pk] + else: + new_state_dict[pk] = state_dict[pk] + + self.model.load_state_dict(new_state_dict, strict=False) + +if __name__ == '__main__': + pass diff --git a/myeasymocap/backbone/pare/utils/geometry.py b/myeasymocap/backbone/pare/utils/geometry.py new file mode 100644 index 0000000..eb5a1d5 --- /dev/null +++ b/myeasymocap/backbone/pare/utils/geometry.py @@ -0,0 +1,722 @@ +import torch +import numpy as np +from torch.nn import functional as F + +""" +Useful geometric operations, e.g. Perspective projection and a differentiable Rodrigues formula +Parts of the code are taken from https://github.com/MandyMo/pytorch_HMR +""" + + +def batch_rot2aa(Rs): + """ + Rs is B x 3 x 3 + void cMathUtil::RotMatToAxisAngle(const tMatrix& mat, tVector& out_axis, + double& out_theta) + { + double c = 0.5 * (mat(0, 0) + mat(1, 1) + mat(2, 2) - 1); + c = cMathUtil::Clamp(c, -1.0, 1.0); + + out_theta = std::acos(c); + + if (std::abs(out_theta) < 0.00001) + { + out_axis = tVector(0, 0, 1, 0); + } + else + { + double m21 = mat(2, 1) - mat(1, 2); + double m02 = mat(0, 2) - mat(2, 0); + double m10 = mat(1, 0) - mat(0, 1); + double denom = std::sqrt(m21 * m21 + m02 * m02 + m10 * m10); + out_axis[0] = m21 / denom; + out_axis[1] = m02 / denom; + out_axis[2] = m10 / denom; + out_axis[3] = 0; + } + } + """ + cos = 0.5 * (torch.stack([torch.trace(x) for x in Rs]) - 1) + cos = torch.clamp(cos, -1, 1) + + theta = torch.acos(cos) + + m21 = Rs[:, 2, 1] - Rs[:, 1, 2] + m02 = Rs[:, 0, 2] - Rs[:, 2, 0] + m10 = Rs[:, 1, 0] - Rs[:, 0, 1] + denom = torch.sqrt(m21 * m21 + m02 * m02 + m10 * m10) + + axis0 = torch.where(torch.abs(theta) < 0.00001, m21, m21 / denom) + axis1 = torch.where(torch.abs(theta) < 0.00001, m02, m02 / denom) + axis2 = torch.where(torch.abs(theta) < 0.00001, m10, m10 / denom) + + return theta.unsqueeze(1) * torch.stack([axis0, axis1, axis2], 1) + + +def batch_rodrigues(theta): + """Convert axis-angle representation to rotation matrix. + Args: + theta: size = [B, 3] + Returns: + Rotation matrix corresponding to the quaternion -- size = [B, 3, 3] + """ + l1norm = torch.norm(theta + 1e-8, p = 2, dim = 1) + angle = torch.unsqueeze(l1norm, -1) + normalized = torch.div(theta, angle) + angle = angle * 0.5 + v_cos = torch.cos(angle) + v_sin = torch.sin(angle) + quat = torch.cat([v_cos, v_sin * normalized], dim = 1) + return quat_to_rotmat(quat) + + +def quat_to_rotmat(quat): + """Convert quaternion coefficients to rotation matrix. + Args: + quat: size = [B, 4] 4 <===>(w, x, y, z) + Returns: + Rotation matrix corresponding to the quaternion -- size = [B, 3, 3] + """ + norm_quat = quat + norm_quat = norm_quat/norm_quat.norm(p=2, dim=1, keepdim=True) + w, x, y, z = norm_quat[:,0], norm_quat[:,1], norm_quat[:,2], norm_quat[:,3] + + B = quat.size(0) + + w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2) + wx, wy, wz = w*x, w*y, w*z + xy, xz, yz = x*y, x*z, y*z + + rotMat = torch.stack([w2 + x2 - y2 - z2, 2*xy - 2*wz, 2*wy + 2*xz, + 2*wz + 2*xy, w2 - x2 + y2 - z2, 2*yz - 2*wx, + 2*xz - 2*wy, 2*wx + 2*yz, w2 - x2 - y2 + z2], dim=1).view(B, 3, 3) + return rotMat + + +def rot6d_to_rotmat(x): + """Convert 6D rotation representation to 3x3 rotation matrix. + Based on Zhou et al., "On the Continuity of Rotation Representations in Neural Networks", CVPR 2019 + Input: + (B,6) Batch of 6-D rotation representations + Output: + (B,3,3) Batch of corresponding rotation matrices + """ + x = x.reshape(-1,3,2) + a1 = x[:, :, 0] + a2 = x[:, :, 1] + b1 = F.normalize(a1) + b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1) + b3 = torch.cross(b1, b2) + return torch.stack((b1, b2, b3), dim=-1) + + +def rotmat_to_rot6d(x): + rotmat = x.reshape(-1, 3, 3) + rot6d = rotmat[:, :, :2].reshape(x.shape[0], -1) + return rot6d + + +def rotation_matrix_to_angle_axis(rotation_matrix): + """ + This function is borrowed from https://github.com/kornia/kornia + + Convert 3x4 rotation matrix to Rodrigues vector + + Args: + rotation_matrix (Tensor): rotation matrix. + + Returns: + Tensor: Rodrigues vector transformation. + + Shape: + - Input: :math:`(N, 3, 4)` + - Output: :math:`(N, 3)` + + Example: + >>> input = torch.rand(2, 3, 4) # Nx4x4 + >>> output = tgm.rotation_matrix_to_angle_axis(input) # Nx3 + """ + if rotation_matrix.shape[1:] == (3,3): + rot_mat = rotation_matrix.reshape(-1, 3, 3) + hom = torch.tensor([0, 0, 1], dtype=torch.float32, + device=rotation_matrix.device).reshape(1, 3, 1).expand(rot_mat.shape[0], -1, -1) + rotation_matrix = torch.cat([rot_mat, hom], dim=-1) + + quaternion = rotation_matrix_to_quaternion(rotation_matrix) + aa = quaternion_to_angle_axis(quaternion) + aa[torch.isnan(aa)] = 0.0 + return aa + + +def quaternion_to_angle_axis(quaternion: torch.Tensor) -> torch.Tensor: + """ + This function is borrowed from https://github.com/kornia/kornia + + Convert quaternion vector to angle axis of rotation. + + Adapted from ceres C++ library: ceres-solver/include/ceres/rotation.h + + Args: + quaternion (torch.Tensor): tensor with quaternions. + + Return: + torch.Tensor: tensor with angle axis of rotation. + + Shape: + - Input: :math:`(*, 4)` where `*` means, any number of dimensions + - Output: :math:`(*, 3)` + + Example: + >>> quaternion = torch.rand(2, 4) # Nx4 + >>> angle_axis = tgm.quaternion_to_angle_axis(quaternion) # Nx3 + """ + if not torch.is_tensor(quaternion): + raise TypeError("Input type is not a torch.Tensor. Got {}".format( + type(quaternion))) + + if not quaternion.shape[-1] == 4: + raise ValueError("Input must be a tensor of shape Nx4 or 4. Got {}" + .format(quaternion.shape)) + # unpack input and compute conversion + q1: torch.Tensor = quaternion[..., 1] + q2: torch.Tensor = quaternion[..., 2] + q3: torch.Tensor = quaternion[..., 3] + sin_squared_theta: torch.Tensor = q1 * q1 + q2 * q2 + q3 * q3 + + sin_theta: torch.Tensor = torch.sqrt(sin_squared_theta) + cos_theta: torch.Tensor = quaternion[..., 0] + two_theta: torch.Tensor = 2.0 * torch.where( + cos_theta < 0.0, + torch.atan2(-sin_theta, -cos_theta), + torch.atan2(sin_theta, cos_theta)) + + k_pos: torch.Tensor = two_theta / sin_theta + k_neg: torch.Tensor = 2.0 * torch.ones_like(sin_theta) + k: torch.Tensor = torch.where(sin_squared_theta > 0.0, k_pos, k_neg) + + angle_axis: torch.Tensor = torch.zeros_like(quaternion)[..., :3] + angle_axis[..., 0] += q1 * k + angle_axis[..., 1] += q2 * k + angle_axis[..., 2] += q3 * k + return angle_axis + + +def rotation_matrix_to_quaternion(rotation_matrix, eps=1e-6): + """ + This function is borrowed from https://github.com/kornia/kornia + + Convert 3x4 rotation matrix to 4d quaternion vector + + This algorithm is based on algorithm described in + https://github.com/KieranWynn/pyquaternion/blob/master/pyquaternion/quaternion.py#L201 + + Args: + rotation_matrix (Tensor): the rotation matrix to convert. + + Return: + Tensor: the rotation in quaternion + + Shape: + - Input: :math:`(N, 3, 4)` + - Output: :math:`(N, 4)` + + Example: + >>> input = torch.rand(4, 3, 4) # Nx3x4 + >>> output = tgm.rotation_matrix_to_quaternion(input) # Nx4 + """ + if not torch.is_tensor(rotation_matrix): + raise TypeError("Input type is not a torch.Tensor. Got {}".format( + type(rotation_matrix))) + + if len(rotation_matrix.shape) > 3: + raise ValueError( + "Input size must be a three dimensional tensor. Got {}".format( + rotation_matrix.shape)) + if not rotation_matrix.shape[-2:] == (3, 4): + raise ValueError( + "Input size must be a N x 3 x 4 tensor. Got {}".format( + rotation_matrix.shape)) + + rmat_t = torch.transpose(rotation_matrix, 1, 2) + + mask_d2 = rmat_t[:, 2, 2] < eps + + mask_d0_d1 = rmat_t[:, 0, 0] > rmat_t[:, 1, 1] + mask_d0_nd1 = rmat_t[:, 0, 0] < -rmat_t[:, 1, 1] + + t0 = 1 + rmat_t[:, 0, 0] - rmat_t[:, 1, 1] - rmat_t[:, 2, 2] + q0 = torch.stack([rmat_t[:, 1, 2] - rmat_t[:, 2, 1], + t0, rmat_t[:, 0, 1] + rmat_t[:, 1, 0], + rmat_t[:, 2, 0] + rmat_t[:, 0, 2]], -1) + t0_rep = t0.repeat(4, 1).t() + + t1 = 1 - rmat_t[:, 0, 0] + rmat_t[:, 1, 1] - rmat_t[:, 2, 2] + q1 = torch.stack([rmat_t[:, 2, 0] - rmat_t[:, 0, 2], + rmat_t[:, 0, 1] + rmat_t[:, 1, 0], + t1, rmat_t[:, 1, 2] + rmat_t[:, 2, 1]], -1) + t1_rep = t1.repeat(4, 1).t() + + t2 = 1 - rmat_t[:, 0, 0] - rmat_t[:, 1, 1] + rmat_t[:, 2, 2] + q2 = torch.stack([rmat_t[:, 0, 1] - rmat_t[:, 1, 0], + rmat_t[:, 2, 0] + rmat_t[:, 0, 2], + rmat_t[:, 1, 2] + rmat_t[:, 2, 1], t2], -1) + t2_rep = t2.repeat(4, 1).t() + + t3 = 1 + rmat_t[:, 0, 0] + rmat_t[:, 1, 1] + rmat_t[:, 2, 2] + q3 = torch.stack([t3, rmat_t[:, 1, 2] - rmat_t[:, 2, 1], + rmat_t[:, 2, 0] - rmat_t[:, 0, 2], + rmat_t[:, 0, 1] - rmat_t[:, 1, 0]], -1) + t3_rep = t3.repeat(4, 1).t() + + mask_c0 = mask_d2 * mask_d0_d1 + mask_c1 = mask_d2 * ~mask_d0_d1 + mask_c2 = ~mask_d2 * mask_d0_nd1 + mask_c3 = ~mask_d2 * ~mask_d0_nd1 + mask_c0 = mask_c0.view(-1, 1).type_as(q0) + mask_c1 = mask_c1.view(-1, 1).type_as(q1) + mask_c2 = mask_c2.view(-1, 1).type_as(q2) + mask_c3 = mask_c3.view(-1, 1).type_as(q3) + + q = q0 * mask_c0 + q1 * mask_c1 + q2 * mask_c2 + q3 * mask_c3 + q /= torch.sqrt(t0_rep * mask_c0 + t1_rep * mask_c1 + # noqa + t2_rep * mask_c2 + t3_rep * mask_c3) # noqa + q *= 0.5 + return q + +def convert_perspective_to_weak_perspective( + perspective_camera, + focal_length=5000., + img_res=224, +): + # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz] + # in 3D given the bounding box size + # This camera translation can be used in a full perspective projection + # if isinstance(focal_length, torch.Tensor): + # focal_length = focal_length[:, 0] + + weak_perspective_camera = torch.stack( + [ + + 2 * focal_length / (img_res * perspective_camera[:, 2] + 1e-9), + perspective_camera[:, 0], + perspective_camera[:, 1], + ], + dim=-1 + ) + return weak_perspective_camera + + +def convert_weak_perspective_to_perspective( + weak_perspective_camera, + focal_length=5000., + img_res=224, +): + # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz] + # in 3D given the bounding box size + # This camera translation can be used in a full perspective projection + # if isinstance(focal_length, torch.Tensor): + # focal_length = focal_length[:, 0] + + perspective_camera = torch.stack( + [ + weak_perspective_camera[:, 1], + weak_perspective_camera[:, 2], + 2 * focal_length / (img_res * weak_perspective_camera[:, 0] + 1e-9) + ], + dim=-1 + ) + return perspective_camera + + +def perspective_projection(points, rotation, translation, + focal_length, camera_center): + """ + This function computes the perspective projection of a set of points. + Input: + points (bs, N, 3): 3D points + rotation (bs, 3, 3): Camera rotation + translation (bs, 3): Camera translation + focal_length (bs,) or scalar: Focal length + camera_center (bs, 2): Camera center + """ + batch_size = points.shape[0] + K = torch.zeros([batch_size, 3, 3], device=points.device) + K[:,0,0] = focal_length + K[:,1,1] = focal_length + K[:,2,2] = 1. + K[:,:-1, -1] = camera_center + + # Transform points + points = torch.einsum('bij,bkj->bki', rotation, points) + points = points + translation.unsqueeze(1) + + # Apply perspective distortion + projected_points = points / points[:,:,-1].unsqueeze(-1) + + # Apply camera intrinsics + projected_points = torch.einsum('bij,bkj->bki', K, projected_points) + + return projected_points[:, :, :-1] + + +def weak_perspective_projection(points, rotation, weak_cam_params, focal_length, camera_center, img_res): + """ + This function computes the perspective projection of a set of points. + Input: + points (bs, N, 3): 3D points + rotation (bs, 3, 3): Camera rotation + translation (bs, 3): Camera translation + focal_length (bs,) or scalar: Focal length + camera_center (bs, 2): Camera center + """ + batch_size = points.shape[0] + K = torch.zeros([batch_size, 3, 3], device=points.device) + K[:,0,0] = focal_length + K[:,1,1] = focal_length + K[:,2,2] = 1. + K[:,:-1, -1] = camera_center + + translation = convert_weak_perspective_to_perspective(weak_cam_params, focal_length, img_res) + + # Transform points + points = torch.einsum('bij,bkj->bki', rotation, points) + points = points + translation.unsqueeze(1) + + # Apply perspective distortion + projected_points = points / points[:,:,-1].unsqueeze(-1) + + # Apply camera intrinsics + projected_points = torch.einsum('bij,bkj->bki', K, projected_points) + + return projected_points[:, :, :-1] + + +def estimate_translation_np(S, joints_2d, joints_conf, focal_length=5000., img_size=224.): + """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. + Input: + S: (25, 3) 3D joint locations + joints: (25, 3) 2D joint locations and confidence + Returns: + (3,) camera translation vector + """ + + num_joints = S.shape[0] + # focal length + f = np.array([focal_length,focal_length]) + # optical center + center = np.array([img_size/2., img_size/2.]) + + # transformations + Z = np.reshape(np.tile(S[:,2],(2,1)).T,-1) + XY = np.reshape(S[:,0:2],-1) + O = np.tile(center,num_joints) + F = np.tile(f,num_joints) + weight2 = np.reshape(np.tile(np.sqrt(joints_conf),(2,1)).T,-1) + + # least squares + Q = np.array([F*np.tile(np.array([1,0]),num_joints), F*np.tile(np.array([0,1]),num_joints), O-np.reshape(joints_2d,-1)]).T + c = (np.reshape(joints_2d,-1)-O)*Z - F*XY + + # weighted least squares + W = np.diagflat(weight2) + Q = np.dot(W,Q) + c = np.dot(W,c) + + # square matrix + A = np.dot(Q.T,Q) + b = np.dot(Q.T,c) + + # solution + trans = np.linalg.solve(A, b) + + return trans + + +def estimate_translation(S, joints_2d, focal_length=5000., img_size=224., use_all_joints=False, rotation=None): + """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. + Input: + S: (B, 49, 3) 3D joint locations + joints: (B, 49, 3) 2D joint locations and confidence + Returns: + (B, 3) camera translation vectors + """ + + device = S.device + + if rotation is not None: + S = torch.einsum('bij,bkj->bki', rotation, S) + + # Use only joints 25:49 (GT joints) + if use_all_joints: + S = S.cpu().numpy() + joints_2d = joints_2d.cpu().numpy() + else: + S = S[:, 25:, :].cpu().numpy() + joints_2d = joints_2d[:, 25:, :].cpu().numpy() + + joints_conf = joints_2d[:, :, -1] + joints_2d = joints_2d[:, :, :-1] + trans = np.zeros((S.shape[0], 3), dtype=np.float32) + # Find the translation for each example in the batch + for i in range(S.shape[0]): + S_i = S[i] + joints_i = joints_2d[i] + conf_i = joints_conf[i] + trans[i] = estimate_translation_np(S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size) + return torch.from_numpy(trans).to(device) + + +def estimate_translation_cam(S, joints_2d, focal_length=(5000., 5000.), img_size=(224., 224.), + use_all_joints=False, rotation=None): + """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. + Input: + S: (B, 49, 3) 3D joint locations + joints: (B, 49, 3) 2D joint locations and confidence + Returns: + (B, 3) camera translation vectors + """ + + def estimate_translation_np(S, joints_2d, joints_conf, focal_length=(5000., 5000.), img_size=(224., 224.)): + """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. + Input: + S: (25, 3) 3D joint locations + joints: (25, 3) 2D joint locations and confidence + Returns: + (3,) camera translation vector + """ + + num_joints = S.shape[0] + # focal length + f = np.array([focal_length[0], focal_length[1]]) + # optical center + center = np.array([img_size[0] / 2., img_size[1] / 2.]) + + # transformations + Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1) + XY = np.reshape(S[:, 0:2], -1) + O = np.tile(center, num_joints) + F = np.tile(f, num_joints) + weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1) + + # least squares + Q = np.array([F * np.tile(np.array([1, 0]), num_joints), F * np.tile(np.array([0, 1]), num_joints), + O - np.reshape(joints_2d, -1)]).T + c = (np.reshape(joints_2d, -1) - O) * Z - F * XY + + # weighted least squares + W = np.diagflat(weight2) + Q = np.dot(W, Q) + c = np.dot(W, c) + + # square matrix + A = np.dot(Q.T, Q) + b = np.dot(Q.T, c) + + # solution + trans = np.linalg.solve(A, b) + + return trans + + device = S.device + + if rotation is not None: + S = torch.einsum('bij,bkj->bki', rotation, S) + + # Use only joints 25:49 (GT joints) + if use_all_joints: + S = S.cpu().numpy() + joints_2d = joints_2d.cpu().numpy() + else: + S = S[:, 25:, :].cpu().numpy() + joints_2d = joints_2d[:, 25:, :].cpu().numpy() + + joints_conf = joints_2d[:, :, -1] + joints_2d = joints_2d[:, :, :-1] + trans = np.zeros((S.shape[0], 3), dtype=np.float32) + # Find the translation for each example in the batch + for i in range(S.shape[0]): + S_i = S[i] + joints_i = joints_2d[i] + conf_i = joints_conf[i] + trans[i] = estimate_translation_np(S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size) + return torch.from_numpy(trans).to(device) + + +def get_coord_maps(size=56): + xx_ones = torch.ones([1, size], dtype=torch.int32) + xx_ones = xx_ones.unsqueeze(-1) + + xx_range = torch.arange(size, dtype=torch.int32).unsqueeze(0) + xx_range = xx_range.unsqueeze(1) + + xx_channel = torch.matmul(xx_ones, xx_range) + xx_channel = xx_channel.unsqueeze(-1) + + yy_ones = torch.ones([1, size], dtype=torch.int32) + yy_ones = yy_ones.unsqueeze(1) + + yy_range = torch.arange(size, dtype=torch.int32).unsqueeze(0) + yy_range = yy_range.unsqueeze(-1) + + yy_channel = torch.matmul(yy_range, yy_ones) + yy_channel = yy_channel.unsqueeze(-1) + + xx_channel = xx_channel.permute(0, 3, 1, 2) + yy_channel = yy_channel.permute(0, 3, 1, 2) + + xx_channel = xx_channel.float() / (size - 1) + yy_channel = yy_channel.float() / (size - 1) + + xx_channel = xx_channel * 2 - 1 + yy_channel = yy_channel * 2 - 1 + + out = torch.cat([xx_channel, yy_channel], dim=1) + return out + + +def look_at(eye, at=np.array([0, 0, 0]), up=np.array([0, 0, 1]), eps=1e-5): + at = at.astype(float).reshape(1, 3) + up = up.astype(float).reshape(1, 3) + + eye = eye.reshape(-1, 3) + up = up.repeat(eye.shape[0] // up.shape[0], axis=0) + eps = np.array([eps]).reshape(1, 1).repeat(up.shape[0], axis=0) + + z_axis = eye - at + z_axis /= np.max(np.stack([np.linalg.norm(z_axis, axis=1, keepdims=True), eps])) + + x_axis = np.cross(up, z_axis) + x_axis /= np.max(np.stack([np.linalg.norm(x_axis, axis=1, keepdims=True), eps])) + + y_axis = np.cross(z_axis, x_axis) + y_axis /= np.max(np.stack([np.linalg.norm(y_axis, axis=1, keepdims=True), eps])) + + r_mat = np.concatenate((x_axis.reshape(-1, 3, 1), y_axis.reshape(-1, 3, 1), z_axis.reshape(-1, 3, 1)), axis=2) + + return r_mat + + +def to_sphere(u, v): + theta = 2 * np.pi * u + phi = np.arccos(1 - 2 * v) + cx = np.sin(phi) * np.cos(theta) + cy = np.sin(phi) * np.sin(theta) + cz = np.cos(phi) + s = np.stack([cx, cy, cz]) + return s + + +def sample_on_sphere(range_u=(0, 1), range_v=(0, 1)): + u = np.random.uniform(*range_u) + v = np.random.uniform(*range_v) + return to_sphere(u, v) + + +def sample_pose_on_sphere(range_v=(0,1), range_u=(0,1), radius=1, up=[0,1,0]): + # sample location on unit sphere + loc = sample_on_sphere(range_u, range_v) + + # sample radius if necessary + if isinstance(radius, tuple): + radius = np.random.uniform(*radius) + + loc = loc * radius + R = look_at(loc, up=np.array(up))[0] + + RT = np.concatenate([R, loc.reshape(3, 1)], axis=1) + RT = torch.Tensor(RT.astype(np.float32)) + return RT + + +def rectify_pose(camera_r, body_aa, rotate_x=False): + body_r = batch_rodrigues(body_aa).reshape(-1,3,3) + + if rotate_x: + rotate_x = torch.tensor([[[1.0, 0.0, 0.0], [0.0, -1.0, 0.0], [0.0, 0.0, -1.0]]]) + body_r = body_r @ rotate_x + + final_r = camera_r @ body_r + body_aa = batch_rot2aa(final_r) + return body_aa + + +def batch_euler2matrix(r): + return quaternion_to_rotation_matrix(euler_to_quaternion(r)) + + +def euler_to_quaternion(r): + x = r[..., 0] + y = r[..., 1] + z = r[..., 2] + + z = z/2.0 + y = y/2.0 + x = x/2.0 + cz = torch.cos(z) + sz = torch.sin(z) + cy = torch.cos(y) + sy = torch.sin(y) + cx = torch.cos(x) + sx = torch.sin(x) + quaternion = torch.zeros_like(r.repeat(1,2))[..., :4].to(r.device) + quaternion[..., 0] += cx*cy*cz - sx*sy*sz + quaternion[..., 1] += cx*sy*sz + cy*cz*sx + quaternion[..., 2] += cx*cz*sy - sx*cy*sz + quaternion[..., 3] += cx*cy*sz + sx*cz*sy + return quaternion + + +def quaternion_to_rotation_matrix(quat): + """Convert quaternion coefficients to rotation matrix. + Args: + quat: size = [B, 4] 4 <===>(w, x, y, z) + Returns: + Rotation matrix corresponding to the quaternion -- size = [B, 3, 3] + """ + norm_quat = quat + norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True) + w, x, y, z = norm_quat[:, 0], norm_quat[:, 1], norm_quat[:, 2], norm_quat[:, 3] + + B = quat.size(0) + + w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2) + wx, wy, wz = w * x, w * y, w * z + xy, xz, yz = x * y, x * z, y * z + + rotMat = torch.stack([w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz, + 2 * wz + 2 * xy, w2 - x2 + y2 - z2, 2 * yz - 2 * wx, + 2 * xz - 2 * wy, 2 * wx + 2 * yz, w2 - x2 - y2 + z2], dim=1).view(B, 3, 3) + return rotMat + +def euler_angles_from_rotmat(R): + """ + computer euler angles for rotation around x, y, z axis + from rotation amtrix + R: 4x4 rotation matrix + https://www.gregslabaugh.net/publications/euler.pdf + """ + r21 = np.round(R[:, 2, 0].item(), 4) + if abs(r21) != 1: + y_angle1 = -1 * torch.asin(R[:, 2, 0]) + y_angle2 = math.pi + torch.asin(R[:, 2, 0]) + cy1, cy2 = torch.cos(y_angle1), torch.cos(y_angle2) + + x_angle1 = torch.atan2(R[:, 2, 1] / cy1, R[:, 2, 2] / cy1) + x_angle2 = torch.atan2(R[:, 2, 1] / cy2, R[:, 2, 2] / cy2) + z_angle1 = torch.atan2(R[:, 1, 0] / cy1, R[:, 0, 0] / cy1) + z_angle2 = torch.atan2(R[:, 1, 0] / cy2, R[:, 0, 0] / cy2) + + s1 = (x_angle1, y_angle1, z_angle1) + s2 = (x_angle2, y_angle2, z_angle2) + s = (s1, s2) + + else: + z_angle = torch.tensor([0], device=R.device).float() + if r21 == -1: + y_angle = torch.tensor([math.pi / 2], device=R.device).float() + x_angle = z_angle + torch.atan2(R[:, 0, 1], R[:, 0, 2]) + else: + y_angle = -torch.tensor([math.pi / 2], device=R.device).float() + x_angle = -z_angle + torch.atan2(-R[:, 0, 1], R[:, 0, 2]) + s = ((x_angle, y_angle, z_angle),) + return s \ No newline at end of file diff --git a/myeasymocap/backbone/pare/utils/kp_utils.py b/myeasymocap/backbone/pare/utils/kp_utils.py new file mode 100644 index 0000000..e3246d0 --- /dev/null +++ b/myeasymocap/backbone/pare/utils/kp_utils.py @@ -0,0 +1,1130 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +import numpy as np + + +def keypoint_hflip(kp, img_width): + # Flip a keypoint horizontally around the y-axis + # kp N,2 + if len(kp.shape) == 2: + kp[:,0] = (img_width - 1.) - kp[:,0] + elif len(kp.shape) == 3: + kp[:, :, 0] = (img_width - 1.) - kp[:, :, 0] + return kp + + +def convert_kps(joints2d, src, dst): + src_names = eval(f'get_{src}_joint_names')() + dst_names = eval(f'get_{dst}_joint_names')() + + out_joints2d = np.zeros((joints2d.shape[0], len(dst_names), joints2d.shape[-1])) + + for idx, jn in enumerate(dst_names): + if jn in src_names: + out_joints2d[:, idx] = joints2d[:, src_names.index(jn)] + + return out_joints2d + + +def get_perm_idxs(src, dst): + src_names = eval(f'get_{src}_joint_names')() + dst_names = eval(f'get_{dst}_joint_names')() + idxs = [src_names.index(h) for h in dst_names if h in src_names] + return idxs + + +def get_mpii3d_test_joint_names(): + return [ + 'headtop', # 'head_top', + 'neck', + 'rshoulder',# 'right_shoulder', + 'relbow',# 'right_elbow', + 'rwrist',# 'right_wrist', + 'lshoulder',# 'left_shoulder', + 'lelbow', # 'left_elbow', + 'lwrist', # 'left_wrist', + 'rhip', # 'right_hip', + 'rknee', # 'right_knee', + 'rankle',# 'right_ankle', + 'lhip',# 'left_hip', + 'lknee',# 'left_knee', + 'lankle',# 'left_ankle' + 'hip',# 'pelvis', + 'Spine (H36M)',# 'spine', + 'Head (H36M)',# 'head' + ] + + +def get_mpii3d_joint_names(): + return [ + 'spine3', # 0, + 'spine4', # 1, + 'spine2', # 2, + 'Spine (H36M)', #'spine', # 3, + 'hip', # 'pelvis', # 4, + 'neck', # 5, + 'Head (H36M)', # 'head', # 6, + "headtop", # 'head_top', # 7, + 'left_clavicle', # 8, + "lshoulder", # 'left_shoulder', # 9, + "lelbow", # 'left_elbow',# 10, + "lwrist", # 'left_wrist',# 11, + 'left_hand',# 12, + 'right_clavicle',# 13, + 'rshoulder',# 'right_shoulder',# 14, + 'relbow',# 'right_elbow',# 15, + 'rwrist',# 'right_wrist',# 16, + 'right_hand',# 17, + 'lhip', # left_hip',# 18, + 'lknee', # 'left_knee',# 19, + 'lankle', #left ankle # 20 + 'left_foot', # 21 + 'left_toe', # 22 + "rhip", # 'right_hip',# 23 + "rknee", # 'right_knee',# 24 + "rankle", #'right_ankle', # 25 + 'right_foot',# 26 + 'right_toe' # 27 + ] + + +# def get_insta_joint_names(): +# return [ +# 'rheel' , # 0 +# 'rknee' , # 1 +# 'rhip' , # 2 +# 'lhip' , # 3 +# 'lknee' , # 4 +# 'lheel' , # 5 +# 'rwrist' , # 6 +# 'relbow' , # 7 +# 'rshoulder' , # 8 +# 'lshoulder' , # 9 +# 'lelbow' , # 10 +# 'lwrist' , # 11 +# 'neck' , # 12 +# 'headtop' , # 13 +# 'nose' , # 14 +# 'leye' , # 15 +# 'reye' , # 16 +# 'lear' , # 17 +# 'rear' , # 18 +# 'lbigtoe' , # 19 +# 'rbigtoe' , # 20 +# 'lsmalltoe' , # 21 +# 'rsmalltoe' , # 22 +# 'lankle' , # 23 +# 'rankle' , # 24 +# ] + + +def get_insta_joint_names(): + return [ + 'OP RHeel', + 'OP RKnee', + 'OP RHip', + 'OP LHip', + 'OP LKnee', + 'OP LHeel', + 'OP RWrist', + 'OP RElbow', + 'OP RShoulder', + 'OP LShoulder', + 'OP LElbow', + 'OP LWrist', + 'OP Neck', + 'headtop', + 'OP Nose', + 'OP LEye', + 'OP REye', + 'OP LEar', + 'OP REar', + 'OP LBigToe', + 'OP RBigToe', + 'OP LSmallToe', + 'OP RSmallToe', + 'OP LAnkle', + 'OP RAnkle', + ] + + +def get_mmpose_joint_names(): + # this naming is for the first 23 joints of MMPose + # does not include hands and face + return [ + 'OP Nose', # 1 + 'OP LEye', # 2 + 'OP REye', # 3 + 'OP LEar', # 4 + 'OP REar', # 5 + 'OP LShoulder', # 6 + 'OP RShoulder', # 7 + 'OP LElbow', # 8 + 'OP RElbow', # 9 + 'OP LWrist', # 10 + 'OP RWrist', # 11 + 'OP LHip', # 12 + 'OP RHip', # 13 + 'OP LKnee', # 14 + 'OP RKnee', # 15 + 'OP LAnkle', # 16 + 'OP RAnkle', # 17 + 'OP LBigToe', # 18 + 'OP LSmallToe', # 19 + 'OP LHeel', # 20 + 'OP RBigToe', # 21 + 'OP RSmallToe', # 22 + 'OP RHeel', # 23 + ] + + +def get_insta_skeleton(): + return np.array( + [ + [0 , 1], + [1 , 2], + [2 , 3], + [3 , 4], + [4 , 5], + [6 , 7], + [7 , 8], + [8 , 9], + [9 ,10], + [2 , 8], + [3 , 9], + [10,11], + [8 ,12], + [9 ,12], + [12,13], + [12,14], + [14,15], + [14,16], + [15,17], + [16,18], + [0 ,20], + [20,22], + [5 ,19], + [19,21], + [5 ,23], + [0 ,24], + ]) + + +def get_staf_skeleton(): + return np.array( + [ + [0, 1], + [1, 2], + [2, 3], + [3, 4], + [1, 5], + [5, 6], + [6, 7], + [1, 8], + [8, 9], + [9, 10], + [10, 11], + [8, 12], + [12, 13], + [13, 14], + [0, 15], + [0, 16], + [15, 17], + [16, 18], + [2, 9], + [5, 12], + [1, 19], + [20, 19], + ] + ) + + +def get_staf_joint_names(): + return [ + 'OP Nose', # 0, + 'OP Neck', # 1, + 'OP RShoulder', # 2, + 'OP RElbow', # 3, + 'OP RWrist', # 4, + 'OP LShoulder', # 5, + 'OP LElbow', # 6, + 'OP LWrist', # 7, + 'OP MidHip', # 8, + 'OP RHip', # 9, + 'OP RKnee', # 10, + 'OP RAnkle', # 11, + 'OP LHip', # 12, + 'OP LKnee', # 13, + 'OP LAnkle', # 14, + 'OP REye', # 15, + 'OP LEye', # 16, + 'OP REar', # 17, + 'OP LEar', # 18, + 'Neck (LSP)', # 19, + 'Top of Head (LSP)', # 20, + ] + + +def get_spin_op_joint_names(): + return [ + 'OP Nose', # 0 + 'OP Neck', # 1 + 'OP RShoulder', # 2 + 'OP RElbow', # 3 + 'OP RWrist', # 4 + 'OP LShoulder', # 5 + 'OP LElbow', # 6 + 'OP LWrist', # 7 + 'OP MidHip', # 8 + 'OP RHip', # 9 + 'OP RKnee', # 10 + 'OP RAnkle', # 11 + 'OP LHip', # 12 + 'OP LKnee', # 13 + 'OP LAnkle', # 14 + 'OP REye', # 15 + 'OP LEye', # 16 + 'OP REar', # 17 + 'OP LEar', # 18 + 'OP LBigToe', # 19 + 'OP LSmallToe', # 20 + 'OP LHeel', # 21 + 'OP RBigToe', # 22 + 'OP RSmallToe', # 23 + 'OP RHeel', # 24 + ] + + +def get_openpose_joint_names(): + return [ + 'OP Nose', # 0 + 'OP Neck', # 1 + 'OP RShoulder', # 2 + 'OP RElbow', # 3 + 'OP RWrist', # 4 + 'OP LShoulder', # 5 + 'OP LElbow', # 6 + 'OP LWrist', # 7 + 'OP MidHip', # 8 + 'OP RHip', # 9 + 'OP RKnee', # 10 + 'OP RAnkle', # 11 + 'OP LHip', # 12 + 'OP LKnee', # 13 + 'OP LAnkle', # 14 + 'OP REye', # 15 + 'OP LEye', # 16 + 'OP REar', # 17 + 'OP LEar', # 18 + 'OP LBigToe', # 19 + 'OP LSmallToe', # 20 + 'OP LHeel', # 21 + 'OP RBigToe', # 22 + 'OP RSmallToe', # 23 + 'OP RHeel', # 24 + ] + + +def get_spin_joint_names(): + return [ + 'OP Nose', # 0 + 'OP Neck', # 1 + 'OP RShoulder', # 2 + 'OP RElbow', # 3 + 'OP RWrist', # 4 + 'OP LShoulder', # 5 + 'OP LElbow', # 6 + 'OP LWrist', # 7 + 'OP MidHip', # 8 + 'OP RHip', # 9 + 'OP RKnee', # 10 + 'OP RAnkle', # 11 + 'OP LHip', # 12 + 'OP LKnee', # 13 + 'OP LAnkle', # 14 + 'OP REye', # 15 + 'OP LEye', # 16 + 'OP REar', # 17 + 'OP LEar', # 18 + 'OP LBigToe', # 19 + 'OP LSmallToe', # 20 + 'OP LHeel', # 21 + 'OP RBigToe', # 22 + 'OP RSmallToe', # 23 + 'OP RHeel', # 24 + 'rankle', # 25 + 'rknee', # 26 + 'rhip', # 27 + 'lhip', # 28 + 'lknee', # 29 + 'lankle', # 30 + 'rwrist', # 31 + 'relbow', # 32 + 'rshoulder', # 33 + 'lshoulder', # 34 + 'lelbow', # 35 + 'lwrist', # 36 + 'neck', # 37 + 'headtop', # 38 + 'hip', # 39 'Pelvis (MPII)', # 39 + 'thorax', # 40 'Thorax (MPII)', # 40 + 'Spine (H36M)', # 41 + 'Jaw (H36M)', # 42 + 'Head (H36M)', # 43 + 'nose', # 44 + 'leye', # 45 'Left Eye', # 45 + 'reye', # 46 'Right Eye', # 46 + 'lear', # 47 'Left Ear', # 47 + 'rear', # 48 'Right Ear', # 48 + ] + +def get_muco3dhp_joint_names(): + return [ + 'headtop', + 'thorax', + 'rshoulder', + 'relbow', + 'rwrist', + 'lshoulder', + 'lelbow', + 'lwrist', + 'rhip', + 'rknee', + 'rankle', + 'lhip', + 'lknee', + 'lankle', + 'hip', + 'Spine (H36M)', + 'Head (H36M)', + 'R_Hand', + 'L_Hand', + 'R_Toe', + 'L_Toe' + ] + +def get_h36m_joint_names(): + return [ + 'hip', # 0 + 'lhip', # 1 + 'lknee', # 2 + 'lankle', # 3 + 'rhip', # 4 + 'rknee', # 5 + 'rankle', # 6 + 'Spine (H36M)', # 7 + 'neck', # 8 + 'Head (H36M)', # 9 + 'headtop', # 10 + 'lshoulder', # 11 + 'lelbow', # 12 + 'lwrist', # 13 + 'rshoulder', # 14 + 'relbow', # 15 + 'rwrist', # 16 + ] + + +def get_spin_skeleton(): + return np.array( + [ + [0 , 1], + [1 , 2], + [2 , 3], + [3 , 4], + [1 , 5], + [5 , 6], + [6 , 7], + [1 , 8], + [8 , 9], + [9 ,10], + [10,11], + [8 ,12], + [12,13], + [13,14], + [0 ,15], + [0 ,16], + [15,17], + [16,18], + [21,19], + [19,20], + [14,21], + [11,24], + [24,22], + [22,23], + [0 ,38], + ] + ) + + +def get_openpose_skeleton(): + return np.array( + [ + [0 , 1], + [1 , 2], + [2 , 3], + [3 , 4], + [1 , 5], + [5 , 6], + [6 , 7], + [1 , 8], + [8 , 9], + [9 ,10], + [10,11], + [8 ,12], + [12,13], + [13,14], + [0 ,15], + [0 ,16], + [15,17], + [16,18], + [21,19], + [19,20], + [14,21], + [11,24], + [24,22], + [22,23], + ] + ) + + +def get_posetrack_joint_names(): + return [ + "nose", + "neck", + "headtop", + "lear", + "rear", + "lshoulder", + "rshoulder", + "lelbow", + "relbow", + "lwrist", + "rwrist", + "lhip", + "rhip", + "lknee", + "rknee", + "lankle", + "rankle" + ] + + +def get_posetrack_original_kp_names(): + return [ + 'nose', + 'head_bottom', + 'head_top', + 'left_ear', + 'right_ear', + 'left_shoulder', + 'right_shoulder', + 'left_elbow', + 'right_elbow', + 'left_wrist', + 'right_wrist', + 'left_hip', + 'right_hip', + 'left_knee', + 'right_knee', + 'left_ankle', + 'right_ankle' + ] + + +def get_pennaction_joint_names(): + return [ + "headtop", # 0 + "lshoulder", # 1 + "rshoulder", # 2 + "lelbow", # 3 + "relbow", # 4 + "lwrist", # 5 + "rwrist", # 6 + "lhip" , # 7 + "rhip" , # 8 + "lknee", # 9 + "rknee" , # 10 + "lankle", # 11 + "rankle" # 12 + ] + + +def get_common_joint_names(): + return [ + "rankle", # 0 "lankle", # 0 + "rknee", # 1 "lknee", # 1 + "rhip", # 2 "lhip", # 2 + "lhip", # 3 "rhip", # 3 + "lknee", # 4 "rknee", # 4 + "lankle", # 5 "rankle", # 5 + "rwrist", # 6 "lwrist", # 6 + "relbow", # 7 "lelbow", # 7 + "rshoulder", # 8 "lshoulder", # 8 + "lshoulder", # 9 "rshoulder", # 9 + "lelbow", # 10 "relbow", # 10 + "lwrist", # 11 "rwrist", # 11 + "neck", # 12 "neck", # 12 + "headtop", # 13 "headtop", # 13 + ] + + +def get_common_paper_joint_names(): + return [ + "Right Ankle", # 0 "lankle", # 0 + "Right Knee", # 1 "lknee", # 1 + "Right Hip", # 2 "lhip", # 2 + "Left Hip", # 3 "rhip", # 3 + "Left Knee", # 4 "rknee", # 4 + "Left Ankle", # 5 "rankle", # 5 + "Right Wrist", # 6 "lwrist", # 6 + "Right Elbow", # 7 "lelbow", # 7 + "Right Shoulder", # 8 "lshoulder", # 8 + "Left Shoulder", # 9 "rshoulder", # 9 + "Left Elbow", # 10 "relbow", # 10 + "Left Wrist", # 11 "rwrist", # 11 + "Neck", # 12 "neck", # 12 + "Head", # 13 "headtop", # 13 + ] + + +def get_common_skeleton(): + return np.array( + [ + [ 0, 1 ], + [ 1, 2 ], + [ 3, 4 ], + [ 4, 5 ], + [ 6, 7 ], + [ 7, 8 ], + [ 8, 2 ], + [ 8, 9 ], + [ 9, 3 ], + [ 2, 3 ], + [ 8, 12], + [ 9, 10], + [12, 9 ], + [10, 11], + [12, 13], + ] + ) + + +def get_coco_joint_names(): + return [ + "nose", # 0 + "leye", # 1 + "reye", # 2 + "lear", # 3 + "rear", # 4 + "lshoulder", # 5 + "rshoulder", # 6 + "lelbow", # 7 + "relbow", # 8 + "lwrist", # 9 + "rwrist", # 10 + "lhip", # 11 + "rhip", # 12 + "lknee", # 13 + "rknee", # 14 + "lankle", # 15 + "rankle", # 16 + ] + + +def get_ochuman_joint_names(): + return [ + 'rshoulder', + 'relbow', + 'rwrist', + 'lshoulder', + 'lelbow', + 'lwrist', + 'rhip', + 'rknee', + 'rankle', + 'lhip', + 'lknee', + 'lankle', + 'headtop', + 'neck', + 'rear', + 'lear', + 'nose', + 'reye', + 'leye' + ] + + +def get_crowdpose_joint_names(): + return [ + 'lshoulder', + 'rshoulder', + 'lelbow', + 'relbow', + 'lwrist', + 'rwrist', + 'lhip', + 'rhip', + 'lknee', + 'rknee', + 'lankle', + 'rankle', + 'headtop', + 'neck' + ] + +def get_coco_skeleton(): + # 0 - nose, + # 1 - leye, + # 2 - reye, + # 3 - lear, + # 4 - rear, + # 5 - lshoulder, + # 6 - rshoulder, + # 7 - lelbow, + # 8 - relbow, + # 9 - lwrist, + # 10 - rwrist, + # 11 - lhip, + # 12 - rhip, + # 13 - lknee, + # 14 - rknee, + # 15 - lankle, + # 16 - rankle, + return np.array( + [ + [15, 13], + [13, 11], + [16, 14], + [14, 12], + [11, 12], + [ 5, 11], + [ 6, 12], + [ 5, 6 ], + [ 5, 7 ], + [ 6, 8 ], + [ 7, 9 ], + [ 8, 10], + [ 1, 2 ], + [ 0, 1 ], + [ 0, 2 ], + [ 1, 3 ], + [ 2, 4 ], + [ 3, 5 ], + [ 4, 6 ] + ] + ) + + +def get_mpii_joint_names(): + return [ + "rankle", # 0 + "rknee", # 1 + "rhip", # 2 + "lhip", # 3 + "lknee", # 4 + "lankle", # 5 + "hip", # 6 + "thorax", # 7 + "neck", # 8 + "headtop", # 9 + "rwrist", # 10 + "relbow", # 11 + "rshoulder", # 12 + "lshoulder", # 13 + "lelbow", # 14 + "lwrist", # 15 + ] + + +def get_mpii_skeleton(): + # 0 - rankle, + # 1 - rknee, + # 2 - rhip, + # 3 - lhip, + # 4 - lknee, + # 5 - lankle, + # 6 - hip, + # 7 - thorax, + # 8 - neck, + # 9 - headtop, + # 10 - rwrist, + # 11 - relbow, + # 12 - rshoulder, + # 13 - lshoulder, + # 14 - lelbow, + # 15 - lwrist, + return np.array( + [ + [ 0, 1 ], + [ 1, 2 ], + [ 2, 6 ], + [ 6, 3 ], + [ 3, 4 ], + [ 4, 5 ], + [ 6, 7 ], + [ 7, 8 ], + [ 8, 9 ], + [ 7, 12], + [12, 11], + [11, 10], + [ 7, 13], + [13, 14], + [14, 15] + ] + ) + + +def get_aich_joint_names(): + return [ + "rshoulder", # 0 + "relbow", # 1 + "rwrist", # 2 + "lshoulder", # 3 + "lelbow", # 4 + "lwrist", # 5 + "rhip", # 6 + "rknee", # 7 + "rankle", # 8 + "lhip", # 9 + "lknee", # 10 + "lankle", # 11 + "headtop", # 12 + "neck", # 13 + ] + + +def get_aich_skeleton(): + # 0 - rshoulder, + # 1 - relbow, + # 2 - rwrist, + # 3 - lshoulder, + # 4 - lelbow, + # 5 - lwrist, + # 6 - rhip, + # 7 - rknee, + # 8 - rankle, + # 9 - lhip, + # 10 - lknee, + # 11 - lankle, + # 12 - headtop, + # 13 - neck, + return np.array( + [ + [ 0, 1 ], + [ 1, 2 ], + [ 3, 4 ], + [ 4, 5 ], + [ 6, 7 ], + [ 7, 8 ], + [ 9, 10], + [10, 11], + [12, 13], + [13, 0 ], + [13, 3 ], + [ 0, 6 ], + [ 3, 9 ] + ] + ) + + +def get_3dpw_joint_names(): + return [ + "nose", # 0 + "thorax", # 1 + "rshoulder", # 2 + "relbow", # 3 + "rwrist", # 4 + "lshoulder", # 5 + "lelbow", # 6 + "lwrist", # 7 + "rhip", # 8 + "rknee", # 9 + "rankle", # 10 + "lhip", # 11 + "lknee", # 12 + "lankle", # 13 + ] + + +def get_3dpw_skeleton(): + return np.array( + [ + [ 0, 1 ], + [ 1, 2 ], + [ 2, 3 ], + [ 3, 4 ], + [ 1, 5 ], + [ 5, 6 ], + [ 6, 7 ], + [ 2, 8 ], + [ 5, 11], + [ 8, 11], + [ 8, 9 ], + [ 9, 10], + [11, 12], + [12, 13] + ] + ) + + +def get_smplcoco_joint_names(): + return [ + "rankle", # 0 + "rknee", # 1 + "rhip", # 2 + "lhip", # 3 + "lknee", # 4 + "lankle", # 5 + "rwrist", # 6 + "relbow", # 7 + "rshoulder", # 8 + "lshoulder", # 9 + "lelbow", # 10 + "lwrist", # 11 + "neck", # 12 + "headtop", # 13 + "nose", # 14 + "leye", # 15 + "reye", # 16 + "lear", # 17 + "rear", # 18 + ] + + +def get_smplcoco_skeleton(): + return np.array( + [ + [ 0, 1 ], + [ 1, 2 ], + [ 3, 4 ], + [ 4, 5 ], + [ 6, 7 ], + [ 7, 8 ], + [ 8, 12], + [12, 9 ], + [ 9, 10], + [10, 11], + [12, 13], + [14, 15], + [15, 17], + [16, 18], + [14, 16], + [ 8, 2 ], + [ 9, 3 ], + [ 2, 3 ], + ] + ) + + +def get_smpl_joint_names(): + return [ + 'hips', # 0 + 'leftUpLeg', # 1 + 'rightUpLeg', # 2 + 'spine', # 3 + 'leftLeg', # 4 + 'rightLeg', # 5 + 'spine1', # 6 + 'leftFoot', # 7 + 'rightFoot', # 8 + 'spine2', # 9 + 'leftToeBase', # 10 + 'rightToeBase', # 11 + 'neck', # 12 + 'leftShoulder', # 13 + 'rightShoulder', # 14 + 'head', # 15 + 'leftArm', # 16 + 'rightArm', # 17 + 'leftForeArm', # 18 + 'rightForeArm', # 19 + 'leftHand', # 20 + 'rightHand', # 21 + 'leftHandIndex1', # 22 + 'rightHandIndex1', # 23 + ] + + +def get_smpl_paper_joint_names(): + return [ + 'Hips', # 0 + 'Left Hip', # 1 + 'Right Hip', # 2 + 'Spine', # 3 + 'Left Knee', # 4 + 'Right Knee', # 5 + 'Spine_1', # 6 + 'Left Ankle', # 7 + 'Right Ankle', # 8 + 'Spine_2', # 9 + 'Left Toe', # 10 + 'Right Toe', # 11 + 'Neck', # 12 + 'Left Shoulder', # 13 + 'Right Shoulder', # 14 + 'Head', # 15 + 'Left Arm', # 16 + 'Right Arm', # 17 + 'Left Elbow', # 18 + 'Right Elbow', # 19 + 'Left Hand', # 20 + 'Right Hand', # 21 + 'Left Thumb', # 22 + 'Right Thumb', # 23 + ] + + +def get_smpl_neighbor_triplets(): + return [ + [ 0, 1, 2 ], # 0 + [ 1, 4, 0 ], # 1 + [ 2, 0, 5 ], # 2 + [ 3, 0, 6 ], # 3 + [ 4, 7, 1 ], # 4 + [ 5, 2, 8 ], # 5 + [ 6, 3, 9 ], # 6 + [ 7, 10, 4 ], # 7 + [ 8, 5, 11], # 8 + [ 9, 13, 14], # 9 + [10, 7, 4 ], # 10 + [11, 8, 5 ], # 11 + [12, 9, 15], # 12 + [13, 16, 9 ], # 13 + [14, 9, 17], # 14 + [15, 9, 12], # 15 + [16, 18, 13], # 16 + [17, 14, 19], # 17 + [18, 20, 16], # 18 + [19, 17, 21], # 19 + [20, 22, 18], # 20 + [21, 19, 23], # 21 + [22, 20, 18], # 22 + [23, 19, 21], # 23 + ] + + +def get_smpl_skeleton(): + return np.array( + [ + [ 0, 1 ], + [ 0, 2 ], + [ 0, 3 ], + [ 1, 4 ], + [ 2, 5 ], + [ 3, 6 ], + [ 4, 7 ], + [ 5, 8 ], + [ 6, 9 ], + [ 7, 10], + [ 8, 11], + [ 9, 12], + [ 9, 13], + [ 9, 14], + [12, 15], + [13, 16], + [14, 17], + [16, 18], + [17, 19], + [18, 20], + [19, 21], + [20, 22], + [21, 23], + ] + ) + + +def map_spin_joints_to_smpl(): + # this function primarily will be used to copy 2D keypoint + # confidences to pose parameters + return [ + [(39, 27, 28), 0], # hip,lhip,rhip->hips + [(28,), 1], # lhip->leftUpLeg + [(27,), 2], # rhip->rightUpLeg + [(41, 27, 28, 39), 3], # Spine->spine + [(29,), 4], # lknee->leftLeg + [(26,), 5], # rknee->rightLeg + [(41, 40, 33, 34,), 6], # spine, thorax ->spine1 + [(30,), 7], # lankle->leftFoot + [(25,), 8], # rankle->rightFoot + [(40, 33, 34), 9], # thorax,shoulders->spine2 + [(30,), 10], # lankle -> leftToe + [(25,), 11], # rankle -> rightToe + [(37, 42, 33, 34), 12], # neck, shoulders -> neck + [(34,), 13], # lshoulder->leftShoulder + [(33,), 14], # rshoulder->rightShoulder + [(33, 34, 38, 43, 44, 45, 46, 47, 48,), 15], # nose, eyes, ears, headtop, shoulders->head + [(34,), 16], # lshoulder->leftArm + [(33,), 17], # rshoulder->rightArm + [(35,), 18], # lelbow->leftForeArm + [(32,), 19], # relbow->rightForeArm + [(36,), 20], # lwrist->leftHand + [(31,), 21], # rwrist->rightHand + [(36,), 22], # lhand -> leftHandIndex + [(31,), 23], # rhand -> rightHandIndex + ] + + +def map_smpl_to_common(): + return [ + [(11, 8), 0], # rightToe, rightFoot -> rankle + [(5,), 1], # rightleg -> rknee, + [(2,), 2], # rhip + [(1,), 3], # lhip + [(4,), 4], # leftLeg -> lknee + [(10, 7), 5], # lefttoe, leftfoot -> lankle + [(21, 23), 6], # rwrist + [(18,), 7], # relbow + [(17, 14), 8], # rshoulder + [(16, 13), 9], # lshoulder + [(19,), 10], # lelbow + [(20, 22), 11], # lwrist + [(0, 3, 6, 9, 12), 12], # neck + [(15,), 13], # headtop + ] + + +def relation_among_spin_joints(): + # this function primarily will be used to copy 2D keypoint + # confidences to 3D joints + return [ + [(), 25], + [(), 26], + [(39,), 27], + [(39,), 28], + [(), 29], + [(), 30], + [(), 31], + [(), 32], + [(), 33], + [(), 34], + [(), 35], + [(), 36], + [(40,42,44,43,38,33,34,), 37], + [(43,44,45,46,47,48,33,34,), 38], + [(27,28,), 39], + [(27,28,37,41,42,), 40], + [(27,28,39,40,), 41], + [(37,38,44,45,46,47,48,), 42], + [(44,45,46,47,48,38,42,37,33,34,), 43], + [(44,45,46,47,48,38,42,37,33,34), 44], + [(44,45,46,47,48,38,42,37,33,34), 45], + [(44,45,46,47,48,38,42,37,33,34), 46], + [(44,45,46,47,48,38,42,37,33,34), 47], + [(44,45,46,47,48,38,42,37,33,34), 48], + ] \ No newline at end of file