add pare

2023-06-24 22:39:33 +08:00 · 2023-06-24 22:39:33 +08:00 · ad0791fac6
commit ad0791fac6
parent e30a28bff0
25 changed files with 5768 additions and 1 deletions
--- a/config/1v1p/hrnet_pare_finetune.yml
+++ b/config/1v1p/hrnet_pare_finetune.yml
@ -14,7 +14,7 @@ args:
      key_from_previous: [bbox]
      key_keep: []
      args:
-        ckpt: /nas/home/shuaiqing/Code/EasyMocapPublic/data/models/pose_hrnet_w48_384x288.pth
+        ckpt: data/models/pose_hrnet_w48_384x288.pth
    vis2d:
      module: myeasymocap.io.vis.Vis2D
      skip: False
--- a/myeasymocap/backbone/pare/backbone/init.py
+++ b/myeasymocap/backbone/pare/backbone/init.py
@ -0,0 +1,3 @@
 # from .hrnet_pare import *
 from .resnet import *
 from .mobilenet import *
--- a/myeasymocap/backbone/pare/backbone/hrnet.py
+++ b/myeasymocap/backbone/pare/backbone/hrnet.py
@ -0,0 +1,631 @@
 # ------------------------------------------------------------------------------
 # Copyright (c) Microsoft
 # Licensed under the MIT License.
 # Written by Bin Xiao (Bin.Xiao@microsoft.com)
 # ------------------------------------------------------------------------------
 import os
 import torch
 import torch.nn as nn
 # from loguru import logger
 import torch.nn.functional as F
 from yacs.config import CfgNode as CN
 models = [
    'hrnet_w32',
    'hrnet_w48',
 ]
 BN_MOMENTUM = 0.1
 def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)
 class BasicBlock(nn.Module):
    expansion = 1
    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.downsample = downsample
        self.stride = stride
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out
 class Bottleneck(nn.Module):
    expansion = 4
    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
                               bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
                                  momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.bn3(out)
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out
 class HighResolutionModule(nn.Module):
    def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
                 num_channels, fuse_method, multi_scale_output=True):
        super(HighResolutionModule, self).__init__()
        self._check_branches(
            num_branches, blocks, num_blocks, num_inchannels, num_channels)
        self.num_inchannels = num_inchannels
        self.fuse_method = fuse_method
        self.num_branches = num_branches
        self.multi_scale_output = multi_scale_output
        self.branches = self._make_branches(
            num_branches, blocks, num_blocks, num_channels)
        self.fuse_layers = self._make_fuse_layers()
        self.relu = nn.ReLU(True)
    def _check_branches(self, num_branches, blocks, num_blocks,
                        num_inchannels, num_channels):
        if num_branches != len(num_blocks):
            error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(
                num_branches, len(num_blocks))
            logger.error(error_msg)
            raise ValueError(error_msg)
        if num_branches != len(num_channels):
            error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format(
                num_branches, len(num_channels))
            logger.error(error_msg)
            raise ValueError(error_msg)
        if num_branches != len(num_inchannels):
            error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format(
                num_branches, len(num_inchannels))
            logger.error(error_msg)
            raise ValueError(error_msg)
    def _make_one_branch(self, branch_index, block, num_blocks, num_channels,
                         stride=1):
        downsample = None
        if stride != 1 or \
           self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(
                    self.num_inchannels[branch_index],
                    num_channels[branch_index] * block.expansion,
                    kernel_size=1, stride=stride, bias=False
                ),
                nn.BatchNorm2d(
                    num_channels[branch_index] * block.expansion,
                    momentum=BN_MOMENTUM
                ),
            )
        layers = []
        layers.append(
            block(
                self.num_inchannels[branch_index],
                num_channels[branch_index],
                stride,
                downsample
            )
        )
        self.num_inchannels[branch_index] = \
            num_channels[branch_index] * block.expansion
        for i in range(1, num_blocks[branch_index]):
            layers.append(
                block(
                    self.num_inchannels[branch_index],
                    num_channels[branch_index]
                )
            )
        return nn.Sequential(*layers)
    def _make_branches(self, num_branches, block, num_blocks, num_channels):
        branches = []
        for i in range(num_branches):
            branches.append(
                self._make_one_branch(i, block, num_blocks, num_channels)
            )
        return nn.ModuleList(branches)
    def _make_fuse_layers(self):
        if self.num_branches == 1:
            return None
        num_branches = self.num_branches
        num_inchannels = self.num_inchannels
        fuse_layers = []
        for i in range(num_branches if self.multi_scale_output else 1):
            fuse_layer = []
            for j in range(num_branches):
                if j > i:
                    fuse_layer.append(
                        nn.Sequential(
                            nn.Conv2d(
                                num_inchannels[j],
                                num_inchannels[i],
                                1, 1, 0, bias=False
                            ),
                            nn.BatchNorm2d(num_inchannels[i]),
                            nn.Upsample(scale_factor=2**(j-i), mode='nearest')
                        )
                    )
                elif j == i:
                    fuse_layer.append(None)
                else:
                    conv3x3s = []
                    for k in range(i-j):
                        if k == i - j - 1:
                            num_outchannels_conv3x3 = num_inchannels[i]
                            conv3x3s.append(
                                nn.Sequential(
                                    nn.Conv2d(
                                        num_inchannels[j],
                                        num_outchannels_conv3x3,
                                        3, 2, 1, bias=False
                                    ),
                                    nn.BatchNorm2d(num_outchannels_conv3x3)
                                )
                            )
                        else:
                            num_outchannels_conv3x3 = num_inchannels[j]
                            conv3x3s.append(
                                nn.Sequential(
                                    nn.Conv2d(
                                        num_inchannels[j],
                                        num_outchannels_conv3x3,
                                        3, 2, 1, bias=False
                                    ),
                                    nn.BatchNorm2d(num_outchannels_conv3x3),
                                    nn.ReLU(True)
                                )
                            )
                    fuse_layer.append(nn.Sequential(*conv3x3s))
            fuse_layers.append(nn.ModuleList(fuse_layer))
        return nn.ModuleList(fuse_layers)
    def get_num_inchannels(self):
        return self.num_inchannels
    def forward(self, x):
        if self.num_branches == 1:
            return [self.branches[0](x[0])]
        for i in range(self.num_branches):
            x[i] = self.branches[i](x[i])
        x_fuse = []
        for i in range(len(self.fuse_layers)):
            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
            for j in range(1, self.num_branches):
                if i == j:
                    y = y + x[j]
                else:
                    y = y + self.fuse_layers[i][j](x[j])
            x_fuse.append(self.relu(y))
        return x_fuse
 blocks_dict = {
    'BASIC': BasicBlock,
    'BOTTLENECK': Bottleneck
 }
 class PoseHighResolutionNet(nn.Module):
    def __init__(self, cfg):
        self.inplanes = 64
        extra = cfg['MODEL']['EXTRA']
        super(PoseHighResolutionNet, self).__init__()
        self.cfg = extra
        # stem net
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1,
                               bias=False)
        self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self._make_layer(Bottleneck, 64, 4)
        self.stage2_cfg = extra['STAGE2']
        num_channels = self.stage2_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage2_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))
        ]
        self.transition1 = self._make_transition_layer([256], num_channels)
        self.stage2, pre_stage_channels = self._make_stage(
            self.stage2_cfg, num_channels)
        self.stage3_cfg = extra['STAGE3']
        num_channels = self.stage3_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage3_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))
        ]
        self.transition2 = self._make_transition_layer(
            pre_stage_channels, num_channels)
        self.stage3, pre_stage_channels = self._make_stage(
            self.stage3_cfg, num_channels)
        self.stage4_cfg = extra['STAGE4']
        num_channels = self.stage4_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage4_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))
        ]
        self.transition3 = self._make_transition_layer(
            pre_stage_channels, num_channels)
        self.stage4, pre_stage_channels = self._make_stage(
            self.stage4_cfg, num_channels, multi_scale_output=True)
        self.final_layer = nn.Conv2d(
            in_channels=pre_stage_channels[0],
            out_channels=cfg['MODEL']['NUM_JOINTS'],
            kernel_size=extra['FINAL_CONV_KERNEL'],
            stride=1,
            padding=1 if extra['FINAL_CONV_KERNEL'] == 3 else 0
        )
        self.pretrained_layers = extra['PRETRAINED_LAYERS']
        if extra.DOWNSAMPLE and extra.USE_CONV:
            self.downsample_stage_1 = self._make_downsample_layer(3, num_channel=self.stage2_cfg['NUM_CHANNELS'][0])
            self.downsample_stage_2 = self._make_downsample_layer(2, num_channel=self.stage2_cfg['NUM_CHANNELS'][-1])
            self.downsample_stage_3 = self._make_downsample_layer(1, num_channel=self.stage3_cfg['NUM_CHANNELS'][-1])
        elif not extra.DOWNSAMPLE and extra.USE_CONV:
            self.upsample_stage_2 = self._make_upsample_layer(1, num_channel=self.stage2_cfg['NUM_CHANNELS'][-1])
            self.upsample_stage_3 = self._make_upsample_layer(2, num_channel=self.stage3_cfg['NUM_CHANNELS'][-1])
            self.upsample_stage_4 = self._make_upsample_layer(3, num_channel=self.stage4_cfg['NUM_CHANNELS'][-1])
    def _make_transition_layer(
            self, num_channels_pre_layer, num_channels_cur_layer):
        num_branches_cur = len(num_channels_cur_layer)
        num_branches_pre = len(num_channels_pre_layer)
        transition_layers = []
        for i in range(num_branches_cur):
            if i < num_branches_pre:
                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
                    transition_layers.append(
                        nn.Sequential(
                            nn.Conv2d(
                                num_channels_pre_layer[i],
                                num_channels_cur_layer[i],
                                3, 1, 1, bias=False
                            ),
                            nn.BatchNorm2d(num_channels_cur_layer[i]),
                            nn.ReLU(inplace=True)
                        )
                    )
                else:
                    transition_layers.append(None)
            else:
                conv3x3s = []
                for j in range(i+1-num_branches_pre):
                    inchannels = num_channels_pre_layer[-1]
                    outchannels = num_channels_cur_layer[i] \
                        if j == i-num_branches_pre else inchannels
                    conv3x3s.append(
                        nn.Sequential(
                            nn.Conv2d(
                                inchannels, outchannels, 3, 2, 1, bias=False
                            ),
                            nn.BatchNorm2d(outchannels),
                            nn.ReLU(inplace=True)
                        )
                    )
                transition_layers.append(nn.Sequential(*conv3x3s))
        return nn.ModuleList(transition_layers)
    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(
                    self.inplanes, planes * block.expansion,
                    kernel_size=1, stride=stride, bias=False
                ),
                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
            )
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))
        return nn.Sequential(*layers)
    def _make_stage(self, layer_config, num_inchannels,
                    multi_scale_output=True):
        num_modules = layer_config['NUM_MODULES']
        num_branches = layer_config['NUM_BRANCHES']
        num_blocks = layer_config['NUM_BLOCKS']
        num_channels = layer_config['NUM_CHANNELS']
        block = blocks_dict[layer_config['BLOCK']]
        fuse_method = layer_config['FUSE_METHOD']
        modules = []
        for i in range(num_modules):
            # multi_scale_output is only used last module
            if not multi_scale_output and i == num_modules - 1:
                reset_multi_scale_output = False
            else:
                reset_multi_scale_output = True
            modules.append(
                HighResolutionModule(
                    num_branches,
                    block,
                    num_blocks,
                    num_inchannels,
                    num_channels,
                    fuse_method,
                    reset_multi_scale_output
                )
            )
            num_inchannels = modules[-1].get_num_inchannels()
        return nn.Sequential(*modules), num_inchannels
    def _make_upsample_layer(self, num_layers, num_channel, kernel_size=3):
        layers = []
        for i in range(num_layers):
            layers.append(nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True))
            layers.append(
                nn.Conv2d(
                    in_channels=num_channel, out_channels=num_channel,
                    kernel_size=kernel_size, stride=1, padding=1, bias=False,
                )
            )
            layers.append(nn.BatchNorm2d(num_channel, momentum=BN_MOMENTUM))
            layers.append(nn.ReLU(inplace=True))
        return nn.Sequential(*layers)
    def _make_downsample_layer(self, num_layers, num_channel, kernel_size=3):
        layers = []
        for i in range(num_layers):
            layers.append(
                nn.Conv2d(
                    in_channels=num_channel, out_channels=num_channel,
                    kernel_size=kernel_size, stride=2, padding=1, bias=False,
                )
            )
            layers.append(nn.BatchNorm2d(num_channel, momentum=BN_MOMENTUM))
            layers.append(nn.ReLU(inplace=True))
        return nn.Sequential(*layers)
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.layer1(x)
        x_list = []
        for i in range(self.stage2_cfg['NUM_BRANCHES']):
            if self.transition1[i] is not None:
                x_list.append(self.transition1[i](x))
            else:
                x_list.append(x)
        y_list = self.stage2(x_list)
        x_list = []
        for i in range(self.stage3_cfg['NUM_BRANCHES']):
            if self.transition2[i] is not None:
                x_list.append(self.transition2[i](y_list[-1]))
            else:
                x_list.append(y_list[i])
        y_list = self.stage3(x_list)
        x_list = []
        for i in range(self.stage4_cfg['NUM_BRANCHES']):
            if self.transition3[i] is not None:
                x_list.append(self.transition3[i](y_list[-1]))
            else:
                x_list.append(y_list[i])
        x = self.stage4(x_list)
        if self.cfg.DOWNSAMPLE:
            if self.cfg.USE_CONV:
                # Downsampling with strided convolutions
                x1 = self.downsample_stage_1(x[0])
                x2 = self.downsample_stage_2(x[1])
                x3 = self.downsample_stage_3(x[2])
                x = torch.cat([x1, x2, x3, x[3]], 1)
            else:
                # Downsampling with interpolation
                x0_h, x0_w = x[3].size(2), x[3].size(3)
                x1 = F.interpolate(x[0], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
                x2 = F.interpolate(x[1], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
                x3 = F.interpolate(x[2], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
                x = torch.cat([x1, x2, x3, x[3]], 1)
        else:
            if self.cfg.USE_CONV:
                # Upsampling with interpolations + convolutions
                x1 = self.upsample_stage_2(x[1])
                x2 = self.upsample_stage_3(x[2])
                x3 = self.upsample_stage_4(x[3])
                x = torch.cat([x[0], x1, x2, x3], 1)
            else:
                # Upsampling with interpolation
                x0_h, x0_w = x[0].size(2), x[0].size(3)
                x1 = F.interpolate(x[1], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
                x2 = F.interpolate(x[2], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
                x3 = F.interpolate(x[3], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
                x = torch.cat([x[0], x1, x2, x3], 1)
        return x
    def init_weights(self, pretrained=''):
        # logger.info('=> init weights from normal distribution')
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                nn.init.normal_(m.weight, std=0.001)
                for name, _ in m.named_parameters():
                    if name in ['bias']:
                        nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.ConvTranspose2d):
                nn.init.normal_(m.weight, std=0.001)
                for name, _ in m.named_parameters():
                    if name in ['bias']:
                        nn.init.constant_(m.bias, 0)
        if os.path.isfile(pretrained):
            pretrained_state_dict = torch.load(pretrained)
            logger.info('=> loading pretrained model {}'.format(pretrained))
            need_init_state_dict = {}
            for name, m in pretrained_state_dict.items():
                if name.split('.')[0] in self.pretrained_layers \
                   or self.pretrained_layers[0] is '*':
                    need_init_state_dict[name] = m
            self.load_state_dict(need_init_state_dict, strict=False)
        elif pretrained:
            # logger.warning('IMPORTANT WARNING!! Please download pre-trained models if you are in TRAINING mode!')
            # raise ValueError('{} is not exist!'.format(pretrained))
            pass
 def get_pose_net(cfg, is_train):
    model = PoseHighResolutionNet(cfg)
    if is_train and cfg['MODEL']['INIT_WEIGHTS']:
        model.init_weights(cfg['MODEL']['PRETRAINED'])
    return model
 def get_cfg_defaults(pretrained, width=32, downsample=False, use_conv=False):
    # pose_multi_resoluton_net related params
    HRNET = CN()
    HRNET.PRETRAINED_LAYERS = [
        'conv1', 'bn1', 'conv2', 'bn2', 'layer1', 'transition1',
        'stage2', 'transition2', 'stage3', 'transition3', 'stage4',
    ]
    HRNET.STEM_INPLANES = 64
    HRNET.FINAL_CONV_KERNEL = 1
    HRNET.STAGE2 = CN()
    HRNET.STAGE2.NUM_MODULES = 1
    HRNET.STAGE2.NUM_BRANCHES = 2
    HRNET.STAGE2.NUM_BLOCKS = [4, 4]
    HRNET.STAGE2.NUM_CHANNELS = [width, width*2]
    HRNET.STAGE2.BLOCK = 'BASIC'
    HRNET.STAGE2.FUSE_METHOD = 'SUM'
    HRNET.STAGE3 = CN()
    HRNET.STAGE3.NUM_MODULES = 4
    HRNET.STAGE3.NUM_BRANCHES = 3
    HRNET.STAGE3.NUM_BLOCKS = [4, 4, 4]
    HRNET.STAGE3.NUM_CHANNELS = [width, width*2, width*4]
    HRNET.STAGE3.BLOCK = 'BASIC'
    HRNET.STAGE3.FUSE_METHOD = 'SUM'
    HRNET.STAGE4 = CN()
    HRNET.STAGE4.NUM_MODULES = 3
    HRNET.STAGE4.NUM_BRANCHES = 4
    HRNET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
    HRNET.STAGE4.NUM_CHANNELS = [width, width*2, width*4, width*8]
    HRNET.STAGE4.BLOCK = 'BASIC'
    HRNET.STAGE4.FUSE_METHOD = 'SUM'
    HRNET.DOWNSAMPLE = downsample
    HRNET.USE_CONV = use_conv
    cfg = CN()
    cfg.MODEL = CN()
    cfg.MODEL.INIT_WEIGHTS = True
    cfg.MODEL.PRETRAINED = pretrained  # 'data/pretrained_models/hrnet_w32-36af842e.pth'
    cfg.MODEL.EXTRA = HRNET
    cfg.MODEL.NUM_JOINTS = 24
    return cfg
 def hrnet_w32(
        pretrained=True,
        pretrained_ckpt='data/pretrained_models/pose_coco/pose_hrnet_w32_256x192.pth',
        downsample=False,
        use_conv=False,
 ):
    cfg = get_cfg_defaults(pretrained_ckpt, width=32, downsample=downsample, use_conv=use_conv)
    return get_pose_net(cfg, is_train=True)
 def hrnet_w48(
        pretrained=True,
        pretrained_ckpt='data/pretrained_models/pose_coco/pose_hrnet_w48_256x192.pth',
        downsample=False,
        use_conv=False,
 ):
    cfg = get_cfg_defaults(pretrained_ckpt, width=48, downsample=downsample, use_conv=use_conv)
    return get_pose_net(cfg, is_train=True)
--- a/myeasymocap/backbone/pare/backbone/mobilenet.py
+++ b/myeasymocap/backbone/pare/backbone/mobilenet.py
@ -0,0 +1,191 @@
 from torch import nn
 try:
    from torch.hub import load_state_dict_from_url
 except ImportError:
    from torchvision.models.utils import load_state_dict_from_url
 __all__ = ['MobileNetV2', 'mobilenet_v2']
 model_urls = {
    'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth',
 }
 def _make_divisible(v, divisor, min_value=None):
    """
    This function is taken from the original tf repo.
    It ensures that all layers have a channel number that is divisible by 8
    It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    :param v:
    :param divisor:
    :param min_value:
    :return:
    """
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v
 class ConvBNReLU(nn.Sequential):
    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1, norm_layer=None):
        padding = (kernel_size - 1) // 2
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        super(ConvBNReLU, self).__init__(
            nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
            norm_layer(out_planes),
            nn.ReLU6(inplace=True)
        )
 class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio, norm_layer=None):
        super(InvertedResidual, self).__init__()
        self.stride = stride
        assert stride in [1, 2]
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        hidden_dim = int(round(inp * expand_ratio))
        self.use_res_connect = self.stride == 1 and inp == oup
        layers = []
        if expand_ratio != 1:
            # pw
            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1, norm_layer=norm_layer))
        layers.extend([
            # dw
            ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim, norm_layer=norm_layer),
            # pw-linear
            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
            norm_layer(oup),
        ])
        self.conv = nn.Sequential(*layers)
    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)
 class MobileNetV2(nn.Module):
    def __init__(self,
                 num_classes=1000,
                 width_mult=1.0,
                 inverted_residual_setting=None,
                 round_nearest=8,
                 block=None,
                 norm_layer=None):
        """
        MobileNet V2 main class
        Args:
            num_classes (int): Number of classes
            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
            inverted_residual_setting: Network structure
            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
            Set to 1 to turn off rounding
            block: Module specifying inverted residual building block for mobilenet
            norm_layer: Module specifying the normalization layer to use
        """
        super(MobileNetV2, self).__init__()
        if block is None:
            block = InvertedResidual
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        input_channel = 32
        last_channel = 1280
        if inverted_residual_setting is None:
            inverted_residual_setting = [
                # t, c, n, s
                [1, 16, 1, 1],
                [6, 24, 2, 2],
                [6, 32, 3, 2],
                [6, 64, 4, 2],
                [6, 96, 3, 1],
                [6, 160, 3, 2],
                [6, 320, 1, 1],
            ]
        # only check the first element, assuming user knows t,c,n,s are required
        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
            raise ValueError("inverted_residual_setting should be non-empty "
                             "or a 4-element list, got {}".format(inverted_residual_setting))
        # building first layer
        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
        self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
        features = [ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer)]
        # building inverted residual blocks
        for t, c, n, s in inverted_residual_setting:
            output_channel = _make_divisible(c * width_mult, round_nearest)
            for i in range(n):
                stride = s if i == 0 else 1
                features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer))
                input_channel = output_channel
        # building last several layers
        features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer))
        # make it nn.Sequential
        self.features = nn.Sequential(*features)
        # building classifier
        # self.classifier = nn.Sequential(
        #     nn.Dropout(0.2),
        #     nn.Linear(self.last_channel, num_classes),
        # )
        # weight initialization
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out')
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.zeros_(m.bias)
    def _forward_impl(self, x):
        # This exists since TorchScript doesn't support inheritance, so the superclass method
        # (this one) needs to have a name other than `forward` that can be accessed in a subclass
        x = self.features(x)
        # Cannot use "squeeze" as batch-size can be 1 => must use reshape with x.shape[0]
        # x = nn.functional.adaptive_avg_pool2d(x, 1).reshape(x.shape[0], -1)
        # x = self.classifier(x)
        return x
    def forward(self, x):
        return self._forward_impl(x)
 def mobilenet_v2(pretrained=False, progress=True, **kwargs):
    """
    Constructs a MobileNetV2 architecture from
    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    model = MobileNetV2(**kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls['mobilenet_v2'],
                                              progress=progress)
        model.load_state_dict(state_dict, strict=False)
    return model
--- a/myeasymocap/backbone/pare/backbone/resnet.py
+++ b/myeasymocap/backbone/pare/backbone/resnet.py
@ -0,0 +1,355 @@
 import torch
 import torch.nn as nn
 try:
    from torch.hub import load_state_dict_from_url
 except ImportError:
    from torchvision.models.utils import load_state_dict_from_url
 __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
           'wide_resnet50_2', 'wide_resnet101_2']
 model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
    'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
    'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
    'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
    'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
 }
 def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)
 def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
 class BasicBlock(nn.Module):
    expansion = 1
    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride
    def forward(self, x):
        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample is not None:
            identity = self.downsample(x)
        out += identity
        out = self.relu(out)
        return out
 class Bottleneck(nn.Module):
    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
    # This variant is also known as ResNet V1.5 and improves accuracy according to
    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
    expansion = 4
    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(Bottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride
    def forward(self, x):
        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.bn3(out)
        if self.downsample is not None:
            identity = self.downsample(x)
        out += identity
        out = self.relu(out)
        return out
 class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer
        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])
        # self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        # self.fc = nn.Linear(512 * block.expansion, num_classes)
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)
    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))
        return nn.Sequential(*layers)
    def _forward_impl(self, x):
        # See note [TorchScript super()]
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        # x = self.avgpool(x)
        # x = torch.flatten(x, 1)
        # x = self.fc(x)
        return x
    def forward(self, x):
        return self._forward_impl(x)
 def _resnet(arch, block, layers, pretrained, progress, **kwargs):
    model = ResNet(block, layers, **kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        model.load_state_dict(state_dict, strict=False)
    return model
 def resnet18(pretrained=False, progress=True, **kwargs):
    r"""ResNet-18 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
                   **kwargs)
 def resnet34(pretrained=False, progress=True, **kwargs):
    r"""ResNet-34 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
                   **kwargs)
 def resnet50(pretrained=False, progress=True, **kwargs):
    r"""ResNet-50 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
                   **kwargs)
 def resnet101(pretrained=False, progress=True, **kwargs):
    r"""ResNet-101 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
                   **kwargs)
 def resnet152(pretrained=False, progress=True, **kwargs):
    r"""ResNet-152 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
                   **kwargs)
 def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
    r"""ResNeXt-50 32x4d model from
    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['groups'] = 32
    kwargs['width_per_group'] = 4
    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
                   pretrained, progress, **kwargs)
 def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
    r"""ResNeXt-101 32x8d model from
    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['groups'] = 32
    kwargs['width_per_group'] = 8
    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
                   pretrained, progress, **kwargs)
 def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
    r"""Wide ResNet-50-2 model from
    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
    The model is the same as ResNet except for the bottleneck number of channels
    which is twice larger in every block. The number of channels in outer 1x1
    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['width_per_group'] = 64 * 2
    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
                   pretrained, progress, **kwargs)
 def wide_resnet101_2(pretrained=False, progress=True, **kwargs):
    r"""Wide ResNet-101-2 model from
    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
    The model is the same as ResNet except for the bottleneck number of channels
    which is twice larger in every block. The number of channels in outer 1x1
    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['width_per_group'] = 64 * 2
    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
                   pretrained, progress, **kwargs)
--- a/myeasymocap/backbone/pare/backbone/utils.py
+++ b/myeasymocap/backbone/pare/backbone/utils.py
@ -0,0 +1,36 @@
 # -*- coding: utf-8 -*-
 # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
 # holder of all proprietary rights on this computer program.
 # You can only use this computer program if you have closed
 # a license agreement with MPG or you get the right to use the computer
 # program from someone who is authorized to grant you that right.
 # Any use of the computer program without a valid license is prohibited and
 # liable to prosecution.
 #
 # Copyright©2019 Max-Planck-Gesellschaft zur Förderung
 # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
 # for Intelligent Systems. All rights reserved.
 #
 # Contact: ps-license@tuebingen.mpg.de
 def get_backbone_info(backbone):
    info = {
        'resnet18': {'n_output_channels': 512, 'downsample_rate': 4},
        'resnet34': {'n_output_channels': 512, 'downsample_rate': 4},
        'resnet50': {'n_output_channels': 2048, 'downsample_rate': 4},
        'resnet50_adf_dropout': {'n_output_channels': 2048, 'downsample_rate': 4},
        'resnet50_dropout': {'n_output_channels': 2048, 'downsample_rate': 4},
        'resnet101': {'n_output_channels': 2048, 'downsample_rate': 4},
        'resnet152': {'n_output_channels': 2048, 'downsample_rate': 4},
        'resnext50_32x4d': {'n_output_channels': 2048, 'downsample_rate': 4},
        'resnext101_32x8d': {'n_output_channels': 2048, 'downsample_rate': 4},
        'wide_resnet50_2': {'n_output_channels': 2048, 'downsample_rate': 4},
        'wide_resnet101_2': {'n_output_channels': 2048, 'downsample_rate': 4},
        'mobilenet_v2': {'n_output_channels': 1280, 'downsample_rate': 4},
        'hrnet_w32': {'n_output_channels': 480, 'downsample_rate': 4},
        'hrnet_w48': {'n_output_channels': 720, 'downsample_rate': 4},
        # 'hrnet_w64': {'n_output_channels': 2048, 'downsample_rate': 4},
        'dla34': {'n_output_channels': 512, 'downsample_rate': 4},
    }
    return info[backbone]
--- a/myeasymocap/backbone/pare/config.py
+++ b/myeasymocap/backbone/pare/config.py
@ -0,0 +1,239 @@
 # -*- coding: utf-8 -*-
 # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
 # holder of all proprietary rights on this computer program.
 # You can only use this computer program if you have closed
 # a license agreement with MPG or you get the right to use the computer
 # program from someone who is authorized to grant you that right.
 # Any use of the computer program without a valid license is prohibited and
 # liable to prosecution.
 #
 # Copyright©2019 Max-Planck-Gesellschaft zur Förderung
 # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
 # for Intelligent Systems. All rights reserved.
 #
 # Contact: ps-license@tuebingen.mpg.de
 import os
 import time
 import yaml
 import shutil
 import argparse
 import operator
 import itertools
 from os.path import join
 from functools import reduce
 from yacs.config import CfgNode as CN
 from typing import Dict, List, Union, Any
 # from ..utils.cluster import execute_task_on_cluster
 ##### CONSTANTS #####
 DATASET_NPZ_PATH = 'data/dataset_extras'
 DATASET_LMDB_PATH = 'data/lmdb'
 MMPOSE_PATH = '/is/cluster/work/mkocabas/projects/mmpose'
 MMDET_PATH = '/is/cluster/work/mkocabas/projects/mmdetection'
 MMPOSE_CFG = os.path.join(MMPOSE_PATH, 'configs/top_down/hrnet/coco-wholebody/hrnet_w48_coco_wholebody_256x192.py')
 MMPOSE_CKPT = os.path.join(MMPOSE_PATH, 'checkpoints/hrnet_w48_coco_wholebody_256x192-643e18cb_20200922.pth')
 MMDET_CFG = os.path.join(MMDET_PATH, 'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py')
 MMDET_CKPT = os.path.join(MMDET_PATH, 'checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth')
 PW3D_ROOT = 'data/dataset_folders/3dpw'
 OH3D_ROOT = 'data/dataset_folders/3doh'
 JOINT_REGRESSOR_TRAIN_EXTRA = 'models/pare/data/J_regressor_extra.npy'
 JOINT_REGRESSOR_H36M = 'models/pare/data/J_regressor_h36m.npy'
 SMPL_MEAN_PARAMS = 'models/pare/data/smpl_mean_params.npz'
 SMPL_MODEL_DIR = 'models/pare/data/body_models/smpl'
 COCO_OCCLUDERS_FILE = 'data/occlusion_augmentation/coco_train2014_occluders.pkl'
 PASCAL_OCCLUDERS_FILE = 'data/occlusion_augmentation/pascal_occluders.pkl'
 DATASET_FOLDERS = {
    '3dpw': PW3D_ROOT,
    '3dpw-val': PW3D_ROOT,
    '3dpw-val-cam': PW3D_ROOT,
    '3dpw-test-cam': PW3D_ROOT,
    '3dpw-train-cam': PW3D_ROOT,
    '3dpw-cam': PW3D_ROOT,
    '3dpw-all': PW3D_ROOT,
    '3doh': OH3D_ROOT,
 }
 DATASET_FILES = [
    # Training
    {
        '3dpw-all': join(DATASET_NPZ_PATH, '3dpw_all_test_with_mmpose.npz'),
        '3doh': join(DATASET_NPZ_PATH, '3doh_test.npz'),
    },
    # Testing
    {
        '3doh': join(DATASET_NPZ_PATH, '3doh_train.npz'),
        '3dpw': join(DATASET_NPZ_PATH, '3dpw_train.npz'),
    }
 ]
 EVAL_MESH_DATASETS = ['3dpw', '3dpw-val', '3dpw-all', '3doh']
 ##### CONFIGS #####
 hparams = CN()
 # General settings
 hparams.LOG_DIR = 'logs/experiments'
 hparams.METHOD = 'pare'
 hparams.EXP_NAME = 'default'
 hparams.RUN_TEST = False
 hparams.PROJECT_NAME = 'pare'
 hparams.SEED_VALUE = -1
 hparams.SYSTEM = CN()
 hparams.SYSTEM.GPU = ''
 hparams.SYSTEM.CLUSTER_NODE = 0.0
 # Dataset hparams
 hparams.DATASET = CN()
 hparams.DATASET.LOAD_TYPE = 'Base'
 hparams.DATASET.NOISE_FACTOR = 0.4
 hparams.DATASET.ROT_FACTOR = 30
 hparams.DATASET.SCALE_FACTOR = 0.25
 hparams.DATASET.FLIP_PROB = 0.5
 hparams.DATASET.CROP_PROB = 0.0
 hparams.DATASET.CROP_FACTOR = 0.0
 hparams.DATASET.BATCH_SIZE = 64
 hparams.DATASET.NUM_WORKERS = 8
 hparams.DATASET.PIN_MEMORY = True
 hparams.DATASET.SHUFFLE_TRAIN = True
 hparams.DATASET.TRAIN_DS = 'all'
 hparams.DATASET.VAL_DS = '3dpw_3doh'
 hparams.DATASET.NUM_IMAGES = -1
 hparams.DATASET.TRAIN_NUM_IMAGES = -1
 hparams.DATASET.TEST_NUM_IMAGES = -1
 hparams.DATASET.IMG_RES = 224
 hparams.DATASET.USE_HEATMAPS = '' # 'hm', 'hm_soft', 'part_segm', 'attention'
 hparams.DATASET.RENDER_RES = 480
 hparams.DATASET.MESH_COLOR = 'pinkish'
 hparams.DATASET.FOCAL_LENGTH = 5000.
 hparams.DATASET.IGNORE_3D = False
 hparams.DATASET.USE_SYNTHETIC_OCCLUSION = False
 hparams.DATASET.OCC_AUG_DATASET = 'pascal'
 hparams.DATASET.USE_3D_CONF = False
 hparams.DATASET.USE_GENDER = False
 # this is a bit confusing but for the in the wild dataset ratios should be same, otherwise the code
 # will be a bit verbose
 hparams.DATASET.DATASETS_AND_RATIOS = 'h36m_mpii_lspet_coco_mpi-inf-3dhp_0.3_0.6_0.6_0.6_0.1'
 hparams.DATASET.STAGE_DATASETS = '0+h36m_coco_0.2_0.8 2+h36m_coco_0.4_0.6'
 # enable non parametric representation
 hparams.DATASET.NONPARAMETRIC = False
 # optimizer config
 hparams.OPTIMIZER = CN()
 hparams.OPTIMIZER.TYPE = 'adam'
 hparams.OPTIMIZER.LR = 0.0001 # 0.00003
 hparams.OPTIMIZER.WD = 0.0
 # Training process hparams
 hparams.TRAINING = CN()
 hparams.TRAINING.RESUME = None
 hparams.TRAINING.PRETRAINED = None
 hparams.TRAINING.PRETRAINED_LIT = None
 hparams.TRAINING.MAX_EPOCHS = 100
 hparams.TRAINING.LOG_SAVE_INTERVAL = 50
 hparams.TRAINING.LOG_FREQ_TB_IMAGES = 500
 hparams.TRAINING.CHECK_VAL_EVERY_N_EPOCH = 1
 hparams.TRAINING.RELOAD_DATALOADERS_EVERY_EPOCH = True
 hparams.TRAINING.NUM_SMPLIFY_ITERS = 100 # 50
 hparams.TRAINING.RUN_SMPLIFY = False
 hparams.TRAINING.SMPLIFY_THRESHOLD = 100
 hparams.TRAINING.DROPOUT_P = 0.2
 hparams.TRAINING.TEST_BEFORE_TRAINING = False
 hparams.TRAINING.SAVE_IMAGES = False
 hparams.TRAINING.USE_PART_SEGM_LOSS = False
 hparams.TRAINING.USE_AMP = False
 # Training process hparams
 hparams.TESTING = CN()
 hparams.TESTING.SAVE_IMAGES = False
 hparams.TESTING.SAVE_FREQ = 1
 hparams.TESTING.SAVE_RESULTS = True
 hparams.TESTING.SAVE_MESHES = False
 hparams.TESTING.SIDEVIEW = True
 hparams.TESTING.TEST_ON_TRAIN_END = True
 hparams.TESTING.MULTI_SIDEVIEW = False
 hparams.TESTING.USE_GT_CAM = False
 # PARE method hparams
 hparams.PARE = CN()
 hparams.PARE.BACKBONE = 'resnet50' # hrnet_w32-conv, hrnet_w32-interp
 hparams.PARE.NUM_JOINTS = 24
 hparams.PARE.SOFTMAX_TEMP = 1.
 hparams.PARE.NUM_FEATURES_SMPL = 64
 hparams.PARE.USE_ATTENTION = False
 hparams.PARE.USE_SELF_ATTENTION = False
 hparams.PARE.USE_KEYPOINT_ATTENTION = False
 hparams.PARE.USE_KEYPOINT_FEATURES_FOR_SMPL_REGRESSION = False
 hparams.PARE.USE_POSTCONV_KEYPOINT_ATTENTION = False
 hparams.PARE.KEYPOINT_ATTENTION_ACT = 'softmax'
 hparams.PARE.USE_SCALE_KEYPOINT_ATTENTION = False
 hparams.PARE.USE_FINAL_NONLOCAL = None
 hparams.PARE.USE_BRANCH_NONLOCAL = None
 hparams.PARE.USE_HMR_REGRESSION = False
 hparams.PARE.USE_COATTENTION = False
 hparams.PARE.NUM_COATTENTION_ITER = 1
 hparams.PARE.COATTENTION_CONV = 'simple' # 'double_1', 'double_3', 'single_1', 'single_3', 'simple'
 hparams.PARE.USE_UPSAMPLING = False
 hparams.PARE.DECONV_CONV_KERNEL_SIZE = 4
 hparams.PARE.USE_SOFT_ATTENTION = False
 hparams.PARE.NUM_BRANCH_ITERATION = 0
 hparams.PARE.BRANCH_DEEPER = False
 hparams.PARE.NUM_DECONV_LAYERS = 3
 hparams.PARE.NUM_DECONV_FILTERS = 256
 hparams.PARE.USE_RESNET_CONV_HRNET = False
 hparams.PARE.USE_POS_ENC = False
 hparams.PARE.ITERATIVE_REGRESSION = False
 hparams.PARE.ITER_RESIDUAL = False
 hparams.PARE.NUM_ITERATIONS = 3
 hparams.PARE.SHAPE_INPUT_TYPE = 'feats.all_pose.shape.cam'
 hparams.PARE.POSE_INPUT_TYPE = 'feats.neighbor_pose_feats.all_pose.self_pose.neighbor_pose.shape.cam'
 hparams.PARE.POSE_MLP_NUM_LAYERS = 1
 hparams.PARE.SHAPE_MLP_NUM_LAYERS = 1
 hparams.PARE.POSE_MLP_HIDDEN_SIZE = 256
 hparams.PARE.SHAPE_MLP_HIDDEN_SIZE = 256
 hparams.PARE.SHAPE_LOSS_WEIGHT = 0
 hparams.PARE.KEYPOINT_LOSS_WEIGHT = 5.
 hparams.PARE.KEYPOINT_NATIVE_LOSS_WEIGHT = 5.
 hparams.PARE.HEATMAPS_LOSS_WEIGHT = 5.
 hparams.PARE.SMPL_PART_LOSS_WEIGHT = 1.
 hparams.PARE.PART_SEGM_LOSS_WEIGHT = 1.
 hparams.PARE.POSE_LOSS_WEIGHT = 1.
 hparams.PARE.BETA_LOSS_WEIGHT = 0.001
 hparams.PARE.OPENPOSE_TRAIN_WEIGHT = 0.
 hparams.PARE.GT_TRAIN_WEIGHT = 1.
 hparams.PARE.LOSS_WEIGHT = 60.
 hparams.PARE.USE_SHAPE_REG = False
 hparams.PARE.USE_MEAN_CAMSHAPE = False
 hparams.PARE.USE_MEAN_POSE = False
 hparams.PARE.INIT_XAVIER = False
 def get_hparams_defaults():
    """Get a yacs hparamsNode object with default values for my_project."""
    # Return a clone so that the defaults will not be altered
    # This is for the "local variable" use pattern
    return hparams.clone()
 def update_hparams(hparams_file):
    hparams = get_hparams_defaults()
    hparams.merge_from_file(hparams_file)
    return hparams.clone()
 def update_hparams_from_dict(cfg_dict):
    hparams = get_hparams_defaults()
    cfg = hparams.load_cfg(str(cfg_dict))
    hparams.merge_from_other_cfg(cfg)
    return hparams.clone()
--- a/myeasymocap/backbone/pare/constants.py
+++ b/myeasymocap/backbone/pare/constants.py
@ -0,0 +1,195 @@
 # -*- coding: utf-8 -*-
 # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
 # holder of all proprietary rights on this computer program.
 # You can only use this computer program if you have closed
 # a license agreement with MPG or you get the right to use the computer
 # program from someone who is authorized to grant you that right.
 # Any use of the computer program without a valid license is prohibited and
 # liable to prosecution.
 #
 # Copyright©2019 Max-Planck-Gesellschaft zur Förderung
 # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
 # for Intelligent Systems. All rights reserved.
 #
 # Contact: ps-license@tuebingen.mpg.de
 import numpy as np
 # Mean and standard deviation for normalizing input image
 IMG_NORM_MEAN = [0.485, 0.456, 0.406]
 IMG_NORM_STD = [0.229, 0.224, 0.225]
 """
 We create a superset of joints containing the OpenPose joints together with the ones that each dataset provides.
 We keep a superset of 24 joints such that we include all joints from every dataset.
 If a dataset doesn't provide annotations for a specific joint, we simply ignore it.
 The joints used here are the following:
 """
 JOINT_NAMES = [
 # 25 OpenPose joints (in the order provided by OpenPose)
 'OP Nose',
 'OP Neck',
 'OP RShoulder',
 'OP RElbow',
 'OP RWrist',
 'OP LShoulder',
 'OP LElbow',
 'OP LWrist',
 'OP MidHip',
 'OP RHip',
 'OP RKnee',
 'OP RAnkle',
 'OP LHip',
 'OP LKnee',
 'OP LAnkle',
 'OP REye',
 'OP LEye',
 'OP REar',
 'OP LEar',
 'OP LBigToe',
 'OP LSmallToe',
 'OP LHeel',
 'OP RBigToe',
 'OP RSmallToe',
 'OP RHeel',
 # 24 Ground Truth joints (superset of joints from different datasets)
 'Right Ankle',
 'Right Knee',
 'Right Hip',
 'Left Hip',
 'Left Knee',
 'Left Ankle',
 'Right Wrist',
 'Right Elbow',
 'Right Shoulder',
 'Left Shoulder',
 'Left Elbow',
 'Left Wrist',
 'Neck (LSP)',
 'Top of Head (LSP)',
 'Pelvis (MPII)',
 'Thorax (MPII)',
 'Spine (H36M)',
 'Jaw (H36M)',
 'Head (H36M)',
 'Nose',
 'Left Eye',
 'Right Eye',
 'Left Ear',
 'Right Ear'
 ]
 # Dict containing the joints in numerical order
 JOINT_IDS = {JOINT_NAMES[i]: i for i in range(len(JOINT_NAMES))}
 # Map joints to SMPL joints
 JOINT_MAP = {
 'OP Nose': 24, 'OP Neck': 12, 'OP RShoulder': 17,
 'OP RElbow': 19, 'OP RWrist': 21, 'OP LShoulder': 16,
 'OP LElbow': 18, 'OP LWrist': 20, 'OP MidHip': 0,
 'OP RHip': 2, 'OP RKnee': 5, 'OP RAnkle': 8,
 'OP LHip': 1, 'OP LKnee': 4, 'OP LAnkle': 7,
 'OP REye': 25, 'OP LEye': 26, 'OP REar': 27,
 'OP LEar': 28, 'OP LBigToe': 29, 'OP LSmallToe': 30,
 'OP LHeel': 31, 'OP RBigToe': 32, 'OP RSmallToe': 33, 'OP RHeel': 34,
 'Right Ankle': 8, 'Right Knee': 5, 'Right Hip': 45,
 'Left Hip': 46, 'Left Knee': 4, 'Left Ankle': 7,
 'Right Wrist': 21, 'Right Elbow': 19, 'Right Shoulder': 17,
 'Left Shoulder': 16, 'Left Elbow': 18, 'Left Wrist': 20,
 'Neck (LSP)': 47, 'Top of Head (LSP)': 48,
 'Pelvis (MPII)': 49, 'Thorax (MPII)': 50,
 'Spine (H36M)': 51, 'Jaw (H36M)': 52,
 'Head (H36M)': 53, 'Nose': 24, 'Left Eye': 26,
 'Right Eye': 25, 'Left Ear': 28, 'Right Ear': 27
 }
 # Joint selectors
 # Indices to get the 14 LSP joints from the 17 H36M joints
 H36M_TO_J17 = [6, 5, 4, 1, 2, 3, 16, 15, 14, 11, 12, 13, 8, 10, 0, 7, 9]
 H36M_TO_J14 = H36M_TO_J17[:14]
 # Indices to get the 14 LSP joints from the ground truth joints
 J24_TO_J17 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18, 14, 16, 17]
 J24_TO_J14 = J24_TO_J17[:14]
 # Permutation of SMPL pose parameters when flipping the shape
 SMPL_JOINTS_FLIP_PERM = [0, 2, 1, 3, 5, 4, 6, 8, 7, 9, 11, 10, 12, 14, 13, 15, 17, 16, 19, 18, 21, 20, 23, 22]
 SMPL_POSE_FLIP_PERM = []
 for i in SMPL_JOINTS_FLIP_PERM:
    SMPL_POSE_FLIP_PERM.append(3*i)
    SMPL_POSE_FLIP_PERM.append(3*i+1)
    SMPL_POSE_FLIP_PERM.append(3*i+2)
 # Permutation indices for the 24 ground truth joints
 J24_FLIP_PERM = [5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13, 14, 15, 16, 17, 18, 19, 21, 20, 23, 22]
 # Permutation indices for the full set of 49 joints
 J49_FLIP_PERM = [0, 1, 5, 6, 7, 2, 3, 4, 8, 12, 13, 14, 9, 10, 11, 16, 15, 18, 17, 22, 23, 24, 19, 20, 21]\
              + [25+i for i in J24_FLIP_PERM]
 SMPLH_TO_SMPL = np.arange(0, 156).reshape((-1, 3))[
    np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 37])
 ].reshape(-1)
 pw3d_occluded_sequences = [
    'courtyard_backpack',
    'courtyard_basketball',
    'courtyard_bodyScannerMotions',
    'courtyard_box',
    'courtyard_golf',
    'courtyard_jacket',
    'courtyard_laceShoe',
    'downtown_stairs',
    'flat_guitar',
    'flat_packBags',
    'outdoors_climbing',
    'outdoors_crosscountry',
    'outdoors_fencing',
    'outdoors_freestyle',
    'outdoors_golf',
    'outdoors_parcours',
    'outdoors_slalom',
 ]
 pw3d_test_sequences = [
    'flat_packBags_00',
    'downtown_weeklyMarket_00',
    'outdoors_fencing_01',
    'downtown_walkBridge_01',
    'downtown_enterShop_00',
    'downtown_rampAndStairs_00',
    'downtown_bar_00',
    'downtown_runForBus_01',
    'downtown_cafe_00',
    'flat_guitar_01',
    'downtown_runForBus_00',
    'downtown_sitOnStairs_00',
    'downtown_bus_00',
    'downtown_arguing_00',
    'downtown_crossStreets_00',
    'downtown_walkUphill_00',
    'downtown_walking_00',
    'downtown_car_00',
    'downtown_warmWelcome_00',
    'downtown_upstairs_00',
    'downtown_stairs_00',
    'downtown_windowShopping_00',
    'office_phoneCall_00',
    'downtown_downstairs_00'
 ]
 pw3d_cam_sequences = [
    # TEST
    'downtown_downstairs_00',
    'downtown_stairs_00',
    'downtown_rampAndStairs_00',
    'flat_packBags_00',
    'flat_guitar_01',
    'downtown_warmWelcome_00',
    'downtown_walkUphill_00',
    # VALIDATION
    'outdoors_parcours_01',
    'outdoors_crosscountry_00',
    'outdoors_freestyle_01',
    'downtown_walkDownhill_00',
    'outdoors_parcours_00',
 ]
--- a/myeasymocap/backbone/pare/head/init.py
+++ b/myeasymocap/backbone/pare/head/init.py
@ -0,0 +1,4 @@
 from .pare_head import PareHead
 from .hmr_head import HMRHead
 # from .smpl_head import SMPLHead
 # from .smpl_cam_head import SMPLCamHead
--- a/myeasymocap/backbone/pare/head/hmr_head.py
+++ b/myeasymocap/backbone/pare/head/hmr_head.py
@ -0,0 +1,203 @@
 # -*- coding: utf-8 -*-
 # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
 # holder of all proprietary rights on this computer program.
 # You can only use this computer program if you have closed
 # a license agreement with MPG or you get the right to use the computer
 # program from someone who is authorized to grant you that right.
 # Any use of the computer program without a valid license is prohibited and
 # liable to prosecution.
 #
 # Copyright©2019 Max-Planck-Gesellschaft zur Förderung
 # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
 # for Intelligent Systems. All rights reserved.
 #
 # Contact: ps-license@tuebingen.mpg.de
 import math
 import torch
 import numpy as np
 import torch.nn as nn
 from ..config import SMPL_MEAN_PARAMS
 from ..utils.geometry import rot6d_to_rotmat, rotmat_to_rot6d
 BN_MOMENTUM = 0.1
 class HMRHead(nn.Module):
    def __init__(
            self,
            num_input_features,
            smpl_mean_params=SMPL_MEAN_PARAMS,
            estimate_var=False,
            use_separate_var_branch=False,
            uncertainty_activation='',
            backbone='resnet50',
            use_cam_feats=False,
    ):
        super(HMRHead, self).__init__()
        npose = 24 * 6
        self.npose = npose
        self.estimate_var = estimate_var
        self.use_separate_var_branch = use_separate_var_branch
        self.uncertainty_activation = uncertainty_activation
        self.backbone = backbone
        self.num_input_features = num_input_features
        self.use_cam_feats = use_cam_feats
        if use_cam_feats:
            num_input_features += 7 # 6d rotmat + vfov
        self.avgpool = nn.AdaptiveAvgPool2d(1) # nn.AvgPool2d(7, stride=1)
        self.fc1 = nn.Linear(num_input_features + npose + 13, 1024)
        self.drop1 = nn.Dropout()
        self.fc2 = nn.Linear(1024, 1024)
        self.drop2 = nn.Dropout()
        if self.estimate_var:
            # estimate variance for pose and shape parameters
            if self.use_separate_var_branch:
                # Decouple var estimation layer using separate linear layers
                self.decpose = nn.Linear(1024, npose)
                self.decshape = nn.Linear(1024, 10)
                self.deccam = nn.Linear(1024, 3)
                self.decpose_var = nn.Linear(1024, npose)
                self.decshape_var = nn.Linear(1024, 10)
                nn.init.xavier_uniform_(self.decpose_var.weight, gain=0.01)
                nn.init.xavier_uniform_(self.decshape_var.weight, gain=0.01)
            else:
                # double the output sizes to estimate var
                self.decpose = nn.Linear(1024, npose * 2)
                self.decshape = nn.Linear(1024, 10 * 2)
                self.deccam = nn.Linear(1024, 3)
        else:
            self.decpose = nn.Linear(1024, npose)
            self.decshape = nn.Linear(1024, 10)
            self.deccam = nn.Linear(1024, 3)
        nn.init.xavier_uniform_(self.decpose.weight, gain=0.01)
        nn.init.xavier_uniform_(self.decshape.weight, gain=0.01)
        nn.init.xavier_uniform_(self.deccam.weight, gain=0.01)
        if self.backbone.startswith('hrnet'):
            self.downsample_module = self._make_head()
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
        mean_params = np.load(smpl_mean_params)
        init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0)
        init_shape = torch.from_numpy(mean_params['shape'][:].astype('float32')).unsqueeze(0)
        init_cam = torch.from_numpy(mean_params['cam']).unsqueeze(0)
        self.register_buffer('init_pose', init_pose)
        self.register_buffer('init_shape', init_shape)
        self.register_buffer('init_cam', init_cam)
    def _make_head(self):
        # downsampling modules
        downsamp_modules = []
        for i in range(3):
            in_channels = self.num_input_features
            out_channels = self.num_input_features
            downsamp_module = nn.Sequential(
                nn.Conv2d(in_channels=in_channels,
                          out_channels=out_channels,
                          kernel_size=3,
                          stride=2,
                          padding=1),
                nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM),
                nn.ReLU(inplace=True)
            )
            downsamp_modules.append(downsamp_module)
        downsamp_modules = nn.Sequential(*downsamp_modules)
        return downsamp_modules
    def forward(
            self,
            features,
            init_pose=None,
            init_shape=None,
            init_cam=None,
            cam_rotmat=None,
            cam_vfov=None,
            n_iter=3
    ):
        # if self.backbone.startswith('hrnet'):
        #     features = self.downsample_module(features)
        batch_size = features.shape[0]
        if init_pose is None:
            init_pose = self.init_pose.expand(batch_size, -1)
        if init_shape is None:
            init_shape = self.init_shape.expand(batch_size, -1)
        if init_cam is None:
            init_cam = self.init_cam.expand(batch_size, -1)
        xf = self.avgpool(features)
        xf = xf.view(xf.size(0), -1)
        pred_pose = init_pose
        pred_shape = init_shape
        pred_cam = init_cam
        for i in range(n_iter):
            if self.use_cam_feats:
                xc = torch.cat([xf, pred_pose, pred_shape, pred_cam,
                                rotmat_to_rot6d(cam_rotmat), cam_vfov.unsqueeze(-1)], 1)
            else:
                xc = torch.cat([xf, pred_pose, pred_shape, pred_cam], 1)
            xc = self.fc1(xc)
            xc = self.drop1(xc)
            xc = self.fc2(xc)
            xc = self.drop2(xc)
            if self.estimate_var:
                pred_pose = self.decpose(xc)[:,:self.npose] + pred_pose
                pred_shape = self.decshape(xc)[:,:10] + pred_shape
                pred_cam = self.deccam(xc) + pred_cam
                if self.use_separate_var_branch:
                    pred_pose_var = self.decpose_var(xc)
                    pred_shape_var = self.decshape_var(xc)
                else:
                    pred_pose_var = self.decpose(xc)[:,self.npose:]
                    pred_shape_var = self.decshape(xc)[:,10:]
                if self.uncertainty_activation != '':
                    # Use an activation layer to output uncertainty
                    pred_pose_var = eval(f'F.{self.uncertainty_activation}')(pred_pose_var)
                    pred_shape_var = eval(f'F.{self.uncertainty_activation}')(pred_shape_var)
            else:
                pred_pose = self.decpose(xc) + pred_pose
                pred_shape = self.decshape(xc) + pred_shape
                pred_cam = self.deccam(xc) + pred_cam
        pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, 24, 3, 3)
        output = {
            'pred_pose': pred_rotmat,
            'pred_cam': pred_cam,
            'pred_shape': pred_shape,
            'pred_pose_6d': pred_pose,
        }
        if self.estimate_var:
            output.update({
                'pred_pose_var': torch.cat([pred_pose, pred_pose_var], dim=1),
                'pred_shape_var': torch.cat([pred_shape, pred_shape_var], dim=1),
            })
        return output
 def keep_variance(x, min_variance):
    return x + min_variance
--- a/myeasymocap/backbone/pare/head/pare_head.py
+++ b/myeasymocap/backbone/pare/head/pare_head.py
@ -0,0 +1,926 @@
 # -*- coding: utf-8 -*-
 # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
 # holder of all proprietary rights on this computer program.
 # You can only use this computer program if you have closed
 # a license agreement with MPG or you get the right to use the computer
 # program from someone who is authorized to grant you that right.
 # Any use of the computer program without a valid license is prohibited and
 # liable to prosecution.
 #
 # Copyright©2019 Max-Planck-Gesellschaft zur Förderung
 # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
 # for Intelligent Systems. All rights reserved.
 #
 # Contact: ps-license@tuebingen.mpg.de
 import torch
 import numpy as np
 import torch.nn as nn
 import torch.nn.functional as F
 from ..config import SMPL_MEAN_PARAMS
 from ..layers.coattention import CoAttention
 from ..utils.geometry import rot6d_to_rotmat, get_coord_maps
 from ..utils.kp_utils import get_smpl_neighbor_triplets
 from ..layers.softargmax import softargmax2d, get_heatmap_preds
 from ..layers import LocallyConnected2d, KeypointAttention, interpolate
 from ..layers.non_local import dot_product
 from ..backbone.resnet import conv3x3, conv1x1, BasicBlock
 class logger:
    @staticmethod
    def info(*args, **kwargs):
        pass
 BN_MOMENTUM = 0.1
 class PareHead(nn.Module):
    def __init__(
            self,
            num_joints,
            num_input_features,
            softmax_temp=1.0,
            num_deconv_layers=3,
            num_deconv_filters=(256, 256, 256),
            num_deconv_kernels=(4, 4, 4),
            num_camera_params=3,
            num_features_smpl=64,
            final_conv_kernel=1,
            iterative_regression=False,
            iter_residual=False,
            num_iterations=3,
            shape_input_type='feats', # 'feats.pose.shape.cam'
            pose_input_type='feats', # 'feats.neighbor_pose_feats.all_pose.self_pose.neighbor_pose.shape.cam'
            pose_mlp_num_layers=1,
            shape_mlp_num_layers=1,
            pose_mlp_hidden_size=256,
            shape_mlp_hidden_size=256,
            use_keypoint_features_for_smpl_regression=False,
            use_heatmaps='',
            use_keypoint_attention=False,
            use_postconv_keypoint_attention=False,
            keypoint_attention_act='softmax',
            use_scale_keypoint_attention=False,
            use_branch_nonlocal=None, # 'concatenation', 'dot_product', 'embedded_gaussian', 'gaussian'
            use_final_nonlocal=None, # 'concatenation', 'dot_product', 'embedded_gaussian', 'gaussian'
            backbone='resnet',
            use_hmr_regression=False,
            use_coattention=False,
            num_coattention_iter=1,
            coattention_conv='simple', # 'double_1', 'double_3', 'single_1', 'single_3', 'simple'
            use_upsampling=False,
            use_soft_attention=False, # Stefan & Otmar 3DV style attention
            num_branch_iteration=0,
            branch_deeper=False,
            use_resnet_conv_hrnet=False,
            use_position_encodings=None,
            use_mean_camshape=False,
            use_mean_pose=False,
            init_xavier=False,
    ):
        super(PareHead, self).__init__()
        self.backbone = backbone
        self.num_joints = num_joints
        self.deconv_with_bias = False
        self.use_heatmaps = use_heatmaps
        self.num_iterations = num_iterations
        self.use_final_nonlocal = use_final_nonlocal
        self.use_branch_nonlocal = use_branch_nonlocal
        self.use_hmr_regression = use_hmr_regression
        self.use_coattention = use_coattention
        self.num_coattention_iter = num_coattention_iter
        self.coattention_conv = coattention_conv
        self.use_soft_attention = use_soft_attention
        self.num_branch_iteration = num_branch_iteration
        self.iter_residual = iter_residual
        self.iterative_regression = iterative_regression
        self.pose_mlp_num_layers = pose_mlp_num_layers
        self.shape_mlp_num_layers = shape_mlp_num_layers
        self.pose_mlp_hidden_size = pose_mlp_hidden_size
        self.shape_mlp_hidden_size = shape_mlp_hidden_size
        self.use_keypoint_attention = use_keypoint_attention
        self.use_keypoint_features_for_smpl_regression = use_keypoint_features_for_smpl_regression
        self.use_position_encodings = use_position_encodings
        self.use_mean_camshape = use_mean_camshape
        self.use_mean_pose = use_mean_pose
        self.num_input_features = num_input_features
        if use_soft_attention:
            # These options should be True by default when soft attention is used
            self.use_keypoint_features_for_smpl_regression = True
            self.use_hmr_regression = True
            self.use_coattention = False
            logger.warning('Coattention cannot be used together with soft attention')
            logger.warning('Overriding use_coattention=False')
        if use_coattention:
            self.use_keypoint_features_for_smpl_regression = False
            logger.warning('\"use_keypoint_features_for_smpl_regression\" cannot be used together with co-attention')
            logger.warning('Overriding \"use_keypoint_features_for_smpl_regression\"=False')
        if use_hmr_regression:
            self.iterative_regression = False
            logger.warning('iterative_regression cannot be used together with hmr regression')
        if self.use_heatmaps in ['part_segm', 'attention']:
            logger.info('\"Keypoint Attention\" should be activated to be able to use part segmentation')
            logger.info('Overriding use_keypoint_attention')
            self.use_keypoint_attention = True
        assert num_iterations > 0, '\"num_iterations\" should be greater than 0.'
        if use_position_encodings:
            assert backbone.startswith('hrnet'), 'backbone should be hrnet to use position encodings'
            # self.pos_enc = get_coord_maps(size=56)
            self.register_buffer('pos_enc', get_coord_maps(size=56))
            num_input_features += 2
            self.num_input_features = num_input_features
        if backbone.startswith('hrnet'):
            if use_resnet_conv_hrnet:
                logger.info('Using resnet block for keypoint and smpl conv layers...')
                self.keypoint_deconv_layers = self._make_res_conv_layers(
                    input_channels=self.num_input_features,
                    num_channels=num_deconv_filters[-1],
                    num_basic_blocks=num_deconv_layers,
                )
                self.num_input_features = num_input_features
                self.smpl_deconv_layers = self._make_res_conv_layers(
                    input_channels=self.num_input_features,
                    num_channels=num_deconv_filters[-1],
                    num_basic_blocks=num_deconv_layers,
                )
            else:
                self.keypoint_deconv_layers = self._make_conv_layer(
                    num_deconv_layers,
                    num_deconv_filters,
                    (3,)*num_deconv_layers,
                )
                self.num_input_features = num_input_features
                self.smpl_deconv_layers = self._make_conv_layer(
                    num_deconv_layers,
                    num_deconv_filters,
                    (3,)*num_deconv_layers,
                )
        else:
            # part branch that estimates 2d keypoints
            conv_fn = self._make_upsample_layer if use_upsampling else self._make_deconv_layer
            if use_upsampling:
                logger.info('Upsampling is active to increase spatial dimension')
                logger.info(f'Upsampling conv kernels: {num_deconv_kernels}')
            self.keypoint_deconv_layers = conv_fn(
                num_deconv_layers,
                num_deconv_filters,
                num_deconv_kernels,
            )
            # reset inplanes to 2048 -> final resnet layer
            self.num_input_features = num_input_features
            self.smpl_deconv_layers = conv_fn(
                num_deconv_layers,
                num_deconv_filters,
                num_deconv_kernels,
            )
        pose_mlp_inp_dim = num_deconv_filters[-1]
        smpl_final_dim = num_features_smpl
        shape_mlp_inp_dim = num_joints * smpl_final_dim
        if self.use_soft_attention:
            logger.info('Soft attention (Stefan & Otmar 3DV) is active')
            self.keypoint_final_layer = nn.Sequential(
                conv3x3(num_deconv_filters[-1], 256),
                nn.BatchNorm2d(256),
                nn.ReLU(inplace=True),
                conv1x1(256, num_joints+1 if self.use_heatmaps in ('part_segm', 'part_segm_pool') else num_joints),
            )
            soft_att_feature_size = smpl_final_dim # if use_hmr_regression else pose_mlp_inp_dim
            self.smpl_final_layer = nn.Sequential(
                conv3x3(num_deconv_filters[-1], 256),
                nn.BatchNorm2d(256),
                nn.ReLU(inplace=True),
                conv1x1(256, soft_att_feature_size),
            )
            # pose_mlp_inp_dim = soft_att_feature_size
        else:
            self.keypoint_final_layer = nn.Conv2d(
                in_channels=num_deconv_filters[-1],
                out_channels=num_joints+1 if self.use_heatmaps in ('part_segm', 'part_segm_pool') else num_joints,
                kernel_size=final_conv_kernel,
                stride=1,
                padding=1 if final_conv_kernel == 3 else 0,
            )
            self.smpl_final_layer = nn.Conv2d(
                in_channels=num_deconv_filters[-1],
                out_channels=smpl_final_dim,
                kernel_size=final_conv_kernel,
                stride=1,
                padding=1 if final_conv_kernel == 3 else 0,
            )
        # temperature for softargmax function
        self.register_buffer('temperature', torch.tensor(softmax_temp))
        # if self.iterative_regression or self.num_branch_iteration > 0 or self.use_coattention:
        mean_params = np.load(SMPL_MEAN_PARAMS)
        init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0)
        init_shape = torch.from_numpy(mean_params['shape'][:].astype('float32')).unsqueeze(0)
        init_cam = torch.from_numpy(mean_params['cam']).unsqueeze(0)
        self.register_buffer('init_pose', init_pose)
        self.register_buffer('init_shape', init_shape)
        self.register_buffer('init_cam', init_cam)
        if self.iterative_regression:
            # enable iterative regression similar to HMR
            # these are the features that can be used as input to final MLPs
            input_type_dim = {
                'feats': 0, # image features for self
                'neighbor_pose_feats': 2 * 256, # image features from neighbor joints
                'all_pose': 24 * 6, # rot6d of all joints from previous iter
                'self_pose': 6, # rot6d of self
                'neighbor_pose': 2 * 6, # rot6d of neighbor joints from previous iter
                'shape': 10, # smpl betas/shape
                'cam': num_camera_params, # weak perspective camera
            }
            assert 'feats' in shape_input_type, '\"feats\" should be the default value'
            assert 'feats' in pose_input_type, '\"feats\" should be the default value'
            self.shape_input_type = shape_input_type.split('.')
            self.pose_input_type = pose_input_type.split('.')
            pose_mlp_inp_dim = pose_mlp_inp_dim + sum([input_type_dim[x] for x in self.pose_input_type])
            shape_mlp_inp_dim = shape_mlp_inp_dim + sum([input_type_dim[x] for x in self.shape_input_type])
            logger.debug(f'Shape MLP takes \"{self.shape_input_type}\" as input, '
                         f'input dim: {shape_mlp_inp_dim}')
            logger.debug(f'Pose MLP takes \"{self.pose_input_type}\" as input, '
                         f'input dim: {pose_mlp_inp_dim}')
        self.pose_mlp_inp_dim = pose_mlp_inp_dim
        self.shape_mlp_inp_dim = shape_mlp_inp_dim
        if self.use_hmr_regression:
            logger.info(f'HMR regression is active...')
            # enable iterative regression similar to HMR
            self.fc1 = nn.Linear(num_joints * smpl_final_dim + (num_joints * 6) + 10 + num_camera_params, 1024)
            self.drop1 = nn.Dropout()
            self.fc2 = nn.Linear(1024, 1024)
            self.drop2 = nn.Dropout()
            self.decpose = nn.Linear(1024, (num_joints * 6))
            self.decshape = nn.Linear(1024, 10)
            self.deccam = nn.Linear(1024, num_camera_params)
            nn.init.xavier_uniform_(self.decpose.weight, gain=0.01)
            nn.init.xavier_uniform_(self.decshape.weight, gain=0.01)
            nn.init.xavier_uniform_(self.deccam.weight, gain=0.01)
        else:
            # here we use 2 different MLPs to estimate shape and camera
            # They take a channelwise downsampled version of smpl features
            self.shape_mlp = self._get_shape_mlp(output_size=10)
            self.cam_mlp = self._get_shape_mlp(output_size=num_camera_params)
            # for pose each joint has a separate MLP
            # weights for these MLPs are not shared
            # hence we use Locally Connected layers
            # TODO support kernel_size > 1 to access context of other joints
            self.pose_mlp = self._get_pose_mlp(num_joints=num_joints, output_size=6)
            if init_xavier:
                nn.init.xavier_uniform_(self.shape_mlp.weight, gain=0.01)
                nn.init.xavier_uniform_(self.cam_mlp.weight, gain=0.01)
                nn.init.xavier_uniform_(self.pose_mlp.weight, gain=0.01)
        if self.use_branch_nonlocal:
            logger.info(f'Branch nonlocal is active, type {self.use_branch_nonlocal}')
            self.branch_2d_nonlocal = eval(self.use_branch_nonlocal).NONLocalBlock2D(
                in_channels=num_deconv_filters[-1],
                sub_sample=False,
                bn_layer=True,
            )
            self.branch_3d_nonlocal = eval(self.use_branch_nonlocal).NONLocalBlock2D(
                in_channels=num_deconv_filters[-1],
                sub_sample=False,
                bn_layer=True,
            )
        if self.use_final_nonlocal:
            logger.info(f'Final nonlocal is active, type {self.use_final_nonlocal}')
            self.final_pose_nonlocal = eval(self.use_final_nonlocal).NONLocalBlock1D(
                in_channels=self.pose_mlp_inp_dim,
                sub_sample=False,
                bn_layer=True,
            )
            self.final_shape_nonlocal = eval(self.use_final_nonlocal).NONLocalBlock1D(
                in_channels=num_features_smpl,
                sub_sample=False,
                bn_layer=True,
            )
        if self.use_keypoint_attention:
            logger.info('Keypoint attention is active')
            self.keypoint_attention = KeypointAttention(
                use_conv=use_postconv_keypoint_attention,
                in_channels=(self.pose_mlp_inp_dim, smpl_final_dim),
                out_channels=(self.pose_mlp_inp_dim, smpl_final_dim),
                act=keypoint_attention_act,
                use_scale=use_scale_keypoint_attention,
            )
        if self.use_coattention:
            logger.info(f'Coattention is active, final conv type {self.coattention_conv}')
            self.coattention = CoAttention(n_channel=num_deconv_filters[-1], final_conv=self.coattention_conv)
        if self.num_branch_iteration > 0:
            logger.info(f'Branch iteration is active')
            if branch_deeper:
                self.branch_iter_2d_nonlocal = nn.Sequential(
                    conv3x3(num_deconv_filters[-1], 256),
                    nn.BatchNorm2d(256),
                    nn.ReLU(inplace=True),
                    dot_product.NONLocalBlock2D(
                        in_channels=num_deconv_filters[-1],
                        sub_sample=False,
                        bn_layer=True,
                    )
                )
                self.branch_iter_3d_nonlocal = nn.Sequential(
                    conv3x3(num_deconv_filters[-1], 256),
                    nn.BatchNorm2d(256),
                    nn.ReLU(inplace=True),
                    dot_product.NONLocalBlock2D(
                        in_channels=num_deconv_filters[-1],
                        sub_sample=False,
                        bn_layer=True,
                    )
                )
            else:
                self.branch_iter_2d_nonlocal = dot_product.NONLocalBlock2D(
                    in_channels=num_deconv_filters[-1],
                    sub_sample=False,
                    bn_layer=True,
                )
                self.branch_iter_3d_nonlocal = dot_product.NONLocalBlock2D(
                    in_channels=num_deconv_filters[-1],
                    sub_sample=False,
                    bn_layer=True,
                )
    def _get_shape_mlp(self, output_size):
        if self.shape_mlp_num_layers == 1:
            return nn.Linear(self.shape_mlp_inp_dim, output_size)
        module_list = []
        for i in range(self.shape_mlp_num_layers):
            if i == 0:
                module_list.append(
                    nn.Linear(self.shape_mlp_inp_dim, self.shape_mlp_hidden_size)
                )
            elif i == self.shape_mlp_num_layers - 1:
                module_list.append(
                    nn.Linear(self.shape_mlp_hidden_size, output_size)
                )
            else:
                module_list.append(
                    nn.Linear(self.shape_mlp_hidden_size, self.shape_mlp_hidden_size)
                )
        return nn.Sequential(*module_list)
    def _get_pose_mlp(self, num_joints, output_size):
        if self.pose_mlp_num_layers == 1:
            return LocallyConnected2d(
                in_channels=self.pose_mlp_inp_dim,
                out_channels=output_size,
                output_size=[num_joints, 1],
                kernel_size=1,
                stride=1,
            )
        module_list = []
        for i in range(self.pose_mlp_num_layers):
            if i == 0:
                module_list.append(
                    LocallyConnected2d(
                        in_channels=self.pose_mlp_inp_dim,
                        out_channels=self.pose_mlp_hidden_size,
                        output_size=[num_joints, 1],
                        kernel_size=1,
                        stride=1,
                    )
                )
            elif i == self.pose_mlp_num_layers - 1:
                module_list.append(
                    LocallyConnected2d(
                        in_channels=self.pose_mlp_hidden_size,
                        out_channels=output_size,
                        output_size=[num_joints, 1],
                        kernel_size=1,
                        stride=1,
                    )
                )
            else:
                module_list.append(
                    LocallyConnected2d(
                        in_channels=self.pose_mlp_hidden_size,
                        out_channels=self.pose_mlp_hidden_size,
                        output_size=[num_joints, 1],
                        kernel_size=1,
                        stride=1,
                    )
                )
        return nn.Sequential(*module_list)
    def _get_deconv_cfg(self, deconv_kernel):
        if deconv_kernel == 4:
            padding = 1
            output_padding = 0
        elif deconv_kernel == 3:
            padding = 1
            output_padding = 1
        elif deconv_kernel == 2:
            padding = 0
            output_padding = 0
        return deconv_kernel, padding, output_padding
    def _make_conv_layer(self, num_layers, num_filters, num_kernels):
        assert num_layers == len(num_filters), \
            'ERROR: num_conv_layers is different len(num_conv_filters)'
        assert num_layers == len(num_kernels), \
            'ERROR: num_conv_layers is different len(num_conv_filters)'
        layers = []
        for i in range(num_layers):
            kernel, padding, output_padding = \
                self._get_deconv_cfg(num_kernels[i])
            planes = num_filters[i]
            layers.append(
                nn.Conv2d(
                    in_channels=self.num_input_features,
                    out_channels=planes,
                    kernel_size=kernel,
                    stride=1,
                    padding=padding,
                    bias=self.deconv_with_bias))
            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
            layers.append(nn.ReLU(inplace=True))
            self.num_input_features = planes
        return nn.Sequential(*layers)
    def _make_res_conv_layers(self, input_channels, num_channels=64,
                              num_heads=1, num_basic_blocks=2):
        head_layers = []
        # kernel_sizes, strides, paddings = self._get_trans_cfg()
        # for kernel_size, padding, stride in zip(kernel_sizes, paddings, strides):
        head_layers.append(nn.Sequential(
            nn.Conv2d(
                in_channels=input_channels,
                out_channels=num_channels,
                kernel_size=3,
                stride=1,
                padding=1
            ),
            nn.BatchNorm2d(num_channels, momentum=BN_MOMENTUM),
            nn.ReLU(inplace=True))
        )
        for i in range(num_heads):
            layers = []
            for _ in range(num_basic_blocks):
                layers.append(nn.Sequential(BasicBlock(num_channels, num_channels)))
            head_layers.append(nn.Sequential(*layers))
        # head_layers.append(nn.Conv2d(in_channels=num_channels, out_channels=output_channels,
        #                              kernel_size=1, stride=1, padding=0))
        return nn.Sequential(*head_layers)
    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
        assert num_layers == len(num_filters), \
            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
        assert num_layers == len(num_kernels), \
            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
        layers = []
        for i in range(num_layers):
            kernel, padding, output_padding = \
                self._get_deconv_cfg(num_kernels[i])
            planes = num_filters[i]
            layers.append(
                nn.ConvTranspose2d(
                    in_channels=self.num_input_features,
                    out_channels=planes,
                    kernel_size=kernel,
                    stride=2,
                    padding=padding,
                    output_padding=output_padding,
                    bias=self.deconv_with_bias))
            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
            layers.append(nn.ReLU(inplace=True))
            # if self.use_self_attention:
            #     layers.append(SelfAttention(planes))
            self.num_input_features = planes
        return nn.Sequential(*layers)
    def _make_upsample_layer(self, num_layers, num_filters, num_kernels):
        assert num_layers == len(num_filters), \
            'ERROR: num_layers is different len(num_filters)'
        assert num_layers == len(num_kernels), \
            'ERROR: num_layers is different len(num_filters)'
        layers = []
        for i in range(num_layers):
            kernel, padding, output_padding = \
                self._get_deconv_cfg(num_kernels[i])
            planes = num_filters[i]
            layers.append(nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True))
            layers.append(
                nn.Conv2d(in_channels=self.num_input_features, out_channels=planes,
                          kernel_size=kernel, stride=1, padding=padding, bias=self.deconv_with_bias)
            )
            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
            layers.append(nn.ReLU(inplace=True))
            # if self.use_self_attention:
            #     layers.append(SelfAttention(planes))
            self.num_input_features = planes
        return nn.Sequential(*layers)
    def _prepare_pose_mlp_inp(self, feats, pred_pose, pred_shape, pred_cam):
        # feats shape: [N, 256, J, 1]
        # pose shape: [N, 6, J, 1]
        # cam shape: [N, 3]
        # beta shape: [N, 10]
        batch_size, num_joints = pred_pose.shape[0], pred_pose.shape[2]
        joint_triplets = get_smpl_neighbor_triplets()
        inp_list = []
        for inp_type in self.pose_input_type:
            if inp_type == 'feats':
                # add image features
                inp_list.append(feats)
            if inp_type == 'neighbor_pose_feats':
                # add the image features from neighboring joints
                n_pose_feat = []
                for jt in joint_triplets:
                    n_pose_feat.append(
                        feats[:, :, jt[1:]].reshape(batch_size, -1, 1).unsqueeze(-2)
                    )
                n_pose_feat = torch.cat(n_pose_feat, 2)
                inp_list.append(n_pose_feat)
            if inp_type == 'self_pose':
                # add image features
                inp_list.append(pred_pose)
            if inp_type == 'all_pose':
                # append all of the joint angels
                all_pose = pred_pose.reshape(batch_size, -1, 1)[..., None].repeat(1, 1, num_joints, 1)
                inp_list.append(all_pose)
            if inp_type == 'neighbor_pose':
                # append only the joint angles of neighboring ones
                n_pose = []
                for jt in joint_triplets:
                    n_pose.append(
                        pred_pose[:,:,jt[1:]].reshape(batch_size, -1, 1).unsqueeze(-2)
                    )
                n_pose = torch.cat(n_pose, 2)
                inp_list.append(n_pose)
            if inp_type == 'shape':
                # append shape predictions
                pred_shape = pred_shape[..., None, None].repeat(1, 1, num_joints, 1)
                inp_list.append(pred_shape)
            if inp_type == 'cam':
                # append camera predictions
                pred_cam = pred_cam[..., None, None].repeat(1, 1, num_joints, 1)
                inp_list.append(pred_cam)
        assert len(inp_list) > 0
        # for i,inp in enumerate(inp_list):
        #     print(i, inp.shape)
        return torch.cat(inp_list, 1)
    def _prepare_shape_mlp_inp(self, feats, pred_pose, pred_shape, pred_cam):
        # feats shape: [N, 256, J, 1]
        # pose shape: [N, 6, J, 1]
        # cam shape: [N, 3]
        # beta shape: [N, 10]
        batch_size, num_joints = pred_pose.shape[:2]
        inp_list = []
        for inp_type in self.shape_input_type:
            if inp_type == 'feats':
                # add image features
                inp_list.append(feats)
            if inp_type == 'all_pose':
                # append all of the joint angels
                pred_pose = pred_pose.reshape(batch_size, -1)
                inp_list.append(pred_pose)
            if inp_type == 'shape':
                # append shape predictions
                inp_list.append(pred_shape)
            if inp_type == 'cam':
                # append camera predictions
                inp_list.append(pred_cam)
        assert len(inp_list) > 0
        return torch.cat(inp_list, 1)
    def forward(self, features, gt_segm=None):
        batch_size = features.shape[0]
        init_pose = self.init_pose.expand(batch_size, -1)  # N, Jx6
        init_shape = self.init_shape.expand(batch_size, -1)
        init_cam = self.init_cam.expand(batch_size, -1)
        if self.use_position_encodings:
            features = torch.cat((features, self.pos_enc.repeat(features.shape[0], 1, 1, 1)), 1)
        output = {}
        ############## 2D PART BRANCH FEATURES ##############
        part_feats = self._get_2d_branch_feats(features)
        ############## GET PART ATTENTION MAP ##############
        part_attention = self._get_part_attention_map(part_feats, output)
        ############## 3D SMPL BRANCH FEATURES ##############
        smpl_feats = self._get_3d_smpl_feats(features, part_feats)
        ############## SAMPLE LOCAL FEATURES ##############
        if gt_segm is not None:
            # logger.debug(gt_segm.shape)
            # import IPython; IPython.embed(); exit()
            gt_segm = F.interpolate(gt_segm.unsqueeze(1).float(), scale_factor=(1/4, 1/4), mode='nearest').long().squeeze(1)
            part_attention = F.one_hot(gt_segm.to('cpu'), num_classes=self.num_joints + 1).permute(0,3,1,2).float()[:,1:,:,:]
            part_attention = part_attention.to('cuda')
            # part_attention = F.interpolate(part_attention, scale_factor=1/4, mode='bilinear', align_corners=True)
            # import IPython; IPython.embed(); exit()
        point_local_feat, cam_shape_feats = self._get_local_feats(smpl_feats, part_attention, output)
        ############## GET FINAL PREDICTIONS ##############
        pred_pose, pred_shape, pred_cam = self._get_final_preds(
            point_local_feat, cam_shape_feats, init_pose, init_shape, init_cam
        )
        if self.use_coattention:
            for c in range(self.num_coattention_iter):
                smpl_feats, part_feats = self.coattention(smpl_feats, part_feats)
                part_attention = self._get_part_attention_map(part_feats, output)
                point_local_feat, cam_shape_feats = self._get_local_feats(smpl_feats, part_attention, output)
                pred_pose, pred_shape, pred_cam = self._get_final_preds(
                    point_local_feat, cam_shape_feats, pred_pose, pred_shape, pred_cam
                )
        if self.num_branch_iteration > 0:
            for nbi in range(self.num_branch_iteration):
                if self.use_soft_attention:
                    smpl_feats = self.branch_iter_3d_nonlocal(smpl_feats)
                    part_feats = self.branch_iter_2d_nonlocal(part_feats)
                else:
                    smpl_feats = self.branch_iter_3d_nonlocal(smpl_feats)
                    part_feats = smpl_feats
                part_attention = self._get_part_attention_map(part_feats, output)
                point_local_feat, cam_shape_feats = self._get_local_feats(smpl_feats, part_attention, output)
                pred_pose, pred_shape, pred_cam = self._get_final_preds(
                    point_local_feat, cam_shape_feats, pred_pose, pred_shape, pred_cam,
                )
        pred_rotmat = rot6d_to_rotmat(pred_pose).reshape(batch_size, 24, 3, 3)
        output.update({
            'pred_pose': pred_rotmat,
            'pred_cam': pred_cam,
            'pred_shape': pred_shape,
        })
        return output
    def _get_local_feats(self, smpl_feats, part_attention, output):
        cam_shape_feats = self.smpl_final_layer(smpl_feats)
        if self.use_keypoint_attention:
            point_local_feat = self.keypoint_attention(smpl_feats, part_attention)
            cam_shape_feats = self.keypoint_attention(cam_shape_feats, part_attention)
        else:
            point_local_feat = interpolate(smpl_feats, output['pred_kp2d'])
            cam_shape_feats = interpolate(cam_shape_feats, output['pred_kp2d'])
        return point_local_feat, cam_shape_feats
    def _get_2d_branch_feats(self, features):
        part_feats = self.keypoint_deconv_layers(features)
        if self.use_branch_nonlocal:
            part_feats = self.branch_2d_nonlocal(part_feats)
        return part_feats
    def _get_3d_smpl_feats(self, features, part_feats):
        if self.use_keypoint_features_for_smpl_regression:
            smpl_feats = part_feats
        else:
            smpl_feats = self.smpl_deconv_layers(features)
            if self.use_branch_nonlocal:
                smpl_feats = self.branch_3d_nonlocal(smpl_feats)
        return smpl_feats
    def _get_part_attention_map(self, part_feats, output):
        heatmaps = self.keypoint_final_layer(part_feats)
        if self.use_heatmaps == 'hm':
            # returns coords between [-1,1]
            pred_kp2d, confidence = get_heatmap_preds(heatmaps)
            output['pred_kp2d'] = pred_kp2d
            output['pred_kp2d_conf'] = confidence
            output['pred_heatmaps_2d'] = heatmaps
        elif self.use_heatmaps == 'hm_soft':
            pred_kp2d, _ = softargmax2d(heatmaps, self.temperature)
            output['pred_kp2d'] = pred_kp2d
            output['pred_heatmaps_2d'] = heatmaps
        elif self.use_heatmaps == 'part_segm':
            output['pred_segm_mask'] = heatmaps
            heatmaps = heatmaps[:,1:,:,:] # remove the first channel which encodes the background
        elif self.use_heatmaps == 'part_segm_pool':
            output['pred_segm_mask'] = heatmaps
            heatmaps = heatmaps[:,1:,:,:] # remove the first channel which encodes the background
            pred_kp2d, _ = softargmax2d(heatmaps, self.temperature) # get_heatmap_preds(heatmaps)
            output['pred_kp2d'] = pred_kp2d
            for k, v in output.items():
                if torch.any(torch.isnan(v)):
                    logger.debug(f'{k} is Nan!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
                if torch.any(torch.isinf(v)):
                    logger.debug(f'{k} is Inf!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            # if torch.any(torch.isnan(pred_kp2d)):
            #     print('pred_kp2d nan', pred_kp2d.min(), pred_kp2d.max())
            # if torch.any(torch.isnan(heatmaps)):
            #     print('heatmap nan', heatmaps.min(), heatmaps.max())
            #
            # if torch.any(torch.isinf(pred_kp2d)):
            #     print('pred_kp2d inf', pred_kp2d.min(), pred_kp2d.max())
            # if torch.any(torch.isinf(heatmaps)):
            #     print('heatmap inf', heatmaps.min(), heatmaps.max())
        elif self.use_heatmaps == 'attention':
            output['pred_attention'] = heatmaps
        else:
            # returns coords between [-1,1]
            pred_kp2d, _ = softargmax2d(heatmaps, self.temperature)
            output['pred_kp2d'] = pred_kp2d
            output['pred_heatmaps_2d'] = heatmaps
        return heatmaps
    def _get_final_preds(self, pose_feats, cam_shape_feats, init_pose, init_shape, init_cam):
        if self.use_hmr_regression:
            return self._hmr_get_final_preds(cam_shape_feats, init_pose, init_shape, init_cam)
        else:
            return self._pare_get_final_preds(pose_feats, cam_shape_feats, init_pose, init_shape, init_cam)
    def _hmr_get_final_preds(self, cam_shape_feats, init_pose, init_shape, init_cam):
        if self.use_final_nonlocal:
            cam_shape_feats = self.final_shape_nonlocal(cam_shape_feats)
        xf = torch.flatten(cam_shape_feats, start_dim=1)
        pred_pose = init_pose
        pred_shape = init_shape
        pred_cam = init_cam
        for i in range(3):
            xc = torch.cat([xf, pred_pose, pred_shape, pred_cam], 1)
            xc = self.fc1(xc)
            xc = self.drop1(xc)
            xc = self.fc2(xc)
            xc = self.drop2(xc)
            pred_pose = self.decpose(xc) + pred_pose
            pred_shape = self.decshape(xc) + pred_shape
            pred_cam = self.deccam(xc) + pred_cam
        return pred_pose, pred_shape, pred_cam
    def _pare_get_final_preds(self, pose_feats, cam_shape_feats, init_pose, init_shape, init_cam):
        pose_feats = pose_feats.unsqueeze(-1)  #
        if init_pose.shape[-1] == 6:
            # This means init_pose comes from a previous iteration
            init_pose = init_pose.transpose(2,1).unsqueeze(-1)
        else:
            # This means init pose comes from mean pose
            init_pose = init_pose.reshape(init_pose.shape[0], 6, -1).unsqueeze(-1)
        if self.iterative_regression:
            shape_feats = torch.flatten(cam_shape_feats, start_dim=1)
            pred_pose = init_pose  # [N, 6, J, 1]
            pred_cam = init_cam  # [N, 3]
            pred_shape = init_shape  # [N, 10]
            # import IPython; IPython.embed(); exit(1)
            for i in range(self.num_iterations):
                # pose_feats shape: [N, 256, 24, 1]
                # shape_feats shape: [N, 24*64]
                pose_mlp_inp = self._prepare_pose_mlp_inp(pose_feats, pred_pose, pred_shape, pred_cam)
                shape_mlp_inp = self._prepare_shape_mlp_inp(shape_feats, pred_pose, pred_shape, pred_cam)
                # print('pose_mlp_inp', pose_mlp_inp.shape)
                # print('shape_mlp_inp', shape_mlp_inp.shape)
                # TODO: this does not work but let it go since we dont use iterative regression for now.
                # if self.use_final_nonlocal:
                #     pose_mlp_inp = self.final_pose_nonlocal(pose_mlp_inp)
                #     shape_mlp_inp = self.final_shape_nonlocal(shape_mlp_inp)
                if self.iter_residual:
                    pred_pose = self.pose_mlp(pose_mlp_inp) + pred_pose
                    pred_cam = self.cam_mlp(shape_mlp_inp) + pred_cam
                    pred_shape = self.shape_mlp(shape_mlp_inp) + pred_shape
                else:
                    pred_pose = self.pose_mlp(pose_mlp_inp)
                    pred_cam = self.cam_mlp(shape_mlp_inp)
                    pred_shape = self.shape_mlp(shape_mlp_inp) + init_shape
        else:
            shape_feats = cam_shape_feats
            if self.use_final_nonlocal:
                pose_feats = self.final_pose_nonlocal(pose_feats.squeeze(-1)).unsqueeze(-1)
                shape_feats = self.final_shape_nonlocal(shape_feats)
            shape_feats = torch.flatten(shape_feats, start_dim=1)
            pred_pose = self.pose_mlp(pose_feats)
            pred_cam = self.cam_mlp(shape_feats)
            pred_shape = self.shape_mlp(shape_feats)
            if self.use_mean_camshape:
                pred_cam = pred_cam + init_cam
                pred_shape = pred_shape + init_shape
            if self.use_mean_pose:
                pred_pose = pred_pose + init_pose
        pred_pose = pred_pose.squeeze(-1).transpose(2, 1) # N, J, 6
        return pred_pose, pred_shape, pred_cam
    def forward_pretraining(self, features):
        # TODO: implement pretraining
        kp_feats = self.keypoint_deconv_layers(features)
        heatmaps = self.keypoint_final_layer(kp_feats)
        output = {}
        if self.use_heatmaps == 'hm':
            # returns coords between [-1,1]
            pred_kp2d, confidence = get_heatmap_preds(heatmaps)
            output['pred_kp2d'] = pred_kp2d
            output['pred_kp2d_conf'] = confidence
        elif self.use_heatmaps == 'hm_soft':
            pred_kp2d, _ = softargmax2d(heatmaps, self.temperature)
            output['pred_kp2d'] = pred_kp2d
        else:
            # returns coords between [-1,1]
            pred_kp2d, _ = softargmax2d(heatmaps, self.temperature)
            output['pred_kp2d'] = pred_kp2d
        if self.use_keypoint_features_for_smpl_regression:
            smpl_feats = kp_feats
        else:
            smpl_feats = self.smpl_deconv_layers(features)
        cam_shape_feats = self.smpl_final_layer(smpl_feats)
        output.update({
            'kp_feats': heatmaps,
            'heatmaps': heatmaps,
            'smpl_feats': smpl_feats,
            'cam_shape_feats': cam_shape_feats,
        })
        return output
--- a/myeasymocap/backbone/pare/head/smpl_cam_head.py
+++ b/myeasymocap/backbone/pare/head/smpl_cam_head.py
@ -0,0 +1,133 @@
 # -*- coding: utf-8 -*-
 # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
 # holder of all proprietary rights on this computer program.
 # You can only use this computer program if you have closed
 # a license agreement with MPG or you get the right to use the computer
 # program from someone who is authorized to grant you that right.
 # Any use of the computer program without a valid license is prohibited and
 # liable to prosecution.
 #
 # Copyright©2019 Max-Planck-Gesellschaft zur Förderung
 # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
 # for Intelligent Systems. All rights reserved.
 #
 # Contact: ps-license@tuebingen.mpg.de
 import torch
 import torch.nn as nn
 from .. import config
 from .smpl_head import SMPL
 class SMPLCamHead(nn.Module):
    def __init__(self, img_res=224):
        super(SMPLCamHead, self).__init__()
        self.smpl = SMPL(config.SMPL_MODEL_DIR, create_transl=False)
        self.add_module('smpl', self.smpl)
        self.img_res = img_res
    def forward(self, rotmat, shape, cam, cam_rotmat, cam_intrinsics,
                bbox_scale, bbox_center, img_w, img_h, normalize_joints2d=False):
        '''
        :param rotmat: rotation in euler angles format (N,J,3,3)
        :param shape: smpl betas
        :param cam: weak perspective camera
        :param normalize_joints2d: bool, normalize joints between -1, 1 if true
        :param cam_rotmat (Nx3x3) camera rotation matrix
        :param cam_intrinsics (Nx3x3) camera intrinsics matrix
        :param bbox_scale (N,) bbox height normalized by 200
        :param bbox_center (N,2) bbox center
        :param img_w (N,) original image width
        :param img_h (N,) original image height
        :return: dict with keys 'vertices', 'joints3d', 'joints2d' if cam is True
        '''
        smpl_output = self.smpl(
            betas=shape,
            body_pose=rotmat[:, 1:].contiguous(),
            global_orient=rotmat[:, 0].unsqueeze(1).contiguous(),
            pose2rot=False,
        )
        output = {
            'smpl_vertices': smpl_output.vertices,
            'smpl_joints3d': smpl_output.joints,
        }
        joints3d = smpl_output.joints
        cam_t = convert_pare_to_full_img_cam(
            pare_cam=cam,
            bbox_height=bbox_scale * 200.,
            bbox_center=bbox_center,
            img_w=img_w,
            img_h=img_h,
            focal_length=cam_intrinsics[:, 0, 0],
            crop_res=self.img_res,
        )
        joints2d = perspective_projection(
            joints3d,
            rotation=cam_rotmat,
            translation=cam_t,
            cam_intrinsics=cam_intrinsics,
        )
        # logger.debug(f'PARE cam: {cam}')
        # logger.debug(f'FIMG cam: {cam_t}')
        # logger.debug(f'joints2d: {joints2d}')
        if normalize_joints2d:
            # Normalize keypoints to [-1,1]
            joints2d = joints2d / (self.img_res / 2.)
        output['smpl_joints2d'] = joints2d
        output['pred_cam_t'] = cam_t
        return output
 def perspective_projection(points, rotation, translation, cam_intrinsics):
    """
    This function computes the perspective projection of a set of points.
    Input:
        points (bs, N, 3): 3D points
        rotation (bs, 3, 3): Camera rotation
        translation (bs, 3): Camera translation
        cam_intrinsics (bs, 3, 3): Camera intrinsics
    """
    K = cam_intrinsics
    # Transform points
    points = torch.einsum('bij,bkj->bki', rotation, points)
    points = points + translation.unsqueeze(1)
    # Apply perspective distortion
    projected_points = points / points[:,:,-1].unsqueeze(-1)
    # Apply camera intrinsics
    projected_points = torch.einsum('bij,bkj->bki', K, projected_points.float())
    return projected_points[:, :, :-1]
 def convert_pare_to_full_img_cam(
        pare_cam, bbox_height, bbox_center,
        img_w, img_h, focal_length, crop_res=224):
    # Converts weak perspective camera estimated by PARE in
    # bbox coords to perspective camera in full image coordinates
    # from https://arxiv.org/pdf/2009.06549.pdf
    s, tx, ty = pare_cam[:, 0], pare_cam[:, 1], pare_cam[:, 2]
    res = 224
    r = bbox_height / res
    tz = 2 * focal_length / (r * res * s)
    cx = 2 * (bbox_center[:, 0] - (img_w / 2.)) / (s * bbox_height)
    cy = 2 * (bbox_center[:, 1] - (img_h / 2.)) / (s * bbox_height)
    cam_t = torch.stack([tx + cx, ty + cy, tz], dim=-1)
    return cam_t
--- a/myeasymocap/backbone/pare/head/smpl_head.py
+++ b/myeasymocap/backbone/pare/head/smpl_head.py
@ -0,0 +1,104 @@
 # -*- coding: utf-8 -*-
 # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
 # holder of all proprietary rights on this computer program.
 # You can only use this computer program if you have closed
 # a license agreement with MPG or you get the right to use the computer
 # program from someone who is authorized to grant you that right.
 # Any use of the computer program without a valid license is prohibited and
 # liable to prosecution.
 #
 # Copyright©2019 Max-Planck-Gesellschaft zur Förderung
 # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
 # for Intelligent Systems. All rights reserved.
 #
 # Contact: ps-license@tuebingen.mpg.de
 import torch
 import numpy as np
 import torch.nn as nn
 from smplx import SMPL as _SMPL
 from smplx.utils import SMPLOutput
 from smplx.lbs import vertices2joints
 from .. import config, constants
 from ..utils.geometry import perspective_projection, convert_weak_perspective_to_perspective
 class SMPL(_SMPL):
    """ Extension of the official SMPL implementation to support more joints """
    def __init__(self, *args, **kwargs):
        super(SMPL, self).__init__(*args, **kwargs)
        joints = [constants.JOINT_MAP[i] for i in constants.JOINT_NAMES]
        J_regressor_extra = np.load(config.JOINT_REGRESSOR_TRAIN_EXTRA)
        self.register_buffer('J_regressor_extra', torch.tensor(J_regressor_extra, dtype=torch.float32))
        self.joint_map = torch.tensor(joints, dtype=torch.long)
    def forward(self, *args, **kwargs):
        kwargs['get_skin'] = True
        smpl_output = super(SMPL, self).forward(*args, **kwargs)
        extra_joints = vertices2joints(self.J_regressor_extra, smpl_output.vertices)
        joints = torch.cat([smpl_output.joints, extra_joints], dim=1)
        joints = joints[:, self.joint_map, :]
        output = SMPLOutput(vertices=smpl_output.vertices,
                            global_orient=smpl_output.global_orient,
                            body_pose=smpl_output.body_pose,
                            joints=joints,
                            betas=smpl_output.betas,
                            full_pose=smpl_output.full_pose)
        return output
 class SMPLHead(nn.Module):
    def __init__(self, focal_length=5000., img_res=224):
        super(SMPLHead, self).__init__()
        self.smpl = SMPL(config.SMPL_MODEL_DIR, create_transl=False)
        self.add_module('smpl', self.smpl)
        self.focal_length = focal_length
        self.img_res = img_res
    def forward(self, rotmat, shape, cam=None, normalize_joints2d=False):
        '''
        :param rotmat: rotation in euler angles format (N,J,3,3)
        :param shape: smpl betas
        :param cam: weak perspective camera
        :param normalize_joints2d: bool, normalize joints between -1, 1 if true
        :return: dict with keys 'vertices', 'joints3d', 'joints2d' if cam is True
        '''
        smpl_output = self.smpl(
            betas=shape,
            body_pose=rotmat[:, 1:].contiguous(),
            global_orient=rotmat[:, 0].unsqueeze(1).contiguous(),
            pose2rot=False,
        )
        output = {
            'smpl_vertices': smpl_output.vertices,
            'smpl_joints3d': smpl_output.joints,
        }
        if cam is not None:
            joints3d = smpl_output.joints
            batch_size = joints3d.shape[0]
            device = joints3d.device
            cam_t = convert_weak_perspective_to_perspective(
                cam,
                focal_length=self.focal_length,
                img_res=self.img_res,
            )
            joints2d = perspective_projection(
                joints3d,
                rotation=torch.eye(3, device=device).unsqueeze(0).expand(batch_size, -1, -1),
                translation=cam_t,
                focal_length=self.focal_length,
                camera_center=torch.zeros(batch_size, 2, device=device)
            )
            if normalize_joints2d:
                # Normalize keypoints to [-1,1]
                joints2d = joints2d / (self.img_res / 2.)
            output['smpl_joints2d'] = joints2d
            output['pred_cam_t'] = cam_t
        return output
--- a/myeasymocap/backbone/pare/layers/init.py
+++ b/myeasymocap/backbone/pare/layers/init.py
@ -0,0 +1,4 @@
 from .locallyconnected2d import LocallyConnected2d
 from .interpolate import interpolate
 from .nonlocalattention import NonLocalAttention
 from .keypoint_attention import KeypointAttention
--- a/myeasymocap/backbone/pare/layers/coattention.py
+++ b/myeasymocap/backbone/pare/layers/coattention.py
@ -0,0 +1,126 @@
 # -*- coding: utf-8 -*-
 # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
 # holder of all proprietary rights on this computer program.
 # You can only use this computer program if you have closed
 # a license agreement with MPG or you get the right to use the computer
 # program from someone who is authorized to grant you that right.
 # Any use of the computer program without a valid license is prohibited and
 # liable to prosecution.
 #
 # Copyright©2019 Max-Planck-Gesellschaft zur Förderung
 # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
 # for Intelligent Systems. All rights reserved.
 #
 # Contact: ps-license@tuebingen.mpg.de
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from ..backbone.resnet import conv1x1, conv3x3
 class CoAttention(nn.Module):
    def __init__(
            self,
            n_channel,
            final_conv='simple', # 'double_1', 'double_3', 'single_1', 'single_3', 'simple'
    ):
        super(CoAttention, self).__init__()
        self.linear_e = nn.Linear(n_channel, n_channel, bias=False)
        self.channel = n_channel
        # self.dim = all_dim
        self.gate = nn.Conv2d(n_channel, 1, kernel_size=1, bias=False)
        self.gate_s = nn.Sigmoid()
        self.softmax = nn.Sigmoid()
        if final_conv.startswith('double'):
            kernel_size = int(final_conv[-1])
            conv = conv1x1 if kernel_size == 1 else conv3x3
            self.final_conv_1 = nn.Sequential(
                conv(n_channel * 2, n_channel),
                nn.BatchNorm2d(n_channel),
                nn.ReLU(inplace=True),
                conv(n_channel, n_channel),
                nn.BatchNorm2d(n_channel),
                nn.ReLU(inplace=True),
            )
            self.final_conv_2 = nn.Sequential(
                conv(n_channel * 2, n_channel),
                nn.BatchNorm2d(n_channel),
                nn.ReLU(inplace=True),
                conv(n_channel, n_channel),
                nn.BatchNorm2d(n_channel),
                nn.ReLU(inplace=True),
            )
        elif final_conv.startswith('single'):
            kernel_size = int(final_conv[-1])
            conv = conv1x1 if kernel_size == 1 else conv3x3
            self.final_conv_1 = nn.Sequential(
                conv(n_channel*2, n_channel),
                nn.BatchNorm2d(n_channel),
                nn.ReLU(inplace=True),
            )
            self.final_conv_2 = nn.Sequential(
                conv(n_channel*2, n_channel),
                nn.BatchNorm2d(n_channel),
                nn.ReLU(inplace=True),
            )
        elif final_conv == 'simple':
            self.final_conv_1 = conv1x1(n_channel * 2, n_channel)
            self.final_conv_2 = conv1x1(n_channel * 2, n_channel)
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, 0.01)
                # init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                # init.xavier_normal(m.weight.data)
                # m.bias.data.fill_(0)
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
    def forward(self, input_1, input_2):
        '''
        input_1: [N, C, H, W]
        input_2: [N, C, H, W]
        '''
        b, c, h, w = input_1.shape
        exemplar, query = input_1, input_2
        exemplar_flat = exemplar.reshape(-1, c, h*w)  # N,C,H*W
        query_flat = query.reshape(-1, c, h*w)
        # Compute coattention scores, S in the paper
        exemplar_t = torch.transpose(exemplar_flat, 1, 2).contiguous()  # batch size x dim x num
        exemplar_corr = self.linear_e(exemplar_t)
        A = torch.bmm(exemplar_corr, query_flat)
        A1 = F.softmax(A.clone(), dim=1)
        B = F.softmax(torch.transpose(A, 1, 2), dim=1)
        query_att = torch.bmm(exemplar_flat, A1)
        exemplar_att = torch.bmm(query_flat, B)
        input1_att = exemplar_att.reshape(-1, c, h, w)
        input2_att = query_att.reshape(-1, c, h, w)
        # Apply gating on S, section gated coattention
        input1_mask = self.gate(input1_att)
        input2_mask = self.gate(input2_att)
        input1_mask = self.gate_s(input1_mask)
        input2_mask = self.gate_s(input2_mask)
        input1_att = input1_att * input1_mask
        input2_att = input2_att * input2_mask
        # Concatenate inputs with their attended version
        input1_att = torch.cat([input1_att, exemplar], 1)
        input2_att = torch.cat([input2_att, query], 1)
        input1 = self.final_conv_1(input1_att)
        input2 = self.final_conv_2(input2_att)
        return input1, input2
--- a/myeasymocap/backbone/pare/layers/interpolate.py
+++ b/myeasymocap/backbone/pare/layers/interpolate.py
@ -0,0 +1,35 @@
 # -*- coding: utf-8 -*-
 # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
 # holder of all proprietary rights on this computer program.
 # You can only use this computer program if you have closed
 # a license agreement with MPG or you get the right to use the computer
 # program from someone who is authorized to grant you that right.
 # Any use of the computer program without a valid license is prohibited and
 # liable to prosecution.
 #
 # Copyright©2019 Max-Planck-Gesellschaft zur Förderung
 # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
 # for Intelligent Systems. All rights reserved.
 #
 # Contact: ps-license@tuebingen.mpg.de
 import torch
 def interpolate(feat, uv):
    '''
    :param feat: [B, C, H, W] image features
    :param uv: [B, 2, N] uv coordinates in the image plane, range [-1, 1]
    :return: [B, C, N] image features at the uv coordinates
    '''
    if uv.shape[-1] != 2:
        uv = uv.transpose(1, 2)  # [B, N, 2]
    uv = uv.unsqueeze(2)  # [B, N, 1, 2]
    # NOTE: for newer PyTorch, it seems that training results are degraded due to implementation diff in F.grid_sample
    # for old versions, simply remove the aligned_corners argument.
    if int(torch.__version__.split('.')[1]) < 4:
        samples = torch.nn.functional.grid_sample(feat, uv)  # [B, C, N, 1]
    else:
        samples = torch.nn.functional.grid_sample(feat, uv, align_corners=True)  # [B, C, N, 1]
    return samples[:, :, :, 0]  # [B, C, N]
--- a/myeasymocap/backbone/pare/layers/keypoint_attention.py
+++ b/myeasymocap/backbone/pare/layers/keypoint_attention.py
@ -0,0 +1,56 @@
 # -*- coding: utf-8 -*-
 # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
 # holder of all proprietary rights on this computer program.
 # You can only use this computer program if you have closed
 # a license agreement with MPG or you get the right to use the computer
 # program from someone who is authorized to grant you that right.
 # Any use of the computer program without a valid license is prohibited and
 # liable to prosecution.
 #
 # Copyright©2019 Max-Planck-Gesellschaft zur Förderung
 # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
 # for Intelligent Systems. All rights reserved.
 #
 # Contact: ps-license@tuebingen.mpg.de
 import torch
 import numpy as np
 import torch.nn as nn
 import torch.nn.functional as F
 class KeypointAttention(nn.Module):
    def __init__(self, use_conv=False, in_channels=(256, 64), out_channels=(256, 64), act='softmax', use_scale=False):
        super(KeypointAttention, self).__init__()
        self.use_conv = use_conv
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.act = act
        self.use_scale = use_scale
        if use_conv:
            self.conv1x1_pose = nn.Conv1d(in_channels[0], out_channels[0], kernel_size=1)
            self.conv1x1_shape_cam = nn.Conv1d(in_channels[1], out_channels[1], kernel_size=1)
    def forward(self, features, heatmaps):
        batch_size, num_joints, height, width = heatmaps.shape
        if self.use_scale:
            scale = 1.0 / np.sqrt(height * width)
            heatmaps = heatmaps * scale
        if self.act == 'softmax':
            normalized_heatmap = F.softmax(heatmaps.reshape(batch_size, num_joints, -1), dim=-1)
        elif self.act == 'sigmoid':
            normalized_heatmap = torch.sigmoid(heatmaps.reshape(batch_size, num_joints, -1))
        features = features.reshape(batch_size, -1, height*width)
        attended_features = torch.matmul(normalized_heatmap, features.transpose(2,1))
        attended_features = attended_features.transpose(2,1)
        if self.use_conv:
            if attended_features.shape[1] == self.in_channels[0]:
                attended_features = self.conv1x1_pose(attended_features)
            else:
                attended_features = self.conv1x1_shape_cam(attended_features)
        return attended_features
--- a/myeasymocap/backbone/pare/layers/locallyconnected2d.py
+++ b/myeasymocap/backbone/pare/layers/locallyconnected2d.py
@ -0,0 +1,49 @@
 # -*- coding: utf-8 -*-
 # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
 # holder of all proprietary rights on this computer program.
 # You can only use this computer program if you have closed
 # a license agreement with MPG or you get the right to use the computer
 # program from someone who is authorized to grant you that right.
 # Any use of the computer program without a valid license is prohibited and
 # liable to prosecution.
 #
 # Copyright©2019 Max-Planck-Gesellschaft zur Förderung
 # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
 # for Intelligent Systems. All rights reserved.
 #
 # Contact: ps-license@tuebingen.mpg.de
 import torch
 import torch.nn as nn
 from torch.nn.modules.utils import _pair
 class LocallyConnected2d(nn.Module):
    def __init__(self, in_channels, out_channels, output_size, kernel_size, stride, bias=False):
        super(LocallyConnected2d, self).__init__()
        output_size = _pair(output_size)
        self.weight = nn.Parameter(
            torch.randn(1, out_channels, in_channels, output_size[0], output_size[1], kernel_size ** 2),
            requires_grad=True,
        )
        if bias:
            self.bias = nn.Parameter(
                torch.randn(1, out_channels, output_size[0], output_size[1]), requires_grad=True
            )
        else:
            self.register_parameter('bias', None)
        self.kernel_size = _pair(kernel_size)
        self.stride = _pair(stride)
    def forward(self, x):
        _, c, h, w = x.size()
        kh, kw = self.kernel_size
        dh, dw = self.stride
        x = x.unfold(2, kh, dh).unfold(3, kw, dw)
        x = x.contiguous().view(*x.size()[:-2], -1)
        # Sum in in_channel and kernel_size dims
        out = (x.unsqueeze(1) * self.weight).sum([2, -1])
        if self.bias is not None:
            out += self.bias
        return out
--- a/myeasymocap/backbone/pare/layers/non_local/init.py
+++ b/myeasymocap/backbone/pare/layers/non_local/init.py
--- a/myeasymocap/backbone/pare/layers/non_local/dot_product.py
+++ b/myeasymocap/backbone/pare/layers/non_local/dot_product.py
@ -0,0 +1,152 @@
 import torch
 from torch import nn
 from torch.nn import functional as F
 class _NonLocalBlockND(nn.Module):
    def __init__(self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True):
        super(_NonLocalBlockND, self).__init__()
        assert dimension in [1, 2, 3]
        self.dimension = dimension
        self.sub_sample = sub_sample
        self.in_channels = in_channels
        self.inter_channels = inter_channels
        if self.inter_channels is None:
            self.inter_channels = in_channels // 2
            if self.inter_channels == 0:
                self.inter_channels = 1
        if dimension == 3:
            conv_nd = nn.Conv3d
            max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
            bn = nn.BatchNorm3d
        elif dimension == 2:
            conv_nd = nn.Conv2d
            max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
            bn = nn.BatchNorm2d
        else:
            conv_nd = nn.Conv1d
            max_pool_layer = nn.MaxPool1d(kernel_size=(2))
            bn = nn.BatchNorm1d
        self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
                         kernel_size=1, stride=1, padding=0)
        if bn_layer:
            self.W = nn.Sequential(
                conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
                        kernel_size=1, stride=1, padding=0),
                bn(self.in_channels)
            )
            nn.init.constant_(self.W[1].weight, 0)
            nn.init.constant_(self.W[1].bias, 0)
        else:
            self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
                             kernel_size=1, stride=1, padding=0)
            nn.init.constant_(self.W.weight, 0)
            nn.init.constant_(self.W.bias, 0)
        self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
                             kernel_size=1, stride=1, padding=0)
        self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
                           kernel_size=1, stride=1, padding=0)
        if sub_sample:
            self.g = nn.Sequential(self.g, max_pool_layer)
            self.phi = nn.Sequential(self.phi, max_pool_layer)
    def forward(self, x, return_nl_map=False):
        """
        :param x: (b, c, t, h, w)
        :param return_nl_map: if True return z, nl_map, else only return z.
        :return:
        """
        batch_size = x.size(0)
        g_x = self.g(x).view(batch_size, self.inter_channels, -1)
        g_x = g_x.permute(0, 2, 1)
        theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
        theta_x = theta_x.permute(0, 2, 1)
        phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
        f = torch.matmul(theta_x, phi_x)
        N = f.size(-1)
        f_div_C = f / N
        y = torch.matmul(f_div_C, g_x)
        y = y.permute(0, 2, 1).contiguous()
        y = y.view(batch_size, self.inter_channels, *x.size()[2:])
        W_y = self.W(y)
        z = W_y + x
        if return_nl_map:
            return z, f_div_C
        return z
 class NONLocalBlock1D(_NonLocalBlockND):
    def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
        super(NONLocalBlock1D, self).__init__(in_channels,
                                              inter_channels=inter_channels,
                                              dimension=1, sub_sample=sub_sample,
                                              bn_layer=bn_layer)
 class NONLocalBlock2D(_NonLocalBlockND):
    def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
        super(NONLocalBlock2D, self).__init__(in_channels,
                                              inter_channels=inter_channels,
                                              dimension=2, sub_sample=sub_sample,
                                              bn_layer=bn_layer)
 class NONLocalBlock3D(_NonLocalBlockND):
    def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
        super(NONLocalBlock3D, self).__init__(in_channels,
                                              inter_channels=inter_channels,
                                              dimension=3, sub_sample=sub_sample,
                                              bn_layer=bn_layer)
 if __name__ == '__main__':
    import torch
    img = torch.zeros(2, 256, 24)
    net = NONLocalBlock1D(
        in_channels=256, inter_channels=None, sub_sample=False, bn_layer=True
    )
    out = net(img)
    print(out.size())
    img = torch.zeros(2, 256, 56, 56)
    net = NONLocalBlock2D(
        in_channels=256, inter_channels=None, sub_sample=False, bn_layer=True
    )
    out = net(img)
    print(out.size())
    # for (sub_sample_, bn_layer_) in [(True, True), (False, False), (True, False), (False, True)]:
    #     img = torch.zeros(2, 256, 24)
    #     net = NONLocalBlock1D(256, inter_channels=24, sub_sample=sub_sample_, bn_layer=bn_layer_)
    #     out = net(img)
    #     print(out.size())
    #
    #     img = torch.zeros(2, 3, 20, 20)
    #     net = NONLocalBlock2D(3, sub_sample=sub_sample_, bn_layer=bn_layer_)
    #     out = net(img)
    #     print(out.size())
    #
    #     img = torch.randn(2, 3, 8, 20, 20)
    #     net = NONLocalBlock3D(3, sub_sample=sub_sample_, bn_layer=bn_layer_)
    #     out = net(img)
    #     print(out.size())
--- a/myeasymocap/backbone/pare/layers/nonlocalattention.py
+++ b/myeasymocap/backbone/pare/layers/nonlocalattention.py
@ -0,0 +1,57 @@
 # -*- coding: utf-8 -*-
 # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
 # holder of all proprietary rights on this computer program.
 # You can only use this computer program if you have closed
 # a license agreement with MPG or you get the right to use the computer
 # program from someone who is authorized to grant you that right.
 # Any use of the computer program without a valid license is prohibited and
 # liable to prosecution.
 #
 # Copyright©2019 Max-Planck-Gesellschaft zur Förderung
 # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
 # for Intelligent Systems. All rights reserved.
 #
 # Contact: ps-license@tuebingen.mpg.de
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class NonLocalAttention(nn.Module):
    def __init__(
            self,
            in_channels=256,
            out_channels=256,
    ):
        super(NonLocalAttention, self).__init__()
        self.conv1x1 = nn.Conv1d(in_channels, out_channels, kernel_size=1)
    def forward(self, input):
        '''
        input [N, Feats, J, 1]
        output [N, Feats, J, 1]
        '''
        batch_size, n_feats, n_joints, _ = input.shape
        input = input.squeeze(-1)
        # Compute attention weights
        attention = torch.matmul(input.transpose(2, 1), input)
        norm_attention = F.softmax(attention, dim=-1)
        # Compute final dot product
        out = torch.matmul(input, norm_attention)
        out = self.conv1x1(out)
        out = out.unsqueeze(-1) # [N, F, J, 1]
        return out
 if __name__ == '__main__':
    nla = NonLocalAttention()
    inp = torch.rand(32, 256, 24, 1)
    out = nla(inp)
    print(out.shape)
--- a/myeasymocap/backbone/pare/layers/softargmax.py
+++ b/myeasymocap/backbone/pare/layers/softargmax.py
@ -0,0 +1,154 @@
 # -*- coding: utf-8 -*-
 # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
 # holder of all proprietary rights on this computer program.
 # You can only use this computer program if you have closed
 # a license agreement with MPG or you get the right to use the computer
 # program from someone who is authorized to grant you that right.
 # Any use of the computer program without a valid license is prohibited and
 # liable to prosecution.
 #
 # Copyright©2019 Max-Planck-Gesellschaft zur Förderung
 # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
 # for Intelligent Systems. All rights reserved.
 #
 # Contact: ps-license@tuebingen.mpg.de
 import torch
 import torch.nn.functional as F
 def _softmax(tensor, temperature, dim=-1):
    return F.softmax(tensor * temperature, dim=dim)
 def softargmax1d(
        heatmaps,
        temperature=None,
        normalize_keypoints=True,
 ):
    dtype, device = heatmaps.dtype, heatmaps.device
    if temperature is None:
        temperature = torch.tensor(1.0, dtype=dtype, device=device)
    batch_size, num_channels, dim = heatmaps.shape
    points = torch.arange(0, dim, device=device, dtype=dtype).reshape(1, 1, dim).expand(batch_size, -1, -1)
    # y = torch.arange(0, height, device=device, dtype=dtype).reshape(1, 1, height, 1).expand(batch_size, -1, -1, width)
    # Should be Bx2xHxW
    # points = torch.cat([x, y], dim=1)
    normalized_heatmap = _softmax(
        heatmaps.reshape(batch_size, num_channels, -1),
        temperature=temperature.reshape(1, -1, 1),
        dim=-1)
    # Should be BxJx2
    keypoints = (normalized_heatmap.reshape(batch_size, -1, dim) * points).sum(dim=-1)
    if normalize_keypoints:
        # Normalize keypoints to [-1, 1]
        keypoints = (keypoints / (dim - 1) * 2 - 1)
    return keypoints, normalized_heatmap.reshape(
        batch_size, -1, dim)
 def softargmax2d(
        heatmaps,
        temperature=None,
        normalize_keypoints=True,
 ):
    dtype, device = heatmaps.dtype, heatmaps.device
    if temperature is None:
        temperature = torch.tensor(1.0, dtype=dtype, device=device)
    batch_size, num_channels, height, width = heatmaps.shape
    x = torch.arange(0, width, device=device, dtype=dtype).reshape(1, 1, 1, width).expand(batch_size, -1, height, -1)
    y = torch.arange(0, height, device=device, dtype=dtype).reshape(1, 1, height, 1).expand(batch_size, -1, -1, width)
    # Should be Bx2xHxW
    points = torch.cat([x, y], dim=1)
    normalized_heatmap = _softmax(
        heatmaps.reshape(batch_size, num_channels, -1),
        temperature=temperature.reshape(1, -1, 1),
        dim=-1)
    # Should be BxJx2
    keypoints = (
            normalized_heatmap.reshape(batch_size, -1, 1, height * width) *
            points.reshape(batch_size, 1, 2, -1)).sum(dim=-1)
    if normalize_keypoints:
        # Normalize keypoints to [-1, 1]
        keypoints[:, :, 0] = (keypoints[:, :, 0] / (width - 1) * 2 - 1)
        keypoints[:, :, 1] = (keypoints[:, :, 1] / (height - 1) * 2 - 1)
    return keypoints, normalized_heatmap.reshape(
        batch_size, -1, height, width)
 def softargmax3d(
        heatmaps,
        temperature=None,
        normalize_keypoints=True,
 ):
    dtype, device = heatmaps.dtype, heatmaps.device
    if temperature is None:
        temperature = torch.tensor(1.0, dtype=dtype, device=device)
    batch_size, num_channels, height, width, depth = heatmaps.shape
    x = torch.arange(0, width, device=device, dtype=dtype).reshape(1, 1, 1, width, 1).expand(batch_size, -1, height, -1, depth)
    y = torch.arange(0, height, device=device, dtype=dtype).reshape(1, 1, height, 1, 1).expand(batch_size, -1, -1, width, depth)
    z = torch.arange(0, depth, device=device, dtype=dtype).reshape(1, 1, 1, 1, depth).expand(batch_size, -1, height, width, -1)
    # Should be Bx2xHxW
    points = torch.cat([x, y, z], dim=1)
    normalized_heatmap = _softmax(
        heatmaps.reshape(batch_size, num_channels, -1),
        temperature=temperature.reshape(1, -1, 1),
        dim=-1)
    # Should be BxJx3
    keypoints = (
            normalized_heatmap.reshape(batch_size, -1, 1, height * width * depth) *
            points.reshape(batch_size, 1, 3, -1)).sum(dim=-1)
    if normalize_keypoints:
        # Normalize keypoints to [-1, 1]
        keypoints[:, :, 0] = (keypoints[:, :, 0] / (width - 1) * 2 - 1)
        keypoints[:, :, 1] = (keypoints[:, :, 1] / (height - 1) * 2 - 1)
        keypoints[:, :, 2] = (keypoints[:, :, 2] / (depth - 1) * 2 - 1)
    return keypoints, normalized_heatmap.reshape(
        batch_size, -1, height, width, depth)
 def get_heatmap_preds(batch_heatmaps, normalize_keypoints=True):
    '''
    get predictions from score maps
    heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
    '''
    assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim'
    batch_size = batch_heatmaps.shape[0]
    num_joints = batch_heatmaps.shape[1]
    height = batch_heatmaps.shape[2]
    width = batch_heatmaps.shape[3]
    heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1))
    maxvals, idx = torch.max(heatmaps_reshaped, 2)
    maxvals = maxvals.reshape((batch_size, num_joints, 1))
    idx = idx.reshape((batch_size, num_joints, 1))
    preds = idx.repeat(1, 1, 2).float()
    preds[:, :, 0] = (preds[:, :, 0]) % width
    preds[:, :, 1] = torch.floor((preds[:, :, 1]) / width)
    pred_mask = torch.gt(maxvals, 0.0).repeat(1, 1, 2)
    pred_mask = pred_mask.float()
    preds *= pred_mask
    if normalize_keypoints:
        # Normalize keypoints to [-1, 1]
        preds[:, :, 0] = (preds[:, :, 0] / (width - 1) * 2 - 1)
        preds[:, :, 1] = (preds[:, :, 1] / (height - 1) * 2 - 1)
    return preds, maxvals
--- a/myeasymocap/backbone/pare/pare.py
+++ b/myeasymocap/backbone/pare/pare.py
@ -0,0 +1,262 @@
 import os
 import torch
 import torch.nn as nn
 from .config import update_hparams
 # from .head import PareHead, SMPLHead, SMPLCamHead
 from .head import PareHead
 from .backbone.utils import get_backbone_info
 from .backbone.hrnet import hrnet_w32
 from os.path import join
 from easymocap.multistage.torchgeometry import rotation_matrix_to_axis_angle
 import cv2
 def try_to_download():
    model_dir = os.path.join(os.path.dirname(__file__), '..', '..', 'models', 'pare')
    cmd = 'wget https://www.dropbox.com/s/aeulffqzb3zmh8x/pare-github-data.zip'
    os.system(cmd)
    os.makedirs(model_dir, exist_ok=True)
    cmd = 'unzip pare-github-data.zip -d {}'.format(model_dir)
    os.system(cmd)
 CFG = 'models/pare/data/pare/checkpoints/pare_w_3dpw_config.yaml'
 CKPT = 'models/pare/data/pare/checkpoints/pare_w_3dpw_checkpoint.ckpt'
 class PARE(nn.Module):
    def __init__(
            self,
            num_joints=24,
            softmax_temp=1.0,
            num_features_smpl=64,
            backbone='resnet50',
            focal_length=5000.,
            img_res=224,
            pretrained=None,
            iterative_regression=False,
            iter_residual=False,
            num_iterations=3,
            shape_input_type='feats',  # 'feats.all_pose.shape.cam',
            pose_input_type='feats', # 'feats.neighbor_pose_feats.all_pose.self_pose.neighbor_pose.shape.cam'
            pose_mlp_num_layers=1,
            shape_mlp_num_layers=1,
            pose_mlp_hidden_size=256,
            shape_mlp_hidden_size=256,
            use_keypoint_features_for_smpl_regression=False,
            use_heatmaps='',
            use_keypoint_attention=False,
            keypoint_attention_act='softmax',
            use_postconv_keypoint_attention=False,
            use_scale_keypoint_attention=False,
            use_final_nonlocal=None,
            use_branch_nonlocal=None,
            use_hmr_regression=False,
            use_coattention=False,
            num_coattention_iter=1,
            coattention_conv='simple',
            deconv_conv_kernel_size=4,
            use_upsampling=False,
            use_soft_attention=False,
            num_branch_iteration=0,
            branch_deeper=False,
            num_deconv_layers=3,
            num_deconv_filters=256,
            use_resnet_conv_hrnet=False,
            use_position_encodings=None,
            use_mean_camshape=False,
            use_mean_pose=False,
            init_xavier=False,
            use_cam=False,
    ):
        super(PARE, self).__init__()
        if backbone.startswith('hrnet'):
            backbone, use_conv = backbone.split('-')
            # hrnet_w32-conv, hrnet_w32-interp
            self.backbone = eval(backbone)(
                pretrained=True,
                downsample=False,
                use_conv=(use_conv == 'conv')
            )
        else:
            self.backbone = eval(backbone)(pretrained=True)
        # self.backbone = eval(backbone)(pretrained=True)
        self.head = PareHead(
            num_joints=num_joints,
            num_input_features=get_backbone_info(backbone)['n_output_channels'],
            softmax_temp=softmax_temp,
            num_deconv_layers=num_deconv_layers,
            num_deconv_filters=[num_deconv_filters] * num_deconv_layers,
            num_deconv_kernels=[deconv_conv_kernel_size] * num_deconv_layers,
            num_features_smpl=num_features_smpl,
            final_conv_kernel=1,
            iterative_regression=iterative_regression,
            iter_residual=iter_residual,
            num_iterations=num_iterations,
            shape_input_type=shape_input_type,
            pose_input_type=pose_input_type,
            pose_mlp_num_layers=pose_mlp_num_layers,
            shape_mlp_num_layers=shape_mlp_num_layers,
            pose_mlp_hidden_size=pose_mlp_hidden_size,
            shape_mlp_hidden_size=shape_mlp_hidden_size,
            use_keypoint_features_for_smpl_regression=use_keypoint_features_for_smpl_regression,
            use_heatmaps=use_heatmaps,
            use_keypoint_attention=use_keypoint_attention,
            use_postconv_keypoint_attention=use_postconv_keypoint_attention,
            keypoint_attention_act=keypoint_attention_act,
            use_scale_keypoint_attention=use_scale_keypoint_attention,
            use_branch_nonlocal=use_branch_nonlocal, # 'concatenation', 'dot_product', 'embedded_gaussian', 'gaussian'
            use_final_nonlocal=use_final_nonlocal, # 'concatenation', 'dot_product', 'embedded_gaussian', 'gaussian'
            backbone=backbone,
            use_hmr_regression=use_hmr_regression,
            use_coattention=use_coattention,
            num_coattention_iter=num_coattention_iter,
            coattention_conv=coattention_conv,
            use_upsampling=use_upsampling,
            use_soft_attention=use_soft_attention,
            num_branch_iteration=num_branch_iteration,
            branch_deeper=branch_deeper,
            use_resnet_conv_hrnet=use_resnet_conv_hrnet,
            use_position_encodings=use_position_encodings,
            use_mean_camshape=use_mean_camshape,
            use_mean_pose=use_mean_pose,
            init_xavier=init_xavier,
        )
        self.use_cam = use_cam
        # if self.use_cam:
        #     self.smpl = SMPLCamHead(
        #         img_res=img_res,
        #     )
        # else:
        #     self.smpl = SMPLHead(
        #         focal_length=focal_length,
        #         img_res=img_res
        #     )
        if pretrained is not None:
            self.load_pretrained(pretrained)
    def forward(
            self,
            images,
            gt_segm=None,
    ):
        features = self.backbone(images)
        hmr_output = self.head(features, gt_segm=gt_segm)
        rotmat = hmr_output['pred_pose']
        shape = hmr_output['pred_shape']
        rotmat_flat = rotmat.reshape(-1, 3, 3)
        rvec_flat = rotation_matrix_to_axis_angle(rotmat_flat)
        rvec = rvec_flat.reshape(*rotmat.shape[:-2], 3)
        rvec = rvec.reshape(*rvec.shape[:-2], -1)
        return {
            'Rh': rvec[..., :3],
            'Th': torch.zeros_like(rvec[..., :3]),
            'poses': rvec[..., 3:],
            'shapes': shape,
        }
 from ..basetopdown import BaseTopDownModelCache
 import pickle
 class NullSPIN:
    def __init__(self, ckpt) -> None:
        self.name = 'spin'
    def __call__(self, bbox, images, imgname):
        from easymocap.mytools.reader import read_smpl
        basename = os.path.basename(imgname)
        cachename = join(self.output, self.name, basename.replace('.jpg', '.json'))
        if os.path.exists(cachename):
            params = read_smpl(cachename)
            params = params[0]
            params = {key:val[0] for key, val in params.items() if key != 'id'}
            ret = {
                'params': params
            }
            return ret
        else:
            import ipdb; ipdb.set_trace()
 class MyPARE(BaseTopDownModelCache):
    def __init__(self, ckpt) -> None:
        super().__init__('pare', bbox_scale=1.1, res_input=224)
        if not os.path.exists(CFG):
            from ...io.model import try_to_download_SMPL
            try_to_download_SMPL('models/pare')
        self.model_cfg = update_hparams(CFG)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.model = self._build_model()
        self._load_pretrained_model(CKPT)
        self.model.eval()
        self.model.to(self.device)
    def __call__(self, bbox, images, imgnames):
        return super().__call__(bbox[0], images, imgnames)
    def _build_model(self):
        # ========= Define PARE model ========= #
        model_cfg = self.model_cfg
        if model_cfg.METHOD == 'pare':
            model = PARE(
                backbone=model_cfg.PARE.BACKBONE,
                num_joints=model_cfg.PARE.NUM_JOINTS,
                softmax_temp=model_cfg.PARE.SOFTMAX_TEMP,
                num_features_smpl=model_cfg.PARE.NUM_FEATURES_SMPL,
                focal_length=model_cfg.DATASET.FOCAL_LENGTH,
                img_res=model_cfg.DATASET.IMG_RES,
                pretrained=model_cfg.TRAINING.PRETRAINED,
                iterative_regression=model_cfg.PARE.ITERATIVE_REGRESSION,
                num_iterations=model_cfg.PARE.NUM_ITERATIONS,
                iter_residual=model_cfg.PARE.ITER_RESIDUAL,
                shape_input_type=model_cfg.PARE.SHAPE_INPUT_TYPE,
                pose_input_type=model_cfg.PARE.POSE_INPUT_TYPE,
                pose_mlp_num_layers=model_cfg.PARE.POSE_MLP_NUM_LAYERS,
                shape_mlp_num_layers=model_cfg.PARE.SHAPE_MLP_NUM_LAYERS,
                pose_mlp_hidden_size=model_cfg.PARE.POSE_MLP_HIDDEN_SIZE,
                shape_mlp_hidden_size=model_cfg.PARE.SHAPE_MLP_HIDDEN_SIZE,
                use_keypoint_features_for_smpl_regression=model_cfg.PARE.USE_KEYPOINT_FEATURES_FOR_SMPL_REGRESSION,
                use_heatmaps=model_cfg.DATASET.USE_HEATMAPS,
                use_keypoint_attention=model_cfg.PARE.USE_KEYPOINT_ATTENTION,
                use_postconv_keypoint_attention=model_cfg.PARE.USE_POSTCONV_KEYPOINT_ATTENTION,
                use_scale_keypoint_attention=model_cfg.PARE.USE_SCALE_KEYPOINT_ATTENTION,
                keypoint_attention_act=model_cfg.PARE.KEYPOINT_ATTENTION_ACT,
                use_final_nonlocal=model_cfg.PARE.USE_FINAL_NONLOCAL,
                use_branch_nonlocal=model_cfg.PARE.USE_BRANCH_NONLOCAL,
                use_hmr_regression=model_cfg.PARE.USE_HMR_REGRESSION,
                use_coattention=model_cfg.PARE.USE_COATTENTION,
                num_coattention_iter=model_cfg.PARE.NUM_COATTENTION_ITER,
                coattention_conv=model_cfg.PARE.COATTENTION_CONV,
                use_upsampling=model_cfg.PARE.USE_UPSAMPLING,
                deconv_conv_kernel_size=model_cfg.PARE.DECONV_CONV_KERNEL_SIZE,
                use_soft_attention=model_cfg.PARE.USE_SOFT_ATTENTION,
                num_branch_iteration=model_cfg.PARE.NUM_BRANCH_ITERATION,
                branch_deeper=model_cfg.PARE.BRANCH_DEEPER,
                num_deconv_layers=model_cfg.PARE.NUM_DECONV_LAYERS,
                num_deconv_filters=model_cfg.PARE.NUM_DECONV_FILTERS,
                use_resnet_conv_hrnet=model_cfg.PARE.USE_RESNET_CONV_HRNET,
                use_position_encodings=model_cfg.PARE.USE_POS_ENC,
                use_mean_camshape=model_cfg.PARE.USE_MEAN_CAMSHAPE,
                use_mean_pose=model_cfg.PARE.USE_MEAN_POSE,
                init_xavier=model_cfg.PARE.INIT_XAVIER,
            ).to(self.device)
        else:
            exit()
        return model
    def _load_pretrained_model(self, ckpt):
        # ========= Load pretrained weights ========= #
        state_dict = torch.load(ckpt, map_location='cpu')['state_dict']
        pretrained_keys = state_dict.keys()
        new_state_dict = {}
        for pk in pretrained_keys:
            if pk.startswith('model.'):
                new_state_dict[pk.replace('model.', '')] = state_dict[pk]
            else:
                new_state_dict[pk] = state_dict[pk]
        self.model.load_state_dict(new_state_dict, strict=False)
 if __name__ == '__main__':
    pass
--- a/myeasymocap/backbone/pare/utils/geometry.py
+++ b/myeasymocap/backbone/pare/utils/geometry.py
@ -0,0 +1,722 @@
 import torch
 import numpy as np
 from torch.nn import functional as F
 """
 Useful geometric operations, e.g. Perspective projection and a differentiable Rodrigues formula
 Parts of the code are taken from https://github.com/MandyMo/pytorch_HMR
 """
 def batch_rot2aa(Rs):
    """
    Rs is B x 3 x 3
    void cMathUtil::RotMatToAxisAngle(const tMatrix& mat, tVector& out_axis,
                                      double& out_theta)
    {
        double c = 0.5 * (mat(0, 0) + mat(1, 1) + mat(2, 2) - 1);
        c = cMathUtil::Clamp(c, -1.0, 1.0);
        out_theta = std::acos(c);
        if (std::abs(out_theta) < 0.00001)
        {
            out_axis = tVector(0, 0, 1, 0);
        }
        else
        {
            double m21 = mat(2, 1) - mat(1, 2);
            double m02 = mat(0, 2) - mat(2, 0);
            double m10 = mat(1, 0) - mat(0, 1);
            double denom = std::sqrt(m21 * m21 + m02 * m02 + m10 * m10);
            out_axis[0] = m21 / denom;
            out_axis[1] = m02 / denom;
            out_axis[2] = m10 / denom;
            out_axis[3] = 0;
        }
    }
    """
    cos = 0.5 * (torch.stack([torch.trace(x) for x in Rs]) - 1)
    cos = torch.clamp(cos, -1, 1)
    theta = torch.acos(cos)
    m21 = Rs[:, 2, 1] - Rs[:, 1, 2]
    m02 = Rs[:, 0, 2] - Rs[:, 2, 0]
    m10 = Rs[:, 1, 0] - Rs[:, 0, 1]
    denom = torch.sqrt(m21 * m21 + m02 * m02 + m10 * m10)
    axis0 = torch.where(torch.abs(theta) < 0.00001, m21, m21 / denom)
    axis1 = torch.where(torch.abs(theta) < 0.00001, m02, m02 / denom)
    axis2 = torch.where(torch.abs(theta) < 0.00001, m10, m10 / denom)
    return theta.unsqueeze(1) * torch.stack([axis0, axis1, axis2], 1)
 def batch_rodrigues(theta):
    """Convert axis-angle representation to rotation matrix.
    Args:
        theta: size = [B, 3]
    Returns:
        Rotation matrix corresponding to the quaternion -- size = [B, 3, 3]
    """
    l1norm = torch.norm(theta + 1e-8, p = 2, dim = 1)
    angle = torch.unsqueeze(l1norm, -1)
    normalized = torch.div(theta, angle)
    angle = angle * 0.5
    v_cos = torch.cos(angle)
    v_sin = torch.sin(angle)
    quat = torch.cat([v_cos, v_sin * normalized], dim = 1)
    return quat_to_rotmat(quat)
 def quat_to_rotmat(quat):
    """Convert quaternion coefficients to rotation matrix.
    Args:
        quat: size = [B, 4] 4 <===>(w, x, y, z)
    Returns:
        Rotation matrix corresponding to the quaternion -- size = [B, 3, 3]
    """
    norm_quat = quat
    norm_quat = norm_quat/norm_quat.norm(p=2, dim=1, keepdim=True)
    w, x, y, z = norm_quat[:,0], norm_quat[:,1], norm_quat[:,2], norm_quat[:,3]
    B = quat.size(0)
    w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
    wx, wy, wz = w*x, w*y, w*z
    xy, xz, yz = x*y, x*z, y*z
    rotMat = torch.stack([w2 + x2 - y2 - z2, 2*xy - 2*wz, 2*wy + 2*xz,
                          2*wz + 2*xy, w2 - x2 + y2 - z2, 2*yz - 2*wx,
                          2*xz - 2*wy, 2*wx + 2*yz, w2 - x2 - y2 + z2], dim=1).view(B, 3, 3)
    return rotMat
 def rot6d_to_rotmat(x):
    """Convert 6D rotation representation to 3x3 rotation matrix.
    Based on Zhou et al., "On the Continuity of Rotation Representations in Neural Networks", CVPR 2019
    Input:
        (B,6) Batch of 6-D rotation representations
    Output:
        (B,3,3) Batch of corresponding rotation matrices
    """
    x = x.reshape(-1,3,2)
    a1 = x[:, :, 0]
    a2 = x[:, :, 1]
    b1 = F.normalize(a1)
    b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
    b3 = torch.cross(b1, b2)
    return torch.stack((b1, b2, b3), dim=-1)
 def rotmat_to_rot6d(x):
    rotmat = x.reshape(-1, 3, 3)
    rot6d = rotmat[:, :, :2].reshape(x.shape[0], -1)
    return rot6d
 def rotation_matrix_to_angle_axis(rotation_matrix):
    """
    This function is borrowed from https://github.com/kornia/kornia
    Convert 3x4 rotation matrix to Rodrigues vector
    Args:
        rotation_matrix (Tensor): rotation matrix.
    Returns:
        Tensor: Rodrigues vector transformation.
    Shape:
        - Input: :math:`(N, 3, 4)`
        - Output: :math:`(N, 3)`
    Example:
        >>> input = torch.rand(2, 3, 4)  # Nx4x4
        >>> output = tgm.rotation_matrix_to_angle_axis(input)  # Nx3
    """
    if rotation_matrix.shape[1:] == (3,3):
        rot_mat = rotation_matrix.reshape(-1, 3, 3)
        hom = torch.tensor([0, 0, 1], dtype=torch.float32,
                           device=rotation_matrix.device).reshape(1, 3, 1).expand(rot_mat.shape[0], -1, -1)
        rotation_matrix = torch.cat([rot_mat, hom], dim=-1)
    quaternion = rotation_matrix_to_quaternion(rotation_matrix)
    aa = quaternion_to_angle_axis(quaternion)
    aa[torch.isnan(aa)] = 0.0
    return aa
 def quaternion_to_angle_axis(quaternion: torch.Tensor) -> torch.Tensor:
    """
    This function is borrowed from https://github.com/kornia/kornia
    Convert quaternion vector to angle axis of rotation.
    Adapted from ceres C++ library: ceres-solver/include/ceres/rotation.h
    Args:
        quaternion (torch.Tensor): tensor with quaternions.
    Return:
        torch.Tensor: tensor with angle axis of rotation.
    Shape:
        - Input: :math:`(*, 4)` where `*` means, any number of dimensions
        - Output: :math:`(*, 3)`
    Example:
        >>> quaternion = torch.rand(2, 4)  # Nx4
        >>> angle_axis = tgm.quaternion_to_angle_axis(quaternion)  # Nx3
    """
    if not torch.is_tensor(quaternion):
        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
            type(quaternion)))
    if not quaternion.shape[-1] == 4:
        raise ValueError("Input must be a tensor of shape Nx4 or 4. Got {}"
                         .format(quaternion.shape))
    # unpack input and compute conversion
    q1: torch.Tensor = quaternion[..., 1]
    q2: torch.Tensor = quaternion[..., 2]
    q3: torch.Tensor = quaternion[..., 3]
    sin_squared_theta: torch.Tensor = q1 * q1 + q2 * q2 + q3 * q3
    sin_theta: torch.Tensor = torch.sqrt(sin_squared_theta)
    cos_theta: torch.Tensor = quaternion[..., 0]
    two_theta: torch.Tensor = 2.0 * torch.where(
        cos_theta < 0.0,
        torch.atan2(-sin_theta, -cos_theta),
        torch.atan2(sin_theta, cos_theta))
    k_pos: torch.Tensor = two_theta / sin_theta
    k_neg: torch.Tensor = 2.0 * torch.ones_like(sin_theta)
    k: torch.Tensor = torch.where(sin_squared_theta > 0.0, k_pos, k_neg)
    angle_axis: torch.Tensor = torch.zeros_like(quaternion)[..., :3]
    angle_axis[..., 0] += q1 * k
    angle_axis[..., 1] += q2 * k
    angle_axis[..., 2] += q3 * k
    return angle_axis
 def rotation_matrix_to_quaternion(rotation_matrix, eps=1e-6):
    """
    This function is borrowed from https://github.com/kornia/kornia
    Convert 3x4 rotation matrix to 4d quaternion vector
    This algorithm is based on algorithm described in
    https://github.com/KieranWynn/pyquaternion/blob/master/pyquaternion/quaternion.py#L201
    Args:
        rotation_matrix (Tensor): the rotation matrix to convert.
    Return:
        Tensor: the rotation in quaternion
    Shape:
        - Input: :math:`(N, 3, 4)`
        - Output: :math:`(N, 4)`
    Example:
        >>> input = torch.rand(4, 3, 4)  # Nx3x4
        >>> output = tgm.rotation_matrix_to_quaternion(input)  # Nx4
    """
    if not torch.is_tensor(rotation_matrix):
        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
            type(rotation_matrix)))
    if len(rotation_matrix.shape) > 3:
        raise ValueError(
            "Input size must be a three dimensional tensor. Got {}".format(
                rotation_matrix.shape))
    if not rotation_matrix.shape[-2:] == (3, 4):
        raise ValueError(
            "Input size must be a N x 3 x 4  tensor. Got {}".format(
                rotation_matrix.shape))
    rmat_t = torch.transpose(rotation_matrix, 1, 2)
    mask_d2 = rmat_t[:, 2, 2] < eps
    mask_d0_d1 = rmat_t[:, 0, 0] > rmat_t[:, 1, 1]
    mask_d0_nd1 = rmat_t[:, 0, 0] < -rmat_t[:, 1, 1]
    t0 = 1 + rmat_t[:, 0, 0] - rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
    q0 = torch.stack([rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
                      t0, rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
                      rmat_t[:, 2, 0] + rmat_t[:, 0, 2]], -1)
    t0_rep = t0.repeat(4, 1).t()
    t1 = 1 - rmat_t[:, 0, 0] + rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
    q1 = torch.stack([rmat_t[:, 2, 0] - rmat_t[:, 0, 2],
                      rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
                      t1, rmat_t[:, 1, 2] + rmat_t[:, 2, 1]], -1)
    t1_rep = t1.repeat(4, 1).t()
    t2 = 1 - rmat_t[:, 0, 0] - rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
    q2 = torch.stack([rmat_t[:, 0, 1] - rmat_t[:, 1, 0],
                      rmat_t[:, 2, 0] + rmat_t[:, 0, 2],
                      rmat_t[:, 1, 2] + rmat_t[:, 2, 1], t2], -1)
    t2_rep = t2.repeat(4, 1).t()
    t3 = 1 + rmat_t[:, 0, 0] + rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
    q3 = torch.stack([t3, rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
                      rmat_t[:, 2, 0] - rmat_t[:, 0, 2],
                      rmat_t[:, 0, 1] - rmat_t[:, 1, 0]], -1)
    t3_rep = t3.repeat(4, 1).t()
    mask_c0 = mask_d2 * mask_d0_d1
    mask_c1 = mask_d2 * ~mask_d0_d1
    mask_c2 = ~mask_d2 * mask_d0_nd1
    mask_c3 = ~mask_d2 * ~mask_d0_nd1
    mask_c0 = mask_c0.view(-1, 1).type_as(q0)
    mask_c1 = mask_c1.view(-1, 1).type_as(q1)
    mask_c2 = mask_c2.view(-1, 1).type_as(q2)
    mask_c3 = mask_c3.view(-1, 1).type_as(q3)
    q = q0 * mask_c0 + q1 * mask_c1 + q2 * mask_c2 + q3 * mask_c3
    q /= torch.sqrt(t0_rep * mask_c0 + t1_rep * mask_c1 +  # noqa
                    t2_rep * mask_c2 + t3_rep * mask_c3)  # noqa
    q *= 0.5
    return q
 def convert_perspective_to_weak_perspective(
        perspective_camera,
        focal_length=5000.,
        img_res=224,
 ):
    # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz]
    # in 3D given the bounding box size
    # This camera translation can be used in a full perspective projection
    # if isinstance(focal_length, torch.Tensor):
    #     focal_length = focal_length[:, 0]
    weak_perspective_camera = torch.stack(
        [
            2 * focal_length / (img_res * perspective_camera[:, 2] + 1e-9),
            perspective_camera[:, 0],
            perspective_camera[:, 1],
        ],
        dim=-1
    )
    return weak_perspective_camera
 def convert_weak_perspective_to_perspective(
        weak_perspective_camera,
        focal_length=5000.,
        img_res=224,
 ):
    # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz]
    # in 3D given the bounding box size
    # This camera translation can be used in a full perspective projection
    # if isinstance(focal_length, torch.Tensor):
    #     focal_length = focal_length[:, 0]
    perspective_camera = torch.stack(
        [
            weak_perspective_camera[:, 1],
            weak_perspective_camera[:, 2],
            2 * focal_length / (img_res * weak_perspective_camera[:, 0] + 1e-9)
        ],
        dim=-1
    )
    return perspective_camera
 def perspective_projection(points, rotation, translation,
                           focal_length, camera_center):
    """
    This function computes the perspective projection of a set of points.
    Input:
        points (bs, N, 3): 3D points
        rotation (bs, 3, 3): Camera rotation
        translation (bs, 3): Camera translation
        focal_length (bs,) or scalar: Focal length
        camera_center (bs, 2): Camera center
    """
    batch_size = points.shape[0]
    K = torch.zeros([batch_size, 3, 3], device=points.device)
    K[:,0,0] = focal_length
    K[:,1,1] = focal_length
    K[:,2,2] = 1.
    K[:,:-1, -1] = camera_center
    # Transform points
    points = torch.einsum('bij,bkj->bki', rotation, points)
    points = points + translation.unsqueeze(1)
    # Apply perspective distortion
    projected_points = points / points[:,:,-1].unsqueeze(-1)
    # Apply camera intrinsics
    projected_points = torch.einsum('bij,bkj->bki', K, projected_points)
    return projected_points[:, :, :-1]
 def weak_perspective_projection(points, rotation, weak_cam_params, focal_length, camera_center, img_res):
    """
    This function computes the perspective projection of a set of points.
    Input:
        points (bs, N, 3): 3D points
        rotation (bs, 3, 3): Camera rotation
        translation (bs, 3): Camera translation
        focal_length (bs,) or scalar: Focal length
        camera_center (bs, 2): Camera center
    """
    batch_size = points.shape[0]
    K = torch.zeros([batch_size, 3, 3], device=points.device)
    K[:,0,0] = focal_length
    K[:,1,1] = focal_length
    K[:,2,2] = 1.
    K[:,:-1, -1] = camera_center
    translation = convert_weak_perspective_to_perspective(weak_cam_params, focal_length, img_res)
    # Transform points
    points = torch.einsum('bij,bkj->bki', rotation, points)
    points = points + translation.unsqueeze(1)
    # Apply perspective distortion
    projected_points = points / points[:,:,-1].unsqueeze(-1)
    # Apply camera intrinsics
    projected_points = torch.einsum('bij,bkj->bki', K, projected_points)
    return projected_points[:, :, :-1]
 def estimate_translation_np(S, joints_2d, joints_conf, focal_length=5000., img_size=224.):
    """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
    Input:
        S: (25, 3) 3D joint locations
        joints: (25, 3) 2D joint locations and confidence
    Returns:
        (3,) camera translation vector
    """
    num_joints = S.shape[0]
    # focal length
    f = np.array([focal_length,focal_length])
    # optical center
    center = np.array([img_size/2., img_size/2.])
    # transformations
    Z = np.reshape(np.tile(S[:,2],(2,1)).T,-1)
    XY = np.reshape(S[:,0:2],-1)
    O = np.tile(center,num_joints)
    F = np.tile(f,num_joints)
    weight2 = np.reshape(np.tile(np.sqrt(joints_conf),(2,1)).T,-1)
    # least squares
    Q = np.array([F*np.tile(np.array([1,0]),num_joints), F*np.tile(np.array([0,1]),num_joints), O-np.reshape(joints_2d,-1)]).T
    c = (np.reshape(joints_2d,-1)-O)*Z - F*XY
    # weighted least squares
    W = np.diagflat(weight2)
    Q = np.dot(W,Q)
    c = np.dot(W,c)
    # square matrix
    A = np.dot(Q.T,Q)
    b = np.dot(Q.T,c)
    # solution
    trans = np.linalg.solve(A, b)
    return trans
 def estimate_translation(S, joints_2d, focal_length=5000., img_size=224., use_all_joints=False, rotation=None):
    """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
    Input:
        S: (B, 49, 3) 3D joint locations
        joints: (B, 49, 3) 2D joint locations and confidence
    Returns:
        (B, 3) camera translation vectors
    """
    device = S.device
    if rotation is not None:
        S = torch.einsum('bij,bkj->bki', rotation, S)
    # Use only joints 25:49 (GT joints)
    if use_all_joints:
        S = S.cpu().numpy()
        joints_2d = joints_2d.cpu().numpy()
    else:
        S = S[:, 25:, :].cpu().numpy()
        joints_2d = joints_2d[:, 25:, :].cpu().numpy()
    joints_conf = joints_2d[:, :, -1]
    joints_2d = joints_2d[:, :, :-1]
    trans = np.zeros((S.shape[0], 3), dtype=np.float32)
    # Find the translation for each example in the batch
    for i in range(S.shape[0]):
        S_i = S[i]
        joints_i = joints_2d[i]
        conf_i = joints_conf[i]
        trans[i] = estimate_translation_np(S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size)
    return torch.from_numpy(trans).to(device)
 def estimate_translation_cam(S, joints_2d, focal_length=(5000., 5000.), img_size=(224., 224.),
                             use_all_joints=False, rotation=None):
    """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
    Input:
        S: (B, 49, 3) 3D joint locations
        joints: (B, 49, 3) 2D joint locations and confidence
    Returns:
        (B, 3) camera translation vectors
    """
    def estimate_translation_np(S, joints_2d, joints_conf, focal_length=(5000., 5000.), img_size=(224., 224.)):
        """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
        Input:
            S: (25, 3) 3D joint locations
            joints: (25, 3) 2D joint locations and confidence
        Returns:
            (3,) camera translation vector
        """
        num_joints = S.shape[0]
        # focal length
        f = np.array([focal_length[0], focal_length[1]])
        # optical center
        center = np.array([img_size[0] / 2., img_size[1] / 2.])
        # transformations
        Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1)
        XY = np.reshape(S[:, 0:2], -1)
        O = np.tile(center, num_joints)
        F = np.tile(f, num_joints)
        weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1)
        # least squares
        Q = np.array([F * np.tile(np.array([1, 0]), num_joints), F * np.tile(np.array([0, 1]), num_joints),
                      O - np.reshape(joints_2d, -1)]).T
        c = (np.reshape(joints_2d, -1) - O) * Z - F * XY
        # weighted least squares
        W = np.diagflat(weight2)
        Q = np.dot(W, Q)
        c = np.dot(W, c)
        # square matrix
        A = np.dot(Q.T, Q)
        b = np.dot(Q.T, c)
        # solution
        trans = np.linalg.solve(A, b)
        return trans
    device = S.device
    if rotation is not None:
        S = torch.einsum('bij,bkj->bki', rotation, S)
    # Use only joints 25:49 (GT joints)
    if use_all_joints:
        S = S.cpu().numpy()
        joints_2d = joints_2d.cpu().numpy()
    else:
        S = S[:, 25:, :].cpu().numpy()
        joints_2d = joints_2d[:, 25:, :].cpu().numpy()
    joints_conf = joints_2d[:, :, -1]
    joints_2d = joints_2d[:, :, :-1]
    trans = np.zeros((S.shape[0], 3), dtype=np.float32)
    # Find the translation for each example in the batch
    for i in range(S.shape[0]):
        S_i = S[i]
        joints_i = joints_2d[i]
        conf_i = joints_conf[i]
        trans[i] = estimate_translation_np(S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size)
    return torch.from_numpy(trans).to(device)
 def get_coord_maps(size=56):
    xx_ones = torch.ones([1, size], dtype=torch.int32)
    xx_ones = xx_ones.unsqueeze(-1)
    xx_range = torch.arange(size, dtype=torch.int32).unsqueeze(0)
    xx_range = xx_range.unsqueeze(1)
    xx_channel = torch.matmul(xx_ones, xx_range)
    xx_channel = xx_channel.unsqueeze(-1)
    yy_ones = torch.ones([1, size], dtype=torch.int32)
    yy_ones = yy_ones.unsqueeze(1)
    yy_range = torch.arange(size, dtype=torch.int32).unsqueeze(0)
    yy_range = yy_range.unsqueeze(-1)
    yy_channel = torch.matmul(yy_range, yy_ones)
    yy_channel = yy_channel.unsqueeze(-1)
    xx_channel = xx_channel.permute(0, 3, 1, 2)
    yy_channel = yy_channel.permute(0, 3, 1, 2)
    xx_channel = xx_channel.float() / (size - 1)
    yy_channel = yy_channel.float() / (size - 1)
    xx_channel = xx_channel * 2 - 1
    yy_channel = yy_channel * 2 - 1
    out = torch.cat([xx_channel, yy_channel], dim=1)
    return out
 def look_at(eye, at=np.array([0, 0, 0]), up=np.array([0, 0, 1]), eps=1e-5):
    at = at.astype(float).reshape(1, 3)
    up = up.astype(float).reshape(1, 3)
    eye = eye.reshape(-1, 3)
    up = up.repeat(eye.shape[0] // up.shape[0], axis=0)
    eps = np.array([eps]).reshape(1, 1).repeat(up.shape[0], axis=0)
    z_axis = eye - at
    z_axis /= np.max(np.stack([np.linalg.norm(z_axis, axis=1, keepdims=True), eps]))
    x_axis = np.cross(up, z_axis)
    x_axis /= np.max(np.stack([np.linalg.norm(x_axis, axis=1, keepdims=True), eps]))
    y_axis = np.cross(z_axis, x_axis)
    y_axis /= np.max(np.stack([np.linalg.norm(y_axis, axis=1, keepdims=True), eps]))
    r_mat = np.concatenate((x_axis.reshape(-1, 3, 1), y_axis.reshape(-1, 3, 1), z_axis.reshape(-1, 3, 1)), axis=2)
    return r_mat
 def to_sphere(u, v):
    theta = 2 * np.pi * u
    phi = np.arccos(1 - 2 * v)
    cx = np.sin(phi) * np.cos(theta)
    cy = np.sin(phi) * np.sin(theta)
    cz = np.cos(phi)
    s = np.stack([cx, cy, cz])
    return s
 def sample_on_sphere(range_u=(0, 1), range_v=(0, 1)):
    u = np.random.uniform(*range_u)
    v = np.random.uniform(*range_v)
    return to_sphere(u, v)
 def sample_pose_on_sphere(range_v=(0,1), range_u=(0,1), radius=1, up=[0,1,0]):
    # sample location on unit sphere
    loc = sample_on_sphere(range_u, range_v)
    # sample radius if necessary
    if isinstance(radius, tuple):
        radius = np.random.uniform(*radius)
    loc = loc * radius
    R = look_at(loc, up=np.array(up))[0]
    RT = np.concatenate([R, loc.reshape(3, 1)], axis=1)
    RT = torch.Tensor(RT.astype(np.float32))
    return RT
 def rectify_pose(camera_r, body_aa, rotate_x=False):
    body_r = batch_rodrigues(body_aa).reshape(-1,3,3)
    if rotate_x:
        rotate_x = torch.tensor([[[1.0, 0.0, 0.0], [0.0, -1.0, 0.0], [0.0, 0.0, -1.0]]])
        body_r = body_r @ rotate_x
    final_r = camera_r @ body_r
    body_aa = batch_rot2aa(final_r)
    return body_aa
 def batch_euler2matrix(r):
    return quaternion_to_rotation_matrix(euler_to_quaternion(r))
 def euler_to_quaternion(r):
    x = r[..., 0]
    y = r[..., 1]
    z = r[..., 2]
    z = z/2.0
    y = y/2.0
    x = x/2.0
    cz = torch.cos(z)
    sz = torch.sin(z)
    cy = torch.cos(y)
    sy = torch.sin(y)
    cx = torch.cos(x)
    sx = torch.sin(x)
    quaternion = torch.zeros_like(r.repeat(1,2))[..., :4].to(r.device)
    quaternion[..., 0] += cx*cy*cz - sx*sy*sz
    quaternion[..., 1] += cx*sy*sz + cy*cz*sx
    quaternion[..., 2] += cx*cz*sy - sx*cy*sz
    quaternion[..., 3] += cx*cy*sz + sx*cz*sy
    return quaternion
 def quaternion_to_rotation_matrix(quat):
    """Convert quaternion coefficients to rotation matrix.
    Args:
        quat: size = [B, 4] 4 <===>(w, x, y, z)
    Returns:
        Rotation matrix corresponding to the quaternion -- size = [B, 3, 3]
    """
    norm_quat = quat
    norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True)
    w, x, y, z = norm_quat[:, 0], norm_quat[:, 1], norm_quat[:, 2], norm_quat[:, 3]
    B = quat.size(0)
    w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
    wx, wy, wz = w * x, w * y, w * z
    xy, xz, yz = x * y, x * z, y * z
    rotMat = torch.stack([w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz,
                          2 * wz + 2 * xy, w2 - x2 + y2 - z2, 2 * yz - 2 * wx,
                          2 * xz - 2 * wy, 2 * wx + 2 * yz, w2 - x2 - y2 + z2], dim=1).view(B, 3, 3)
    return rotMat
 def euler_angles_from_rotmat(R):
    """
    computer euler angles for rotation around x, y, z axis
    from rotation amtrix
    R: 4x4 rotation matrix
    https://www.gregslabaugh.net/publications/euler.pdf
    """
    r21 = np.round(R[:, 2, 0].item(), 4)
    if abs(r21) != 1:
        y_angle1 = -1 * torch.asin(R[:, 2, 0])
        y_angle2 = math.pi + torch.asin(R[:, 2, 0])
        cy1, cy2 = torch.cos(y_angle1), torch.cos(y_angle2)
        x_angle1 = torch.atan2(R[:, 2, 1] / cy1, R[:, 2, 2] / cy1)
        x_angle2 = torch.atan2(R[:, 2, 1] / cy2, R[:, 2, 2] / cy2)
        z_angle1 = torch.atan2(R[:, 1, 0] / cy1, R[:, 0, 0] / cy1)
        z_angle2 = torch.atan2(R[:, 1, 0] / cy2, R[:, 0, 0] / cy2)
        s1 = (x_angle1, y_angle1, z_angle1)
        s2 = (x_angle2, y_angle2, z_angle2)
        s = (s1, s2)
    else:
        z_angle = torch.tensor([0], device=R.device).float()
        if r21 == -1:
            y_angle = torch.tensor([math.pi / 2], device=R.device).float()
            x_angle = z_angle + torch.atan2(R[:, 0, 1], R[:, 0, 2])
        else:
            y_angle = -torch.tensor([math.pi / 2], device=R.device).float()
            x_angle = -z_angle + torch.atan2(-R[:, 0, 1], R[:, 0, 2])
        s = ((x_angle, y_angle, z_angle),)
    return s
--- a/myeasymocap/backbone/pare/utils/kp_utils.py
+++ b/myeasymocap/backbone/pare/utils/kp_utils.py