🚀 add model check point

2023-06-19 17:23:44 +08:00 · 2023-06-19 17:23:44 +08:00 · 11f13d6953
commit 11f13d6953
parent e7800a1356
12 changed files with 793 additions and 22 deletions
--- a/config/1v1p/fixhand.yml
+++ b/config/1v1p/fixhand.yml
@ -26,7 +26,8 @@ args:
      key_from_previous: [bbox]
      args:
        # ckpt: /nas/public/EasyMocapModels/hrnetv2_w18_coco_wholebody_hand_256x256-1c028db7_20210908.pth
-        ckpt: /nas/public/EasyMocapModels/hand/resnet_kp2d_clean.pt
+        ckpt: models/hand_resnet_kp2d_clean.pt
+        url: 1LTK7e9oAS6B3drmQyXwTZild6k87fEZa
        mode: resnet
    vis2d:
      module: myeasymocap.io.vis.Vis2D
@ -42,13 +43,14 @@ args:
      key_from_previous: [bbox]
      key_keep: [meta, cameras, imgnames] # 将这些参数都保留到最后的输出中
      args:
-        ckpt: models/manol_pca45_noflat.ckpt
+        ckpt: models/hand_manol_pca45_noflat.ckpt
+        url: '1KTi_oJ_udLRK3WZ3xyHzBUd6vKAApfT8'
    # TODO: add visualize for Init MANO
  at_final:
    load_hand_model: # 载入身体模型
      module: myeasymocap.io.model.MANOLoader
      args:
-        cfg_path: config/model/mano.yml
+        cfg_path: config/model/manol.yml
        model_path: models/manov1.2/MANO_LEFT.pkl #models/handmesh/data/MANO_RIGHT.pkl # load mano model
        regressor_path: models/manov1.2/J_regressor_mano_LEFT.txt #models/handmesh/data/J_regressor_mano_RIGHT.txt
        num_pca_comps: 45
--- a/config/1v1p/hand_detect_finetune.yml
+++ b/config/1v1p/hand_detect_finetune.yml
@ -26,7 +26,9 @@ args:
      key_from_previous: [bbox]
      args:
        # ckpt: /nas/public/EasyMocapModels/hrnetv2_w18_coco_wholebody_hand_256x256-1c028db7_20210908.pth
-        ckpt: /nas/public/EasyMocapModels/hand/resnet_kp2d_clean.pt
+        # ckpt: /nas/public/EasyMocapModels/hand/resnet_kp2d_clean.pt
+        ckpt: models/hand_resnet_kp2d_clean.pt
+        url: 1LTK7e9oAS6B3drmQyXwTZild6k87fEZa
        mode: resnet
    vis2d:
      module: myeasymocap.io.vis.Vis2D
@ -42,13 +44,14 @@ args:
      key_from_previous: [bbox]
      key_keep: [meta, cameras, imgnames] # 将这些参数都保留到最后的输出中
      args:
-        ckpt: models/manol_pca45_noflat.ckpt
+        ckpt: models/hand_manol_pca45_noflat.ckpt
+        url: '1KTi_oJ_udLRK3WZ3xyHzBUd6vKAApfT8'
    # TODO: add visualize for Init MANO
  at_final:
    load_hand_model: # 载入身体模型
      module: myeasymocap.io.model.MANOLoader
      args:
-        cfg_path: config/model/mano.yml
+        cfg_path: config/model/manol.yml
        model_path: models/manov1.2/MANO_LEFT.pkl #models/handmesh/data/MANO_RIGHT.pkl # load mano model
        regressor_path: models/manov1.2/J_regressor_mano_LEFT.txt #models/handmesh/data/J_regressor_mano_RIGHT.txt
        num_pca_comps: 45
--- a/easymocap/mytools/camera_utils.py
+++ b/easymocap/mytools/camera_utils.py
@ -33,11 +33,13 @@ class FileStorage(object):
            self._write('  rows: {}'.format(value.shape[0]))
            self._write('  cols: {}'.format(value.shape[1]))
            self._write('  dt: d')
-            self._write('  data: [{}]'.format(', '.join(['{:.3f}'.format(i) for i in value.reshape(-1)])))
+            self._write('  data: [{}]'.format(', '.join(['{:.6f}'.format(i) for i in value.reshape(-1)])))
        elif dt == 'list':
            self._write('{}:'.format(key))
            for elem in value:
                self._write('  - "{}"'.format(elem))
+        elif dt == 'int':
+            self._write('{}: {}'.format(key, value))

    def read(self, key, dt='mat'):
        if dt == 'mat':
@ -52,6 +54,8 @@ class FileStorage(object):
                if val != 'none':
                    results.append(val)
            output = results
+        elif dt == 'int':
+            output = int(self.fs.getNode(key).real())
        else:
            raise NotImplementedError
        return output
@ -114,6 +118,13 @@ def read_camera(intri_name, extri_name, cam_names=[]):
        cams[cam] = {}
        cams[cam]['K'] = intri.read('K_{}'.format( cam))
        cams[cam]['invK'] = np.linalg.inv(cams[cam]['K'])
+        H = intri.read('H_{}'.format(cam), dt='int')
+        W = intri.read('W_{}'.format(cam), dt='int')
+        if H is None or W is None:
+            print('[camera] no H or W for {}'.format(cam))
+            H, W = -1, -1
+        cams[cam]['H'] = H
+        cams[cam]['W'] = W
        Rvec = extri.read('R_{}'.format(cam))
        Tvec = extri.read('T_{}'.format(cam))
        assert Rvec is not None, cam
@ -129,6 +140,10 @@ def read_camera(intri_name, extri_name, cam_names=[]):
        cams[cam]['P'] = P[cam]

        cams[cam]['dist'] = intri.read('dist_{}'.format(cam))
+        if cams[cam]['dist'] is None:
+            cams[cam]['dist'] = intri.read('D_{}'.format(cam))
+            if cams[cam]['dist'] is None:
+                print('[camera] no dist for {}'.format(cam))
    cams['basenames'] = cam_names
    return cams

@ -155,6 +170,9 @@ def write_camera(camera, path):
        key = key_.split('.')[0]
        intri.write('K_{}'.format(key), val['K'])
        intri.write('dist_{}'.format(key), val['dist'])
+        if 'H' in val.keys() and 'W' in val.keys():
+            intri.write('H_{}'.format(key), val['H'], dt='int')
+            intri.write('W_{}'.format(key), val['W'], dt='int')
        if 'Rvec' not in val.keys():
            val['Rvec'] = cv2.Rodrigues(val['R'])[0]
        extri.write('R_{}'.format(key), val['Rvec'])
@ -174,7 +192,7 @@ def camera_from_img(img):
 class Undistort:
    distortMap = {}
    @classmethod
-    def image(cls, frame, K, dist, sub=None):
+    def image(cls, frame, K, dist, sub=None, interp=cv2.INTER_NEAREST):
        if sub is None:
            return cv2.undistort(frame, K, dist, None)
        else:
@ -183,7 +201,7 @@ class Undistort:
                mapx, mapy = cv2.initUndistortRectifyMap(K, dist, None, K, (w,h), 5)
                cls.distortMap[sub] = (mapx, mapy)
            mapx, mapy = cls.distortMap[sub]
-            img = cv2.remap(frame, mapx, mapy, cv2.INTER_NEAREST)
+            img = cv2.remap(frame, mapx, mapy, interp)
            return img

    @staticmethod
@ -203,6 +221,21 @@ class Undistort:
        bbox = np.array([kpts[0, 0], kpts[0, 1], kpts[1, 0], kpts[1, 1], bbox[4]])
        return bbox

+class Distort:
+    @staticmethod
+    def points(keypoints, K, dist):
+        pass
+
+    @staticmethod
+    def bbox(bbox, K, dist):
+        keypoints = np.array([[bbox[0], bbox[1]], [bbox[2], bbox[3]]], dtype=np.float32)
+        k3d = cv2.convertPointsToHomogeneous(keypoints)
+        k3d = (np.linalg.inv(K) @ k3d[:, 0].T).T[:, None]
+        k2d, _ = cv2.projectPoints(k3d, np.zeros((3,)), np.zeros((3,)), K, dist)
+        k2d = k2d[:, 0]
+        bbox = np.array([k2d[0,0], k2d[0,1], k2d[1, 0], k2d[1, 1], bbox[-1]])
+        return bbox
+
 def unproj(kpts, invK):
    homo = np.hstack([kpts[:, :2], np.ones_like(kpts[:, :1])])
    homo = homo @ invK.T
--- a/easymocap/mytools/vis_base.py
+++ b/easymocap/mytools/vis_base.py
@ -2,14 +2,15 @@
  @ Date: 2020-11-28 17:23:04
  @ Author: Qing Shuai
  @ LastEditors: Qing Shuai
-  @ LastEditTime: 2022-08-12 21:50:56
+  @ LastEditTime: 2022-10-27 15:13:56
  @ FilePath: /EasyMocapPublic/easymocap/mytools/vis_base.py
 '''
 import cv2
 import numpy as np
 import json

-def generate_colorbar(N = 20, cmap = 'jet', rand=True):
+def generate_colorbar(N = 20, cmap = 'jet', rand=True, 
+    ret_float=False, ret_array=False, ret_rgb=False):
    bar = ((np.arange(N)/(N-1))*255).astype(np.uint8).reshape(-1, 1)
    colorbar = cv2.applyColorMap(bar, cv2.COLORMAP_JET).squeeze()
    if False:
@ -22,7 +23,12 @@ def generate_colorbar(N = 20, cmap = 'jet', rand=True):
        rgb = colorbar[index, :]
    else:
        rgb = colorbar
-    rgb = rgb.tolist()
+    if ret_rgb:
+        rgb = rgb[:, ::-1]
+    if ret_float:
+        rgb = rgb/255.
+    if not ret_array:
+        rgb = rgb.tolist()
    return rgb

 # colors_bar_rgb = generate_colorbar(cmap='hsv')
@ -69,9 +75,11 @@ def get_rgb(index):
        # elif index == 0:
        #     return (245, 150, 150)
        col = list(colors_bar_rgb[index%len(colors_bar_rgb)])[::-1]
-    else:
+    elif isinstance(index, str):
        col = colors_table.get(index, (1, 0, 0))
        col = tuple([int(c*255) for c in col[::-1]])
+    else:
+        raise TypeError('index should be int or str')
    return col

 def get_rgb_01(index):
@ -150,14 +158,16 @@ def plot_keypoints(img, points, pid, config, vis_conf=False, use_limb_color=True
                cv2.putText(img, '{:.1f}'.format(c), (int(x), int(y)), 
                cv2.FONT_HERSHEY_SIMPLEX, text_size, col, 2)

-def plot_keypoints_auto(img, points, pid, vis_conf=False, use_limb_color=True, scale=1, lw=-1):
+def plot_keypoints_auto(img, points, pid, vis_conf=False, use_limb_color=True, scale=1, lw=-1, config_name=None, lw_factor=1):
    from ..dataset.config import CONFIG
-    config_name = {25: 'body25', 21: 'hand', 42:'handlr', 17: 'coco', 1:'points', 67:'bodyhand', 137: 'total', 79:'up'}[len(points)]
+    if config_name is None:
+        config_name = {25: 'body25', 15: 'body15', 21: 'hand', 42:'handlr', 17: 'coco', 1:'points', 67:'bodyhand', 137: 'total', 79:'up',
+            19:'ochuman'}[len(points)]
    config = CONFIG[config_name]
    if lw == -1:
        lw = img.shape[0]//200
    if config_name == 'hand':
-        lw = img.shape[0]//1000
+        lw = img.shape[0]//100
    lw = max(lw, 1)
    for ii, (i, j) in enumerate(config['kintree']):
        if i >= len(points) or j >= len(points):
@ -169,9 +179,9 @@ def plot_keypoints_auto(img, points, pid, vis_conf=False, use_limb_color=True, s
            col = get_rgb(config['colors'][ii])
        else:
            col = get_rgb(pid)
-        if pt1[0] < 0 or pt1[1] < 0 or pt1[0] > 10000 or pt1[1] > 10000:
+        if pt1[0] < -10000 or pt1[1] < -10000 or pt1[0] > 10000 or pt1[1] > 10000:
            continue
-        if pt2[0] < 0 or pt2[1] < 0 or pt2[0] > 10000 or pt2[1] > 10000:
+        if pt2[0] < -10000 or pt2[1] < -10000 or pt2[0] > 10000 or pt2[1] > 10000:
            continue
        if pt1[-1] > 0.01 and pt2[-1] > 0.01:
            image = cv2.line(
@ -191,12 +201,13 @@ def plot_keypoints_auto(img, points, pid, vis_conf=False, use_limb_color=True, s
        if c > 0.01:
            col = get_rgb(pid)
            if len(points) == 1:
-                cv2.circle(img, (int(x+0.5), int(y+0.5)), lw*10, col, lw*2)
-                plot_cross(img, int(x+0.5), int(y+0.5), width=lw*5, col=col, lw=lw*2)
+                _lw = max(0, int(lw * lw_factor))
+                cv2.circle(img, (int(x+0.5), int(y+0.5)), _lw*2, col, lw*2)
+                plot_cross(img, int(x+0.5), int(y+0.5), width=_lw, col=col, lw=lw*2)
            else:
                cv2.circle(img, (int(x+0.5), int(y+0.5)), lw*2, col, -1)
            if vis_conf:
-                cv2.putText(img, '{:.1f}'.format(c), (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, 1, col, 2)
+                cv2.putText(img, '{:.1f}'.format(c), (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, col, 2)

 def plot_keypoints_total(img, annots, scale, pid_offset=0):
    _lw = img.shape[0] // 150
--- a/myeasymocap/backbone/basetopdown.py
+++ b/myeasymocap/backbone/basetopdown.py
@ -239,3 +239,10 @@ def get_preds_from_heatmaps(batch_heatmaps):
    coords = coords.astype(np.float32) * 4
    pred = np.dstack((coords, maxvals))
    return pred
+
+def gdown_models(ckpt, url):
+    print('Try to download model from {} to {}'.format(url, ckpt))
+    os.makedirs(os.path.dirname(ckpt), exist_ok=True)
+    cmd = 'gdown "{}" -O {}'.format(url, ckpt)
+    print('\n', cmd, '\n')
+    os.system(cmd)
--- a/myeasymocap/backbone/hand2d/init.py
+++ b/myeasymocap/backbone/hand2d/init.py
--- a/myeasymocap/backbone/hand2d/hand2d.py
+++ b/myeasymocap/backbone/hand2d/hand2d.py
@ -0,0 +1,89 @@
+import os
+import cv2
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+# https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_rhd2d_256x256-95b20dd8_20210330.pth
+# https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_onehand10k_256x256_dark-a2f80c64_20210330.pth
+from .hrnet import PoseHighResolutionNet
+from ..basetopdown import BaseTopDownModelCache, get_preds_from_heatmaps, gdown_models
+
+class TopDownAsMMPose(nn.Module):
+    def __init__(self, backbone, head):
+        super().__init__()
+        self.bacbone = backbone
+        self.head = head
+    
+    def forward(self, x):
+        feat_list = self.bacbone(x)
+        size = feat_list[0].shape[-2:]
+        resized_inputs = [
+            nn.functional.interpolate(feat, size, mode='bilinear', align_corners=False) \
+            for feat in feat_list
+        ]
+        resized_inputs = torch.cat(resized_inputs, 1)
+        out = self.head(resized_inputs)
+        pred = get_preds_from_heatmaps(out.detach().cpu().numpy())
+        return {'keypoints': pred}
+
+class MyHand2D(BaseTopDownModelCache):
+    def __init__(self, ckpt, url=None, mode='hrnet'):
+        if mode == 'hrnet':
+            super().__init__(name='hand2d', bbox_scale=1.1, res_input=256)
+            backbone = PoseHighResolutionNet(inp_ch=3, out_ch=21, W=18, multi_scale_final=True, add_final_layer=False)
+            checkpoint = torch.load(ckpt, map_location='cpu')['state_dict']
+            self.load_checkpoint(backbone, checkpoint, prefix='backbone.', strict=True)
+            head = nn.Sequential(
+                nn.Conv2d(270, 270, kernel_size=1),
+                nn.BatchNorm2d(270),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(270, 21, kernel_size=1)
+            )
+            self.load_checkpoint(head, checkpoint, prefix='keypoint_head.final_layer.', strict=True)
+            # self.model = nn.Sequential(backbone, head)
+            self.model = TopDownAsMMPose(backbone, head)
+        elif mode == 'resnet':
+            super().__init__(name='hand2d', bbox_scale=1.1, res_input=256, mean=[0., 0., 0.], std=[1., 1., 1.])
+            from .resnet import ResNet_Deconv
+            if not os.path.exists(ckpt) and url is not None:
+                gdown_models(ckpt, url)
+            assert os.path.exists(ckpt), f'{ckpt} not exists'
+            checkpoint = torch.load(ckpt, map_location='cpu')['state_dict']
+            model = ResNet_Deconv()
+            self.load_checkpoint(model, checkpoint, prefix='model.', strict=True)
+            self.model = model
+        self.model.eval()
+        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        self.model.to(self.device)
+
+    def __call__(self, bbox, images, imgnames):
+        squeeze = False
+        if not isinstance(images, list):
+            images = [images]
+            imgnames = [imgnames]
+            bbox = [bbox]
+            squeeze = True
+        nViews = len(images)
+        kpts_all = []
+        for nv in range(nViews):
+            if bbox[nv].shape[0] == 0:
+                kpts_all.append(np.zeros((21, 3)))
+                continue
+            _bbox = bbox[nv]
+            if len(_bbox.shape) == 1:
+                _bbox = _bbox[None]
+            output = super().__call__(_bbox, images[nv], imgnames[nv])
+            kpts = output['params']['keypoints']
+            conf = kpts[..., -1:]
+            kpts = self.batch_affine_transform(kpts, output['params']['inv_trans'])
+            kpts = np.concatenate([kpts, conf], axis=-1)
+            if len(kpts.shape) == 3:
+                kpts = kpts[0]
+            kpts_all.append(kpts)
+        kpts_all = np.stack(kpts_all)
+        if squeeze:
+            kpts_all = kpts_all[0]
+        return {
+            'keypoints': kpts_all
+        }
--- a/myeasymocap/backbone/hand2d/resnet.py
+++ b/myeasymocap/backbone/hand2d/resnet.py
@ -0,0 +1,161 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+import os
+import os.path as osp
+import torch
+import torch.nn as nn
+from torchvision.models.resnet import BasicBlock, Bottleneck
+from torchvision.models.resnet import model_urls
+from ..basetopdown import get_preds_from_heatmaps
+
+def make_conv_layers(feat_dims, kernel=3, stride=1, padding=1, bnrelu_final=True):
+    layers = []
+    for i in range(len(feat_dims)-1):
+        layers.append(
+            nn.Conv2d(
+                in_channels=feat_dims[i],
+                out_channels=feat_dims[i+1],
+                kernel_size=kernel,
+                stride=stride,
+                padding=padding
+                ))
+        # Do not use BN and ReLU for final estimation
+        if i < len(feat_dims)-2 or (i == len(feat_dims)-2 and bnrelu_final):
+            layers.append(nn.BatchNorm2d(feat_dims[i+1]))
+            layers.append(nn.ReLU(inplace=True))
+
+    return nn.Sequential(*layers)
+
+def make_deconv_layers(feat_dims, bnrelu_final=True):
+    layers = []
+    for i in range(len(feat_dims)-1):
+        layers.append(
+            nn.ConvTranspose2d(
+                in_channels=feat_dims[i],
+                out_channels=feat_dims[i+1],
+                kernel_size=4,
+                stride=2,
+                padding=1,
+                output_padding=0,
+                bias=False))
+
+        # Do not use BN and ReLU for final estimation
+        if i < len(feat_dims)-2 or (i == len(feat_dims)-2 and bnrelu_final):
+            layers.append(nn.BatchNorm2d(feat_dims[i+1]))
+            layers.append(nn.ReLU(inplace=True))
+
+    return nn.Sequential(*layers)
+
+
+class ResNetBackbone(nn.Module):
+
+    def __init__(self, resnet_type):
+	
+        resnet_spec = {18: (BasicBlock, [2, 2, 2, 2], [64, 64, 128, 256, 512], 'resnet18'),
+		       34: (BasicBlock, [3, 4, 6, 3], [64, 64, 128, 256, 512], 'resnet34'),
+		       50: (Bottleneck, [3, 4, 6, 3], [64, 256, 512, 1024, 2048], 'resnet50'),
+		       101: (Bottleneck, [3, 4, 23, 3], [64, 256, 512, 1024, 2048], 'resnet101'),
+		       152: (Bottleneck, [3, 8, 36, 3], [64, 256, 512, 1024, 2048], 'resnet152')}
+        block, layers, channels, name = resnet_spec[resnet_type]
+        
+        self.name = name
+        self.inplanes = 64
+        super(ResNetBackbone, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False) # RGB
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                nn.init.normal_(m.weight, mean=0, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        return x
+
+    def init_weights(self):
+        org_resnet = torch.utils.model_zoo.load_url(model_urls[self.name])
+        # drop orginal resnet fc layer, add 'None' in case of no fc layer, that will raise error
+        org_resnet.pop('fc.weight', None)
+        org_resnet.pop('fc.bias', None)
+        
+        self.load_state_dict(org_resnet)
+        print("Initialize resnet from model zoo")
+
+class ResNet_Deconv(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.hm2d_size = 64
+        
+        self.resnet = ResNetBackbone(50)
+        self.deconv = make_deconv_layers([2048, 256, 256, 256])
+        self.conv_hm2d = make_conv_layers([256, 21],kernel=1,stride=1,padding=0,bnrelu_final=False)
+
+        self.resnet.init_weights()
+        self.deconv.apply(self.init_weights)
+        self.conv_hm2d.apply(self.init_weights)
+
+    @staticmethod
+    def init_weights(m):
+        if type(m) == nn.ConvTranspose2d:
+            nn.init.normal_(m.weight,std=0.001)
+        elif type(m) == nn.Conv2d:
+            nn.init.normal_(m.weight,std=0.001)
+            nn.init.constant_(m.bias, 0)
+        elif type(m) == nn.BatchNorm2d:
+            nn.init.constant_(m.weight,1)
+            nn.init.constant_(m.bias,0)
+        elif type(m) == nn.Linear:
+            nn.init.normal_(m.weight,std=0.01)
+            nn.init.constant_(m.bias,0)
+
+    def forward(self, img):
+        x_feat = self.resnet(img)
+        x_feat = self.deconv(x_feat)
+
+        x_hm2d = self.conv_hm2d(x_feat)
+        pred = get_preds_from_heatmaps(x_hm2d.detach().cpu().numpy())
+        return {
+            'keypoints': pred
+        }
--- a/myeasymocap/backbone/hmr/init.py
+++ b/myeasymocap/backbone/hmr/init.py
--- a/myeasymocap/backbone/hmr/hmr.py
+++ b/myeasymocap/backbone/hmr/hmr.py
@ -0,0 +1,35 @@
+import os
+import numpy as np
+import torch
+from ..basetopdown import BaseTopDownModelCache, gdown_models
+import pickle
+from .models import hmr
+
+class MyHMR(BaseTopDownModelCache):
+    def __init__(self, ckpt, url=None):
+        super().__init__('handhmr', bbox_scale=1., res_input=224)
+        self.model = hmr()
+        self.model.eval()
+        if not os.path.exists(ckpt) and url is not None:
+            gdown_models(ckpt, url)
+        assert os.path.exists(ckpt), f'{ckpt} not exists'
+        checkpoint = torch.load(ckpt)
+        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        state_dict = checkpoint['state_dict']
+        prefix = 'model.'
+        self.load_checkpoint(self.model, state_dict, prefix, strict=True)
+        self.model.to(self.device)
+    
+    def __call__(self, bbox, images, imgnames):
+        output = super().__call__(bbox, images, imgnames)
+        Rh = output['params']['poses'][:3].copy()
+        poses = output['params']['poses'][3:]
+        Th = np.zeros_like(Rh)
+        Th[2] = 1.
+        output['params'] = {
+            'Rh': Rh,
+            'Th': Th,
+            'poses': poses,
+            'shapes': output['params']['shapes'],
+        }
+        return output
--- a/myeasymocap/backbone/hmr/hmr_api.py
+++ b/myeasymocap/backbone/hmr/hmr_api.py
@ -0,0 +1,256 @@
+'''
+Date: 2021-10-25 11:51:37 am
+Author: dihuangdh
+Descriptions: 
+-----
+LastEditTime: 2021-10-25 1:50:40 pm
+LastEditors: dihuangdh
+'''
+
+import torch
+from torchvision.transforms import Normalize
+import numpy as np
+import cv2
+
+from .models import hmr
+
+
+class constants:
+    FOCAL_LENGTH = 5000.
+    IMG_RES = 224
+
+    # Mean and standard deviation for normalizing input image
+    IMG_NORM_MEAN = [0.485, 0.456, 0.406]
+    IMG_NORM_STD = [0.229, 0.224, 0.225]
+
+def get_transform(center, scale, res, rot=0):
+    """Generate transformation matrix."""
+    h = 200 * scale
+    t = np.zeros((3, 3))
+    t[0, 0] = float(res[1]) / h
+    t[1, 1] = float(res[0]) / h
+    t[0, 2] = res[1] * (-float(center[0]) / h + .5)
+    t[1, 2] = res[0] * (-float(center[1]) / h + .5)
+    t[2, 2] = 1
+    if not rot == 0:
+        rot = -rot # To match direction of rotation from cropping
+        rot_mat = np.zeros((3,3))
+        rot_rad = rot * np.pi / 180
+        sn,cs = np.sin(rot_rad), np.cos(rot_rad)
+        rot_mat[0,:2] = [cs, -sn]
+        rot_mat[1,:2] = [sn, cs]
+        rot_mat[2,2] = 1
+        # Need to rotate around center
+        t_mat = np.eye(3)
+        t_mat[0,2] = -res[1]/2
+        t_mat[1,2] = -res[0]/2
+        t_inv = t_mat.copy()
+        t_inv[:2,2] *= -1
+        t = np.dot(t_inv,np.dot(rot_mat,np.dot(t_mat,t)))
+    return t
+    
+def transform(pt, center, scale, res, invert=0, rot=0):
+    """Transform pixel location to different reference."""
+    t = get_transform(center, scale, res, rot=rot)
+    if invert:
+        t = np.linalg.inv(t)
+    new_pt = np.array([pt[0]-1, pt[1]-1, 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2].astype(int)+1
+    
+def crop(img, center, scale, res, rot=0, bias=0):
+    """Crop image according to the supplied bounding box."""
+    # Upper left point
+    ul = np.array(transform([1, 1], center, scale, res, invert=1))-1
+    # Bottom right point
+    br = np.array(transform([res[0]+1, 
+                             res[1]+1], center, scale, res, invert=1))-1
+    
+    # Padding so that when rotated proper amount of context is included
+    pad = int(np.linalg.norm(br - ul) / 2 - float(br[1] - ul[1]) / 2)
+    if not rot == 0:
+        ul -= pad
+        br += pad
+
+    new_shape = [br[1] - ul[1], br[0] - ul[0]]
+    if len(img.shape) > 2:
+        new_shape += [img.shape[2]]
+    new_img = np.zeros(new_shape) + bias
+
+    # Range to fill new array
+    new_x = max(0, -ul[0]), min(br[0], len(img[0])) - ul[0]
+    new_y = max(0, -ul[1]), min(br[1], len(img)) - ul[1]
+    # Range to sample from original image
+    old_x = max(0, ul[0]), min(len(img[0]), br[0])
+    old_y = max(0, ul[1]), min(len(img), br[1])
+    new_img[new_y[0]:new_y[1], new_x[0]:new_x[1]] = img[old_y[0]:old_y[1], 
+                                                        old_x[0]:old_x[1]]
+
+    if not rot == 0:
+        # Remove padding
+        new_img = scipy.misc.imrotate(new_img, rot)
+        new_img = new_img[pad:-pad, pad:-pad]
+    new_img = cv2.resize(new_img, (res[0], res[1]))
+    return new_img
+    
+def process_image(img, bbox, input_res=224):
+    """Read image, do preprocessing and possibly crop it according to the bounding box.
+    If there are bounding box annotations, use them to crop the image.
+    If no bounding box is specified but openpose detections are available, use them to get the bounding box.
+    """
+    img = img[:, :, ::-1].copy()
+    normalize_img = Normalize(mean=constants.IMG_NORM_MEAN, std=constants.IMG_NORM_STD)
+    l, t, r, b = bbox[:4]
+    center = [(l+r)/2, (t+b)/2]
+    width = max(r-l, b-t)
+    scale = width/200.0
+    img = crop(img, center, scale, (input_res, input_res))
+    img = img.astype(np.float32) / 255.
+    img = torch.from_numpy(img).permute(2,0,1)
+    norm_img = normalize_img(img.clone())[None]
+    return img, norm_img
+
+def solve_translation(X, x, K):
+    A = np.zeros((2*X.shape[0], 3))
+    b = np.zeros((2*X.shape[0], 1))
+    fx, fy = K[0, 0], K[1, 1]
+    cx, cy = K[0, 2], K[1, 2]
+    for nj in range(X.shape[0]):
+        A[2*nj, 0] = 1
+        A[2*nj + 1, 1] = 1
+        A[2*nj, 2] = -(x[nj, 0] - cx)/fx
+        A[2*nj+1, 2] = -(x[nj, 1] - cy)/fy
+        b[2*nj, 0] = X[nj, 2]*(x[nj, 0] - cx)/fx - X[nj, 0]
+        b[2*nj+1, 0] = X[nj, 2]*(x[nj, 1] - cy)/fy - X[nj, 1]
+        A[2*nj:2*nj+2, :] *= x[nj, 2]
+        b[2*nj:2*nj+2, :] *= x[nj, 2]
+    trans = np.linalg.inv(A.T @ A) @ A.T @ b
+    return trans.T[0]
+
+def estimate_translation_np(S, joints_2d, joints_conf, K):
+    """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
+    Input:
+        S: (25, 3) 3D joint locations
+        joints: (25, 3) 2D joint locations and confidence
+    Returns:
+        (3,) camera translation vector
+    """
+    num_joints = S.shape[0]
+    # focal length
+    f = np.array([K[0, 0], K[1, 1]])
+    # optical center
+    center = np.array([K[0, 2], K[1, 2]])
+
+    # transformations
+    Z = np.reshape(np.tile(S[:,2],(2,1)).T,-1)
+    XY = np.reshape(S[:,0:2],-1)
+    O = np.tile(center,num_joints)
+    F = np.tile(f,num_joints)
+    weight2 = np.reshape(np.tile(np.sqrt(joints_conf),(2,1)).T,-1)
+
+    # least squares
+    Q = np.array([F*np.tile(np.array([1,0]),num_joints), F*np.tile(np.array([0,1]),num_joints), O-np.reshape(joints_2d,-1)]).T
+    c = (np.reshape(joints_2d,-1)-O)*Z - F*XY
+
+    # weighted least squares
+    W = np.diagflat(weight2)
+    Q = np.dot(W,Q)
+    c = np.dot(W,c)
+
+    # square matrix
+    A = np.dot(Q.T,Q)
+    b = np.dot(Q.T,c)
+
+    # solution
+    trans = np.linalg.solve(A, b)
+
+    return trans
+
+
+class HMR:
+    def __init__(self, checkpoint, device) -> None:
+        model = hmr().to(device)
+        checkpoint = torch.load(checkpoint)
+        state_dict = checkpoint['state_dict']
+        # update state_dict, remove 'model.'
+        for key in list(state_dict.keys()):
+            state_dict[key[6:]] = state_dict.pop(key)
+        model.load_state_dict(state_dict, strict=False)
+        # Load SMPL model
+        model.eval()
+        self.model = model
+        self.device = device
+    
+    def forward(self, img, bbox, use_rh_th=True):
+        # Preprocess input image and generate predictions
+        img, norm_img = process_image(img, bbox, input_res=constants.IMG_RES)
+        with torch.no_grad():
+            pred_rotmat, pred_betas, pred_camera = self.model(norm_img.to(self.device))
+        results = {
+            'shapes': pred_betas.detach().cpu().numpy()
+        }
+        results['poses'] = pred_rotmat.detach().cpu().numpy()
+        if use_rh_th:
+            body_params = {
+                'poses': results['poses'],
+                'shapes': results['shapes'],
+                'Rh': results['poses'][:, :3].copy(),
+                'Th': np.zeros((1, 3)),
+            }
+            body_params['Th'][0, 2] = 5
+            body_params['poses'][:, :3] = 0
+            results = body_params
+        return results
+    
+    def __call__(self, body_model, img, bbox, kpts, camera, ret_vertices=True):
+        body_params = self.forward(img.copy(), bbox)
+        body_params = body_model.check_params(body_params)
+        # only use body joints to estimation translation
+        nJoints = 21
+        keypoints3d = body_model(return_verts=False, return_tensor=False, **body_params)[0]
+        trans = solve_translation(keypoints3d[:nJoints], kpts[:nJoints], camera['K'])
+        body_params['Th'] += trans[None, :]
+        if body_params['Th'][0, 2] < 0:
+            body_params['Th'] = -body_params['Th']
+            Rhold = cv2.Rodrigues(body_params['Rh'])[0]
+            rotx = cv2.Rodrigues(np.pi*np.array([1., 0, 0]))[0]
+            Rhold = rotx @ Rhold
+            body_params['Rh'] = cv2.Rodrigues(Rhold)[0].reshape(1, 3)
+        # convert to world coordinate
+        Rhold = cv2.Rodrigues(body_params['Rh'])[0]
+        Thold = body_params['Th']
+        Rh = camera['R'].T @ Rhold
+        Th = (camera['R'].T @ (Thold.T - camera['T'])).T
+        body_params['Th'] = Th
+        body_params['Rh'] = cv2.Rodrigues(Rh)[0].reshape(1, 3)
+        keypoints3d = body_model(return_verts=False, return_tensor=False, **body_params)[0]
+        results = {'body_params': body_params, 'keypoints3d': keypoints3d}
+        if ret_vertices:
+            vertices = body_model(return_verts=True, return_tensor=False, **body_params)[0]
+            results['vertices'] = vertices
+        return results
+
+def init_with_hmr(body_model, spin_model, img, bbox, kpts, camera):
+    body_params = spin_model.forward(img.copy(), bbox)
+    body_params = body_model.check_params(body_params)
+    # only use body joints to estimation translation
+    nJoints = 15
+    keypoints3d = body_model(return_verts=False, return_tensor=False, **body_params)[0]
+    trans = estimate_translation_np(keypoints3d[:nJoints], kpts[:nJoints, :2], kpts[:nJoints, 2], camera['K'])
+    body_params['Th'] += trans[None, :]
+    # convert to world coordinate
+    Rhold = cv2.Rodrigues(body_params['Rh'])[0]
+    Thold = body_params['Th']
+    Rh = camera['R'].T @ Rhold
+    Th = (camera['R'].T @ (Thold.T - camera['T'])).T
+    body_params['Th'] = Th
+    body_params['Rh'] = cv2.Rodrigues(Rh)[0].reshape(1, 3)
+    vertices = body_model(return_verts=True, return_tensor=False, **body_params)[0]
+    keypoints3d = body_model(return_verts=False, return_tensor=False, **body_params)[0]
+    results = {'body_params': body_params, 'vertices': vertices, 'keypoints3d': keypoints3d}
+    return results
+
+class 
+
+if __name__ == '__main__':
+    pass
--- a/myeasymocap/backbone/hmr/models.py
+++ b/myeasymocap/backbone/hmr/models.py
@ -0,0 +1,174 @@
+'''
+Date: 2021-10-25 11:51:29 am
+Author: dihuangdh
+Descriptions: 
+-----
+LastEditTime: 2021-10-25 11:51:58 am
+LastEditors: dihuangdh
+'''
+
+import torch
+import torch.nn as nn
+import torchvision.models.resnet as resnet
+import numpy as np
+import math
+
+
+class Bottleneck(nn.Module):
+    """ Redefinition of Bottleneck residual block
+        Adapted from the official PyTorch implementation
+    """
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+class HMR(nn.Module):
+    """ SMPL Iterative Regressor with ResNet50 backbone
+    """
+
+    def __init__(self, block, layers):
+        self.inplanes = 64
+        super(HMR, self).__init__()
+        npose = 3 + 45
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc1 = nn.Linear(512 * block.expansion + npose + 13, 1024)
+        self.drop1 = nn.Dropout()
+        self.fc2 = nn.Linear(1024, 1024)
+        self.drop2 = nn.Dropout()
+        self.decpose = nn.Linear(1024, npose)
+        self.decshape = nn.Linear(1024, 10)
+        self.deccam = nn.Linear(1024, 3)
+        nn.init.xavier_uniform_(self.decpose.weight, gain=0.01)
+        nn.init.xavier_uniform_(self.decshape.weight, gain=0.01)
+        nn.init.xavier_uniform_(self.deccam.weight, gain=0.01)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+        init_pose = torch.zeros(npose).unsqueeze(0)
+        init_shape = torch.zeros(10).unsqueeze(0)
+        init_cam = torch.zeros(3).unsqueeze(0)
+        self.register_buffer('init_pose', init_pose)
+        self.register_buffer('init_shape', init_shape)
+        self.register_buffer('init_cam', init_cam)
+
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+
+    def forward(self, x, init_pose=None, init_shape=None, init_cam=None, n_iter=3):
+
+        batch_size = x.shape[0]
+
+        if init_pose is None:
+            init_pose = self.init_pose.expand(batch_size, -1)
+        if init_shape is None:
+            init_shape = self.init_shape.expand(batch_size, -1)
+        if init_cam is None:
+            init_cam = self.init_cam.expand(batch_size, -1)
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x1 = self.layer1(x)
+        x2 = self.layer2(x1)
+        x3 = self.layer3(x2)
+        x4 = self.layer4(x3)
+
+        xf = self.avgpool(x4)
+        xf = xf.view(xf.size(0), -1)
+
+        pred_pose = init_pose
+        pred_shape = init_shape
+        pred_cam = init_cam
+        for i in range(n_iter):
+            xc = torch.cat([xf, pred_pose, pred_shape, pred_cam],1)
+            xc = self.fc1(xc)
+            xc = self.drop1(xc)
+            xc = self.fc2(xc)
+            xc = self.drop2(xc)
+            pred_pose = self.decpose(xc) + pred_pose
+            pred_shape = self.decshape(xc) + pred_shape
+            pred_cam = self.deccam(xc) + pred_cam
+        
+        # pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, 24, 3, 3)
+        return {
+            'poses': pred_pose,
+            'shapes': pred_shape,
+            'cam': pred_cam
+        }
+
+def hmr(pretrained=True, **kwargs):
+    """ Constructs an HMR model with ResNet50 backbone.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = HMR(Bottleneck, [3, 4, 6, 3],  **kwargs)
+    if pretrained:
+        resnet_imagenet = resnet.resnet50(pretrained=True)
+        model.load_state_dict(resnet_imagenet.state_dict(),strict=False)
+    return model