From 11f13d69537f998830fb5c388afdb3479f22e93c Mon Sep 17 00:00:00 2001 From: shuaiqing Date: Mon, 19 Jun 2023 17:23:44 +0800 Subject: [PATCH] :rocket: add model check point --- config/1v1p/fixhand.yml | 8 +- config/1v1p/hand_detect_finetune.yml | 9 +- easymocap/mytools/camera_utils.py | 39 +++- easymocap/mytools/vis_base.py | 35 ++-- myeasymocap/backbone/basetopdown.py | 9 +- myeasymocap/backbone/hand2d/__init__.py | 0 myeasymocap/backbone/hand2d/hand2d.py | 89 ++++++++ myeasymocap/backbone/hand2d/resnet.py | 161 +++++++++++++++ myeasymocap/backbone/hmr/__init__.py | 0 myeasymocap/backbone/hmr/hmr.py | 35 ++++ myeasymocap/backbone/hmr/hmr_api.py | 256 ++++++++++++++++++++++++ myeasymocap/backbone/hmr/models.py | 174 ++++++++++++++++ 12 files changed, 793 insertions(+), 22 deletions(-) create mode 100644 myeasymocap/backbone/hand2d/__init__.py create mode 100644 myeasymocap/backbone/hand2d/hand2d.py create mode 100644 myeasymocap/backbone/hand2d/resnet.py create mode 100644 myeasymocap/backbone/hmr/__init__.py create mode 100644 myeasymocap/backbone/hmr/hmr.py create mode 100644 myeasymocap/backbone/hmr/hmr_api.py create mode 100644 myeasymocap/backbone/hmr/models.py diff --git a/config/1v1p/fixhand.yml b/config/1v1p/fixhand.yml index 97ef022..1cb25df 100644 --- a/config/1v1p/fixhand.yml +++ b/config/1v1p/fixhand.yml @@ -26,7 +26,8 @@ args: key_from_previous: [bbox] args: # ckpt: /nas/public/EasyMocapModels/hrnetv2_w18_coco_wholebody_hand_256x256-1c028db7_20210908.pth - ckpt: /nas/public/EasyMocapModels/hand/resnet_kp2d_clean.pt + ckpt: models/hand_resnet_kp2d_clean.pt + url: 1LTK7e9oAS6B3drmQyXwTZild6k87fEZa mode: resnet vis2d: module: myeasymocap.io.vis.Vis2D @@ -42,13 +43,14 @@ args: key_from_previous: [bbox] key_keep: [meta, cameras, imgnames] # 将这些参数都保留到最后的输出中 args: - ckpt: models/manol_pca45_noflat.ckpt + ckpt: models/hand_manol_pca45_noflat.ckpt + url: '1KTi_oJ_udLRK3WZ3xyHzBUd6vKAApfT8' # TODO: add visualize for Init MANO at_final: load_hand_model: # 载入身体模型 module: myeasymocap.io.model.MANOLoader args: - cfg_path: config/model/mano.yml + cfg_path: config/model/manol.yml model_path: models/manov1.2/MANO_LEFT.pkl #models/handmesh/data/MANO_RIGHT.pkl # load mano model regressor_path: models/manov1.2/J_regressor_mano_LEFT.txt #models/handmesh/data/J_regressor_mano_RIGHT.txt num_pca_comps: 45 diff --git a/config/1v1p/hand_detect_finetune.yml b/config/1v1p/hand_detect_finetune.yml index 4bb357b..c4eb5e2 100644 --- a/config/1v1p/hand_detect_finetune.yml +++ b/config/1v1p/hand_detect_finetune.yml @@ -26,7 +26,9 @@ args: key_from_previous: [bbox] args: # ckpt: /nas/public/EasyMocapModels/hrnetv2_w18_coco_wholebody_hand_256x256-1c028db7_20210908.pth - ckpt: /nas/public/EasyMocapModels/hand/resnet_kp2d_clean.pt + # ckpt: /nas/public/EasyMocapModels/hand/resnet_kp2d_clean.pt + ckpt: models/hand_resnet_kp2d_clean.pt + url: 1LTK7e9oAS6B3drmQyXwTZild6k87fEZa mode: resnet vis2d: module: myeasymocap.io.vis.Vis2D @@ -42,13 +44,14 @@ args: key_from_previous: [bbox] key_keep: [meta, cameras, imgnames] # 将这些参数都保留到最后的输出中 args: - ckpt: models/manol_pca45_noflat.ckpt + ckpt: models/hand_manol_pca45_noflat.ckpt + url: '1KTi_oJ_udLRK3WZ3xyHzBUd6vKAApfT8' # TODO: add visualize for Init MANO at_final: load_hand_model: # 载入身体模型 module: myeasymocap.io.model.MANOLoader args: - cfg_path: config/model/mano.yml + cfg_path: config/model/manol.yml model_path: models/manov1.2/MANO_LEFT.pkl #models/handmesh/data/MANO_RIGHT.pkl # load mano model regressor_path: models/manov1.2/J_regressor_mano_LEFT.txt #models/handmesh/data/J_regressor_mano_RIGHT.txt num_pca_comps: 45 diff --git a/easymocap/mytools/camera_utils.py b/easymocap/mytools/camera_utils.py index 7ada0d2..666b5cf 100644 --- a/easymocap/mytools/camera_utils.py +++ b/easymocap/mytools/camera_utils.py @@ -33,11 +33,13 @@ class FileStorage(object): self._write(' rows: {}'.format(value.shape[0])) self._write(' cols: {}'.format(value.shape[1])) self._write(' dt: d') - self._write(' data: [{}]'.format(', '.join(['{:.3f}'.format(i) for i in value.reshape(-1)]))) + self._write(' data: [{}]'.format(', '.join(['{:.6f}'.format(i) for i in value.reshape(-1)]))) elif dt == 'list': self._write('{}:'.format(key)) for elem in value: self._write(' - "{}"'.format(elem)) + elif dt == 'int': + self._write('{}: {}'.format(key, value)) def read(self, key, dt='mat'): if dt == 'mat': @@ -52,6 +54,8 @@ class FileStorage(object): if val != 'none': results.append(val) output = results + elif dt == 'int': + output = int(self.fs.getNode(key).real()) else: raise NotImplementedError return output @@ -114,6 +118,13 @@ def read_camera(intri_name, extri_name, cam_names=[]): cams[cam] = {} cams[cam]['K'] = intri.read('K_{}'.format( cam)) cams[cam]['invK'] = np.linalg.inv(cams[cam]['K']) + H = intri.read('H_{}'.format(cam), dt='int') + W = intri.read('W_{}'.format(cam), dt='int') + if H is None or W is None: + print('[camera] no H or W for {}'.format(cam)) + H, W = -1, -1 + cams[cam]['H'] = H + cams[cam]['W'] = W Rvec = extri.read('R_{}'.format(cam)) Tvec = extri.read('T_{}'.format(cam)) assert Rvec is not None, cam @@ -129,6 +140,10 @@ def read_camera(intri_name, extri_name, cam_names=[]): cams[cam]['P'] = P[cam] cams[cam]['dist'] = intri.read('dist_{}'.format(cam)) + if cams[cam]['dist'] is None: + cams[cam]['dist'] = intri.read('D_{}'.format(cam)) + if cams[cam]['dist'] is None: + print('[camera] no dist for {}'.format(cam)) cams['basenames'] = cam_names return cams @@ -155,6 +170,9 @@ def write_camera(camera, path): key = key_.split('.')[0] intri.write('K_{}'.format(key), val['K']) intri.write('dist_{}'.format(key), val['dist']) + if 'H' in val.keys() and 'W' in val.keys(): + intri.write('H_{}'.format(key), val['H'], dt='int') + intri.write('W_{}'.format(key), val['W'], dt='int') if 'Rvec' not in val.keys(): val['Rvec'] = cv2.Rodrigues(val['R'])[0] extri.write('R_{}'.format(key), val['Rvec']) @@ -174,7 +192,7 @@ def camera_from_img(img): class Undistort: distortMap = {} @classmethod - def image(cls, frame, K, dist, sub=None): + def image(cls, frame, K, dist, sub=None, interp=cv2.INTER_NEAREST): if sub is None: return cv2.undistort(frame, K, dist, None) else: @@ -183,7 +201,7 @@ class Undistort: mapx, mapy = cv2.initUndistortRectifyMap(K, dist, None, K, (w,h), 5) cls.distortMap[sub] = (mapx, mapy) mapx, mapy = cls.distortMap[sub] - img = cv2.remap(frame, mapx, mapy, cv2.INTER_NEAREST) + img = cv2.remap(frame, mapx, mapy, interp) return img @staticmethod @@ -203,6 +221,21 @@ class Undistort: bbox = np.array([kpts[0, 0], kpts[0, 1], kpts[1, 0], kpts[1, 1], bbox[4]]) return bbox +class Distort: + @staticmethod + def points(keypoints, K, dist): + pass + + @staticmethod + def bbox(bbox, K, dist): + keypoints = np.array([[bbox[0], bbox[1]], [bbox[2], bbox[3]]], dtype=np.float32) + k3d = cv2.convertPointsToHomogeneous(keypoints) + k3d = (np.linalg.inv(K) @ k3d[:, 0].T).T[:, None] + k2d, _ = cv2.projectPoints(k3d, np.zeros((3,)), np.zeros((3,)), K, dist) + k2d = k2d[:, 0] + bbox = np.array([k2d[0,0], k2d[0,1], k2d[1, 0], k2d[1, 1], bbox[-1]]) + return bbox + def unproj(kpts, invK): homo = np.hstack([kpts[:, :2], np.ones_like(kpts[:, :1])]) homo = homo @ invK.T diff --git a/easymocap/mytools/vis_base.py b/easymocap/mytools/vis_base.py index 490c63a..7036bbe 100644 --- a/easymocap/mytools/vis_base.py +++ b/easymocap/mytools/vis_base.py @@ -2,14 +2,15 @@ @ Date: 2020-11-28 17:23:04 @ Author: Qing Shuai @ LastEditors: Qing Shuai - @ LastEditTime: 2022-08-12 21:50:56 + @ LastEditTime: 2022-10-27 15:13:56 @ FilePath: /EasyMocapPublic/easymocap/mytools/vis_base.py ''' import cv2 import numpy as np import json -def generate_colorbar(N = 20, cmap = 'jet', rand=True): +def generate_colorbar(N = 20, cmap = 'jet', rand=True, + ret_float=False, ret_array=False, ret_rgb=False): bar = ((np.arange(N)/(N-1))*255).astype(np.uint8).reshape(-1, 1) colorbar = cv2.applyColorMap(bar, cv2.COLORMAP_JET).squeeze() if False: @@ -22,7 +23,12 @@ def generate_colorbar(N = 20, cmap = 'jet', rand=True): rgb = colorbar[index, :] else: rgb = colorbar - rgb = rgb.tolist() + if ret_rgb: + rgb = rgb[:, ::-1] + if ret_float: + rgb = rgb/255. + if not ret_array: + rgb = rgb.tolist() return rgb # colors_bar_rgb = generate_colorbar(cmap='hsv') @@ -69,9 +75,11 @@ def get_rgb(index): # elif index == 0: # return (245, 150, 150) col = list(colors_bar_rgb[index%len(colors_bar_rgb)])[::-1] - else: + elif isinstance(index, str): col = colors_table.get(index, (1, 0, 0)) col = tuple([int(c*255) for c in col[::-1]]) + else: + raise TypeError('index should be int or str') return col def get_rgb_01(index): @@ -150,14 +158,16 @@ def plot_keypoints(img, points, pid, config, vis_conf=False, use_limb_color=True cv2.putText(img, '{:.1f}'.format(c), (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, text_size, col, 2) -def plot_keypoints_auto(img, points, pid, vis_conf=False, use_limb_color=True, scale=1, lw=-1): +def plot_keypoints_auto(img, points, pid, vis_conf=False, use_limb_color=True, scale=1, lw=-1, config_name=None, lw_factor=1): from ..dataset.config import CONFIG - config_name = {25: 'body25', 21: 'hand', 42:'handlr', 17: 'coco', 1:'points', 67:'bodyhand', 137: 'total', 79:'up'}[len(points)] + if config_name is None: + config_name = {25: 'body25', 15: 'body15', 21: 'hand', 42:'handlr', 17: 'coco', 1:'points', 67:'bodyhand', 137: 'total', 79:'up', + 19:'ochuman'}[len(points)] config = CONFIG[config_name] if lw == -1: lw = img.shape[0]//200 if config_name == 'hand': - lw = img.shape[0]//1000 + lw = img.shape[0]//100 lw = max(lw, 1) for ii, (i, j) in enumerate(config['kintree']): if i >= len(points) or j >= len(points): @@ -169,9 +179,9 @@ def plot_keypoints_auto(img, points, pid, vis_conf=False, use_limb_color=True, s col = get_rgb(config['colors'][ii]) else: col = get_rgb(pid) - if pt1[0] < 0 or pt1[1] < 0 or pt1[0] > 10000 or pt1[1] > 10000: + if pt1[0] < -10000 or pt1[1] < -10000 or pt1[0] > 10000 or pt1[1] > 10000: continue - if pt2[0] < 0 or pt2[1] < 0 or pt2[0] > 10000 or pt2[1] > 10000: + if pt2[0] < -10000 or pt2[1] < -10000 or pt2[0] > 10000 or pt2[1] > 10000: continue if pt1[-1] > 0.01 and pt2[-1] > 0.01: image = cv2.line( @@ -191,12 +201,13 @@ def plot_keypoints_auto(img, points, pid, vis_conf=False, use_limb_color=True, s if c > 0.01: col = get_rgb(pid) if len(points) == 1: - cv2.circle(img, (int(x+0.5), int(y+0.5)), lw*10, col, lw*2) - plot_cross(img, int(x+0.5), int(y+0.5), width=lw*5, col=col, lw=lw*2) + _lw = max(0, int(lw * lw_factor)) + cv2.circle(img, (int(x+0.5), int(y+0.5)), _lw*2, col, lw*2) + plot_cross(img, int(x+0.5), int(y+0.5), width=_lw, col=col, lw=lw*2) else: cv2.circle(img, (int(x+0.5), int(y+0.5)), lw*2, col, -1) if vis_conf: - cv2.putText(img, '{:.1f}'.format(c), (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, 1, col, 2) + cv2.putText(img, '{:.1f}'.format(c), (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, col, 2) def plot_keypoints_total(img, annots, scale, pid_offset=0): _lw = img.shape[0] // 150 diff --git a/myeasymocap/backbone/basetopdown.py b/myeasymocap/backbone/basetopdown.py index 5ff6b1d..e85e2a8 100644 --- a/myeasymocap/backbone/basetopdown.py +++ b/myeasymocap/backbone/basetopdown.py @@ -238,4 +238,11 @@ def get_preds_from_heatmaps(batch_heatmaps): coords[n][p] += np.sign(diff) * .25 coords = coords.astype(np.float32) * 4 pred = np.dstack((coords, maxvals)) - return pred \ No newline at end of file + return pred + +def gdown_models(ckpt, url): + print('Try to download model from {} to {}'.format(url, ckpt)) + os.makedirs(os.path.dirname(ckpt), exist_ok=True) + cmd = 'gdown "{}" -O {}'.format(url, ckpt) + print('\n', cmd, '\n') + os.system(cmd) \ No newline at end of file diff --git a/myeasymocap/backbone/hand2d/__init__.py b/myeasymocap/backbone/hand2d/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/myeasymocap/backbone/hand2d/hand2d.py b/myeasymocap/backbone/hand2d/hand2d.py new file mode 100644 index 0000000..4cb5ae6 --- /dev/null +++ b/myeasymocap/backbone/hand2d/hand2d.py @@ -0,0 +1,89 @@ +import os +import cv2 +import torch +import torch.nn as nn +import numpy as np +import math +# https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_rhd2d_256x256-95b20dd8_20210330.pth +# https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_onehand10k_256x256_dark-a2f80c64_20210330.pth +from .hrnet import PoseHighResolutionNet +from ..basetopdown import BaseTopDownModelCache, get_preds_from_heatmaps, gdown_models + +class TopDownAsMMPose(nn.Module): + def __init__(self, backbone, head): + super().__init__() + self.bacbone = backbone + self.head = head + + def forward(self, x): + feat_list = self.bacbone(x) + size = feat_list[0].shape[-2:] + resized_inputs = [ + nn.functional.interpolate(feat, size, mode='bilinear', align_corners=False) \ + for feat in feat_list + ] + resized_inputs = torch.cat(resized_inputs, 1) + out = self.head(resized_inputs) + pred = get_preds_from_heatmaps(out.detach().cpu().numpy()) + return {'keypoints': pred} + +class MyHand2D(BaseTopDownModelCache): + def __init__(self, ckpt, url=None, mode='hrnet'): + if mode == 'hrnet': + super().__init__(name='hand2d', bbox_scale=1.1, res_input=256) + backbone = PoseHighResolutionNet(inp_ch=3, out_ch=21, W=18, multi_scale_final=True, add_final_layer=False) + checkpoint = torch.load(ckpt, map_location='cpu')['state_dict'] + self.load_checkpoint(backbone, checkpoint, prefix='backbone.', strict=True) + head = nn.Sequential( + nn.Conv2d(270, 270, kernel_size=1), + nn.BatchNorm2d(270), + nn.ReLU(inplace=True), + nn.Conv2d(270, 21, kernel_size=1) + ) + self.load_checkpoint(head, checkpoint, prefix='keypoint_head.final_layer.', strict=True) + # self.model = nn.Sequential(backbone, head) + self.model = TopDownAsMMPose(backbone, head) + elif mode == 'resnet': + super().__init__(name='hand2d', bbox_scale=1.1, res_input=256, mean=[0., 0., 0.], std=[1., 1., 1.]) + from .resnet import ResNet_Deconv + if not os.path.exists(ckpt) and url is not None: + gdown_models(ckpt, url) + assert os.path.exists(ckpt), f'{ckpt} not exists' + checkpoint = torch.load(ckpt, map_location='cpu')['state_dict'] + model = ResNet_Deconv() + self.load_checkpoint(model, checkpoint, prefix='model.', strict=True) + self.model = model + self.model.eval() + self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + self.model.to(self.device) + + def __call__(self, bbox, images, imgnames): + squeeze = False + if not isinstance(images, list): + images = [images] + imgnames = [imgnames] + bbox = [bbox] + squeeze = True + nViews = len(images) + kpts_all = [] + for nv in range(nViews): + if bbox[nv].shape[0] == 0: + kpts_all.append(np.zeros((21, 3))) + continue + _bbox = bbox[nv] + if len(_bbox.shape) == 1: + _bbox = _bbox[None] + output = super().__call__(_bbox, images[nv], imgnames[nv]) + kpts = output['params']['keypoints'] + conf = kpts[..., -1:] + kpts = self.batch_affine_transform(kpts, output['params']['inv_trans']) + kpts = np.concatenate([kpts, conf], axis=-1) + if len(kpts.shape) == 3: + kpts = kpts[0] + kpts_all.append(kpts) + kpts_all = np.stack(kpts_all) + if squeeze: + kpts_all = kpts_all[0] + return { + 'keypoints': kpts_all + } \ No newline at end of file diff --git a/myeasymocap/backbone/hand2d/resnet.py b/myeasymocap/backbone/hand2d/resnet.py new file mode 100644 index 0000000..343f332 --- /dev/null +++ b/myeasymocap/backbone/hand2d/resnet.py @@ -0,0 +1,161 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# + +import os +import os.path as osp +import torch +import torch.nn as nn +from torchvision.models.resnet import BasicBlock, Bottleneck +from torchvision.models.resnet import model_urls +from ..basetopdown import get_preds_from_heatmaps + +def make_conv_layers(feat_dims, kernel=3, stride=1, padding=1, bnrelu_final=True): + layers = [] + for i in range(len(feat_dims)-1): + layers.append( + nn.Conv2d( + in_channels=feat_dims[i], + out_channels=feat_dims[i+1], + kernel_size=kernel, + stride=stride, + padding=padding + )) + # Do not use BN and ReLU for final estimation + if i < len(feat_dims)-2 or (i == len(feat_dims)-2 and bnrelu_final): + layers.append(nn.BatchNorm2d(feat_dims[i+1])) + layers.append(nn.ReLU(inplace=True)) + + return nn.Sequential(*layers) + +def make_deconv_layers(feat_dims, bnrelu_final=True): + layers = [] + for i in range(len(feat_dims)-1): + layers.append( + nn.ConvTranspose2d( + in_channels=feat_dims[i], + out_channels=feat_dims[i+1], + kernel_size=4, + stride=2, + padding=1, + output_padding=0, + bias=False)) + + # Do not use BN and ReLU for final estimation + if i < len(feat_dims)-2 or (i == len(feat_dims)-2 and bnrelu_final): + layers.append(nn.BatchNorm2d(feat_dims[i+1])) + layers.append(nn.ReLU(inplace=True)) + + return nn.Sequential(*layers) + + +class ResNetBackbone(nn.Module): + + def __init__(self, resnet_type): + + resnet_spec = {18: (BasicBlock, [2, 2, 2, 2], [64, 64, 128, 256, 512], 'resnet18'), + 34: (BasicBlock, [3, 4, 6, 3], [64, 64, 128, 256, 512], 'resnet34'), + 50: (Bottleneck, [3, 4, 6, 3], [64, 256, 512, 1024, 2048], 'resnet50'), + 101: (Bottleneck, [3, 4, 23, 3], [64, 256, 512, 1024, 2048], 'resnet101'), + 152: (Bottleneck, [3, 8, 36, 3], [64, 256, 512, 1024, 2048], 'resnet152')} + block, layers, channels, name = resnet_spec[resnet_type] + + self.name = name + self.inplanes = 64 + super(ResNetBackbone, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, + bias=False) # RGB + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + nn.init.normal_(m.weight, mean=0, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + return x + + def init_weights(self): + org_resnet = torch.utils.model_zoo.load_url(model_urls[self.name]) + # drop orginal resnet fc layer, add 'None' in case of no fc layer, that will raise error + org_resnet.pop('fc.weight', None) + org_resnet.pop('fc.bias', None) + + self.load_state_dict(org_resnet) + print("Initialize resnet from model zoo") + +class ResNet_Deconv(nn.Module): + def __init__(self): + super().__init__() + self.hm2d_size = 64 + + self.resnet = ResNetBackbone(50) + self.deconv = make_deconv_layers([2048, 256, 256, 256]) + self.conv_hm2d = make_conv_layers([256, 21],kernel=1,stride=1,padding=0,bnrelu_final=False) + + self.resnet.init_weights() + self.deconv.apply(self.init_weights) + self.conv_hm2d.apply(self.init_weights) + + @staticmethod + def init_weights(m): + if type(m) == nn.ConvTranspose2d: + nn.init.normal_(m.weight,std=0.001) + elif type(m) == nn.Conv2d: + nn.init.normal_(m.weight,std=0.001) + nn.init.constant_(m.bias, 0) + elif type(m) == nn.BatchNorm2d: + nn.init.constant_(m.weight,1) + nn.init.constant_(m.bias,0) + elif type(m) == nn.Linear: + nn.init.normal_(m.weight,std=0.01) + nn.init.constant_(m.bias,0) + + def forward(self, img): + x_feat = self.resnet(img) + x_feat = self.deconv(x_feat) + + x_hm2d = self.conv_hm2d(x_feat) + pred = get_preds_from_heatmaps(x_hm2d.detach().cpu().numpy()) + return { + 'keypoints': pred + } \ No newline at end of file diff --git a/myeasymocap/backbone/hmr/__init__.py b/myeasymocap/backbone/hmr/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/myeasymocap/backbone/hmr/hmr.py b/myeasymocap/backbone/hmr/hmr.py new file mode 100644 index 0000000..5d5df32 --- /dev/null +++ b/myeasymocap/backbone/hmr/hmr.py @@ -0,0 +1,35 @@ +import os +import numpy as np +import torch +from ..basetopdown import BaseTopDownModelCache, gdown_models +import pickle +from .models import hmr + +class MyHMR(BaseTopDownModelCache): + def __init__(self, ckpt, url=None): + super().__init__('handhmr', bbox_scale=1., res_input=224) + self.model = hmr() + self.model.eval() + if not os.path.exists(ckpt) and url is not None: + gdown_models(ckpt, url) + assert os.path.exists(ckpt), f'{ckpt} not exists' + checkpoint = torch.load(ckpt) + self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + state_dict = checkpoint['state_dict'] + prefix = 'model.' + self.load_checkpoint(self.model, state_dict, prefix, strict=True) + self.model.to(self.device) + + def __call__(self, bbox, images, imgnames): + output = super().__call__(bbox, images, imgnames) + Rh = output['params']['poses'][:3].copy() + poses = output['params']['poses'][3:] + Th = np.zeros_like(Rh) + Th[2] = 1. + output['params'] = { + 'Rh': Rh, + 'Th': Th, + 'poses': poses, + 'shapes': output['params']['shapes'], + } + return output \ No newline at end of file diff --git a/myeasymocap/backbone/hmr/hmr_api.py b/myeasymocap/backbone/hmr/hmr_api.py new file mode 100644 index 0000000..0e5a167 --- /dev/null +++ b/myeasymocap/backbone/hmr/hmr_api.py @@ -0,0 +1,256 @@ +''' +Date: 2021-10-25 11:51:37 am +Author: dihuangdh +Descriptions: +----- +LastEditTime: 2021-10-25 1:50:40 pm +LastEditors: dihuangdh +''' + +import torch +from torchvision.transforms import Normalize +import numpy as np +import cv2 + +from .models import hmr + + +class constants: + FOCAL_LENGTH = 5000. + IMG_RES = 224 + + # Mean and standard deviation for normalizing input image + IMG_NORM_MEAN = [0.485, 0.456, 0.406] + IMG_NORM_STD = [0.229, 0.224, 0.225] + +def get_transform(center, scale, res, rot=0): + """Generate transformation matrix.""" + h = 200 * scale + t = np.zeros((3, 3)) + t[0, 0] = float(res[1]) / h + t[1, 1] = float(res[0]) / h + t[0, 2] = res[1] * (-float(center[0]) / h + .5) + t[1, 2] = res[0] * (-float(center[1]) / h + .5) + t[2, 2] = 1 + if not rot == 0: + rot = -rot # To match direction of rotation from cropping + rot_mat = np.zeros((3,3)) + rot_rad = rot * np.pi / 180 + sn,cs = np.sin(rot_rad), np.cos(rot_rad) + rot_mat[0,:2] = [cs, -sn] + rot_mat[1,:2] = [sn, cs] + rot_mat[2,2] = 1 + # Need to rotate around center + t_mat = np.eye(3) + t_mat[0,2] = -res[1]/2 + t_mat[1,2] = -res[0]/2 + t_inv = t_mat.copy() + t_inv[:2,2] *= -1 + t = np.dot(t_inv,np.dot(rot_mat,np.dot(t_mat,t))) + return t + +def transform(pt, center, scale, res, invert=0, rot=0): + """Transform pixel location to different reference.""" + t = get_transform(center, scale, res, rot=rot) + if invert: + t = np.linalg.inv(t) + new_pt = np.array([pt[0]-1, pt[1]-1, 1.]).T + new_pt = np.dot(t, new_pt) + return new_pt[:2].astype(int)+1 + +def crop(img, center, scale, res, rot=0, bias=0): + """Crop image according to the supplied bounding box.""" + # Upper left point + ul = np.array(transform([1, 1], center, scale, res, invert=1))-1 + # Bottom right point + br = np.array(transform([res[0]+1, + res[1]+1], center, scale, res, invert=1))-1 + + # Padding so that when rotated proper amount of context is included + pad = int(np.linalg.norm(br - ul) / 2 - float(br[1] - ul[1]) / 2) + if not rot == 0: + ul -= pad + br += pad + + new_shape = [br[1] - ul[1], br[0] - ul[0]] + if len(img.shape) > 2: + new_shape += [img.shape[2]] + new_img = np.zeros(new_shape) + bias + + # Range to fill new array + new_x = max(0, -ul[0]), min(br[0], len(img[0])) - ul[0] + new_y = max(0, -ul[1]), min(br[1], len(img)) - ul[1] + # Range to sample from original image + old_x = max(0, ul[0]), min(len(img[0]), br[0]) + old_y = max(0, ul[1]), min(len(img), br[1]) + new_img[new_y[0]:new_y[1], new_x[0]:new_x[1]] = img[old_y[0]:old_y[1], + old_x[0]:old_x[1]] + + if not rot == 0: + # Remove padding + new_img = scipy.misc.imrotate(new_img, rot) + new_img = new_img[pad:-pad, pad:-pad] + new_img = cv2.resize(new_img, (res[0], res[1])) + return new_img + +def process_image(img, bbox, input_res=224): + """Read image, do preprocessing and possibly crop it according to the bounding box. + If there are bounding box annotations, use them to crop the image. + If no bounding box is specified but openpose detections are available, use them to get the bounding box. + """ + img = img[:, :, ::-1].copy() + normalize_img = Normalize(mean=constants.IMG_NORM_MEAN, std=constants.IMG_NORM_STD) + l, t, r, b = bbox[:4] + center = [(l+r)/2, (t+b)/2] + width = max(r-l, b-t) + scale = width/200.0 + img = crop(img, center, scale, (input_res, input_res)) + img = img.astype(np.float32) / 255. + img = torch.from_numpy(img).permute(2,0,1) + norm_img = normalize_img(img.clone())[None] + return img, norm_img + +def solve_translation(X, x, K): + A = np.zeros((2*X.shape[0], 3)) + b = np.zeros((2*X.shape[0], 1)) + fx, fy = K[0, 0], K[1, 1] + cx, cy = K[0, 2], K[1, 2] + for nj in range(X.shape[0]): + A[2*nj, 0] = 1 + A[2*nj + 1, 1] = 1 + A[2*nj, 2] = -(x[nj, 0] - cx)/fx + A[2*nj+1, 2] = -(x[nj, 1] - cy)/fy + b[2*nj, 0] = X[nj, 2]*(x[nj, 0] - cx)/fx - X[nj, 0] + b[2*nj+1, 0] = X[nj, 2]*(x[nj, 1] - cy)/fy - X[nj, 1] + A[2*nj:2*nj+2, :] *= x[nj, 2] + b[2*nj:2*nj+2, :] *= x[nj, 2] + trans = np.linalg.inv(A.T @ A) @ A.T @ b + return trans.T[0] + +def estimate_translation_np(S, joints_2d, joints_conf, K): + """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. + Input: + S: (25, 3) 3D joint locations + joints: (25, 3) 2D joint locations and confidence + Returns: + (3,) camera translation vector + """ + num_joints = S.shape[0] + # focal length + f = np.array([K[0, 0], K[1, 1]]) + # optical center + center = np.array([K[0, 2], K[1, 2]]) + + # transformations + Z = np.reshape(np.tile(S[:,2],(2,1)).T,-1) + XY = np.reshape(S[:,0:2],-1) + O = np.tile(center,num_joints) + F = np.tile(f,num_joints) + weight2 = np.reshape(np.tile(np.sqrt(joints_conf),(2,1)).T,-1) + + # least squares + Q = np.array([F*np.tile(np.array([1,0]),num_joints), F*np.tile(np.array([0,1]),num_joints), O-np.reshape(joints_2d,-1)]).T + c = (np.reshape(joints_2d,-1)-O)*Z - F*XY + + # weighted least squares + W = np.diagflat(weight2) + Q = np.dot(W,Q) + c = np.dot(W,c) + + # square matrix + A = np.dot(Q.T,Q) + b = np.dot(Q.T,c) + + # solution + trans = np.linalg.solve(A, b) + + return trans + + +class HMR: + def __init__(self, checkpoint, device) -> None: + model = hmr().to(device) + checkpoint = torch.load(checkpoint) + state_dict = checkpoint['state_dict'] + # update state_dict, remove 'model.' + for key in list(state_dict.keys()): + state_dict[key[6:]] = state_dict.pop(key) + model.load_state_dict(state_dict, strict=False) + # Load SMPL model + model.eval() + self.model = model + self.device = device + + def forward(self, img, bbox, use_rh_th=True): + # Preprocess input image and generate predictions + img, norm_img = process_image(img, bbox, input_res=constants.IMG_RES) + with torch.no_grad(): + pred_rotmat, pred_betas, pred_camera = self.model(norm_img.to(self.device)) + results = { + 'shapes': pred_betas.detach().cpu().numpy() + } + results['poses'] = pred_rotmat.detach().cpu().numpy() + if use_rh_th: + body_params = { + 'poses': results['poses'], + 'shapes': results['shapes'], + 'Rh': results['poses'][:, :3].copy(), + 'Th': np.zeros((1, 3)), + } + body_params['Th'][0, 2] = 5 + body_params['poses'][:, :3] = 0 + results = body_params + return results + + def __call__(self, body_model, img, bbox, kpts, camera, ret_vertices=True): + body_params = self.forward(img.copy(), bbox) + body_params = body_model.check_params(body_params) + # only use body joints to estimation translation + nJoints = 21 + keypoints3d = body_model(return_verts=False, return_tensor=False, **body_params)[0] + trans = solve_translation(keypoints3d[:nJoints], kpts[:nJoints], camera['K']) + body_params['Th'] += trans[None, :] + if body_params['Th'][0, 2] < 0: + body_params['Th'] = -body_params['Th'] + Rhold = cv2.Rodrigues(body_params['Rh'])[0] + rotx = cv2.Rodrigues(np.pi*np.array([1., 0, 0]))[0] + Rhold = rotx @ Rhold + body_params['Rh'] = cv2.Rodrigues(Rhold)[0].reshape(1, 3) + # convert to world coordinate + Rhold = cv2.Rodrigues(body_params['Rh'])[0] + Thold = body_params['Th'] + Rh = camera['R'].T @ Rhold + Th = (camera['R'].T @ (Thold.T - camera['T'])).T + body_params['Th'] = Th + body_params['Rh'] = cv2.Rodrigues(Rh)[0].reshape(1, 3) + keypoints3d = body_model(return_verts=False, return_tensor=False, **body_params)[0] + results = {'body_params': body_params, 'keypoints3d': keypoints3d} + if ret_vertices: + vertices = body_model(return_verts=True, return_tensor=False, **body_params)[0] + results['vertices'] = vertices + return results + +def init_with_hmr(body_model, spin_model, img, bbox, kpts, camera): + body_params = spin_model.forward(img.copy(), bbox) + body_params = body_model.check_params(body_params) + # only use body joints to estimation translation + nJoints = 15 + keypoints3d = body_model(return_verts=False, return_tensor=False, **body_params)[0] + trans = estimate_translation_np(keypoints3d[:nJoints], kpts[:nJoints, :2], kpts[:nJoints, 2], camera['K']) + body_params['Th'] += trans[None, :] + # convert to world coordinate + Rhold = cv2.Rodrigues(body_params['Rh'])[0] + Thold = body_params['Th'] + Rh = camera['R'].T @ Rhold + Th = (camera['R'].T @ (Thold.T - camera['T'])).T + body_params['Th'] = Th + body_params['Rh'] = cv2.Rodrigues(Rh)[0].reshape(1, 3) + vertices = body_model(return_verts=True, return_tensor=False, **body_params)[0] + keypoints3d = body_model(return_verts=False, return_tensor=False, **body_params)[0] + results = {'body_params': body_params, 'vertices': vertices, 'keypoints3d': keypoints3d} + return results + +class + +if __name__ == '__main__': + pass \ No newline at end of file diff --git a/myeasymocap/backbone/hmr/models.py b/myeasymocap/backbone/hmr/models.py new file mode 100644 index 0000000..6dba947 --- /dev/null +++ b/myeasymocap/backbone/hmr/models.py @@ -0,0 +1,174 @@ +''' +Date: 2021-10-25 11:51:29 am +Author: dihuangdh +Descriptions: +----- +LastEditTime: 2021-10-25 11:51:58 am +LastEditors: dihuangdh +''' + +import torch +import torch.nn as nn +import torchvision.models.resnet as resnet +import numpy as np +import math + + +class Bottleneck(nn.Module): + """ Redefinition of Bottleneck residual block + Adapted from the official PyTorch implementation + """ + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + +class HMR(nn.Module): + """ SMPL Iterative Regressor with ResNet50 backbone + """ + + def __init__(self, block, layers): + self.inplanes = 64 + super(HMR, self).__init__() + npose = 3 + 45 + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, + bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + self.avgpool = nn.AvgPool2d(7, stride=1) + self.fc1 = nn.Linear(512 * block.expansion + npose + 13, 1024) + self.drop1 = nn.Dropout() + self.fc2 = nn.Linear(1024, 1024) + self.drop2 = nn.Dropout() + self.decpose = nn.Linear(1024, npose) + self.decshape = nn.Linear(1024, 10) + self.deccam = nn.Linear(1024, 3) + nn.init.xavier_uniform_(self.decpose.weight, gain=0.01) + nn.init.xavier_uniform_(self.decshape.weight, gain=0.01) + nn.init.xavier_uniform_(self.deccam.weight, gain=0.01) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + init_pose = torch.zeros(npose).unsqueeze(0) + init_shape = torch.zeros(10).unsqueeze(0) + init_cam = torch.zeros(3).unsqueeze(0) + self.register_buffer('init_pose', init_pose) + self.register_buffer('init_shape', init_shape) + self.register_buffer('init_cam', init_cam) + + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + + def forward(self, x, init_pose=None, init_shape=None, init_cam=None, n_iter=3): + + batch_size = x.shape[0] + + if init_pose is None: + init_pose = self.init_pose.expand(batch_size, -1) + if init_shape is None: + init_shape = self.init_shape.expand(batch_size, -1) + if init_cam is None: + init_cam = self.init_cam.expand(batch_size, -1) + + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x1 = self.layer1(x) + x2 = self.layer2(x1) + x3 = self.layer3(x2) + x4 = self.layer4(x3) + + xf = self.avgpool(x4) + xf = xf.view(xf.size(0), -1) + + pred_pose = init_pose + pred_shape = init_shape + pred_cam = init_cam + for i in range(n_iter): + xc = torch.cat([xf, pred_pose, pred_shape, pred_cam],1) + xc = self.fc1(xc) + xc = self.drop1(xc) + xc = self.fc2(xc) + xc = self.drop2(xc) + pred_pose = self.decpose(xc) + pred_pose + pred_shape = self.decshape(xc) + pred_shape + pred_cam = self.deccam(xc) + pred_cam + + # pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, 24, 3, 3) + return { + 'poses': pred_pose, + 'shapes': pred_shape, + 'cam': pred_cam + } + +def hmr(pretrained=True, **kwargs): + """ Constructs an HMR model with ResNet50 backbone. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = HMR(Bottleneck, [3, 4, 6, 3], **kwargs) + if pretrained: + resnet_imagenet = resnet.resnet50(pretrained=True) + model.load_state_dict(resnet_imagenet.state_dict(),strict=False) + return model \ No newline at end of file