🚧 support multi-gpu for keypoints

2022-08-21 16:01:18 +08:00 · 2022-08-21 16:01:18 +08:00 · cd3f184f04
commit cd3f184f04
parent 95f15b14d8
2 changed files with 187 additions and 25 deletions
--- a/apps/preprocess/extract_keypoints.py
+++ b/apps/preprocess/extract_keypoints.py
@ -2,8 +2,8 @@
  @ Date: 2021-08-19 22:06:22
  @ Author: Qing Shuai
  @ LastEditors: Qing Shuai
-  @ LastEditTime: 2021-12-02 21:19:41
+  @ LastEditTime: 2022-05-23 22:58:57
-  @ FilePath: /EasyMocap/apps/preprocess/extract_keypoints.py
+  @ FilePath: /EasyMocapPublic/apps/preprocess/extract_keypoints.py
 '''
 import os
 from os.path import join
@ -27,6 +27,7 @@ config = {
        'vis': False,
        'ext': '.jpg'
    },
    'openposecrop': {},
    'feet':{
        'root': '',
        'res': 1,
@ -78,7 +79,7 @@ config = {
        'min_detection_confidence':0.3,
        'min_tracking_confidence': 0.1,
        'static_image_mode': False,
-    }
+    },
 }
 if __name__ == "__main__":
@ -91,24 +92,58 @@ if __name__ == "__main__":
        help="sub directory name to store the generated annotation files, default to be annots")
    # Detection Control
    parser.add_argument('--mode', type=str, default='openpose', choices=[
-        'openpose', 'feet', 'feetcrop', 'yolo-hrnet', 'yolo', 'hrnet', 
+        'openpose', 'feet', 'feetcrop', 'openposecrop',
        'yolo-hrnet', 'yolo', 'hrnet', 
        'mp-pose', 'mp-holistic', 'mp-handl', 'mp-handr', 'mp-face'], 
        help="model to extract joints from image")
    # Openpose
    parser.add_argument('--openpose', type=str, 
-        default='/media/qing/Project/openpose')
+        default=os.environ.get('openpose', 'openpose'))
    parser.add_argument('--openpose_res', type=float, default=1)
    parser.add_argument('--gpus', type=int, default=[], nargs='+')
    parser.add_argument('--ext', type=str, default='.jpg')
    parser.add_argument('--tmp', type=str, default=os.path.abspath('tmp'))
    parser.add_argument('--hand', action='store_true')
    parser.add_argument('--face', action='store_true')
    parser.add_argument('--wild', action='store_true',
        help='remove crowd class of yolo')
    parser.add_argument('--reverse', action='store_true')
    parser.add_argument('--force', action='store_true')
    args = parser.parse_args()
    config['yolo']['isWild'] = args.wild
    mode = args.mode
    if not os.path.exists(join(args.path, 'images')) and os.path.exists(join(args.path, 'videos')):
        # default extract image
        cmd = f'''python3 apps/preprocess/extract_image.py {args.path}'''
        os.system(cmd)
    subs = load_subs(args.path, args.subs)
    if len(args.gpus) != 0:
        # perform multiprocess by runcmd
        from easymocap.mytools.debug_utils import run_cmd
        nproc = len(args.gpus)
        plist = []
        for i in range(nproc):
            if len(subs[i::nproc]) == 0:
                continue
            cmd = f'export CUDA_VISIBLE_DEVICES={args.gpus[i]} && python3 apps/preprocess/extract_keypoints.py {args.path} --mode {args.mode} --subs {" ".join(subs[i::nproc])}'
            cmd += f' --annot {args.annot} --ext {args.ext}'
            if args.hand:
                cmd += ' --hand'
            if args.face:
                cmd += ' --face'            
            if args.force:
                cmd += ' --force'
            cmd += f' --tmp {args.tmp}'
            cmd += ' &'
            print(cmd)
            p = run_cmd(cmd, bg=False)
            plist.extend(p)
        for p in plist:
            p.join()
        exit()
    global_tasks = []
    if len(subs) == 0:
        subs = ['']
    for sub in subs:
        config[mode]['force'] = args.force
        image_root = join(args.path, 'images', sub)
@ -116,7 +151,7 @@ if __name__ == "__main__":
        tmp_root = join(args.path, mode, sub)
        if os.path.exists(annot_root) and not args.force:
            # check the number of annots and images
-            if len(os.listdir(image_root)) == len(os.listdir(annot_root)):
+            if len(os.listdir(image_root)) == len(os.listdir(annot_root)) and mode != 'feetcrop':
                print('[Skip] detection {}'.format(annot_root))
                continue
        if mode == 'openpose':
@ -132,10 +167,11 @@ if __name__ == "__main__":
            config[mode]['openpose'] = args.openpose
            estimator = FeetEstimator(openpose=args.openpose)
            estimator.detect_foot(image_root, annot_root, args.ext)
-        elif mode == 'yolo':
+        elif mode == 'yolo' or mode == 'openposecrop':
            from easymocap.estimator.yolohrnet_wrapper import extract_bbox
-            config[mode]['ext'] = args.ext
+            config['yolo']['force'] = False # donot redetect
-            extract_bbox(image_root, annot_root, **config[mode])
+            config['yolo']['ext'] = args.ext
            extract_bbox(image_root, annot_root, **config['yolo'])
        elif mode == 'hrnet':
            from easymocap.estimator.yolohrnet_wrapper import extract_hrnet
            config[mode]['ext'] = args.ext
@ -147,10 +183,15 @@ if __name__ == "__main__":
            from easymocap.estimator.mediapipe_wrapper import extract_2d
            config[mode]['ext'] = args.ext
            extract_2d(image_root, annot_root, config[mode], mode=mode.replace('mp-', ''))
-        if mode == 'feetcrop':
+        if mode == 'feetcrop' or mode == 'openposecrop':
            from easymocap.estimator.openpose_wrapper import FeetEstimatorByCrop
            config[mode]['openpose'] = args.openpose
-            estimator = FeetEstimatorByCrop(openpose=args.openpose)
+            estimator = FeetEstimatorByCrop(openpose=args.openpose, 
                tmpdir=join(args.tmp, '{}_{}'.format(os.path.basename(args.path), sub)),
                fullbody=mode=='openposecrop',
                hand=(mode=='openposecrop')or args.hand,
                face=args.face)
            estimator.detect_foot(image_root, annot_root, args.ext)
    for task in global_tasks:
        task.join()
--- a/easymocap/estimator/SPIN/spin_api.py
+++ b/easymocap/estimator/SPIN/spin_api.py
@ -2,8 +2,8 @@
  @ Date: 2020-10-23 20:07:49
  @ Author: Qing Shuai
  @ LastEditors: Qing Shuai
-  @ LastEditTime: 2021-03-05 13:43:01
+  @ LastEditTime: 2022-07-14 12:44:30
-  @ FilePath: /EasyMocap/code/estimator/SPIN/spin_api.py
+  @ FilePath: /EasyMocapPublic/easymocap/estimator/SPIN/spin_api.py
 '''
 """
 Demo code
@ -32,7 +32,6 @@ Running the previous command will save the results in ```examples/im1010_{shape,
 """
 import torch
 from torchvision.transforms import Normalize
 import numpy as np
 import cv2
@ -46,6 +45,81 @@ class constants:
    IMG_NORM_MEAN = [0.485, 0.456, 0.406]
    IMG_NORM_STD = [0.229, 0.224, 0.225]
 def normalize(tensor, mean, std, inplace: bool = False):
    """Normalize a tensor image with mean and standard deviation.
    .. note::
        This transform acts out of place by default, i.e., it does not mutates the input tensor.
    See :class:`~torchvision.transforms.Normalize` for more details.
    Args:
        tensor (Tensor): Tensor image of size (C, H, W) or (B, C, H, W) to be normalized.
        mean (sequence): Sequence of means for each channel.
        std (sequence): Sequence of standard deviations for each channel.
        inplace(bool,optional): Bool to make this operation inplace.
    Returns:
        Tensor: Normalized Tensor image.
    """
    if not isinstance(tensor, torch.Tensor):
        raise TypeError('Input tensor should be a torch tensor. Got {}.'.format(type(tensor)))
    if tensor.ndim < 3:
        raise ValueError('Expected tensor to be a tensor image of size (..., C, H, W). Got tensor.size() = '
                         '{}.'.format(tensor.size()))
    if not inplace:
        tensor = tensor.clone()
    dtype = tensor.dtype
    mean = torch.as_tensor(mean, dtype=dtype, device=tensor.device)
    std = torch.as_tensor(std, dtype=dtype, device=tensor.device)
    if (std == 0).any():
        raise ValueError('std evaluated to zero after conversion to {}, leading to division by zero.'.format(dtype))
    if mean.ndim == 1:
        mean = mean.view(-1, 1, 1)
    if std.ndim == 1:
        std = std.view(-1, 1, 1)
    tensor.sub_(mean).div_(std)
    return tensor
 class Normalize(torch.nn.Module):
    """Normalize a tensor image with mean and standard deviation.
    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
    channels, this transform will normalize each channel of the input
    ``torch.*Tensor`` i.e.,
    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
    .. note::
        This transform acts out of place, i.e., it does not mutate the input tensor.
    Args:
        mean (sequence): Sequence of means for each channel.
        std (sequence): Sequence of standard deviations for each channel.
        inplace(bool,optional): Bool to make this operation in-place.
    """
    def __init__(self, mean, std, inplace=False):
        super().__init__()
        self.mean = mean
        self.std = std
        self.inplace = inplace
    def forward(self, tensor):
        """
        Args:
            tensor (Tensor): Tensor image to be normalized.
        Returns:
            Tensor: Normalized Tensor image.
        """
        return normalize(tensor, self.mean, self.std, self.inplace)
    def __repr__(self):
        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)
 def get_transform(center, scale, res, rot=0):
    """Generate transformation matrix."""
@ -116,7 +190,7 @@ def crop(img, center, scale, res, rot=0, bias=0):
        new_img = new_img[pad:-pad, pad:-pad]
    new_img = cv2.resize(new_img, (res[0], res[1]))
    return new_img
-    
+
 def process_image(img, bbox, input_res=224):
    """Read image, do preprocessing and possibly crop it according to the bounding box.
    If there are bounding box annotations, use them to crop the image.
@ -134,6 +208,23 @@ def process_image(img, bbox, input_res=224):
    norm_img = normalize_img(img.clone())[None]
    return img, norm_img
 def solve_translation(X, x, K):
    A = np.zeros((2*X.shape[0], 3))
    b = np.zeros((2*X.shape[0], 1))
    fx, fy = K[0, 0], K[1, 1]
    cx, cy = K[0, 2], K[1, 2]
    for nj in range(X.shape[0]):
        A[2*nj, 0] = 1
        A[2*nj + 1, 1] = 1
        A[2*nj, 2] = -(x[nj, 0] - cx)/fx
        A[2*nj+1, 2] = -(x[nj, 1] - cy)/fy
        b[2*nj, 0] = X[nj, 2]*(x[nj, 0] - cx)/fx - X[nj, 0]
        b[2*nj+1, 0] = X[nj, 2]*(x[nj, 1] - cy)/fy - X[nj, 1]
        A[2*nj:2*nj+2, :] *= x[nj, 2]
        b[2*nj:2*nj+2, :] *= x[nj, 2]
    trans = np.linalg.inv(A.T @ A) @ A.T @ b
    return trans.T[0]
 def estimate_translation_np(S, joints_2d, joints_conf, K):
    """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
    Input:
@ -197,16 +288,46 @@ class SPIN:
            p, _ = cv2.Rodrigues(rotmat[i])
            poses[0, 3*i:3*i+3] = p[:, 0]
        results['poses'] = poses
-        if use_rh_th:
+        body_params = {
-            body_params = {
+            'Rh': poses[:, :3],
-                'poses': results['poses'],
+            'poses': poses[:, 3:],
-                'shapes': results['shapes'],
+            'shapes': results['shapes'],
-                'Rh': results['poses'][:, :3].copy(),
+        }
-                'Th': np.zeros((1, 3)),
+        results = body_params
-            }
+        return results
-            body_params['Th'][0, 2] = 5
+    
-            body_params['poses'][:, :3] = 0
+    def __call__(self, body_model, img, bbox, kpts, camera, ret_vertices=True):
-            results = body_params
+        body_params = self.forward(img.copy(), bbox)
        # TODO: bug: This encode will arise errors in keypoints
        kpts1 = body_model.keypoints(body_params, return_tensor=False)[0]
        body_params = body_model.encode(body_params)
        # only use body joints to estimation translation
        nJoints = 15
        keypoints3d = body_model.keypoints(body_params, return_tensor=False)[0]
        kpts_diff = np.linalg.norm(kpts1 - keypoints3d, axis=-1).max()
        # print('Encode and decode error: {}'.format(kpts_diff))
        trans = solve_translation(keypoints3d[:nJoints], kpts[:nJoints], camera['K'])
        body_params['Th'] += trans[None, :]
        if body_params['Th'][0, 2] < 0:
            print('  [WARN in SPIN] solved a negative position of human {}'.format(body_params['Th'][0]))
            body_params['Th'] = -body_params['Th']
            Rhold = cv2.Rodrigues(body_params['Rh'])[0]
            rotx = cv2.Rodrigues(np.pi*np.array([1., 0, 0]))[0]
            Rhold = rotx @ Rhold
            body_params['Rh'] = cv2.Rodrigues(Rhold)[0].reshape(1, 3)
        # convert to world coordinate
        if False:
            Rhold = cv2.Rodrigues(body_params['Rh'])[0]
            Thold = body_params['Th']
            Rh = camera['R'].T @ Rhold
            Th = (camera['R'].T @ (Thold.T - camera['T'])).T
            body_params['Th'] = Th
            body_params['Rh'] = cv2.Rodrigues(Rh)[0].reshape(1, 3)
        keypoints3d = body_model.keypoints(body_params, return_tensor=False)[0]
        results = {'body_params': body_params, 'keypoints3d': keypoints3d}
        if ret_vertices:
            vertices = body_model(return_verts=True, return_tensor=False, **body_params)[0]
            results['vertices'] = vertices
        return results
 def init_with_spin(body_model, spin_model, img, bbox, kpts, camera):