🚧 support multi-gpu for keypoints

2022-08-21 16:01:18 +08:00 · 2022-08-21 16:01:18 +08:00 · cd3f184f04
commit cd3f184f04
parent 95f15b14d8
2 changed files with 187 additions and 25 deletions
--- a/apps/preprocess/extract_keypoints.py
+++ b/apps/preprocess/extract_keypoints.py
@ -2,8 +2,8 @@
  @ Date: 2021-08-19 22:06:22
  @ Author: Qing Shuai
  @ LastEditors: Qing Shuai
-  @ LastEditTime: 2021-12-02 21:19:41
-  @ FilePath: /EasyMocap/apps/preprocess/extract_keypoints.py
+  @ LastEditTime: 2022-05-23 22:58:57
+  @ FilePath: /EasyMocapPublic/apps/preprocess/extract_keypoints.py
 '''
 import os
 from os.path import join
@ -27,6 +27,7 @@ config = {
        'vis': False,
        'ext': '.jpg'
    },
+    'openposecrop': {},
    'feet':{
        'root': '',
        'res': 1,
@ -78,7 +79,7 @@ config = {
        'min_detection_confidence':0.3,
        'min_tracking_confidence': 0.1,
        'static_image_mode': False,
-    }
+    },
 }

 if __name__ == "__main__":
@ -91,24 +92,58 @@ if __name__ == "__main__":
        help="sub directory name to store the generated annotation files, default to be annots")
    # Detection Control
    parser.add_argument('--mode', type=str, default='openpose', choices=[
-        'openpose', 'feet', 'feetcrop', 'yolo-hrnet', 'yolo', 'hrnet', 
+        'openpose', 'feet', 'feetcrop', 'openposecrop',
+        'yolo-hrnet', 'yolo', 'hrnet', 
        'mp-pose', 'mp-holistic', 'mp-handl', 'mp-handr', 'mp-face'], 
        help="model to extract joints from image")
    # Openpose
    parser.add_argument('--openpose', type=str, 
-        default='/media/qing/Project/openpose')
+        default=os.environ.get('openpose', 'openpose'))
    parser.add_argument('--openpose_res', type=float, default=1)
+    parser.add_argument('--gpus', type=int, default=[], nargs='+')
    parser.add_argument('--ext', type=str, default='.jpg')
+    parser.add_argument('--tmp', type=str, default=os.path.abspath('tmp'))
    parser.add_argument('--hand', action='store_true')
    parser.add_argument('--face', action='store_true')
    parser.add_argument('--wild', action='store_true',
        help='remove crowd class of yolo')
+    parser.add_argument('--reverse', action='store_true')
    parser.add_argument('--force', action='store_true')
    args = parser.parse_args()
    config['yolo']['isWild'] = args.wild
    mode = args.mode
+    if not os.path.exists(join(args.path, 'images')) and os.path.exists(join(args.path, 'videos')):
+        # default extract image
+        cmd = f'''python3 apps/preprocess/extract_image.py {args.path}'''
+        os.system(cmd)
    subs = load_subs(args.path, args.subs)
+    if len(args.gpus) != 0:
+        # perform multiprocess by runcmd
+        from easymocap.mytools.debug_utils import run_cmd
+        nproc = len(args.gpus)
+        plist = []
+        for i in range(nproc):
+            if len(subs[i::nproc]) == 0:
+                continue
+            cmd = f'export CUDA_VISIBLE_DEVICES={args.gpus[i]} && python3 apps/preprocess/extract_keypoints.py {args.path} --mode {args.mode} --subs {" ".join(subs[i::nproc])}'
+            cmd += f' --annot {args.annot} --ext {args.ext}'
+            if args.hand:
+                cmd += ' --hand'
+            if args.face:
+                cmd += ' --face'            
+            if args.force:
+                cmd += ' --force'
+            cmd += f' --tmp {args.tmp}'
+            cmd += ' &'
+            print(cmd)
+            p = run_cmd(cmd, bg=False)
+            plist.extend(p)
+        for p in plist:
+            p.join()
+        exit()
    global_tasks = []
+    if len(subs) == 0:
+        subs = ['']
    for sub in subs:
        config[mode]['force'] = args.force
        image_root = join(args.path, 'images', sub)
@ -116,7 +151,7 @@ if __name__ == "__main__":
        tmp_root = join(args.path, mode, sub)
        if os.path.exists(annot_root) and not args.force:
            # check the number of annots and images
-            if len(os.listdir(image_root)) == len(os.listdir(annot_root)):
+            if len(os.listdir(image_root)) == len(os.listdir(annot_root)) and mode != 'feetcrop':
                print('[Skip] detection {}'.format(annot_root))
                continue
        if mode == 'openpose':
@ -132,10 +167,11 @@ if __name__ == "__main__":
            config[mode]['openpose'] = args.openpose
            estimator = FeetEstimator(openpose=args.openpose)
            estimator.detect_foot(image_root, annot_root, args.ext)
-        elif mode == 'yolo':
+        elif mode == 'yolo' or mode == 'openposecrop':
            from easymocap.estimator.yolohrnet_wrapper import extract_bbox
-            config[mode]['ext'] = args.ext
-            extract_bbox(image_root, annot_root, **config[mode])
+            config['yolo']['force'] = False # donot redetect
+            config['yolo']['ext'] = args.ext
+            extract_bbox(image_root, annot_root, **config['yolo'])
        elif mode == 'hrnet':
            from easymocap.estimator.yolohrnet_wrapper import extract_hrnet
            config[mode]['ext'] = args.ext
@ -147,10 +183,15 @@ if __name__ == "__main__":
            from easymocap.estimator.mediapipe_wrapper import extract_2d
            config[mode]['ext'] = args.ext
            extract_2d(image_root, annot_root, config[mode], mode=mode.replace('mp-', ''))
-        if mode == 'feetcrop':
+        if mode == 'feetcrop' or mode == 'openposecrop':
            from easymocap.estimator.openpose_wrapper import FeetEstimatorByCrop
            config[mode]['openpose'] = args.openpose
-            estimator = FeetEstimatorByCrop(openpose=args.openpose)
+            estimator = FeetEstimatorByCrop(openpose=args.openpose, 
+                tmpdir=join(args.tmp, '{}_{}'.format(os.path.basename(args.path), sub)),
+                fullbody=mode=='openposecrop',
+                hand=(mode=='openposecrop')or args.hand,
+                face=args.face)
            estimator.detect_foot(image_root, annot_root, args.ext)
+        
    for task in global_tasks:
        task.join()
--- a/easymocap/estimator/SPIN/spin_api.py
+++ b/easymocap/estimator/SPIN/spin_api.py
@ -2,8 +2,8 @@
  @ Date: 2020-10-23 20:07:49
  @ Author: Qing Shuai
  @ LastEditors: Qing Shuai
-  @ LastEditTime: 2021-03-05 13:43:01
-  @ FilePath: /EasyMocap/code/estimator/SPIN/spin_api.py
+  @ LastEditTime: 2022-07-14 12:44:30
+  @ FilePath: /EasyMocapPublic/easymocap/estimator/SPIN/spin_api.py
 '''
 """
 Demo code
@ -32,7 +32,6 @@ Running the previous command will save the results in ```examples/im1010_{shape,
 """

 import torch
-from torchvision.transforms import Normalize
 import numpy as np
 import cv2

@ -46,6 +45,81 @@ class constants:
    IMG_NORM_MEAN = [0.485, 0.456, 0.406]
    IMG_NORM_STD = [0.229, 0.224, 0.225]

+def normalize(tensor, mean, std, inplace: bool = False):
+    """Normalize a tensor image with mean and standard deviation.
+
+    .. note::
+        This transform acts out of place by default, i.e., it does not mutates the input tensor.
+
+    See :class:`~torchvision.transforms.Normalize` for more details.
+
+    Args:
+        tensor (Tensor): Tensor image of size (C, H, W) or (B, C, H, W) to be normalized.
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation inplace.
+
+    Returns:
+        Tensor: Normalized Tensor image.
+    """
+    if not isinstance(tensor, torch.Tensor):
+        raise TypeError('Input tensor should be a torch tensor. Got {}.'.format(type(tensor)))
+
+    if tensor.ndim < 3:
+        raise ValueError('Expected tensor to be a tensor image of size (..., C, H, W). Got tensor.size() = '
+                         '{}.'.format(tensor.size()))
+
+    if not inplace:
+        tensor = tensor.clone()
+
+    dtype = tensor.dtype
+    mean = torch.as_tensor(mean, dtype=dtype, device=tensor.device)
+    std = torch.as_tensor(std, dtype=dtype, device=tensor.device)
+    if (std == 0).any():
+        raise ValueError('std evaluated to zero after conversion to {}, leading to division by zero.'.format(dtype))
+    if mean.ndim == 1:
+        mean = mean.view(-1, 1, 1)
+    if std.ndim == 1:
+        std = std.view(-1, 1, 1)
+    tensor.sub_(mean).div_(std)
+    return tensor
+
+class Normalize(torch.nn.Module):
+    """Normalize a tensor image with mean and standard deviation.
+    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
+    channels, this transform will normalize each channel of the input
+    ``torch.*Tensor`` i.e.,
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input tensor.
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation in-place.
+
+    """
+
+    def __init__(self, mean, std, inplace=False):
+        super().__init__()
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+
+    def forward(self, tensor):
+        """
+        Args:
+            tensor (Tensor): Tensor image to be normalized.
+
+        Returns:
+            Tensor: Normalized Tensor image.
+        """
+        return normalize(tensor, self.mean, self.std, self.inplace)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)
+

 def get_transform(center, scale, res, rot=0):
    """Generate transformation matrix."""
@ -134,6 +208,23 @@ def process_image(img, bbox, input_res=224):
    norm_img = normalize_img(img.clone())[None]
    return img, norm_img

+def solve_translation(X, x, K):
+    A = np.zeros((2*X.shape[0], 3))
+    b = np.zeros((2*X.shape[0], 1))
+    fx, fy = K[0, 0], K[1, 1]
+    cx, cy = K[0, 2], K[1, 2]
+    for nj in range(X.shape[0]):
+        A[2*nj, 0] = 1
+        A[2*nj + 1, 1] = 1
+        A[2*nj, 2] = -(x[nj, 0] - cx)/fx
+        A[2*nj+1, 2] = -(x[nj, 1] - cy)/fy
+        b[2*nj, 0] = X[nj, 2]*(x[nj, 0] - cx)/fx - X[nj, 0]
+        b[2*nj+1, 0] = X[nj, 2]*(x[nj, 1] - cy)/fy - X[nj, 1]
+        A[2*nj:2*nj+2, :] *= x[nj, 2]
+        b[2*nj:2*nj+2, :] *= x[nj, 2]
+    trans = np.linalg.inv(A.T @ A) @ A.T @ b
+    return trans.T[0]
+
 def estimate_translation_np(S, joints_2d, joints_conf, K):
    """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
    Input:
@ -197,16 +288,46 @@ class SPIN:
            p, _ = cv2.Rodrigues(rotmat[i])
            poses[0, 3*i:3*i+3] = p[:, 0]
        results['poses'] = poses
-        if use_rh_th:
-            body_params = {
-                'poses': results['poses'],
-                'shapes': results['shapes'],
-                'Rh': results['poses'][:, :3].copy(),
-                'Th': np.zeros((1, 3)),
-            }
-            body_params['Th'][0, 2] = 5
-            body_params['poses'][:, :3] = 0
-            results = body_params
+        body_params = {
+            'Rh': poses[:, :3],
+            'poses': poses[:, 3:],
+            'shapes': results['shapes'],
+        }
+        results = body_params
+        return results
+    
+    def __call__(self, body_model, img, bbox, kpts, camera, ret_vertices=True):
+        body_params = self.forward(img.copy(), bbox)
+        # TODO: bug: This encode will arise errors in keypoints
+        kpts1 = body_model.keypoints(body_params, return_tensor=False)[0]
+        body_params = body_model.encode(body_params)
+        # only use body joints to estimation translation
+        nJoints = 15
+        keypoints3d = body_model.keypoints(body_params, return_tensor=False)[0]
+        kpts_diff = np.linalg.norm(kpts1 - keypoints3d, axis=-1).max()
+        # print('Encode and decode error: {}'.format(kpts_diff))
+        trans = solve_translation(keypoints3d[:nJoints], kpts[:nJoints], camera['K'])
+        body_params['Th'] += trans[None, :]
+        if body_params['Th'][0, 2] < 0:
+            print('  [WARN in SPIN] solved a negative position of human {}'.format(body_params['Th'][0]))
+            body_params['Th'] = -body_params['Th']
+            Rhold = cv2.Rodrigues(body_params['Rh'])[0]
+            rotx = cv2.Rodrigues(np.pi*np.array([1., 0, 0]))[0]
+            Rhold = rotx @ Rhold
+            body_params['Rh'] = cv2.Rodrigues(Rhold)[0].reshape(1, 3)
+        # convert to world coordinate
+        if False:
+            Rhold = cv2.Rodrigues(body_params['Rh'])[0]
+            Thold = body_params['Th']
+            Rh = camera['R'].T @ Rhold
+            Th = (camera['R'].T @ (Thold.T - camera['T'])).T
+            body_params['Th'] = Th
+            body_params['Rh'] = cv2.Rodrigues(Rh)[0].reshape(1, 3)
+        keypoints3d = body_model.keypoints(body_params, return_tensor=False)[0]
+        results = {'body_params': body_params, 'keypoints3d': keypoints3d}
+        if ret_vertices:
+            vertices = body_model(return_verts=True, return_tensor=False, **body_params)[0]
+            results['vertices'] = vertices
        return results

 def init_with_spin(body_model, spin_model, img, bbox, kpts, camera):