diff --git a/apps/preprocess/extract_keypoints.py b/apps/preprocess/extract_keypoints.py index b315d20..b79a68b 100644 --- a/apps/preprocess/extract_keypoints.py +++ b/apps/preprocess/extract_keypoints.py @@ -2,8 +2,8 @@ @ Date: 2021-08-19 22:06:22 @ Author: Qing Shuai @ LastEditors: Qing Shuai - @ LastEditTime: 2021-12-02 21:19:41 - @ FilePath: /EasyMocap/apps/preprocess/extract_keypoints.py + @ LastEditTime: 2022-05-23 22:58:57 + @ FilePath: /EasyMocapPublic/apps/preprocess/extract_keypoints.py ''' import os from os.path import join @@ -27,6 +27,7 @@ config = { 'vis': False, 'ext': '.jpg' }, + 'openposecrop': {}, 'feet':{ 'root': '', 'res': 1, @@ -78,7 +79,7 @@ config = { 'min_detection_confidence':0.3, 'min_tracking_confidence': 0.1, 'static_image_mode': False, - } + }, } if __name__ == "__main__": @@ -91,24 +92,58 @@ if __name__ == "__main__": help="sub directory name to store the generated annotation files, default to be annots") # Detection Control parser.add_argument('--mode', type=str, default='openpose', choices=[ - 'openpose', 'feet', 'feetcrop', 'yolo-hrnet', 'yolo', 'hrnet', + 'openpose', 'feet', 'feetcrop', 'openposecrop', + 'yolo-hrnet', 'yolo', 'hrnet', 'mp-pose', 'mp-holistic', 'mp-handl', 'mp-handr', 'mp-face'], help="model to extract joints from image") # Openpose parser.add_argument('--openpose', type=str, - default='/media/qing/Project/openpose') + default=os.environ.get('openpose', 'openpose')) parser.add_argument('--openpose_res', type=float, default=1) + parser.add_argument('--gpus', type=int, default=[], nargs='+') parser.add_argument('--ext', type=str, default='.jpg') + parser.add_argument('--tmp', type=str, default=os.path.abspath('tmp')) parser.add_argument('--hand', action='store_true') parser.add_argument('--face', action='store_true') parser.add_argument('--wild', action='store_true', help='remove crowd class of yolo') + parser.add_argument('--reverse', action='store_true') parser.add_argument('--force', action='store_true') args = parser.parse_args() config['yolo']['isWild'] = args.wild mode = args.mode + if not os.path.exists(join(args.path, 'images')) and os.path.exists(join(args.path, 'videos')): + # default extract image + cmd = f'''python3 apps/preprocess/extract_image.py {args.path}''' + os.system(cmd) subs = load_subs(args.path, args.subs) + if len(args.gpus) != 0: + # perform multiprocess by runcmd + from easymocap.mytools.debug_utils import run_cmd + nproc = len(args.gpus) + plist = [] + for i in range(nproc): + if len(subs[i::nproc]) == 0: + continue + cmd = f'export CUDA_VISIBLE_DEVICES={args.gpus[i]} && python3 apps/preprocess/extract_keypoints.py {args.path} --mode {args.mode} --subs {" ".join(subs[i::nproc])}' + cmd += f' --annot {args.annot} --ext {args.ext}' + if args.hand: + cmd += ' --hand' + if args.face: + cmd += ' --face' + if args.force: + cmd += ' --force' + cmd += f' --tmp {args.tmp}' + cmd += ' &' + print(cmd) + p = run_cmd(cmd, bg=False) + plist.extend(p) + for p in plist: + p.join() + exit() global_tasks = [] + if len(subs) == 0: + subs = [''] for sub in subs: config[mode]['force'] = args.force image_root = join(args.path, 'images', sub) @@ -116,7 +151,7 @@ if __name__ == "__main__": tmp_root = join(args.path, mode, sub) if os.path.exists(annot_root) and not args.force: # check the number of annots and images - if len(os.listdir(image_root)) == len(os.listdir(annot_root)): + if len(os.listdir(image_root)) == len(os.listdir(annot_root)) and mode != 'feetcrop': print('[Skip] detection {}'.format(annot_root)) continue if mode == 'openpose': @@ -132,10 +167,11 @@ if __name__ == "__main__": config[mode]['openpose'] = args.openpose estimator = FeetEstimator(openpose=args.openpose) estimator.detect_foot(image_root, annot_root, args.ext) - elif mode == 'yolo': + elif mode == 'yolo' or mode == 'openposecrop': from easymocap.estimator.yolohrnet_wrapper import extract_bbox - config[mode]['ext'] = args.ext - extract_bbox(image_root, annot_root, **config[mode]) + config['yolo']['force'] = False # donot redetect + config['yolo']['ext'] = args.ext + extract_bbox(image_root, annot_root, **config['yolo']) elif mode == 'hrnet': from easymocap.estimator.yolohrnet_wrapper import extract_hrnet config[mode]['ext'] = args.ext @@ -147,10 +183,15 @@ if __name__ == "__main__": from easymocap.estimator.mediapipe_wrapper import extract_2d config[mode]['ext'] = args.ext extract_2d(image_root, annot_root, config[mode], mode=mode.replace('mp-', '')) - if mode == 'feetcrop': + if mode == 'feetcrop' or mode == 'openposecrop': from easymocap.estimator.openpose_wrapper import FeetEstimatorByCrop config[mode]['openpose'] = args.openpose - estimator = FeetEstimatorByCrop(openpose=args.openpose) + estimator = FeetEstimatorByCrop(openpose=args.openpose, + tmpdir=join(args.tmp, '{}_{}'.format(os.path.basename(args.path), sub)), + fullbody=mode=='openposecrop', + hand=(mode=='openposecrop')or args.hand, + face=args.face) estimator.detect_foot(image_root, annot_root, args.ext) + for task in global_tasks: task.join() \ No newline at end of file diff --git a/easymocap/estimator/SPIN/spin_api.py b/easymocap/estimator/SPIN/spin_api.py index abde8d6..75d0c37 100644 --- a/easymocap/estimator/SPIN/spin_api.py +++ b/easymocap/estimator/SPIN/spin_api.py @@ -2,8 +2,8 @@ @ Date: 2020-10-23 20:07:49 @ Author: Qing Shuai @ LastEditors: Qing Shuai - @ LastEditTime: 2021-03-05 13:43:01 - @ FilePath: /EasyMocap/code/estimator/SPIN/spin_api.py + @ LastEditTime: 2022-07-14 12:44:30 + @ FilePath: /EasyMocapPublic/easymocap/estimator/SPIN/spin_api.py ''' """ Demo code @@ -32,7 +32,6 @@ Running the previous command will save the results in ```examples/im1010_{shape, """ import torch -from torchvision.transforms import Normalize import numpy as np import cv2 @@ -46,6 +45,81 @@ class constants: IMG_NORM_MEAN = [0.485, 0.456, 0.406] IMG_NORM_STD = [0.229, 0.224, 0.225] +def normalize(tensor, mean, std, inplace: bool = False): + """Normalize a tensor image with mean and standard deviation. + + .. note:: + This transform acts out of place by default, i.e., it does not mutates the input tensor. + + See :class:`~torchvision.transforms.Normalize` for more details. + + Args: + tensor (Tensor): Tensor image of size (C, H, W) or (B, C, H, W) to be normalized. + mean (sequence): Sequence of means for each channel. + std (sequence): Sequence of standard deviations for each channel. + inplace(bool,optional): Bool to make this operation inplace. + + Returns: + Tensor: Normalized Tensor image. + """ + if not isinstance(tensor, torch.Tensor): + raise TypeError('Input tensor should be a torch tensor. Got {}.'.format(type(tensor))) + + if tensor.ndim < 3: + raise ValueError('Expected tensor to be a tensor image of size (..., C, H, W). Got tensor.size() = ' + '{}.'.format(tensor.size())) + + if not inplace: + tensor = tensor.clone() + + dtype = tensor.dtype + mean = torch.as_tensor(mean, dtype=dtype, device=tensor.device) + std = torch.as_tensor(std, dtype=dtype, device=tensor.device) + if (std == 0).any(): + raise ValueError('std evaluated to zero after conversion to {}, leading to division by zero.'.format(dtype)) + if mean.ndim == 1: + mean = mean.view(-1, 1, 1) + if std.ndim == 1: + std = std.view(-1, 1, 1) + tensor.sub_(mean).div_(std) + return tensor + +class Normalize(torch.nn.Module): + """Normalize a tensor image with mean and standard deviation. + Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n`` + channels, this transform will normalize each channel of the input + ``torch.*Tensor`` i.e., + ``output[channel] = (input[channel] - mean[channel]) / std[channel]`` + + .. note:: + This transform acts out of place, i.e., it does not mutate the input tensor. + + Args: + mean (sequence): Sequence of means for each channel. + std (sequence): Sequence of standard deviations for each channel. + inplace(bool,optional): Bool to make this operation in-place. + + """ + + def __init__(self, mean, std, inplace=False): + super().__init__() + self.mean = mean + self.std = std + self.inplace = inplace + + def forward(self, tensor): + """ + Args: + tensor (Tensor): Tensor image to be normalized. + + Returns: + Tensor: Normalized Tensor image. + """ + return normalize(tensor, self.mean, self.std, self.inplace) + + def __repr__(self): + return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std) + def get_transform(center, scale, res, rot=0): """Generate transformation matrix.""" @@ -116,7 +190,7 @@ def crop(img, center, scale, res, rot=0, bias=0): new_img = new_img[pad:-pad, pad:-pad] new_img = cv2.resize(new_img, (res[0], res[1])) return new_img - + def process_image(img, bbox, input_res=224): """Read image, do preprocessing and possibly crop it according to the bounding box. If there are bounding box annotations, use them to crop the image. @@ -134,6 +208,23 @@ def process_image(img, bbox, input_res=224): norm_img = normalize_img(img.clone())[None] return img, norm_img +def solve_translation(X, x, K): + A = np.zeros((2*X.shape[0], 3)) + b = np.zeros((2*X.shape[0], 1)) + fx, fy = K[0, 0], K[1, 1] + cx, cy = K[0, 2], K[1, 2] + for nj in range(X.shape[0]): + A[2*nj, 0] = 1 + A[2*nj + 1, 1] = 1 + A[2*nj, 2] = -(x[nj, 0] - cx)/fx + A[2*nj+1, 2] = -(x[nj, 1] - cy)/fy + b[2*nj, 0] = X[nj, 2]*(x[nj, 0] - cx)/fx - X[nj, 0] + b[2*nj+1, 0] = X[nj, 2]*(x[nj, 1] - cy)/fy - X[nj, 1] + A[2*nj:2*nj+2, :] *= x[nj, 2] + b[2*nj:2*nj+2, :] *= x[nj, 2] + trans = np.linalg.inv(A.T @ A) @ A.T @ b + return trans.T[0] + def estimate_translation_np(S, joints_2d, joints_conf, K): """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. Input: @@ -197,16 +288,46 @@ class SPIN: p, _ = cv2.Rodrigues(rotmat[i]) poses[0, 3*i:3*i+3] = p[:, 0] results['poses'] = poses - if use_rh_th: - body_params = { - 'poses': results['poses'], - 'shapes': results['shapes'], - 'Rh': results['poses'][:, :3].copy(), - 'Th': np.zeros((1, 3)), - } - body_params['Th'][0, 2] = 5 - body_params['poses'][:, :3] = 0 - results = body_params + body_params = { + 'Rh': poses[:, :3], + 'poses': poses[:, 3:], + 'shapes': results['shapes'], + } + results = body_params + return results + + def __call__(self, body_model, img, bbox, kpts, camera, ret_vertices=True): + body_params = self.forward(img.copy(), bbox) + # TODO: bug: This encode will arise errors in keypoints + kpts1 = body_model.keypoints(body_params, return_tensor=False)[0] + body_params = body_model.encode(body_params) + # only use body joints to estimation translation + nJoints = 15 + keypoints3d = body_model.keypoints(body_params, return_tensor=False)[0] + kpts_diff = np.linalg.norm(kpts1 - keypoints3d, axis=-1).max() + # print('Encode and decode error: {}'.format(kpts_diff)) + trans = solve_translation(keypoints3d[:nJoints], kpts[:nJoints], camera['K']) + body_params['Th'] += trans[None, :] + if body_params['Th'][0, 2] < 0: + print(' [WARN in SPIN] solved a negative position of human {}'.format(body_params['Th'][0])) + body_params['Th'] = -body_params['Th'] + Rhold = cv2.Rodrigues(body_params['Rh'])[0] + rotx = cv2.Rodrigues(np.pi*np.array([1., 0, 0]))[0] + Rhold = rotx @ Rhold + body_params['Rh'] = cv2.Rodrigues(Rhold)[0].reshape(1, 3) + # convert to world coordinate + if False: + Rhold = cv2.Rodrigues(body_params['Rh'])[0] + Thold = body_params['Th'] + Rh = camera['R'].T @ Rhold + Th = (camera['R'].T @ (Thold.T - camera['T'])).T + body_params['Th'] = Th + body_params['Rh'] = cv2.Rodrigues(Rh)[0].reshape(1, 3) + keypoints3d = body_model.keypoints(body_params, return_tensor=False)[0] + results = {'body_params': body_params, 'keypoints3d': keypoints3d} + if ret_vertices: + vertices = body_model(return_verts=True, return_tensor=False, **body_params)[0] + results['vertices'] = vertices return results def init_with_spin(body_model, spin_model, img, bbox, kpts, camera):