From e9d5f061a520fac50b5439470a654be739ba007c Mon Sep 17 00:00:00 2001
From: shuaiqing <s_q@zju.edu.cn>
Date: Mon, 19 Jun 2023 19:12:56 +0800
Subject: [PATCH] :rocket: add mediapipe

---
 myeasymocap/backbone/mediapipe/hand.py | 118 +++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 myeasymocap/backbone/mediapipe/hand.py

diff --git a/myeasymocap/backbone/mediapipe/hand.py b/myeasymocap/backbone/mediapipe/hand.py
new file mode 100644
index 0000000..24585ae
--- /dev/null
+++ b/myeasymocap/backbone/mediapipe/hand.py
@@ -0,0 +1,118 @@
+# 2023.06.15
+# https://colab.research.google.com/github/googlesamples/mediapipe/blob/main/examples/hand_landmarker/python/hand_landmarker.ipynb#scrollTo=OMjuVQiDYJKF&uniqifier=1
+# pip install -q mediapipe==0.10.0
+import os
+import numpy as np
+import cv2
+# !wget -q https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task
+try:
+    import mediapipe as mp
+    from mediapipe.tasks import python
+    from mediapipe.tasks.python import vision
+except:
+    print('Please install the mediapipe by\npip install -q mediapipe==0.10.0')
+    raise ModuleNotFoundError
+
+VisionRunningMode = mp.tasks.vision.RunningMode
+
+def bbox_from_keypoints(keypoints, rescale=1.2, detection_thresh=0.05, MIN_PIXEL=5):
+    """Get center and scale for bounding box from openpose detections."""
+    valid = keypoints[:,-1] > detection_thresh
+    if valid.sum() < 3:
+        return [0, 0, 100, 100, 0]
+    valid_keypoints = keypoints[valid][:,:-1]
+    center = (valid_keypoints.max(axis=0) + valid_keypoints.min(axis=0))/2
+    bbox_size = valid_keypoints.max(axis=0) - valid_keypoints.min(axis=0)
+    # adjust bounding box tightness
+    if bbox_size[0] < MIN_PIXEL or bbox_size[1] < MIN_PIXEL:
+        return [0, 0, 100, 100, 0]
+    bbox_size = bbox_size * rescale
+    bbox = [
+        center[0] - bbox_size[0]/2, 
+        center[1] - bbox_size[1]/2,
+        center[0] + bbox_size[0]/2, 
+        center[1] + bbox_size[1]/2,
+        keypoints[valid, 2].mean()
+    ]
+    return bbox
+
+class MediaPipe:
+    NUM_HAND = 21
+    def create_detector(self):
+        base_options = python.BaseOptions(model_asset_path=self.ckpt)
+        options = vision.HandLandmarkerOptions(base_options=base_options,
+                                            num_hands=2,
+                                            running_mode=VisionRunningMode.VIDEO)
+        detector = vision.HandLandmarker.create_from_options(options)
+        return detector
+
+    def __init__(self, ckpt) -> None:
+        if not os.path.exists(ckpt):
+            cmd = 'wget -q https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task'
+            print('Cannot find {}, try to download it'.format(ckpt))
+            print(cmd)
+            os.system(cmd)
+            os.makedirs(os.path.dirname(ckpt), exist_ok=True)
+            cmd = 'mv hand_landmarker.task {}'.format(os.path.dirname(ckpt))
+            os.system(cmd)
+        self.ckpt = ckpt
+        self.detector = {}
+        self.timestamp = 0
+    
+    @staticmethod
+    def to_array(pose, W, H):
+        N = len(pose)
+        if N == 0:
+            return np.zeros((1, 21, 3))
+        res = np.zeros((N, 21, 3))
+        for nper in range(N):
+            for i in range(len(pose[nper])):
+                res[nper, i, 0] = pose[nper][i].x * W
+                res[nper, i, 1] = pose[nper][i].y * H
+                res[nper, i, 2] = pose[nper][i].visibility
+        res[..., 0] = W - res[..., 0] - 1
+        return res
+
+    def get_hand(self, pose, W, H):
+        if pose is None:
+            bodies = np.zeros((1, self.NUM_HAND, 3))
+            return bodies
+        poses = self.to_array(pose, W, H)
+        poses[..., 2] = 1.
+        return poses
+    
+    def __call__(self, imgnames, images):
+        squeeze = False
+        if not isinstance(imgnames, list):
+            imgnames = [imgnames]
+            images = [images]
+            squeeze = True
+        # STEP 3: Load the input image.
+        nViews = len(images)
+        keypoints = []
+        bboxes = []
+        for nv in range(nViews):
+            if isinstance(images[nv], str):
+                images[nv] = cv2.imread(images[nv])
+            sub = os.path.basename(os.path.dirname(imgnames[nv]))
+            if sub not in self.detector.keys():
+                self.detector[sub] = self.create_detector()
+            image_ = cv2.cvtColor(images[nv], cv2.COLOR_BGR2RGB)
+            image_height, image_width, _ = image_.shape
+            image_ = cv2.flip(image_, 1)
+            mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image_)
+            detection_result = self.detector[sub].detect_for_video(mp_image, self.timestamp)
+            handl2d = self.get_hand(detection_result.hand_landmarks, image_width, image_height)
+            keypoints.append(handl2d[:1])
+            bboxes.append(bbox_from_keypoints(handl2d[0]))
+
+        keypoints = np.vstack(keypoints)
+        bboxes = np.stack(bboxes)
+        if squeeze:
+            keypoints = keypoints[0]
+            bboxes = bboxes[0]
+        self.timestamp += 33 # 假设30fps
+        return {
+            'keypoints': keypoints,
+            'bbox': bboxes,
+        }
\ No newline at end of file