🚀 update to v0.3

2023-06-19 16:39:27 +08:00 · 2023-06-19 16:39:27 +08:00 · e7800a1356
commit e7800a1356
parent b44fa3c90b
39 changed files with 6218 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@ -112,5 +112,7 @@ data/**
 .DS*
 code_deprecate
 code
-neuralbody
-lightning_logs
+# neuralbody
+lightning_logs
+models
+yolov5m.pt
--- a/apps/mocap/run.py
+++ b/apps/mocap/run.py
@ -0,0 +1,96 @@
+# 这个脚本提供mocap的基本运行接口
+import os
+from easymocap.config import Config, load_object
+from tqdm import tqdm
+
+def process(dataset, model):
+    ret_all = []
+    print('[Run] dataset has {} samples'.format(len(dataset)))
+    for i in tqdm(range(len(dataset)), desc='[Run]'):
+        data = dataset[i]
+        ret = model.at_step(data, i)
+        ret_all.append(ret)
+    ret_all = model.at_final(ret_all)
+
+def update_data_by_args(cfg_data, args):
+    if args.root is not None:
+        cfg_data.args.root = args.root
+    if args.subs is not None:
+        cfg_data.args.subs = args.subs
+    if args.subs_vis is not None:
+        cfg_data.args.subs_vis = args.subs_vis
+    if args.ranges is not None:
+        cfg_data.args.ranges = args.ranges
+    if args.cameras is not None:
+        cfg_data.args.reader.cameras.root = args.cameras
+    if args.skip_vis:
+        cfg_data.args.subs_vis = []
+    return cfg_data
+
+def update_exp_by_args(cfg_exp, args):
+    opts_alias = []
+    if 'alias' in cfg_exp.keys():
+        for i in range(len(args.opt_exp)//2):
+            if args.opt_exp[i*2] in cfg_exp.alias.keys():
+                opts_alias.append(cfg_exp.alias[args.opt_exp[i*2]])
+                opts_alias.append(args.opt_exp[i*2+1])
+        cfg_exp.merge_from_list(opts_alias)
+    if args.skip_vis:
+        for key, val in cfg_exp.args.at_step.items():
+            if key.startswith('vis'):
+                val.skip = True
+
+def load_cfg_from_file(cfg, args):
+    cfg = Config.load(cfg)
+    cfg_data = Config.load(cfg.data)
+    cfg_data.args.merge_from_other_cfg(cfg.data_opts)
+    cfg_data = update_data_by_args(cfg_data, args)
+    cfg_exp = Config.load(cfg.exp)
+    cfg_exp.args.merge_from_other_cfg(cfg.exp_opts)
+    update_exp_by_args(cfg_exp, args)
+    return cfg_data, cfg_exp
+
+def load_cfg_from_cmd(args):
+    cfg_data = Config.load(args.data, args.opt_data)
+    cfg_data = update_data_by_args(cfg_data, args)
+    cfg_exp = Config.load(args.exp, args.opt_exp)
+    update_exp_by_args(cfg_exp, args)
+    return cfg_data, cfg_exp
+
+def main_entrypoint():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--cfg', type=str, default=None)
+    for name in ['data', 'exp']:
+        parser.add_argument('--{}'.format(name), type=str, required=False)
+        parser.add_argument('--opt_{}'.format(name), type=str, nargs='+', default=[])
+    parser.add_argument('--root', type=str, default=None)
+    parser.add_argument('--subs', type=str, default=None, nargs='+')
+    parser.add_argument('--subs_vis', type=str, default=None, nargs='+')
+    parser.add_argument('--ranges', type=int, default=None, nargs=3)
+    parser.add_argument('--cameras', type=str, default=None, help='Camera file path')
+    parser.add_argument('--out', type=str, default=None)
+    parser.add_argument('--skip_vis', action='store_true')
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+
+    if args.cfg is not None:
+        cfg_data, cfg_exp = load_cfg_from_file(args.cfg, args)
+    else:
+        cfg_data, cfg_exp = load_cfg_from_cmd(args)
+
+    if args.out is not None:
+        cfg_exp.args.output = args.out
+    out = cfg_exp.args.output
+    os.makedirs(out, exist_ok=True)
+    print(cfg_data, file=open(os.path.join(out, 'cfg_data.yml'), 'w'))
+    print(cfg_exp, file=open(os.path.join(out, 'cfg_exp.yml'), 'w'))
+    
+    dataset = load_object(cfg_data.module, cfg_data.args)
+    print(dataset)
+
+    model = load_object(cfg_exp.module, cfg_exp.args)
+    process(dataset, model)
+
+if __name__ == '__main__':
+    main_entrypoint()
--- a/config/1v1p/fixhand.yml
+++ b/config/1v1p/fixhand.yml
@ -0,0 +1,139 @@
+smooth: &smooth_keypoints
+  weight: 50.
+  module: myeasymocap.operations.loss.Smooth
+  key_from_output: [keypoints, Th]
+  key_from_infos: [] # TODO: 根据2D的置信度来计算smooth权重
+  args:
+    keys: [Th, keypoints]
+    smooth_type: [Linear, Linear] # 这个depth似乎需要相机参数进行转换
+    norm: [l2, l2]
+    order: [2, 2]
+    weights: [1000., 1000.]
+    window_weight: [0.5, 0.3, 0.1, 0.1]
+
+module: myeasymocap.stages.basestage.MultiStage
+args:
+  output: output/sv1p_hand_fix
+  at_step:
+    detect_by_mediapipe:
+      module: myeasymocap.backbone.mediapipe.hand.MediaPipe
+      key_from_data: [images, imgnames]
+      args:
+        ckpt: models/mediapipe/hand_landmarker.task
+    hand2d: 
+      module: myeasymocap.backbone.hand2d.hand2d.MyHand2D
+      key_from_data: [images, imgnames]
+      key_from_previous: [bbox]
+      args:
+        # ckpt: /nas/public/EasyMocapModels/hrnetv2_w18_coco_wholebody_hand_256x256-1c028db7_20210908.pth
+        ckpt: /nas/public/EasyMocapModels/hand/resnet_kp2d_clean.pt
+        mode: resnet
+    vis2d:
+      module: myeasymocap.io.vis.Vis2D
+      skip: False
+      key_from_data: [images]
+      key_from_previous: [keypoints, bbox]
+      args:
+        name: vis_keypoints2d
+        scale: 0.5
+    infer_mano: # 
+      module: myeasymocap.backbone.hmr.hmr.MyHMR
+      key_from_data: [images, imgnames]
+      key_from_previous: [bbox]
+      key_keep: [meta, cameras, imgnames] # 将这些参数都保留到最后的输出中
+      args:
+        ckpt: models/manol_pca45_noflat.ckpt
+    # TODO: add visualize for Init MANO
+  at_final:
+    load_hand_model: # 载入身体模型
+      module: myeasymocap.io.model.MANOLoader
+      args:
+        cfg_path: config/model/mano.yml
+        model_path: models/manov1.2/MANO_LEFT.pkl #models/handmesh/data/MANO_RIGHT.pkl # load mano model
+        regressor_path: models/manov1.2/J_regressor_mano_LEFT.txt #models/handmesh/data/J_regressor_mano_RIGHT.txt
+        num_pca_comps: 45
+        use_pca: True
+        use_flat_mean: False
+      # 这个模块返回两个内容：body_model, model; 其中的body_model是用来进行可视化的
+    mean_param: # 初始化姿态，这里将poses和shapes都进行平均
+      module: myeasymocap.operations.init.MeanShapes
+      key_from_data: [params]
+      args:
+        keys: ['poses', 'shapes']
+    init_T:  # 初始化每一帧的位置
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints, cameras, params]
+      key_from_previous: [model]
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [Th]
+        loss:
+          repro:
+            weight: 100.
+            module: myeasymocap.operations.loss.Keypoints2D
+            key_from_output: [keypoints]
+            key_from_infos: [keypoints, cameras]
+            args:
+              norm: l2
+          smooth: *smooth_keypoints
+    init_R:  # 初始化每一帧的旋转
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints, cameras]
+      key_from_previous: [model, params]
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [Rh]
+        loss:
+          repro:
+            weight: 100.
+            module: myeasymocap.operations.loss.Keypoints2D
+            key_from_output: [keypoints]
+            key_from_infos: [keypoints, cameras]
+            args:
+              norm: l2
+          smooth: *smooth_keypoints
+    refine_poses: # 优化poses
+      repeat: 2
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints, cameras]
+      key_from_previous: [model, params]
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [poses, shapes, Rh, Th]
+        loss:
+          repro:
+            weight: 100.
+            module: myeasymocap.operations.loss.Keypoints2D
+            key_from_output: [keypoints]
+            key_from_infos: [keypoints, cameras]
+            args:
+              norm: l1
+          reg:
+            weight: 0.001
+            module: myeasymocap.operations.loss.RegLoss
+            key_from_output: [poses]
+            key_from_infos: []
+            args:
+              key: poses
+              norm: l2
+          smooth: *smooth_keypoints
+    write:
+      module: myeasymocap.io.write.WriteSMPL
+      key_from_data: [meta]
+      key_from_previous: [params, model]
+      args:
+        name: smpl
+    render:
+      module: myeasymocap.io.vis3d.Render_multiview
+      key_from_data: [cameras, imgnames]
+      key_from_previous: [hand_model, params]
+      args:
+        model_name: hand_model
+        backend: pyrender
+        view_list: [0]
+        scale: 0.5
+    make_video:
+      module: myeasymocap.io.video.MakeVideo
+      args:
+        fps: 50
+        keep_image: False
--- a/config/1v1p/hand_detect_finetune.yml
+++ b/config/1v1p/hand_detect_finetune.yml
@ -0,0 +1,139 @@
+smooth: &smooth_keypoints
+  weight: 50.
+  module: myeasymocap.operations.loss.Smooth
+  key_from_output: [keypoints, Th]
+  key_from_infos: [] # TODO: 根据2D的置信度来计算smooth权重
+  args:
+    keys: [Th, keypoints]
+    smooth_type: [Linear, Linear] # 这个depth似乎需要相机参数进行转换
+    norm: [l2, l2]
+    order: [2, 2]
+    weights: [1000., 1000.]
+    window_weight: [0.5, 0.3, 0.1, 0.1]
+
+module: myeasymocap.stages.basestage.MultiStage
+args:
+  output: output/sv1p_hand
+  at_step:
+    detect_by_mediapipe:
+      module: myeasymocap.backbone.mediapipe.hand.MediaPipe
+      key_from_data: [images, imgnames]
+      args:
+        ckpt: models/mediapipe/hand_landmarker.task
+    hand2d: 
+      module: myeasymocap.backbone.hand2d.hand2d.MyHand2D
+      key_from_data: [images, imgnames]
+      key_from_previous: [bbox]
+      args:
+        # ckpt: /nas/public/EasyMocapModels/hrnetv2_w18_coco_wholebody_hand_256x256-1c028db7_20210908.pth
+        ckpt: /nas/public/EasyMocapModels/hand/resnet_kp2d_clean.pt
+        mode: resnet
+    vis2d:
+      module: myeasymocap.io.vis.Vis2D
+      skip: False
+      key_from_data: [images]
+      key_from_previous: [keypoints, bbox]
+      args:
+        name: vis_keypoints2d
+        scale: 0.5
+    infer_mano: # 
+      module: myeasymocap.backbone.hmr.hmr.MyHMR
+      key_from_data: [images, imgnames]
+      key_from_previous: [bbox]
+      key_keep: [meta, cameras, imgnames] # 将这些参数都保留到最后的输出中
+      args:
+        ckpt: models/manol_pca45_noflat.ckpt
+    # TODO: add visualize for Init MANO
+  at_final:
+    load_hand_model: # 载入身体模型
+      module: myeasymocap.io.model.MANOLoader
+      args:
+        cfg_path: config/model/mano.yml
+        model_path: models/manov1.2/MANO_LEFT.pkl #models/handmesh/data/MANO_RIGHT.pkl # load mano model
+        regressor_path: models/manov1.2/J_regressor_mano_LEFT.txt #models/handmesh/data/J_regressor_mano_RIGHT.txt
+        num_pca_comps: 45
+        use_pca: True
+        use_flat_mean: False
+      # 这个模块返回两个内容：body_model, model; 其中的body_model是用来进行可视化的
+    mean_param: # 初始化姿态，这里将poses和shapes都进行平均
+      module: myeasymocap.operations.init.MeanShapes
+      key_from_data: [params]
+      args:
+        keys: ['shapes']
+    init_T:  # 初始化每一帧的位置
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints, cameras, params]
+      key_from_previous: [model]
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [Th]
+        loss:
+          repro:
+            weight: 100.
+            module: myeasymocap.operations.loss.Keypoints2D
+            key_from_output: [keypoints]
+            key_from_infos: [keypoints, cameras]
+            args:
+              norm: l2
+          smooth: *smooth_keypoints
+    init_R:  # 初始化每一帧的旋转
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints, cameras]
+      key_from_previous: [model, params]
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [Rh]
+        loss:
+          repro:
+            weight: 100.
+            module: myeasymocap.operations.loss.Keypoints2D
+            key_from_output: [keypoints]
+            key_from_infos: [keypoints, cameras]
+            args:
+              norm: l2
+          smooth: *smooth_keypoints
+    refine_poses: # 优化poses
+      repeat: 2
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints, cameras]
+      key_from_previous: [model, params]
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [poses, shapes, Rh, Th]
+        loss:
+          repro:
+            weight: 100.
+            module: myeasymocap.operations.loss.Keypoints2D
+            key_from_output: [keypoints]
+            key_from_infos: [keypoints, cameras]
+            args:
+              norm: l1
+          reg:
+            weight: 0.001
+            module: myeasymocap.operations.loss.RegLoss
+            key_from_output: [poses]
+            key_from_infos: []
+            args:
+              key: poses
+              norm: l2
+          smooth: *smooth_keypoints
+    write:
+      module: myeasymocap.io.write.WriteSMPL
+      key_from_data: [meta]
+      key_from_previous: [params, model]
+      args:
+        name: smpl
+    render:
+      module: myeasymocap.io.vis3d.Render_multiview
+      key_from_data: [cameras, imgnames]
+      key_from_previous: [hand_model, params]
+      args:
+        model_name: hand_model
+        backend: pyrender
+        view_list: [0]
+        scale: 0.5
+    make_video:
+      module: myeasymocap.io.video.MakeVideo
+      args:
+        fps: 50
+        keep_image: False
--- a/config/1v1p/hrnet_pare_finetune.yml
+++ b/config/1v1p/hrnet_pare_finetune.yml
@ -0,0 +1,147 @@
+module: myeasymocap.stages.basestage.MultiStage
+args:
+  output: output/sv1p # 指定输出路径
+  at_step:
+    detect:
+      module: myeasymocap.backbone.yolo.yolo.YoloWithTrack
+      key_from_data: [images, imgnames]
+      args:
+        model: yolov5m
+        name: person
+    keypoints2d:
+      module: myeasymocap.backbone.hrnet.myhrnet.MyHRNet
+      key_from_data: [images, imgnames]
+      key_from_previous: [bbox]
+      key_keep: []
+      args:
+        ckpt: /nas/home/shuaiqing/Code/EasyMocapPublic/data/models/pose_hrnet_w48_384x288.pth
+    vis2d:
+      module: myeasymocap.io.vis.Vis2D
+      skip: False
+      key_from_data: [images]
+      key_from_previous: [keypoints, bbox]
+      args:
+        name: vis_keypoints2d
+        scale: 0.5
+    infer: # 这个模块给定图片和检测的框，直接返回crop系下的人体姿态
+      module: myeasymocap.backbone.pare.pare.MyPARE
+      key_from_data: [images, imgnames] # 从数据集中读入的bbox、图片、图片名，图片名用于保存结果
+      key_from_previous: [bbox]
+      key_keep: [cameras, imgnames] # 将这些参数都保留到最后的输出中
+      args:
+        ckpt: 3dpw # 指定使用3dpw的预训练模型
+  at_final:
+    load_body_model: # 载入SMPL模型
+      module: myeasymocap.io.model.SMPLLoader
+      args:
+        model_path: models/pare/data/body_models/smpl/SMPL_NEUTRAL.pkl
+        regressor_path: models/J_regressor_body25.npy
+    init_translation: # 给定crop系下的姿态、2D关键点、相机参数，返回世界系下的人体姿态
+      module: myeasymocap.operations.init.InitTranslation
+      key_from_data: [keypoints, cameras, params] # 读入关键点、相机参数、SMPL参数
+      key_from_previous: [body_model] # 读入SMPL模型进行关键点计算
+      args:
+        solve_T: True
+        solve_R: False
+    smooth: # 对初始化的结果进行平滑
+      module: myeasymocap.operations.smooth.SmoothPoses
+      key_from_data: [params]
+      args:
+        window_size: 2
+    mean_param: # Mean shapes
+      module: myeasymocap.operations.init.MeanShapes
+      key_from_data: [params]
+      args:
+        keys: ['shapes']
+    init_RT:
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints, cameras]
+      key_from_previous: [model, params]
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [Th, Rh]
+        loss:
+          repro:
+            weight: 100.
+            module: myeasymocap.operations.loss.Keypoints2D
+            key_from_output: [keypoints]
+            key_from_infos: [keypoints, cameras]
+            args:
+              norm: l2
+              index_est: [2, 5, 9, 12]
+              index_gt: [2, 5, 9, 12]
+          smooth:
+            weight: 1.
+            module: myeasymocap.operations.loss.Smooth
+            key_from_output: [Rh, Th]
+            key_from_infos: [cameras] # TODO: 根据2D的置信度来计算smooth权重
+            args:
+              keys: [Th, Th]
+              smooth_type: [Linear, Depth] # 这个depth似乎需要相机参数进行转换
+              norm: [l2, l2]
+              order: [2, 2]
+              weights: [100., 1000.]
+              window_weight: [0.5, 0.3, 0.1, 0.1]
+    refine_poses:
+      repeat: 2
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints, cameras]
+      key_from_previous: [model, params]
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [poses, Rh, Th]
+        loss:
+          repro:
+            weight: 100.
+            module: myeasymocap.operations.loss.Keypoints2D
+            key_from_output: [keypoints]
+            key_from_infos: [keypoints, cameras]
+            args:
+              norm: gm
+              norm_info: 0.02
+          smooth:
+            weight: 1.
+            module: myeasymocap.operations.loss.Smooth
+            key_from_output: [poses, Rh, Th, keypoints]
+            key_from_infos: [cameras] # TODO: 根据2D的置信度来计算smooth权重
+            args:
+              keys: [Th, Th, poses, keypoints]
+              smooth_type: [Linear, Depth, Linear, Linear] # 这个depth似乎需要相机参数进行转换
+              norm: [l2, l2, l2, l2]
+              order: [2, 2, 2, 2]
+              weights: [100., 1000., 50., 100.]
+              window_weight: [0.5, 0.3, 0.1, 0.1]
+          init:
+            weight: 1.
+            module: myeasymocap.operations.loss.Init
+            key_from_output: [poses]
+            key_from_infos: [init_poses]
+            args:
+              keys: [poses]
+              norm: l2
+              weights: [1.]
+          prior:
+            weight: 0.1
+            module: easymocap.multistage.gmm.GMMPrior
+            key_from_output: [poses]
+            key_from_infos: []
+            args:
+              start: 0
+              end: 69
+    write:
+      module: myeasymocap.io.write.WriteSMPL
+      key_from_data: [meta]
+      key_from_previous: [params, model]
+      args:
+        name: smpl
+    render:
+      module: myeasymocap.io.vis3d.Render
+      key_from_data: [cameras, imgnames]
+      key_from_previous: [params, body_model]
+      args:
+        backend: pyrender
+    make_video:
+      module: myeasymocap.io.video.MakeVideo
+      args:
+        fps: 30
+        keep_image: False
--- a/config/datasets/mvimage.yml
+++ b/config/datasets/mvimage.yml
@ -0,0 +1,16 @@
+module: myeasymocap.datasets.mv1p.MVDataset
+args:
+  root: TO_BE_FILLED
+  subs: []  # used views, default all views
+  subs_vis: ['01'] # visualized views
+  ranges: [0, 10000, 1]
+  read_image: True
+  reader:
+    images:
+      root: images
+      ext: .jpg
+    image_shape:
+      root: images
+      ext: .jpg
+    cameras:
+      root: ''
--- a/config/datasets/svimage.yml
+++ b/config/datasets/svimage.yml
@ -0,0 +1,13 @@
+module: myeasymocap.datasets.sv1p.SVDataset
+args:
+  root: TO_BE_FILLED
+  subs: ['video'] # 指定路径下的其中一个文件夹
+  ranges: [0, 10000, 1] # 指定使用的数据的范围
+  read_image: True # 后面会使用CNN来进行SMPL参数估计，所以需要读入图片
+  reader:
+    images:
+      root: images
+      ext: .jpg
+    image_shape:
+      root: images
+      ext: .jpg
--- a/config/mv1p/detect_hand_triangulate.yml
+++ b/config/mv1p/detect_hand_triangulate.yml
@ -0,0 +1,50 @@
+module: myeasymocap.stages.basestage.MultiStage
+args:
+  output: output/detect_hand_triangulate
+  at_step:
+    detect:
+      module: myeasymocap.backbone.mediapipe.hand.MediaPipe
+      key_from_data: [images, imgnames]
+      args:
+        ckpt: models/mediapipe/hand_landmarker.task
+    vis2d:
+      module: myeasymocap.io.vis.Vis2D
+      skip: False
+      key_from_data: [images]
+      key_from_previous: [keypoints]
+      args:
+        name: vis_keypoints2d
+        scale: 0.5
+    triangulate:
+      module: myeasymocap.operations.triangulate.SimpleTriangulate
+      key_from_data: [cameras]
+      key_from_previous: [keypoints]
+      key_keep: [cameras] # 用于最后的一起优化
+      args:
+        mode: iterative # [naive, iterative]
+    visualize:
+      module: myeasymocap.io.vis.Vis3D
+      key_from_data: [images, cameras]
+      key_from_previous: [keypoints3d] # 用于最后的一起优化
+      args:
+        scale: 1.
+        mode: crop
+        mode_args:
+          - [0, 720, 100, 820]
+          - [0, 720, 100, 820]
+          - [0, 720, 400, 1120]
+  at_final:
+    smooth:
+      module: myeasymocap.operations.smooth.Smooth
+      key_from_data: [keypoints3d]
+      args:
+        window_size: 5
+    write:
+      module: myeasymocap.io.write.Write
+      key_from_data: [keypoints3d]
+      args: {}
+    make_video:
+      module: myeasymocap.io.video.MakeVideo
+      args:
+        fps: 60
+        keep_image: False
--- a/config/mv1p/detect_hand_triangulate_fitMANO.yml
+++ b/config/mv1p/detect_hand_triangulate_fitMANO.yml
@ -0,0 +1,166 @@
+smooth: &smooth_keypoints
+  weight: 1.
+  module: myeasymocap.operations.loss.Smooth
+  key_from_output: [keypoints, poses]
+  key_from_infos: [] # TODO: 根据2D的置信度来计算smooth权重
+  args:
+    keys: [poses, keypoints]
+    smooth_type: [Linear, Linear] # 这个depth似乎需要相机参数进行转换
+    norm: [l2, l2]
+    order: [2, 2]
+    weights: [10., 1000.]
+    window_weight: [0.5, 0.3, 0.1, 0.1]
+
+k3dtorso: &k3dtorso
+  weight: 100.
+  module: myeasymocap.operations.loss.Keypoints3D
+  key_from_output: [keypoints]
+  key_from_infos: [keypoints3d]
+  args:
+    norm: l2
+    index_est: [0, 5, 9, 13, 17]
+    index_gt: [0, 5, 9, 13, 17]
+
+module: myeasymocap.stages.basestage.MultiStage
+args:
+  output: output/detect_hand_triangulate_fitMANO
+  at_step:
+    detect:
+      module: myeasymocap.backbone.mediapipe.hand.MediaPipe
+      key_from_data: [images, imgnames]
+      key_keep: [imgnames]
+      args:
+        ckpt: models/mediapipe/hand_landmarker.task
+    vis2d:
+      module: myeasymocap.io.vis.Vis2D
+      skip: False
+      key_from_data: [images]
+      key_from_previous: [keypoints]
+      args:
+        name: vis_keypoints2d
+        scale: 0.5
+    triangulate:
+      module: myeasymocap.operations.triangulate.SimpleTriangulate
+      key_from_data: [cameras]
+      key_from_previous: [keypoints]
+      key_keep: [cameras] # 用于最后的一起优化
+      args:
+        mode: iterative # [naive, iterative]
+    visualize:
+      module: myeasymocap.io.vis.Vis3D
+      key_from_data: [images, cameras]
+      key_from_previous: [keypoints3d] # 用于最后的一起优化
+      args:
+        scale: 0.5
+        mode: center
+  at_final:
+    load_hand_model: # 载入身体模型
+      module: myeasymocap.io.model.MANOLoader
+      args:
+        cfg_path: config/model/manol.yml
+        model_path: models/manov1.2/MANO_LEFT.pkl #models/handmesh/data/MANO_RIGHT.pkl # load mano model
+        regressor_path: models/manov1.2/J_regressor_mano_LEFT.txt #models/handmesh/data/J_regressor_mano_RIGHT.txt
+        num_pca_comps: 45
+        use_pca: True
+        use_flat_mean: False
+    init_params:
+      module: myeasymocap.operations.init.InitParams
+      key_from_data: [keypoints3d]
+      args:
+        num_poses: 45
+        num_shapes: 10
+    fitShape:
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints3d]
+      key_from_previous: [model, params] 
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [shapes]
+        loss:
+          k3d:
+            weight: 10000.
+            module: myeasymocap.operations.loss.LimbLength
+            key_from_output: [keypoints]
+            key_from_infos: [keypoints3d]
+            args:
+              kintree: [[ 1,  0], [ 2,  1], [ 3,  2], [ 4,  3], [ 5,  0], [ 6,  5], [ 7,  6], [ 8,  7], [ 9,  0], [10,  9], [11, 10], [12, 11], [13,  0], [14, 13], [15, 14], [16, 15], [17,  0], [18, 17], [19, 18], [20, 19]]
+          regshape:
+            weight: 0.1
+            module: myeasymocap.operations.loss.RegLoss
+            key_from_output: [shapes]
+            key_from_infos: []
+            args:
+              key: shapes
+              norm: l2
+    init_T:
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints3d]
+      key_from_previous: [model, params]
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [Th]
+        loss:
+          k3d: *k3dtorso
+          smooth: *smooth_keypoints
+    init_R:
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints3d]
+      key_from_previous: [model, params]
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [Rh]
+        loss:
+          k3d: *k3dtorso
+          smooth: *smooth_keypoints
+    refine_poses:
+      repeat: 2
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints3d]
+      key_from_previous: [model, params]
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [poses, Rh, Th]
+        loss:
+          k3d:
+            weight: 1000000.
+            module: myeasymocap.operations.loss.Keypoints3D
+            key_from_output: [keypoints]
+            key_from_infos: [keypoints3d]
+            args:
+              norm: l2
+              norm_info: 0.02
+          smooth: *smooth_keypoints
+          regpose:
+            weight: 0.1
+            module: myeasymocap.operations.loss.RegLoss
+            key_from_output: [poses]
+            key_from_infos: []
+            args:
+              key: poses
+              norm: l2
+    write:
+      module: myeasymocap.io.write.WriteSMPL
+      key_from_data: [meta]
+      key_from_previous: [params, model]
+      args:
+        name: smpl
+    render:
+      module: myeasymocap.io.vis3d.Render_multiview
+      key_from_data: [cameras, imgnames]
+      key_from_previous: [params, hand_model]
+      args:
+        model_name: hand_model
+        backend: pyrender
+        view_list: [1, 0, 2]
+        scale: 1.
+        render_mode: image
+        mode: crop
+        mode_args:
+          - [0, 720, 100, 820]
+          - [0, 720, 100, 820]
+          - [0, 720, 400, 1120]
+    make_video:
+      module: myeasymocap.io.video.MakeVideo
+      args:
+        fps: 60
+        keep_image: False
--- a/config/mv1p/detect_triangulate.yml
+++ b/config/mv1p/detect_triangulate.yml
@ -0,0 +1,54 @@
+module: myeasymocap.stages.basestage.MultiStage
+args:
+  output: output/detect_triangulate
+  at_step:
+    detect:
+      module: myeasymocap.backbone.yolo.yolo.BaseYOLOv5
+      key_from_data: [images, imgnames]
+      args:
+        model: yolov5m
+        name: person
+    keypoints2d:
+      module: myeasymocap.backbone.hrnet.myhrnet.MyHRNet
+      key_from_data: [images, imgnames]
+      key_from_previous: [bbox]
+      key_keep: []
+      args:
+        ckpt: data/models/pose_hrnet_w48_384x288.pth
+    vis2d:
+      module: myeasymocap.io.vis.Vis2D
+      skip: False
+      key_from_data: [images]
+      key_from_previous: [keypoints, bbox]
+      args:
+        name: vis_keypoints2d
+        scale: 0.5
+    triangulate:
+      module: myeasymocap.operations.triangulate.SimpleTriangulate
+      key_from_data: [cameras]
+      key_from_previous: [keypoints]
+      key_keep: [cameras, imgnames]
+      args:
+        mode: iterative # [naive, iterative]
+    visualize:
+      module: myeasymocap.io.vis.Vis3D
+      key_from_data: [images, cameras]
+      key_from_previous: [keypoints3d] # 用于最后的一起优化
+      args:
+        scale: 0.5
+        mode: center
+  at_final:
+    smooth:
+      module: myeasymocap.operations.smooth.Smooth
+      key_from_data: [keypoints3d]
+      args:
+        window_size: 5
+    write:
+      module: myeasymocap.io.write.Write
+      key_from_data: [keypoints3d]
+      args: {}
+    make_video:
+      module: myeasymocap.io.video.MakeVideo
+      args:
+        fps: 50
+        keep_image: False
--- a/config/mv1p/detect_triangulate_fitSMPL.yml
+++ b/config/mv1p/detect_triangulate_fitSMPL.yml
@ -0,0 +1,169 @@
+module: myeasymocap.stages.basestage.MultiStage
+args:
+  output: output/detect_triangulate_fitSMPL
+  at_step:
+    detect:
+      module: myeasymocap.backbone.yolo.yolo.BaseYOLOv5
+      key_from_data: [images, imgnames]
+      args:
+        model: yolov5m
+        name: person
+    keypoints2d:
+      module: myeasymocap.backbone.hrnet.myhrnet.MyHRNet
+      key_from_data: [images, imgnames]
+      key_from_previous: [bbox]
+      key_keep: []
+      args:
+        ckpt: data/models/pose_hrnet_w48_384x288.pth
+    vis2d:
+      module: myeasymocap.io.vis.Vis2D
+      skip: False
+      key_from_data: [images]
+      key_from_previous: [keypoints, bbox]
+      args:
+        name: vis_keypoints2d
+        scale: 0.5
+    triangulate:
+      module: myeasymocap.operations.triangulate.SimpleTriangulate
+      key_from_data: [cameras]
+      key_from_previous: [keypoints]
+      key_keep: [cameras, imgnames] # 用于最后的一起优化
+      args:
+        mode: iterative # [naive, iterative]
+    visualize:
+      module: myeasymocap.io.vis.Vis3D
+      skip: False
+      key_from_data: [images, cameras]
+      key_from_previous: [keypoints3d] # 用于最后的一起优化
+      args:
+        scale: 0.5
+        mode: center
+  at_final:
+    load_body_model:
+      module: myeasymocap.io.model.SMPLLoader
+      args:
+        model_path: models/pare/data/body_models/smpl/SMPL_NEUTRAL.pkl #
+        regressor_path: models/J_regressor_body25.npy
+    init_params:
+      module: myeasymocap.operations.init.InitParams
+      key_from_data: [keypoints3d]
+      args:
+        num_poses: 69
+        num_shapes: 10
+    fitShape:
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints3d]
+      key_from_previous: [model, params] 
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [shapes]
+        loss:
+          k3d:
+            weight: 100.
+            module: myeasymocap.operations.loss.LimbLength
+            key_from_output: [keypoints]
+            key_from_infos: [keypoints3d]
+            args:
+              kintree: [[8, 1], [2, 5], [2, 3], [5, 6], [3, 4], [6, 7], [2, 3], [5, 6], [3, 4], [6, 7], [2, 3], [5, 6], [3, 4], [6, 7], [1, 0], [9, 12], [9, 10], [10, 11], [12, 13],[13, 14]]
+          regshape:
+            weight: 0.1
+            module: myeasymocap.operations.loss.RegLoss
+            key_from_output: [shapes]
+            key_from_infos: []
+            args:
+              key: shapes
+              norm: l2
+    init_RT:
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints, keypoints3d]
+      key_from_previous: [model, params]
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [Th, Rh]
+        loss:
+          k3d:
+            weight: 100.
+            module: myeasymocap.operations.loss.Keypoints3D
+            key_from_output: [keypoints]
+            key_from_infos: [keypoints3d]
+            args:
+              norm: l2
+              index_est: [2, 5, 9, 12]
+              index_gt: [2, 5, 9, 12]
+          smooth:
+            weight: 1.
+            module: myeasymocap.operations.loss.Smooth
+            key_from_output: [Th, keypoints]
+            key_from_infos: [] # TODO: 根据2D的置信度来计算smooth权重
+            args:
+              keys: [keypoints, Th]
+              smooth_type: [Linear, Linear] # 这个depth似乎需要相机参数进行转换
+              norm: [l2, l2]
+              order: [2, 2]
+              weights: [10., 100.]
+              window_weight: [0.5, 0.3, 0.1, 0.1]
+    refine_poses:
+      repeat: 2
+      module: myeasymocap.operations.optimizer.Optimizer
+      key_from_data: [keypoints, keypoints3d]
+      key_from_previous: [model, params]
+      args:
+        optimizer_args: {optim_type: lbfgs}
+        optimize_keys: [poses, Rh, Th]
+        loss:
+          k3d:
+            weight: 1000.
+            module: myeasymocap.operations.loss.Keypoints3D
+            key_from_output: [keypoints]
+            key_from_infos: [keypoints3d]
+            args:
+              norm: l2
+              norm_info: 0.02
+          smooth:
+            weight: 1.
+            module: myeasymocap.operations.loss.Smooth
+            key_from_output: [poses, Th, keypoints]
+            key_from_infos: []
+            args:
+              keys: [Th, poses, keypoints]
+              smooth_type: [Linear, Linear, Linear]
+              norm: [l2, l2, l2]
+              order: [2, 2, 2]
+              weights: [100., 10., 10.,]
+              window_weight: [0.5, 0.3, 0.1, 0.1]
+          prior:
+            weight: 0.1
+            module: easymocap.multistage.gmm.GMMPrior
+            key_from_output: [poses]
+            key_from_infos: []
+            args:
+              start: 0
+              end: 69
+    write:
+      module: myeasymocap.io.write.WriteSMPL
+      key_from_data: [meta]
+      key_from_previous: [params, model]
+      args:
+        name: smpl
+    # render:
+    #   module: myeasymocap.io.vis3d.Render_multiview
+    #   key_from_data: [cameras, imgnames]
+    #   key_from_previous: [params, body_model]
+    #   args:
+    #     backend: pyrender
+    #     view_list: [0]
+    render_ground:
+      module: myeasymocap.io.vis3d.Render_multiview
+      key_from_data: [cameras, imgnames]
+      key_from_previous: [params, body_model]
+      args:
+        backend: pyrender
+        view_list: [3]
+        mode: ground
+        scale: 1.
+        shape: [1024, 1024]
+    make_video:
+      module: myeasymocap.io.video.MakeVideo
+      args:
+        fps: 50
+        keep_image: False
--- a/easymocap/multistage/gmm.py
+++ b/easymocap/multistage/gmm.py
@ -0,0 +1,155 @@
+import pickle
+import os
+from os.path import join
+import numpy as np
+import torch
+from .lossbase import LossBase
+
+def create_prior_from_cmu(n_gaussians, epsilon=1e-15):
+    """Load the gmm from the CMU motion database."""
+    from os.path import dirname
+    np_dtype = np.float32
+    with open(join(dirname(__file__), 'gmm_%02d.pkl'%(n_gaussians)), 'rb') as f:
+        gmm = pickle.load(f, encoding='latin1')
+    if True:
+        means = gmm['means'].astype(np_dtype)
+        covs = gmm['covars'].astype(np_dtype)
+        weights = gmm['weights'].astype(np_dtype)
+    precisions = [np.linalg.inv(cov) for cov in covs]
+    precisions = np.stack(precisions).astype(np_dtype)
+
+    sqrdets = np.array([(np.sqrt(np.linalg.det(c)))
+                        for c in gmm['covars']])
+    const = (2 * np.pi)**(69 / 2.)
+
+    nll_weights = np.asarray(gmm['weights'] / (const * (sqrdets / sqrdets.min())))
+    cov_dets = [np.log(np.linalg.det(cov.astype(np_dtype)) + epsilon)
+                    for cov in covs]
+    return {
+        'means': means,
+        'covs': covs,
+        'precisions': precisions,
+        'nll_weights': -np.log(nll_weights[None]),
+        'weights': weights,
+        'pi_term': np.log(2*np.pi),
+        'cov_dets': cov_dets
+    }
+
+class MaxMixturePrior(LossBase):
+    def __init__(self, num_gaussians=8, epsilon=1e-16, use_merged=True,
+        start=3, end=72):
+        super(MaxMixturePrior, self).__init__()
+        np_dtype = np.float32
+
+        self.num_gaussians = num_gaussians
+        self.epsilon = epsilon
+        self.use_merged = use_merged
+        data = create_prior_from_cmu(num_gaussians)
+        self.start = start
+        self.end = end
+        for key, val in data.items():
+            self.register_buffer(key, torch.tensor(val, dtype=torch.float32))
+
+    def get_mean(self):
+        ''' Returns the mean of the mixture '''
+        mean_pose = torch.matmul(self.weights, self.means)
+        return mean_pose
+
+    def merged_log_likelihood(self, poses):
+        poses = poses[..., self.start:self.end]
+        diff_from_mean = poses.unsqueeze(dim=1) - self.means[None, :, :self.end-self.start]
+
+        prec_diff_prod = torch.einsum('mij,bmj->bmi',
+            [self.precisions, diff_from_mean])
+        diff_prec_quadratic = (prec_diff_prod * diff_from_mean).sum(dim=-1)
+
+        curr_loglikelihood = 0.5 * diff_prec_quadratic + self.nll_weights
+        min_likelihood, _ = torch.min(curr_loglikelihood, dim=1)
+        return min_likelihood
+
+    def log_likelihood(self, pose, betas, *args, **kwargs):
+        ''' Create graph operation for negative log-likelihood calculation
+        '''
+        likelihoods = []
+
+        for idx in range(self.num_gaussians):
+            mean = self.means[idx]
+            prec = self.precisions[idx]
+            cov = self.covs[idx]
+            diff_from_mean = pose - mean
+
+            curr_loglikelihood = torch.einsum('bj,ji->bi',
+                                              [diff_from_mean, prec])
+            curr_loglikelihood = torch.einsum('bi,bi->b',
+                                              [curr_loglikelihood,
+                                               diff_from_mean])
+            cov_term = torch.log(torch.det(cov) + self.epsilon)
+            curr_loglikelihood += 0.5 * (cov_term +
+                                         self.random_var_dim *
+                                         self.pi_term)
+            likelihoods.append(curr_loglikelihood)
+
+        log_likelihoods = torch.stack(likelihoods, dim=1)
+        min_idx = torch.argmin(log_likelihoods, dim=1)
+        weight_component = self.nll_weights[:, min_idx]
+
+        return weight_component + log_likelihoods[:, min_idx]
+
+    def forward(self, poses, **kwargs):
+        if self.use_merged:
+            return self.merged_log_likelihood(poses).mean()
+        else:
+            return self.log_likelihood(poses).mean()
+
+class MaxMixtureCompletePrior(object):
+    """Prior density estimation."""
+    prior = None
+    mean_pose = None
+    def __init__(self, n_gaussians=8, start=3, end=72):
+        self.n_gaussians = n_gaussians
+        self.start = start
+        self.end = end
+        if self.prior is None:
+            self.prior = self.create_prior_from_cmu()
+
+    def create_prior_from_cmu(self):
+        """Load the gmm from the CMU motion database."""
+        from os.path import dirname
+        np_dtype = np.float32
+        with open(join(dirname(__file__), 'gmm_%02d.pkl'%(self.n_gaussians)), 'rb') as f:
+            gmm = pickle.load(f, encoding='latin1')
+        if True:
+            means = gmm['means'].astype(np_dtype)
+            covs = gmm['covars'].astype(np_dtype)
+            weights = gmm['weights'].astype(np_dtype)
+        precisions = [np.linalg.inv(cov) for cov in covs]
+        precisions = np.stack(precisions).astype(np_dtype)
+
+        sqrdets = np.array([(np.sqrt(np.linalg.det(c)))
+                            for c in gmm['covars']])
+        const = (2 * np.pi)**(69 / 2.)
+
+        nll_weights = np.asarray(gmm['weights'] / (const *
+                                                   (sqrdets / sqrdets.min())))
+        self.means = means
+        self.weights = weights
+        self.mean_pose = weights.dot(means)
+
+    def __call__(self, body_model, body_params, info):
+        poses = body_params['poses']
+        for nf in range(poses.shape[0]):
+            poses[nf][self.start:self.end] = self.mean_pose[:self.end-self.start]
+        return body_params
+
+    def get_gmm_prior(self):
+        """Getter implementation."""
+        return self.prior
+
+class GMMPrior(MaxMixturePrior):
+    def __call__(self, pred, target):
+        poses = pred['poses']
+        poses = poses.reshape(-1, poses.shape[-1])
+        if self.use_merged:
+            return self.merged_log_likelihood(poses).mean()
+        else:
+            return self.log_likelihood(poses).mean()
--- a/easymocap/multistage/gmm_08.pkl
+++ b/easymocap/multistage/gmm_08.pkl
--- a/myeasymocap/backbone/basetopdown.py
+++ b/myeasymocap/backbone/basetopdown.py
@ -0,0 +1,241 @@
+import os
+from os.path import join
+import numpy as np
+import cv2
+import torch
+import torch.nn as nn
+import pickle
+import math
+
+def rotate_2d(pt_2d, rot_rad):
+    x = pt_2d[0]
+    y = pt_2d[1]
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+    xx = x * cs - y * sn
+    yy = x * sn + y * cs
+    return np.array([xx, yy], dtype=np.float32)
+
+
+def gen_trans_from_patch_cv(c_x, c_y, src_width, src_height, dst_width, dst_height, scale, rot, inv=False):
+    # augment size with scale
+    src_w = src_width * scale
+    src_h = src_height * scale
+    src_center = np.zeros(2)
+    src_center[0] = c_x
+    src_center[1] = c_y # np.array([c_x, c_y], dtype=np.float32)
+    # augment rotation
+    rot_rad = np.pi * rot / 180
+    src_downdir = rotate_2d(np.array([0, src_h * 0.5], dtype=np.float32), rot_rad)
+    src_rightdir = rotate_2d(np.array([src_w * 0.5, 0], dtype=np.float32), rot_rad)
+
+    dst_w = dst_width
+    dst_h = dst_height
+    dst_center = np.array([dst_w * 0.5, dst_h * 0.5], dtype=np.float32)
+    dst_downdir = np.array([0, dst_h * 0.5], dtype=np.float32)
+    dst_rightdir = np.array([dst_w * 0.5, 0], dtype=np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = src_center
+    src[1, :] = src_center + src_downdir
+    src[2, :] = src_center + src_rightdir
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = dst_center
+    dst[1, :] = dst_center + dst_downdir
+    dst[2, :] = dst_center + dst_rightdir
+
+    inv_trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans, inv_trans
+
+def generate_patch_image_cv(cvimg, c_x, c_y, bb_width, bb_height, patch_width, patch_height, do_flip, scale, rot):
+
+    trans, inv_trans = gen_trans_from_patch_cv(c_x, c_y, bb_width, bb_height, patch_width, patch_height, scale, rot, inv=False)
+
+    img_patch = cv2.warpAffine(cvimg, trans, (int(patch_width), int(patch_height)),
+                               flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT)
+
+    return img_patch, trans, inv_trans
+
+def get_single_image_crop_demo(image, bbox, scale=1.2, crop_size=224,
+                               mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], fliplr=False):
+
+    crop_image, trans, inv_trans = generate_patch_image_cv(
+        cvimg=image.copy(),
+        c_x=bbox[0],
+        c_y=bbox[1],
+        bb_width=bbox[2],
+        bb_height=bbox[3],
+        patch_width=crop_size[0],
+        patch_height=crop_size[1],
+        do_flip=False,
+        scale=scale,
+        rot=0,
+    )
+    if fliplr:
+        crop_image = cv2.flip(crop_image, 1)
+    # cv2.imwrite('debug_crop.jpg', crop_image)
+    # import ipdb; ipdb.set_trace()
+    crop_image = crop_image.transpose(2,0,1)
+    mean1=np.array(mean, dtype=np.float32).reshape(3,1,1)
+    std1= np.array(std, dtype=np.float32).reshape(3,1,1)
+    crop_image = (crop_image.astype(np.float32))/255.
+    # _max = np.max(abs(crop_image))
+    # crop_image = np.divide(crop_image, _max)
+    crop_image = (crop_image - mean1)/std1
+
+    return crop_image, inv_trans
+
+def xyxy2ccwh(bbox):
+    w = bbox[:, 2] - bbox[:, 0]
+    h = bbox[:, 3] - bbox[:, 1]
+    cx = (bbox[:, 2] + bbox[:, 0])/2
+    cy = (bbox[:, 3] + bbox[:, 1])/2
+    return np.stack([cx, cy, w, h], axis=1)
+
+class BaseTopDownModel(nn.Module):
+    def __init__(self, bbox_scale, res_input,
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
+        super().__init__()
+        self.bbox_scale = bbox_scale
+        if not isinstance(res_input, list):
+            res_input = [res_input, res_input]
+        self.crop_size = res_input
+        self.mean = mean
+        self.std = std
+
+    def load_checkpoint(self, model, state_dict, prefix, strict):
+        state_dict_new = {}
+        for key, val in state_dict.items():
+            if key.startswith(prefix):
+                key_new = key.replace(prefix, '')
+                state_dict_new[key_new] = val
+        model.load_state_dict(state_dict_new, strict=strict)
+
+    def infer(self, image, bbox, to_numpy=False, flips=None):
+        if isinstance(image, str):
+            image = cv2.imread(image)
+        img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        squeeze = False
+        if len(bbox.shape) == 1:
+            bbox = bbox[None]
+            squeeze = True
+        # TODO: 兼容多张图片的
+        bbox = xyxy2ccwh(bbox)
+        inputs = []
+        inv_trans_ = []
+        for i in range(bbox.shape[0]):
+            if flips is None:
+                fliplr=False
+            else:
+                fliplr=flips[i]
+            norm_img, inv_trans = get_single_image_crop_demo(
+                img,
+                bbox[i],
+                scale=self.bbox_scale,
+                crop_size=self.crop_size,
+                mean=self.mean,
+                std=self.std,
+                fliplr=fliplr
+            )
+            inputs.append(norm_img)
+            inv_trans_.append(inv_trans)
+        inputs = np.stack(inputs)
+        inv_trans_ = np.stack(inv_trans_)
+        inputs = torch.FloatTensor(inputs).to(self.device)
+        with torch.no_grad():
+            output = self.model(inputs)
+        if squeeze:
+            for key, val in output.items():
+                output[key] = val[0]
+        if to_numpy:
+            for key, val in output.items():
+                if torch.is_tensor(val):
+                    output[key] = val.detach().cpu().numpy()
+        output['inv_trans'] = inv_trans_
+        return output
+
+    @staticmethod
+    def batch_affine_transform(points, trans):
+        # points: (Bn, J, 2), trans: (Bn, 2, 3)
+        points = np.dstack((points[..., :2], np.ones((*points.shape[:-1], 1))))
+        out = np.matmul(points, trans.swapaxes(-1, -2))
+        return out
+
+class BaseTopDownModelCache(BaseTopDownModel):
+    def __init__(self, name, **kwargs):
+        super().__init__(**kwargs)
+        self.name = name
+    
+    def __call__(self, bbox, images, imgname, flips=None):
+        basename = os.sep.join(imgname.split(os.sep)[-2:])
+        cachename = join(self.output, self.name, basename.replace('.jpg', '.pkl'))
+        os.makedirs(os.path.dirname(cachename), exist_ok=True)
+        if os.path.exists(cachename):
+            with open(cachename, 'rb') as f:
+                output = pickle.load(f)
+        else:
+            output = self.infer(images, bbox, to_numpy=True, flips=flips)
+            with open(cachename, 'wb') as f:
+                pickle.dump(output, f)
+        ret = {
+            'params': output
+        }
+        return ret
+
+# post processing
+def get_max_preds(batch_heatmaps):
+    '''
+    get predictions from score maps
+    heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
+    '''
+    assert isinstance(batch_heatmaps, np.ndarray), \
+        'batch_heatmaps should be numpy.ndarray'
+    assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim'
+
+    batch_size = batch_heatmaps.shape[0]
+    num_joints = batch_heatmaps.shape[1]
+    width = batch_heatmaps.shape[3]
+    heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1))
+    idx = np.argmax(heatmaps_reshaped, 2)
+    maxvals = np.amax(heatmaps_reshaped, 2)
+
+    maxvals = maxvals.reshape((batch_size, num_joints, 1))
+    idx = idx.reshape((batch_size, num_joints, 1))
+
+    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+
+    preds[:, :, 0] = (preds[:, :, 0]) % width
+    preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)
+
+    pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
+    pred_mask = pred_mask.astype(np.float32)
+
+    preds *= pred_mask
+    return preds, maxvals
+
+def get_preds_from_heatmaps(batch_heatmaps):
+    coords, maxvals = get_max_preds(batch_heatmaps)
+
+    heatmap_height = batch_heatmaps.shape[2]
+    heatmap_width = batch_heatmaps.shape[3]
+
+    # post-processing
+    if True:
+        for n in range(coords.shape[0]):
+            for p in range(coords.shape[1]):
+                hm = batch_heatmaps[n][p]
+                px = int(math.floor(coords[n][p][0] + 0.5))
+                py = int(math.floor(coords[n][p][1] + 0.5))
+                if 1 < px < heatmap_width-1 and 1 < py < heatmap_height-1:
+                    diff = np.array(
+                        [
+                            hm[py][px+1] - hm[py][px-1],
+                            hm[py+1][px]-hm[py-1][px]
+                        ]
+                    )
+                    coords[n][p] += np.sign(diff) * .25
+    coords = coords.astype(np.float32) * 4
+    pred = np.dstack((coords, maxvals))
+    return pred
--- a/myeasymocap/backbone/hrnet/init.py
+++ b/myeasymocap/backbone/hrnet/init.py
--- a/myeasymocap/backbone/hrnet/hrnet.py
+++ b/myeasymocap/backbone/hrnet/hrnet.py
@ -0,0 +1,218 @@
+import torch
+from torch import nn
+from .modules import BasicBlock, Bottleneck
+
+
+class StageModule(nn.Module):
+    def __init__(self, stage, output_branches, c, bn_momentum):
+        super(StageModule, self).__init__()
+        self.stage = stage
+        self.output_branches = output_branches
+
+        self.branches = nn.ModuleList()
+        for i in range(self.stage):
+            w = c * (2 ** i)
+            branch = nn.Sequential(
+                BasicBlock(w, w, bn_momentum=bn_momentum),
+                BasicBlock(w, w, bn_momentum=bn_momentum),
+                BasicBlock(w, w, bn_momentum=bn_momentum),
+                BasicBlock(w, w, bn_momentum=bn_momentum),
+            )
+            self.branches.append(branch)
+
+        self.fuse_layers = nn.ModuleList()
+        # for each output_branches (i.e. each branch in all cases but the very last one)
+        for i in range(self.output_branches):
+            self.fuse_layers.append(nn.ModuleList())
+            for j in range(self.stage):  # for each branch
+                if i == j:
+                    self.fuse_layers[-1].append(nn.Sequential())  # Used in place of "None" because it is callable
+                elif i < j:
+                    self.fuse_layers[-1].append(nn.Sequential(
+                        nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=(1, 1), stride=(1, 1), bias=False),
+                        nn.BatchNorm2d(c * (2 ** i), eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
+                        nn.Upsample(scale_factor=(2.0 ** (j - i)), mode='nearest'),
+                    ))
+                elif i > j:
+                    ops = []
+                    for k in range(i - j - 1):
+                        ops.append(nn.Sequential(
+                            nn.Conv2d(c * (2 ** j), c * (2 ** j), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1),
+                                      bias=False),
+                            nn.BatchNorm2d(c * (2 ** j), eps=1e-05, momentum=0.1, affine=True,
+                                           track_running_stats=True),
+                            nn.ReLU(inplace=True),
+                        ))
+                    ops.append(nn.Sequential(
+                        nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1),
+                                  bias=False),
+                        nn.BatchNorm2d(c * (2 ** i), eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
+                    ))
+                    self.fuse_layers[-1].append(nn.Sequential(*ops))
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        assert len(self.branches) == len(x)
+
+        x = [branch(b) for branch, b in zip(self.branches, x)]
+
+        x_fused = []
+        for i in range(len(self.fuse_layers)):
+            for j in range(0, len(self.branches)):
+                if j == 0:
+                    x_fused.append(self.fuse_layers[i][0](x[0]))
+                else:
+                    x_fused[i] = x_fused[i] + self.fuse_layers[i][j](x[j])
+
+        for i in range(len(x_fused)):
+            x_fused[i] = self.relu(x_fused[i])
+
+        return x_fused
+
+
+class HRNet(nn.Module):
+    def __init__(self, c=48, nof_joints=17, bn_momentum=0.1):
+        super(HRNet, self).__init__()
+
+        # Input (stem net)
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
+        self.bn1 = nn.BatchNorm2d(64, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
+        self.bn2 = nn.BatchNorm2d(64, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True)
+        self.relu = nn.ReLU(inplace=True)
+
+        # Stage 1 (layer1)      - First group of bottleneck (resnet) modules
+        downsample = nn.Sequential(
+            nn.Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False),
+            nn.BatchNorm2d(256, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+        )
+        self.layer1 = nn.Sequential(
+            Bottleneck(64, 64, downsample=downsample),
+            Bottleneck(256, 64),
+            Bottleneck(256, 64),
+            Bottleneck(256, 64),
+        )
+
+        # Fusion layer 1 (transition1)      - Creation of the first two branches (one full and one half resolution)
+        self.transition1 = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(256, c, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
+                nn.BatchNorm2d(c, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+                nn.ReLU(inplace=True),
+            ),
+            nn.Sequential(nn.Sequential(  # Double Sequential to fit with official pretrained weights
+                nn.Conv2d(256, c * (2 ** 1), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
+                nn.BatchNorm2d(c * (2 ** 1), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+                nn.ReLU(inplace=True),
+            )),
+        ])
+
+        # Stage 2 (stage2)      - Second module with 1 group of bottleneck (resnet) modules. This has 2 branches
+        self.stage2 = nn.Sequential(
+            StageModule(stage=2, output_branches=2, c=c, bn_momentum=bn_momentum),
+        )
+
+        # Fusion layer 2 (transition2)      - Creation of the third branch (1/4 resolution)
+        self.transition2 = nn.ModuleList([
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(nn.Sequential(  # Double Sequential to fit with official pretrained weights
+                nn.Conv2d(c * (2 ** 1), c * (2 ** 2), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
+                nn.BatchNorm2d(c * (2 ** 2), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+                nn.ReLU(inplace=True),
+            )),  # ToDo Why the new branch derives from the "upper" branch only?
+        ])
+
+        # Stage 3 (stage3)      - Third module with 4 groups of bottleneck (resnet) modules. This has 3 branches
+        self.stage3 = nn.Sequential(
+            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
+        )
+
+        # Fusion layer 3 (transition3)      - Creation of the fourth branch (1/8 resolution)
+        self.transition3 = nn.ModuleList([
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(nn.Sequential(  # Double Sequential to fit with official pretrained weights
+                nn.Conv2d(c * (2 ** 2), c * (2 ** 3), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
+                nn.BatchNorm2d(c * (2 ** 3), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+                nn.ReLU(inplace=True),
+            )),  # ToDo Why the new branch derives from the "upper" branch only?
+        ])
+
+        # Stage 4 (stage4)      - Fourth module with 3 groups of bottleneck (resnet) modules. This has 4 branches
+        self.stage4 = nn.Sequential(
+            StageModule(stage=4, output_branches=4, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=4, output_branches=4, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=4, output_branches=1, c=c, bn_momentum=bn_momentum),
+        )
+
+        # Final layer (final_layer)
+        self.final_layer = nn.Conv2d(c, nof_joints, kernel_size=(1, 1), stride=(1, 1))
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+
+        x = self.layer1(x)
+        x = [trans(x) for trans in self.transition1]  # Since now, x is a list (# == nof branches)
+
+        x = self.stage2(x)
+        # x = [trans(x[-1]) for trans in self.transition2]    # New branch derives from the "upper" branch only
+        x = [
+            self.transition2[0](x[0]),
+            self.transition2[1](x[1]),
+            self.transition2[2](x[-1])
+        ]  # New branch derives from the "upper" branch only
+
+        x = self.stage3(x)
+        # x = [trans(x) for trans in self.transition3]    # New branch derives from the "upper" branch only
+        x = [
+            self.transition3[0](x[0]),
+            self.transition3[1](x[1]),
+            self.transition3[2](x[2]),
+            self.transition3[3](x[-1])
+        ]  # New branch derives from the "upper" branch only
+
+        x = self.stage4(x)
+
+        x = self.final_layer(x[0])
+
+        return {
+            'output': x
+        }
+
+
+if __name__ == '__main__':
+    # model = HRNet(48, 17, 0.1)
+    model = HRNet(32, 17, 0.1)
+
+    # print(model)
+
+    model.load_state_dict(
+        # torch.load('./weights/pose_hrnet_w48_384x288.pth')
+        torch.load('./weights/pose_hrnet_w32_256x192.pth')
+    )
+    print('ok!!')
+
+    if torch.cuda.is_available() and False:
+        torch.backends.cudnn.deterministic = True
+        device = torch.device('cuda:0')
+    else:
+        device = torch.device('cpu')
+
+    print(device)
+
+    model = model.to(device)
+
+    y = model(torch.ones(1, 3, 384, 288).to(device))
+    print(y.shape)
+    print(torch.min(y).item(), torch.mean(y).item(), torch.max(y).item())
--- a/myeasymocap/backbone/hrnet/modules.py
+++ b/myeasymocap/backbone/hrnet/modules.py
@ -0,0 +1,72 @@
+import torch
+from torch import nn
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=bn_momentum)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
--- a/myeasymocap/backbone/hrnet/myhrnet.py
+++ b/myeasymocap/backbone/hrnet/myhrnet.py
@ -0,0 +1,130 @@
+import os
+import numpy as np
+import math
+import cv2
+import torch
+from ..basetopdown import BaseTopDownModelCache
+from .hrnet import HRNet
+
+def get_max_preds(batch_heatmaps):
+    '''
+    get predictions from score maps
+    heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
+    '''
+    assert isinstance(batch_heatmaps, np.ndarray), \
+        'batch_heatmaps should be numpy.ndarray'
+    assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim: {}'.format(batch_heatmaps.shape)
+
+    batch_size = batch_heatmaps.shape[0]
+    num_joints = batch_heatmaps.shape[1]
+    width = batch_heatmaps.shape[3]
+    heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1))
+    idx = np.argmax(heatmaps_reshaped, 2)
+    maxvals = np.amax(heatmaps_reshaped, 2)
+
+    maxvals = maxvals.reshape((batch_size, num_joints, 1))
+    idx = idx.reshape((batch_size, num_joints, 1))
+
+    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+
+    preds[:, :, 0] = (preds[:, :, 0]) % width
+    preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)
+
+    pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
+    pred_mask = pred_mask.astype(np.float32)
+
+    preds *= pred_mask
+    return preds, maxvals
+
+COCO17_IN_BODY25 = [0,16,15,18,17,5,2,6,3,7,4,12,9,13,10,14,11]
+pairs = [[1, 8], [1, 2], [1, 5], [2, 3], [3, 4], [5, 6], [6, 7], [8, 9], [9, 10], [10, 11], [8, 12], [12, 13], [13, 14], [1, 0], [0,15], [15,17], [0,16], [16,18], [14,19], [19,20], [14,21], [11,22], [22,23], [11,24]]
+def coco17tobody25(points2d):
+    kpts = np.zeros((points2d.shape[0], 25, 3))
+    kpts[:, COCO17_IN_BODY25, :2] = points2d[:, :, :2]
+    kpts[:, COCO17_IN_BODY25, 2:3] = points2d[:, :, 2:3]
+    kpts[:, 8, :2] = kpts[:, [9, 12], :2].mean(axis=1)
+    kpts[:, 8, 2] = kpts[:, [9, 12], 2].min(axis=1)
+    kpts[:, 1, :2] = kpts[:, [2, 5], :2].mean(axis=1)
+    kpts[:, 1, 2] = kpts[:, [2, 5], 2].min(axis=1)
+    # 需要交换一下
+    # kpts = kpts[:, :, [1,0,2]]
+    return kpts
+
+class MyHRNet(BaseTopDownModelCache):
+    def __init__(self, ckpt):
+        super().__init__(name='hand2d', bbox_scale=1.25, res_input=[288, 384])
+        model = HRNet(48, 17, 0.1)
+        if not os.path.exists(ckpt) and ckpt.endswith('pose_hrnet_w48_384x288.pth'):
+            url = "11ezQ6a_MxIRtj26WqhH3V3-xPI3XqYAw"
+            text = '''Download `models/pytorch/pose_coco/pose_hrnet_w48_384x288.pth` from (OneDrive)[https://1drv.ms/f/s!AhIXJn_J-blW231MH2krnmLq5kkQ],
+            And place it into {}'''.format(os.path.dirname(ckpt))
+            print(text)
+            os.makedirs(os.path.dirname(ckpt), exist_ok=True)
+            cmd = 'gdown "{}" -O {}'.format(url, ckpt)
+            print('\n', cmd, '\n')
+            os.system(cmd)
+        assert os.path.exists(ckpt), f'{ckpt} not exists'
+        checkpoint = torch.load(ckpt, map_location='cpu')
+        model.load_state_dict(checkpoint)
+        model.eval()
+        self.model = model
+        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        self.model.to(self.device)
+
+    @staticmethod
+    def get_max_preds(batch_heatmaps):
+        coords, maxvals = get_max_preds(batch_heatmaps)
+
+        heatmap_height = batch_heatmaps.shape[2]
+        heatmap_width = batch_heatmaps.shape[3]
+
+        # post-processing
+        if True:
+            for n in range(coords.shape[0]):
+                for p in range(coords.shape[1]):
+                    hm = batch_heatmaps[n][p]
+                    px = int(math.floor(coords[n][p][0] + 0.5))
+                    py = int(math.floor(coords[n][p][1] + 0.5))
+                    if 1 < px < heatmap_width-1 and 1 < py < heatmap_height-1:
+                        diff = np.array(
+                            [
+                                hm[py][px+1] - hm[py][px-1],
+                                hm[py+1][px]-hm[py-1][px]
+                            ]
+                        )
+                        coords[n][p] += np.sign(diff) * .25
+        coords = coords.astype(np.float32) * 4
+        pred = np.dstack((coords, maxvals))
+        return pred
+
+    def __call__(self, bbox, images, imgnames):
+        squeeze = False
+        if not isinstance(images, list):
+            images = [images]
+            imgnames = [imgnames]
+            bbox = [bbox]
+            squeeze = True
+        nViews = len(images)
+        kpts_all = []
+        for nv in range(nViews):
+            _bbox = bbox[nv]
+            if _bbox.shape[0] == 0:
+                kpts_all.append(np.zeros((17, 3)))
+                continue
+            img = images[nv]
+            # TODO: add flip test
+            out = super().__call__(_bbox, img, imgnames[nv])
+            output = out['params']['output']
+            kpts = self.get_max_preds(output)
+            kpts_ori = self.batch_affine_transform(kpts, out['params']['inv_trans'])
+            kpts = np.concatenate([kpts_ori, kpts[..., -1:]], axis=-1)
+            kpts = coco17tobody25(kpts)
+            if len(kpts.shape) == 3:
+                kpts = kpts[0]
+            kpts_all.append(kpts)
+        kpts_all = np.stack(kpts_all)
+        if squeeze:
+            kpts_all = kpts_all[0]
+        return {
+            'keypoints': kpts_all
+        }
--- a/myeasymocap/backbone/yolo/yolo.py
+++ b/myeasymocap/backbone/yolo/yolo.py
@ -0,0 +1,292 @@
+import torch
+import numpy as np
+import os
+import cv2
+from os.path import join
+import pickle
+
+def check_modelpath(paths):
+    if isinstance(paths, str):
+        assert os.path.exists(paths), paths
+        return paths
+    elif isinstance(paths, list):
+        for path in paths:
+            if os.path.exists(path):
+                print(f'Found model in {path}')
+                break
+        else:
+            print(f'No model found in {paths}!')
+            raise FileExistsError
+        return path
+    else:
+        raise NotImplementedError
+
+class BaseYOLOv5:
+    def __init__(self, ckpt=None, model='yolov5m', name='object2d', multiview=True) -> None:
+        if ckpt is not None:
+            ckpt = check_modelpath(ckpt)
+            self.model = torch.hub.load('ultralytics/yolov5', 'custom', ckpt)
+        else:
+            print('[{}] Not given ckpt, use default yolov5'.format(self.__class__.__name__))
+            self.model = torch.hub.load('ultralytics/yolov5', model)
+        self.multiview = multiview
+        self.name = name
+    
+    def check_cache(self, imgname):
+        basename = os.path.basename(imgname)
+        imgext = '.' + basename.split('.')[-1]
+        nv = imgname.split(os.sep)[-2]
+        cachename = join(self.output, self.name, nv, basename.replace(imgext, '.npy'))
+        os.makedirs(os.path.dirname(cachename), exist_ok=True)
+        if os.path.exists(cachename):
+            output = np.load(cachename, allow_pickle=True)
+            return True, output, cachename
+        else:
+            return False, None, cachename
+    
+    def check_image(self, img_or_name):
+        if isinstance(img_or_name, str):
+            images = cv2.imread(img_or_name)
+        else:
+            images = img_or_name
+        images = cv2.cvtColor(images, cv2.COLOR_BGR2RGB)
+        return images
+    
+    @torch.no_grad()
+    def detect(self, image, imgname):
+        flag, cache, cachename = self.check_cache(imgname)
+        if flag:
+            return cache
+        image = self.check_image(imgname)
+        results = self.model(image) #RGB images[:,:,::-1]
+        arrays = np.array(results.pandas().xyxy[0])
+        np.save(cachename, arrays)
+        return arrays
+    
+    @staticmethod
+    def select_class(results, name):
+        select = []
+        for i, res in enumerate(results):
+            classname = res[6]
+            if classname != name:
+                continue
+            box = res[:5]
+            select.append(box)
+        return select
+
+    def select_bbox(self, select, imgname):
+        if select.shape[0] == 0:
+            return select
+        # Naive: select the best
+        idx = np.argsort(select[:, -1])[::-1]
+        return select[idx[0:1]]
+
+    def __call__(self, images, imgnames): # 这里好像默认是多视角了，需要继承一下单视角的
+        squeeze = False
+        if not isinstance(images, list):
+            images = [images]
+            imgnames = [imgnames]
+            squeeze = True
+        detects = {'bbox': [[] for _ in range(len(images))]}
+        for nv in range(len(images)):
+            res = self.detect(images[nv], imgnames[nv])            
+            select = self.select_class(res, self.name)
+            if len(select) == 0:
+                select = np.zeros((0,5), dtype=np.float32)
+            else:
+                select = np.stack(select).astype(np.float32)
+            # TODO: add track here
+            select = self.select_bbox(select, imgnames[nv])
+            detects['bbox'][nv] = select
+        if squeeze:
+            detects['bbox'] = detects['bbox'][0]
+        return detects
+
+class YoloWithTrack(BaseYOLOv5):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.track_cache = {}
+
+    @staticmethod
+    def calculate_iou(bbox_pre, bbox_now):
+        area_now = (bbox_now[:, 2] - bbox_now[:, 0])*(bbox_now[:, 3]-bbox_now[:, 1])
+        area_pre = (bbox_pre[:, 2] - bbox_pre[:, 0])*(bbox_pre[:, 3]-bbox_pre[:, 1])
+        # compute IOU
+        # max of left
+        xx1 = np.maximum(bbox_now[:, 0], bbox_pre[:, 0])
+        yy1 = np.maximum(bbox_now[:, 1], bbox_pre[:, 1])
+        # min of right
+        xx2 = np.minimum(bbox_now[:, 0+2], bbox_pre[:, 0+2])
+        yy2 = np.minimum(bbox_now[:, 1+2], bbox_pre[:, 1+2])
+        # w h
+        w = np.maximum(0, xx2 - xx1)
+        h = np.maximum(0, yy2 - yy1)
+        over = (w*h)/(area_pre+area_now-w*h)
+        return over
+
+    def select_bbox(self, select, imgname):
+        if select.shape[0] == 0:
+            return select
+        sub = os.path.basename(os.path.dirname(imgname))
+        frame = int(os.path.basename(imgname).split('.')[0])
+        if sub not in self.track_cache:
+            # select the best
+            select = super().select_bbox(select, imgname)
+            self.track_cache[sub] = {
+                'frame': [frame],
+                'bbox': [select]
+            }
+            return select
+        bbox_pre = self.track_cache[sub]['bbox'][-1]
+        iou = self.calculate_iou(bbox_pre, select)
+        idx = iou.argmax()
+        select = select[idx:idx+1]
+        self.track_cache[sub]['frame'].append(frame)
+        self.track_cache[sub]['bbox'].append(select)
+        return select
+
+class DetectToPelvis:
+    def __init__(self, key) -> None:
+        self.key = key
+        self.multiview = True
+    
+    def __call__(self, **kwargs):
+        key = self.key
+        val = kwargs[key]
+        ret = {'pelvis': []}
+        for nv in range(len(val)):
+            bbox = val[nv]
+            center = np.stack([(bbox[:, 0] + bbox[:, 2])/2, (bbox[:, 1] + bbox[:, 3])/2, bbox[:, -1]], axis=-1)
+            ret['pelvis'].append(center)
+        return ret
+
+class Yolo_model:
+    def __init__(self, mode, yolo_ckpt, multiview, repo_or_dir = 'ultralytics/yolov5', source='github') -> None:
+        yolo_ckpt = check_modelpath(yolo_ckpt)
+        self.model = torch.hub.load(repo_or_dir, 'custom', yolo_ckpt, source=source)
+        self.min_detect_thres = 0.3
+        self.mode = mode # 'fullimg' # 'bboxcrop'
+        self.output = 'output'
+        self.name = 'yolo'
+        self.multiview = multiview
+    @torch.no_grad()
+    def det_step(self, img_or_name, imgname, bbox=[]):
+
+        basename = os.path.basename(imgname)
+        if self.multiview:
+            nv = imgname.split('/')[-2]
+            cachename = join(self.output, self.name, nv, basename.replace('.jpg', '.pkl'))
+        else:
+            cachename = join(self.output, self.name, basename.replace('.jpg', '.pkl'))
+        os.makedirs(os.path.dirname(cachename), exist_ok=True)
+        if os.path.exists(cachename):
+            with open(cachename, 'rb') as f:
+                output = pickle.load(f)
+            return output
+
+        if isinstance(img_or_name,str):
+            images = cv2.imread(img_or_name)
+        else:
+            images = img_or_name
+
+        if self.mode == 'bboxcrop':
+            bbox[0] = max(0,bbox[0])
+            bbox[1] = max(0,bbox[1])
+            crop = images[int(bbox[1]):int(bbox[3]),int(bbox[0]):int(bbox[2]),::-1]
+        else:
+            crop = images[:,:,::-1]
+        # print("[yolo img shape] ",crop.shape)
+        results = self.model(crop) #RGB images[:,:,::-1]
+        # breakpoint()
+        arrays = np.array(results.pandas().xyxy[0])
+        bboxes = {
+            'bbox':[],
+            'bbox_handl':[],
+            'bbox_handr':[],
+            'pelvis':[],
+            'pelvis_l':[],
+            'pelvis_r':[]
+        }
+
+        for i, res in enumerate(arrays):
+            classid = res[5]
+            box = res[:5]
+            if self.mode == 'bboxcrop':
+                box[0]+=bbox[0]
+                box[2]+=bbox[0]
+                box[1]+=bbox[1]
+                box[3]+=bbox[1]
+            if False:
+                vis = images.copy()
+                cpimg = crop.copy()
+                from easymocap.mytools.vis_base import plot_bbox
+                plot_bbox(vis,box,0)
+                plot_bbox(cpimg,res[:5],0)
+                cv2.imshow('vis',vis)
+                # cv2.waitKey(0)
+                cv2.imshow('crop',cpimg)
+                cv2.waitKey(0)
+                breakpoint()
+            if box[4] < self.min_detect_thres:
+                continue
+            if classid==0:
+                bboxes['bbox'].append(box)
+            elif classid==1:
+                bboxes['bbox_handl'].append(box)
+                bboxes['pelvis_l'].append([(box[0]+box[2])/2,(box[1]+box[3])/2,box[-1]])
+            elif classid==2:
+                bboxes['bbox_handr'].append(box)
+                bboxes['pelvis_r'].append([(box[0]+box[2])/2,(box[1]+box[3])/2,box[-1]])
+        if(len(bboxes['bbox_handl'])==0):
+            # bboxes['bbox_handl'].append(np.zeros((0, 5)))
+            # bboxes['pelvis_l'].append(np.zeros((0, 3)))
+            bboxes['bbox_handl'].append(np.zeros((5)))
+            bboxes['pelvis_l'].append(np.zeros((3)))
+            
+        if(len(bboxes['bbox_handr'])==0):
+            # bboxes['bbox_handr'].append(np.zeros((0, 5)))
+            # bboxes['pelvis_r'].append(np.zeros((0, 3)))
+            bboxes['bbox_handr'].append(np.zeros((5)))
+            bboxes['pelvis_r'].append(np.zeros((3)))
+        if(len(bboxes['bbox'])==0):
+            bboxes['bbox'].append(np.zeros((5)))
+        bboxes['bbox'] = np.array(bboxes['bbox'])
+        if isinstance(imgname,str):
+            with open(cachename, 'wb') as f:
+                pickle.dump(bboxes, f)
+        return bboxes
+    def __call__(self, images, imgname, bbox=[]):
+        return self.det_step(images, imgname, bbox)
+
+
+class Yolo_model_hand_mvmp(Yolo_model):
+    @torch.no_grad()
+    def __call__(self, bbox, images, imgnames):
+        ret = {
+            'pelvis_l':[],
+            'pelvis_r':[],
+            # 'pelvis':[],
+            'bbox_handl':[],
+            'bbox_handr':[],
+        }
+        for nv in range(len(images)):
+            img = images[nv]
+            imgname = imgnames[nv]
+            if self.mode == 'bboxcrop':
+                bboxes = {
+                    'bbox':[],
+                    'bbox_handl':[],
+                    'bbox_handr':[],
+                    'pelvis_l':[],
+                    'pelvis_r':[]
+                }
+                for pid in range(len(bbox[nv])):
+                    bboxes_ = self.det_step(img, imgname, bbox[nv][pid])
+                    for key in bboxes.keys():
+                        bboxes[key].append(bboxes_[key])
+            else:
+                bboxes = self.det_step(img, imgname)
+            for k in ret.keys():
+                ret[k].append(np.array(bboxes[k]))
+
+        return ret
--- a/myeasymocap/datasets/basedata.py
+++ b/myeasymocap/datasets/basedata.py
@ -0,0 +1,106 @@
+import os
+from os.path import join
+import numpy as np
+import cv2
+from easymocap.mytools.debug_utils import log, myerror, mywarn
+
+class ImageDataBase:
+    def __init__(self, root, subs, ranges, read_image) -> None:
+        assert root != 'TO_BE_FILLED', 'You must set the root of dataset'
+        assert os.path.exists(root), f'root {root} not exists'
+        self.root = root
+        self.subs = subs
+        self.ranges = ranges
+        self.flag_read_image = read_image
+        self.infos = {}
+        self.meta = {}
+    
+    def check_frames_length(self):
+        if len(self.ranges) == 0:
+            self.ranges = [0, self.length, 1]
+        if self.ranges[1] > self.length:
+            self.ranges[1] = self.length
+        self.frames = list(range(*self.ranges))
+        self.length = len(self.frames)
+
+    def try_to_extract_images(self, root, value):
+        if not os.path.exists(os.path.join(root, value['root'])) and os.path.exists(os.path.join(root, 'videos')):
+            print('[{}] Cannot find the images but find the videos, try to extract it'.format(self.__class__.__name__))
+            for videoname in os.listdir(os.path.join(root, 'videos')):
+                videoext = '.' + videoname.split('.')[-1]
+                outdir = join(root, value['root'], videoname.replace(videoext, ''))
+                os.makedirs(outdir, exist_ok=True)
+                cmd = 'ffmpeg -i {videoname} -q:v 1 -start_number 0 {outdir}/%06d.jpg'.format(
+                    videoname=join(root, 'videos', videoname),
+                    outdir=outdir
+                )
+                os.system(cmd)
+
+    def __str__(self) -> str:
+        return f''' [Dataset] {self.__class__.__name__}
+    root  : {self.root}
+    subs  : {self.subs}
+    ranges: {self.ranges}
+'''
+
+    def __getitem__(self, index):
+        raise NotImplementedError
+    
+    def __len__(self):
+        return self.length
+    
+    def read_image(self, imgname, cameras=None):
+        assert os.path.exists(imgname), "image {} not exists".format(imgname)
+        sub = os.path.basename(os.path.dirname(imgname))
+        img = cv2.imread(imgname)
+        if cameras is None:
+            return img
+        K, D = self.cameras[sub]['K'], self.cameras[sub]['dist']
+        if np.linalg.norm(D) < 1e-3:
+            return img
+        if sub not in self.distortMap.keys():
+            h,  w = img.shape[:2]
+            mapx, mapy = cv2.initUndistortRectifyMap(K, D, None, K, (w,h), 5)
+            self.distortMap[sub] = (mapx, mapy)
+        mapx, mapy = self.distortMap[sub]
+        img = cv2.remap(img, mapx, mapy, cv2.INTER_NEAREST)
+        return img
+
+def read_mv_images(root, root_images, ext, subs):
+    assert os.path.exists(os.path.join(root, root_images)), f'root {root}/{root_images} not exists'
+    if len(subs) == 0:
+        subs = sorted(os.listdir(os.path.join(root, root_images)))
+        if subs[0].isdigit():
+            subs = sorted(subs, key=lambda x: int(x))
+    imagelists = []
+    log(f'Found {len(subs)} subjects in {root}/{root_images}')
+    for sub in subs:
+        images = sorted(os.listdir(os.path.join(root, root_images, sub)))
+        images = [os.path.join(root, root_images, sub, image) for image in images if image.endswith(ext)]
+        log(f'  -> Found {len(images)} {root_images} in {sub}.')
+        imagelists.append(images)
+    min_length = min([len(image) for image in imagelists])
+    log(f'  -> Min length: {min_length}')
+    imagenames = [[image[i] for image in imagelists] for i in range(min_length)]
+    return imagenames, {'subs': subs}
+
+def FloatArray(x):
+    return np.array(x, dtype=np.float32)
+
+def find_best_people(annots):
+    if len(annots) == 0:
+        return {}
+    # TODO: find the best
+    annot = annots[0]
+    bbox = FloatArray(annot['bbox'])
+    if 'keypoints' not in annot.keys():
+        return {}
+    keypoints = FloatArray(annot['keypoints'])
+    return {'bbox': bbox, 'keypoints': keypoints}
+
+def find_all_people(annots):
+    if len(annots) == 0:
+        return {}
+    bbox = FloatArray([annot['bbox'] for annot in annots])
+    keypoints = FloatArray([annot['keypoints'] for annot in annots])
+    return {'bbox': bbox, 'keypoints': keypoints}
--- a/myeasymocap/datasets/mv1p.py
+++ b/myeasymocap/datasets/mv1p.py
@ -0,0 +1,332 @@
+from easymocap.mytools.camera_utils import read_cameras
+from easymocap.mytools.debug_utils import log, myerror, mywarn
+from easymocap.mytools.file_utils import read_json
+from .basedata import ImageDataBase, read_mv_images, find_best_people, find_all_people
+import os
+from os.path import join
+import numpy as np
+import cv2
+from collections import defaultdict
+
+panoptic15_in_body15 = [1,0,8,5,6,7,12,13,14,2,3,4,9,10,11]
+
+def convert_body15_panoptic15(keypoints):
+    k3d_panoptic15 = keypoints[..., panoptic15_in_body15,: ]
+    return k3d_panoptic15
+
+def convert_panoptic15_body15(keypoints):
+    keypoints_b15 = np.zeros_like(keypoints)
+    keypoints_b15[..., panoptic15_in_body15, :] = keypoints
+    return keypoints_b15
+
+def padding_and_stack(datas):
+    shapes = {}
+    for data in datas:
+        if len(data) == 0:
+            continue
+        for key, value in data.items():
+            if key not in shapes.keys():
+                shapes[key] = value.shape
+    collect = {key: np.zeros((len(datas), *shapes[key])) for key in shapes.keys()}
+    for i, data in enumerate(datas):
+        for key, value in data.items():
+            collect[key][i] = value
+    return collect
+
+def padding_empty(datas):
+    shapes = {}
+    for data in datas:
+        if len(data) == 0:
+            continue
+        for key, value in data.items():
+            if key not in shapes.keys():
+                shapes[key] = value.shape[1:]
+    collect = {key: [None for data in datas] for key in shapes.keys()}
+    for i, data in enumerate(datas):
+        for key, shape in shapes.items():
+            if key not in data.keys():
+                print('[Dataset] padding empty view {} of {}'.format(i, key))
+                collect[key][i] = np.zeros((0, *shape), dtype=np.float32)
+            else:
+                collect[key][i] = data[key]
+    return collect
+
+def parse_frames(pafs_frame, H, W):
+    # 解析单帧的
+    res = {
+        'joints': [],
+        'pafs': {}
+    }
+    joints = pafs_frame[1:1+3*25]
+    for i in range(25):
+        value = np.fromstring(joints[3*i+2], sep=' ').reshape(3, -1).T
+        value[:, 0] = value[:, 0] * W
+        value[:, 1] = value[:, 1] * H
+        res['joints'].append(value.astype(np.float32))
+    # parse pafs
+    pafs = pafs_frame[1+3*25+1:]
+    for npart in range(26):
+        label = pafs[3*npart+0].split(' ')[2:]
+        label = (int(label[0]), int(label[1]))
+        shape = pafs[3*npart+1].split(' ')[2:]
+        w, h = int(shape[0]), int(shape[1])
+        value = np.fromstring(pafs[3*npart+2], sep=' ').reshape(w, h).astype(np.float32)
+        res['pafs'][label] = value
+    return res
+
+def read_4dassociation(pafs, H, W):
+    outputs = []
+    # 解析paf文件
+    with open(pafs, 'r') as f:
+        pafs = f.readlines()
+    indices = []
+    for i, line in enumerate(pafs):
+        if line.startswith('# newframes:'):
+            indices.append([i])
+        elif line.startswith('# end frames:'):
+            indices[-1].append(i)
+    print('[Read OpenPose] Totally {} frames'.format(len(indices)))
+    for (start, end) in indices:
+        pafs_frame = pafs[start+1:end]
+        pafs_frame = list(map(lambda x:x.strip(), pafs_frame))
+        frames = parse_frames(pafs_frame, H, W)
+        outputs.append(frames)
+    return outputs
+
+class MVDataset(ImageDataBase):
+    def __init__(self, root, subs, subs_vis, ranges, read_image=False, reader={}, filter={}) -> None:
+        super().__init__(root, subs, ranges, read_image)
+        self.subs_vis = subs_vis
+        self.length = 0
+        for key, value in reader.items():
+            if key == 'images':
+                self.try_to_extract_images(root, value)
+                data, meta = read_mv_images(root, value['root'], value['ext'], subs)
+                self.length = len(data)
+            elif key == 'image_shape':
+                imgnames = self.infos['images'][0]
+                shapes = []
+                for imgname in imgnames:
+                    img = cv2.imread(imgname)
+                    height, width, _ = img.shape
+                    log('[{}] sub {} shape {}'.format(self.__class__.__name__, imgname, img.shape))
+                    shapes.append([height, width])
+                data = [shapes]
+                meta = {}
+            elif key == 'annots':
+                data, meta = read_mv_images(root, value['root'], value['ext'], subs)
+                if self.length > 0:
+                    if self.length != len(data):
+                        myerror('annots length {} not equal to images length {}.'.format(len(data), self.length))
+                        data = data[:self.length]
+                else:
+                    self.length = len(data)
+            elif key == 'openpose':
+                # 读取open pose
+                if len(subs) == 0:
+                    pafs = sorted(os.listdir(join(root, value['root'])))
+                else:
+                    pafs = [f'{sub}.txt' for sub in subs]
+                results = []
+                for nv, paf in enumerate(pafs):
+                    pafname = join(root, value['root'], paf)
+                    infos = read_4dassociation(pafname, H=self.infos['image_shape'][0][nv][0], W=self.infos['image_shape'][0][nv][1])
+                    results.append(infos)
+                data = [[d[i] for d in results] for i in range(self.length)]
+                meta = {}
+            elif key == 'cameras':
+                if 'with_sub' in value.keys():
+                    raise NotImplementedError
+                else:
+                    cameras = read_cameras(os.path.join(root, value['root']))
+                    if 'remove_k3' in value.keys():
+                        for cam, camera in cameras.items():
+                            camera['dist'][:, 4] = 0.
+                    data = [cameras]
+                    meta = {}
+            elif key in ['pelvis']:
+                continue
+            elif key == 'keypoints3d':
+                k3droot = value['root']
+                filenames = sorted(os.listdir(k3droot))[:self.length]
+                res_key = value.get('key', 'pred')
+                data = []
+                for filename in filenames:
+                    results = read_json(join(k3droot, filename))
+                    if 'pids' not in results.keys():
+                        # 擅自补全
+                        results['pids'] = list(range(len(results[res_key])))
+                    data.append({
+                        'pids': results['pids'],
+                        'keypoints3d': np.array(results[res_key], dtype=np.float32)
+                    })
+                    if data[-1]['keypoints3d'].shape[-1] == 3:
+                        mywarn('The input keypoints dont have confidence')
+                        data[-1]['keypoints3d'] = np.concatenate([data[-1]['keypoints3d'], np.ones_like(data[-1]['keypoints3d'][..., :1])], axis=-1)
+                    if 'conversion' in value.keys():
+                        if value['conversion'] == 'panoptic15_to_body15':
+                            data[-1]['keypoints3d'] = convert_panoptic15_body15(data[-1]['keypoints3d'])
+            else:
+                raise ValueError(f'Unknown reader: {key}')
+            self.infos[key] = data
+            self.meta.update(meta)
+        self.reader = reader
+        self.filter = filter
+        if len(self.subs) == 0:
+            self.subs = self.meta['subs']
+        self.check_frames_length()
+    
+    @staticmethod
+    def read_annots(annotnames):
+        val = []
+        for annname in annotnames:
+            annots = read_json(annname)['annots']
+            # select the best people
+            annots = find_best_people(annots)
+            val.append(annots)
+        val = padding_and_stack(val)
+        return val
+    
+    def filter_openpose(self, candidates, pafs):
+        for nv, candview in enumerate(candidates):
+            H=self.infos['image_shape'][0][nv][0]
+            W=self.infos['image_shape'][0][nv][1]
+            for cand in candview:
+                if 'border' in self.filter.keys():
+                    border = self.filter['border'] * max(H, W)
+                    flag = (cand[:, 0] > border) & (cand[:, 0] < W - border) & (cand[:, 1] > border) & (cand[:, 1] < H - border)
+                    cand[~flag] = 0
+        return candidates, pafs
+
+    def __getitem__(self, index):
+        frame = self.frames[index]
+        ret = {}
+        for key, value in self.infos.items():
+            if len(value) == 1:
+                ret[key] = value[0]
+            elif frame >= len(value):
+                myerror(f'[{self.__class__.__name__}] {key}: index {frame} out of range {len(value)}')
+            else:
+                ret[key] = value[frame]
+        ret_list = defaultdict(list)
+        for key, val in ret.items():
+            if key == 'annots':
+                ret_list[key] = self.read_annots(val)
+            elif key == 'cameras':
+                for sub in self.subs:
+                    select = {k: val[sub][k] for k in ['K', 'R', 'T', 'dist', 'P']}
+                    ret_list[key].append(select)
+                ret_list[key] = padding_and_stack(ret_list[key])
+            elif key == 'images':
+                if self.flag_read_image:
+                    for i, sub in enumerate(self.subs):
+                        imgname = val[i]
+                        if sub in self.subs_vis or self.subs_vis == 'all':
+                            img = self.read_image(imgname)
+                        else:
+                            img = imgname
+                        ret_list[key].append(img)
+                        ret_list['imgnames'].append(imgname)
+                else:
+                    ret_list[key] = val
+                    ret_list['imgnames'] = val
+            elif key == 'openpose':
+                ret_list[key] = [v['joints'] for v in val]
+                # 同时返回PAF
+                ret_list[key+'_paf'] = [v['pafs'] for v in val]
+                # check一下PAF:
+                for nv in range(len(ret_list[key])):
+                    ret_list[key+'_paf'][nv][(8, 1)] = ret_list[key+'_paf'][nv].pop((1, 8)).T
+                ret_list[key], ret_list[key+'_paf'] = self.filter_openpose(ret_list[key], ret_list[key+'_paf'])
+            elif key == 'keypoints3d':
+                ret_list['keypoints3d'] = val['keypoints3d']
+                if 'pids' in val.keys():
+                    ret_list['pids'] = val['pids']
+                else:
+                    ret_list['pids'] = list(range(len(val['keypoints3d'])))
+            elif key in ['image_shape']:
+                pass
+            else:
+                print('[Dataset] Unknown key: {}'.format(key))
+        ret_list.update(ret_list.pop('annots', {}))
+        for key, val in self.reader.items():
+            if key == 'pelvis' and 'annots' in self.reader.keys(): # load pelvis from annots.keypoints
+                ret_list[key] = [d[:, val.root_id] for d in ret_list['keypoints']]
+            elif key == 'pelvis' and 'openpose' in self.reader.keys():
+                ret_list[key] = [d[val.root_id] for d in ret_list['openpose']]
+        ret_list['meta'] = {
+            'subs': self.subs,
+            'index': index,
+            'frame': frame,
+            'image_shape': ret['image_shape'],
+            'imgnames': ret_list['imgnames'],
+        }
+        return ret_list
+
+    def check(self, index):
+        raise NotImplementedError
+
+class MVMP(MVDataset):
+    def read_annots(self, annotnames):
+        val = []
+        for annname in annotnames:
+            annots = read_json(annname)['annots']
+            # 在这里进行filter，去掉不需要的2D
+            annots_valid = []
+            for annot in annots:
+                flag = True
+                if 'bbox_size' in self.filter.keys():
+                    bbox_size = self.filter['bbox_size']
+                    bbox = annot['bbox']
+                    area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+                    if area < bbox_size:
+                        flag = False
+                if flag:
+                    annots_valid.append(annot)
+            annots = annots_valid
+            # select the best people
+            annots = find_all_people(annots)
+            val.append(annots)
+        val = padding_empty(val)
+        return val
+    
+    def check(self, index):
+        data = self.__getitem__(index)
+        from easymocap.mytools.vis_base import plot_bbox, merge, plot_keypoints_auto
+        # check the subs vis
+        vis = []
+        for nv, sub in enumerate(self.subs):
+            if sub not in self.subs_vis:continue
+            img = data['images'][nv].copy()
+            bbox = data['bbox'][nv]
+            kpts = data['keypoints'][nv]
+            for i in range(bbox.shape[0]):
+                plot_bbox(img, bbox[i], pid=i)
+                plot_keypoints_auto(img, kpts[i], pid=i, use_limb_color=False)
+            vis.append(img)
+        vis = merge(vis)
+        cv2.imwrite('debug/{}_{:06d}.jpg'.format(self.__class__.__name__, index), vis)
+
+if __name__ == '__main__':
+    config = '''
+args:
+    root: /nas/ZJUMoCap/Part0/313
+    subs: []
+    subs_vis: ['01', '07', '13', '19']
+    ranges: [0, 100, 1]
+    read_image: False
+    reader:
+        images:
+            root: images
+            ext: .jpg
+        annots:
+            root: annots
+            ext: .json
+        cameras: # 兼容所有帧的相机参数不同的情况
+            root: ''
+'''
+    import yaml
+    config = yaml.load(config, Loader=yaml.FullLoader)
+    dataset = MVDataset(**config['args'])
+    for i in range(len(dataset)):
+        data = dataset[i]
--- a/myeasymocap/datasets/sv1p.py
+++ b/myeasymocap/datasets/sv1p.py
@ -0,0 +1,136 @@
+from .basedata import ImageDataBase, read_mv_images, find_best_people
+from easymocap.mytools.debug_utils import log, myerror, mywarn
+from easymocap.mytools.camera_utils import read_cameras
+from easymocap.mytools.file_utils import read_json
+import os
+import numpy as np
+import cv2
+
+class SVDataset(ImageDataBase):
+    '''
+        这个数据只用来返回单段的视频数据，不用来返回多段的视频数据
+    '''
+    def __init__(self, root, subs, ranges, read_image=False, reader={}) -> None:
+        super().__init__(root, subs, ranges, read_image)
+        assert len(subs) == 1, 'SVDataset only support one subject'
+        for key, value in reader.items():
+            if key == 'images':
+                self.try_to_extract_images(root, value)
+                data, meta = read_mv_images(root, value['root'], value['ext'], subs)
+                data = [d[0] for d in data]
+                self.length = len(data)
+            elif key == 'image_shape':
+                imgname = self.infos['images'][0]
+                shapes = []
+                assert os.path.exists(imgname), "image {} not exists".format(imgname)
+                img = cv2.imread(imgname)
+                assert img is not None, "image {} read failed".format(imgname)
+                height, width, _ = img.shape
+                log('[{}] sub {} shape {}'.format(self.__class__.__name__, imgname, img.shape))
+                shapes.append([height, width])
+                data = shapes
+            elif key == 'annots':
+                data, meta = read_mv_images(root, value['root'], value['ext'], subs)
+                data = [d[0] for d in data]
+                if self.length > 0:
+                    assert self.length == len(data), \
+                        myerror('annots length {} not equal to images length {}.'.format(len(data), self.length))
+                else:
+                    self.length = len(data)
+            elif key == 'cameras':
+                myerror('暂时没有实现相机参数')
+                raise NotImplementedError
+            else:
+                raise ValueError(f'Unknown reader: {key}')
+            self.infos[key] = data
+            self.meta.update(meta)
+        # check cameras:
+        if 'cameras' not in self.infos:
+            mywarn('[{}] No camera info, use default camera'.format(self.__class__.__name__))
+            imgname0 = self.infos['images'][0]
+            img = self.read_image(imgname0)
+            height, width = img.shape[:2]
+            log('[{}] Read shape {} from image {}'.format(self.__class__.__name__, img.shape, imgname0))
+            focal = 1.2*min(height, width) # as colmap
+            log('[{}] Set a fix focal length {}'.format(self.__class__.__name__, focal))
+            K = np.array([focal, 0., width/2, 0., focal, height/2, 0. ,0., 1.]).reshape(3, 3)
+            camera = {'K':K ,'R': np.eye(3), 'T': np.zeros((3, 1)), 'dist': np.zeros((1, 5))}
+            for key, val in camera.items():
+                camera[key] = val.astype(np.float32)
+            self.infos['cameras'] = [camera]
+        self.check_frames_length()
+        self.find_best_people = find_best_people
+    
+    def __getitem__(self, index):
+        frame = self.frames[index]
+        ret = {}
+        for key, value in self.infos.items():
+            if len(value) == 1:
+                ret[key] = value[0]
+            elif index >= len(value):
+                myerror(f'[{self.__class__.__name__}] {key}: index {frame} out of range {len(value)}')
+            else:
+                ret[key] = value[frame]
+        ret_new = {}
+        for key, val in ret.items():
+            if key == 'annots':
+                annots = read_json(val)['annots']
+                # select the best people
+                annots = self.find_best_people(annots)
+                ret_new.update(annots)
+            elif key == 'cameras':
+                ret_new[key] = val
+            elif key == 'images':
+                ret_new['imgnames'] = val
+                if self.flag_read_image:
+                    img = self.read_image(val)
+                    ret_new[key] = img
+                else:
+                    ret_new[key] = val
+            elif key == 'image_shape':
+                ret_new['image_shape'] = val
+        ret_new['meta'] = {
+            'subs': self.subs,
+            'index': index,
+            'frame': self.frames[index],
+            'image_shape': ret_new['image_shape'],
+            'imgnames': ret_new['imgnames'],
+        }
+        return ret_new
+
+class SVHandL(SVDataset):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.find_best_people = self._find_best_hand
+    
+    def _find_best_hand(self, annots):
+        assert len(annots) == 1, 'SVHandL only support one person'
+        annot = annots[0]
+        ret = {
+            'bbox': np.array(annot['bbox_handl2d'], dtype=np.float32),
+            'keypoints': np.array(annot['handl2d'], dtype=np.float32),
+        }
+        return ret
+
+if __name__ == '__main__':
+    cfg = '''
+module: myeasymocap.datasets.1v1p.MonoDataset
+args:
+  root: /nas/home/shuaiqing/EasyMocapDoc/demo/1v1p
+  subs: ['0+000553+000965']
+  ranges: [0, 99999, 1]
+  read_image: True
+  reader:
+    images:
+      root: images
+      ext: .jpg
+    annots:
+      root: annots
+      ext: .json
+'''
+    import yaml
+    cfg = yaml.load(cfg, Loader=yaml.FullLoader)
+    dataset = SVDataset(**cfg['args'])
+    print(dataset)
+    for i in range(len(dataset)):
+        data = dataset[i]
--- a/myeasymocap/io/model.py
+++ b/myeasymocap/io/model.py
@ -0,0 +1,123 @@
+import os
+import torch
+import numpy as np
+from easymocap.bodymodel.smpl import SMPLModel
+
+from easymocap.mytools.debug_utils import log
+
+def try_to_download_SMPL(model_dir):
+    cmd = 'wget https://www.dropbox.com/s/aeulffqzb3zmh8x/pare-github-data.zip'
+    os.system(cmd)
+    os.makedirs(model_dir, exist_ok=True)
+    cmd = 'unzip pare-github-data.zip -d {}'.format(model_dir)
+    print('[RUN] {}'.format(cmd))
+    os.system(cmd)
+
+class SMPLLoader:
+    def __init__(self, model_path, regressor_path, return_keypoints=True):
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        if not os.path.exists(model_path):
+            log('[SMPL] Model not found in `{}`'.format(model_path))
+            log('[SMPL] Downloading model to `{}`'.format(model_path))
+            try_to_download_SMPL('models/pare')
+        assert os.path.exists(model_path), f'{model_path} not exists'
+        if not os.path.exists(regressor_path):
+            if regressor_path.endswith('J_regressor_body25.npy'):
+                url = 'https://github.com/zju3dv/EasyMocap/raw/master/data/smplx/J_regressor_body25.npy'
+                os.makedirs(os.path.dirname(regressor_path), exist_ok=True)
+                cmd = 'wget {} -O {}'.format(url, regressor_path)
+                os.system(cmd)
+        assert os.path.exists(regressor_path), f'{regressor_path} not exists'
+        log('[SMPL] Loading model in `{}`'.format(model_path))
+        log('[SMPL] Using keypoints regressor `{}`'.format(regressor_path))
+        smplmodel = SMPLModel(model_path=model_path,
+                              model_type='smpl', device=device,
+                              regressor_path=regressor_path,
+                              NUM_SHAPES=10,
+                              )
+        self.smplmodel = smplmodel
+        self.return_keypoints = return_keypoints
+
+    def __call__(self,):
+        return {
+            'body_model': self.smplmodel, 
+            'model': self.forward}
+    
+    def forward(self, params):
+        keypoints = self.smplmodel.keypoints(params, return_tensor=True)
+        ret = {
+            'keypoints': keypoints
+        }
+        ret.update(params)
+        return ret
+
+class MANOLoader:
+    def __init__(self, cfg_path, model_path, regressor_path, num_pca_comps=45, use_pca=False, use_flat_mean=False):
+        log('[MANO] Loading model in `{}`'.format(model_path))
+        log('[MANO] Using keypoints regressor `{}`'.format(regressor_path))
+        assert os.path.exists(model_path), f'{model_path} not exists, Please download it from `mano.is.tue.mpg.de`'
+        if not os.path.exists(regressor_path) and regressor_path.endswith('J_regressor_mano_LEFT.txt'):
+            url = 'https://raw.githubusercontent.com/zju3dv/EasyMocap/master/data/smplx/J_regressor_mano_LEFT.txt'
+            os.makedirs(os.path.dirname(regressor_path), exist_ok=True)
+            cmd = 'wget {} -O {}'.format(url, regressor_path)
+            os.system(cmd)
+        assert os.path.exists(regressor_path), f'{regressor_path} not exists'
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        from easymocap.config import Config, load_object
+        cfg_data = Config.load(cfg_path)
+        cfg_data['args']['model_path'] = model_path
+        cfg_data['args']['regressor_path'] = regressor_path
+        cfg_data['args']['cfg_hand']['num_pca_comps'] = num_pca_comps
+        cfg_data['args']['cfg_hand']['use_pca'] = use_pca
+        cfg_data['args']['cfg_hand']['use_flat_mean'] = use_flat_mean
+        model = load_object(cfg_data.module, cfg_data.args)
+        self.manomodel = model
+
+    def __call__(self,):
+        return {
+            'hand_model': self.manomodel, 
+            'model': self.forward}
+    
+    def forward(self, params):
+        keypoints = self.manomodel.keypoints(params, return_tensor=True)
+        ret = {
+            'keypoints': keypoints
+        }
+        ret.update(params)
+        return ret
+
+class MANOLoader_lr:
+    def __init__(self, cfg_path, model_path, regressor_path, num_pca_comps=45, use_pca=False):
+        self.Model_l = MANOLoader(cfg_path, model_path, regressor_path, num_pca_comps, use_pca)
+        self.Model_r = MANOLoader(cfg_path, model_path.replace('LEFT','RIGHT'), regressor_path.replace('LEFT','RIGHT'), num_pca_comps, use_pca)
+    def __call__(self,):
+        ret={}
+        out1 = self.Model_l()
+        for key in out1.keys():
+            ret[key+'_l'] = out1[key]
+        out2 = self.Model_r()
+        for key in out1.keys():
+            ret[key+'_r'] = out2[key]
+        return ret
+
+class SMPLHLoader:
+    def __init__(self, path):
+        from easymocap.config import Config, load_object
+        cfg_data = Config.load(path)
+        self.model = load_object(cfg_data.module, cfg_data.args)
+    
+    def __call__(self,):
+        return {
+            'smplh_model': self.model, 
+            'model': self.forward}
+    
+    def forward(self, params):
+        keypoints = self.model(**params, return_verts=False, return_tensor=True)
+        ret = {
+            'keypoints': keypoints.clone(),#
+            'keypoints_body': keypoints[...,:25,:].clone(),
+            'keypoints_handlr': keypoints[...,25:,:].clone()
+
+        }
+        ret.update(params)
+        return ret
--- a/myeasymocap/io/video.py
+++ b/myeasymocap/io/video.py
@ -0,0 +1,42 @@
+import os
+import shutil
+from os.path import join
+from glob import glob
+from easymocap.mytools.debug_utils import log, mywarn, myerror, run_cmd
+
+class MakeVideo:
+    def __init__(self, fps, keep_image, output='tmp') -> None:
+        self.output = output
+        self.fps = fps
+        self.debug = False
+        self.keep_image = keep_image
+    
+    def __call__(self):
+        restart = ' -y '
+        fps_in = fps_out = self.fps
+        fps_in = ' -r {}'.format(fps_in)
+        path = self.output
+        ext = '.jpg'
+        cmd = ' -pix_fmt yuv420p -vcodec libx264'
+        cmd += ' -r {}'.format(fps_out)
+        if ext == '.png':
+            cmd += ' -profile:v main'
+        pathlist = sorted(os.listdir(path))
+        pathlist = [join(path, p) for p in pathlist if os.path.isdir(join(path, p))]
+        for path in pathlist:
+            imgnames = glob(join(path, '*{}'.format(ext)))
+            if len(imgnames) == 0:
+                continue
+            shell = f'ffmpeg{restart}{fps_in} -i "{path}/%06d{ext}" -vf scale="2*ceil(iw/2):2*ceil(ih/2)"{cmd} "{path}.mp4"'
+            if not self.debug:
+                shell += ' -loglevel quiet'
+            print(shell)
+            os.system(shell)
+            # 确认一下文件已经生成了
+            if not os.path.exists(path+'.mp4'):
+                mywarn('Video {} is not generated'.format(path+'.mp4'))
+                shell = shell.replace(' -loglevel quiet', '')
+                run_cmd(shell)
+            else:
+                if not self.keep_image:
+                    shutil.rmtree(path)
--- a/myeasymocap/io/vis.py
+++ b/myeasymocap/io/vis.py
@ -0,0 +1,260 @@
+import os
+from typing import Any
+import numpy as np
+import cv2
+from os.path import join
+from easymocap.mytools.vis_base import plot_keypoints_auto, merge, plot_bbox, get_rgb, plot_cross
+from easymocap.datasets.base import add_logo
+from easymocap.mytools.camera_utils import Undistort
+
+def projectPoints(k3d, camera):
+    k3d0 = np.ascontiguousarray(k3d[:, :3])
+    k3d_rt = np.dot(k3d0, camera['R'].T) + camera['T'].T
+    depth = k3d_rt[:, -1:]
+    k2d, _ = cv2.projectPoints(k3d0, camera['R'], camera['T'], camera['K'], camera['dist'])
+    k2d = np.hstack([k2d[:, 0], k3d[:, -1:]])
+    return k2d, depth
+
+class VisBase:
+    def __init__(self, scale=1, lw_factor=1, name='vis', mode='none', mode_args={}):
+        self.scale = scale
+        self.output = '/tmp'
+        self.name = name
+        self.lw = lw_factor
+        self.count = 0
+        self.mode = mode
+        self.mode_args = mode_args
+    
+    def merge_and_write(self, vis):
+        vis = [v for v in vis if not isinstance(v, str)]
+        if self.mode == 'center':
+            for i, v in enumerate(vis):
+                # crop the center region
+                left = int(v.shape[1] - v.shape[0]) // 2
+                v = v[:, left:left+v.shape[0], :]
+                vis[i] = v
+        elif self.mode == 'crop':
+            for i, v in enumerate(vis):
+                t, b, l, r = self.mode_args[i]
+                v = v[t:b, l:r]
+                vis[i] = v
+        if len(vis) == 0:
+            return 0
+        if len(vis) == 3: # 只有3个的时候的merge方案：第一个不变，后面两个缩小了放在右边
+            vis_0 = vis[0]
+            vis_1 = cv2.resize(vis[1], None, fx=0.5, fy=0.5)
+            vis_2 = cv2.resize(vis[2], None, fx=0.5, fy=0.5)
+            vis_12 = np.vstack([vis_1, vis_2])
+            vis = np.hstack([vis_0, vis_12])
+        else:
+            vis = merge(vis)
+        vis = cv2.resize(vis, None, fx=self.scale, fy=self.scale)
+        vis = add_logo(vis)
+        # TODO: 从输入的Meta里面读入图片名字
+        outname = join(self.output, self.name, '{:06d}.jpg'.format(self.count))
+        os.makedirs(os.path.dirname(outname), exist_ok=True)
+        cv2.imwrite(outname, vis)
+        self.count += 1
+
+class Vis3D(VisBase):
+    def __init__(self, scale, lw_factor=1, name='repro', **kwargs) -> None:
+        super().__init__(scale, lw_factor, name, **kwargs)
+    
+    def __call__(self, images, cameras, keypoints3d=None, results=None):
+        # keypoints3d: (nJoints, 4)
+        undist = False
+        cameras['dist'] = np.zeros_like(cameras['dist'])
+        vis_all = []
+        for nv in range(len(images)):
+            if isinstance(images[nv], str): continue
+            camera = {key:cameras[key][nv] for key in ['R', 'T', 'K', 'dist']}
+            if undist:
+                vis = Undistort.image(images[nv], cameras['K'][nv], cameras['dist'][nv])
+                camera['dist'] = np.zeros_like(camera['dist'])
+            else:
+                vis = images[nv].copy()
+            
+            if results is None:
+                if len(keypoints3d.shape) == 2:
+                    keypoints_repro, depth = projectPoints(keypoints3d, {key:cameras[key][nv] for key in ['R', 'T', 'K', 'dist']})
+                    plot_keypoints_auto(vis, keypoints_repro, pid=0, use_limb_color=False)
+                else:
+                    for pid in range(keypoints3d.shape[0]):
+                        keypoints_repro, depth = projectPoints(keypoints3d[pid], {key:cameras[key][nv] for key in ['R', 'T', 'K', 'dist']})
+                        plot_keypoints_auto(vis, keypoints_repro, pid=pid, use_limb_color=False)
+            else:
+                for res in results:
+                    k3d = res['keypoints3d']
+                    keypoints_repro, depth = projectPoints(k3d, camera)
+                    if k3d.shape[0] == 1:
+                        x, y = keypoints_repro[0,0], keypoints_repro[0,1]
+                        # if res['id'] == 6:
+                        plot_cross(vis, x, y, col=get_rgb(res['id']), lw=self.lw, width=self.lw * 5)
+                    elif k3d.shape[0] == 2: # limb
+                        x1, y1 = keypoints_repro[0,0], keypoints_repro[0,1]
+                        x2, y2 = keypoints_repro[1,0], keypoints_repro[1,1]
+                        cv2.line(vis, (int(x1), int(y1)), (int(x2), int(y2)), get_rgb(res['id']), self.lw)
+                    else:
+                        plot_keypoints_auto(vis, keypoints_repro, pid=res['id'], use_limb_color=False, lw_factor=self.lw)
+                    cv2.putText(vis, '{}'.format(res['id']), (int(keypoints_repro[0,0]), int(keypoints_repro[0,1])), 
+                                cv2.FONT_HERSHEY_SIMPLEX, 2, get_rgb(res['id']), self.lw)
+            vis_all.append(vis)
+        self.merge_and_write(vis_all)
+
+class VisRoot(VisBase):
+    def __call__(self, images, pelvis):
+        vis = []
+        for nv in range(len(images)):
+            if isinstance(images[nv], str): continue
+            v = images[nv].copy()
+            for i in range(pelvis[nv].shape[0]):
+                color = get_rgb(i)
+                x, y = pelvis[nv][i][0], pelvis[nv][i][1]
+                x, y = int(x), int(y)
+                plot_cross(v, x, y , col=color, lw=self.lw, width=self.lw * 10)
+                cv2.putText(v, '{}'.format(i), (int(x), int(y)), 
+                    cv2.FONT_HERSHEY_SIMPLEX, 2, color, self.lw)
+            vis.append(v)
+        self.merge_and_write(vis)
+
+class VisPAF(VisBase):
+    def __call__(self, images, openpose, openpose_paf):
+        # openpose [nViews, nJoints, 3]
+        # openpose_paf [nViews, dict, MxN]
+        vis_limb = [(8, 1)]
+        vis = []
+        nViews = len(images)
+        for nv in range(nViews):
+            if isinstance(images[nv], str): continue
+            v = images[nv].copy()
+            k2d = openpose[nv]
+            paf = openpose_paf[nv]
+            for (src, dst) in vis_limb:
+                # (M, N)
+                paf_ = paf[(src, dst)]
+                for i in range(paf_.shape[0]):
+                    for j in range(paf_.shape[1]):
+                        if paf_[i, j] < 0.1:
+                            continue
+                        x1, y1 = k2d[src][i, :2]
+                        x2, y2 = k2d[dst][j, :2]
+                        lw = int(paf_[i, j] * 10)
+                        cv2.line(v, (int(x1), int(y1)), (int(x2), int(y2)), get_rgb(src), lw)
+            vis.append(v)
+        self.merge_and_write(vis)
+        
+
+class VisBirdEye(VisBase):
+    def __init__(self, xranges, yranges, resolution=1024, name='bird', **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.xranges = xranges
+        self.yranges = yranges
+        self.resolution = resolution
+        self.blank = np.zeros((resolution, resolution, 3), dtype=np.uint8) + 255
+        x0, y0 = self.map_x_y(0, 0)
+        cv2.line(self.blank, (x0, 0), (x0, resolution), (0, 0, 0), 1)
+        cv2.line(self.blank, (0, y0), (resolution, y0), (0, 0, 0), 1)
+    
+    def map_x_y(self, x, y):
+        x = (x - self.xranges[0]) / (self.xranges[1] - self.xranges[0]) * self.resolution
+        y = (y - self.yranges[0]) / (self.yranges[1] - self.yranges[0]) * self.resolution
+        y = self.resolution - y
+        x, y = int(x), int(y)
+        return x, y
+
+    def __call__(self, results, cameras):
+        vis = self.blank.copy()
+        R = cameras['R']
+        T = cameras['T']
+        # 这里要兼容将来的相机运动的情况，所以不能预先可视化好
+        center = - np.einsum('bmn,bnj->bmj', R.swapaxes(1, 2), T)
+        for nv in range(center.shape[0]):
+            x, y = center[nv, 0], center[nv, 1]
+            x, y = self.map_x_y(x, y)
+            plot_cross(vis, x, y, col=(0,0,255), lw=self.lw, width=20)
+            cv2.putText(vis, 'cam{}'.format(nv), (int(x), int(y)), 
+                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), self.lw//4)
+        for res in results:
+            pid = res['id']
+            color = get_rgb(pid)
+            x, y, z = res['pelvis'][0, 0], res['pelvis'][0, 1], res['pelvis'][0, 2]
+            length = 0.5 * (np.clip(z - 1., 0, 1) + 1)
+            length = int(length/(self.xranges[1] - self.xranges[0]) * self.resolution)
+            x, y = self.map_x_y(x, y)
+            plot_cross(vis, x, y, col=color, lw=self.lw, width=self.lw * 5)
+            cv2.rectangle(vis, (x - length, y - length), (x + length, y + length), color, self.lw)
+            cv2.putText(vis, '{}'.format(pid), (int(x), int(y)), 
+                    cv2.FONT_HERSHEY_SIMPLEX, 2, color, self.lw)
+        self.merge_and_write([vis])
+
+
+class VisMatch(VisBase):
+    def __call__(self, images, pelvis, results):
+        vis = []
+        for nv in range(len(images)):
+            if isinstance(images[nv], str): 
+                vis.append(images[nv])
+                continue
+            else:
+                vis.append(images[nv].copy())
+        for res in results:
+            pid = res['id']
+            for nv, ind in zip(res['views'], res['indices']):
+                v = vis[nv]
+                if isinstance(v, str): continue
+                x, y = pelvis[nv][ind][0], pelvis[nv][ind][1]
+                plot_cross(v, pelvis[nv][ind][0], pelvis[nv][ind][1], col=get_rgb(pid), lw=self.lw, width=self.lw * 5)
+                cv2.putText(v, '{}'.format(pid), (int(x), int(y)), 
+                                cv2.FONT_HERSHEY_SIMPLEX, 2, get_rgb(pid), self.lw)
+        self.merge_and_write(vis)
+    
+class Vis_det(VisBase):
+    def __call__(self, images, **kwargs):
+        vis = []
+        for nv in range(len(images)):
+            if isinstance(images[nv], str): 
+                vis.append(images[nv])
+                continue
+            else:
+                v = images[nv].copy()
+                for key, bbox in kwargs.items():
+                    _bbox = bbox[nv]
+                    for idet in range(_bbox.shape[0]):
+                        plot_bbox(v, _bbox[idet], idet)
+                vis.append(v)
+        self.merge_and_write(vis)
+
+class Vis2D(VisBase):    
+    def __call__(self, images, **kwargs):
+        if 'keypoints' in kwargs:
+            keypoints = kwargs['keypoints']
+        else:
+            if len(kwargs.keys()) == 1:
+                keypoints = list(kwargs.values())[0]
+            else:
+                raise NotImplementedError
+        if 'bbox' in kwargs:
+            bbox = kwargs['bbox']
+        else:
+            bbox = None
+        if not isinstance(images, list):
+            images = [images]
+            keypoints = [keypoints]
+            bbox = [bbox]
+        vis = []
+        for nv in range(len(images)):
+            if isinstance(images[nv], str): continue
+            k2d = keypoints[nv]
+            vis_ = images[nv].copy()
+            if len(k2d.shape) == 2:
+                plot_keypoints_auto(vis_, k2d, pid=0, use_limb_color=False)
+                if bbox is not None:
+                    if len(bbox[nv].shape) == 2:
+                        plot_bbox(vis_, bbox[nv][0], 0)
+                    else:
+                        plot_bbox(vis_, bbox[nv], 0)
+            else:
+                for pid in range(k2d.shape[0]):
+                    plot_keypoints_auto(vis_, k2d[pid], pid=pid, use_limb_color=False)
+            vis.append(vis_)
+        self.merge_and_write(vis)
--- a/myeasymocap/io/vis3d.py
+++ b/myeasymocap/io/vis3d.py
@ -0,0 +1,389 @@
+from tqdm import tqdm
+import cv2
+import os
+from easymocap.visualize.pyrender_wrapper import plot_meshes
+from os.path import join
+import numpy as np
+from easymocap.datasets.base import add_logo
+from easymocap.mytools.vis_base import merge, plot_bbox
+from .vis import VisBase
+
+class Render(VisBase):
+    def __init__(self, name='render', scale=0.5, backend='pyrender', **kwargs) -> None:
+        super().__init__(name=name, scale=1., **kwargs)
+        self.scale3d = scale
+
+    def __call__(self, body_model, params, cameras, imgnames):
+        vertices = body_model.vertices(params, return_tensor=False)
+        faces = body_model.faces
+        for nf, img in enumerate(tqdm(imgnames, desc=self.name)):
+            basename = os.path.basename(img)
+            # 重新读入图片
+            assert os.path.exists(img), img
+            vis = cv2.imread(img)
+            vis = cv2.resize(vis, None, fx=self.scale3d, fy=self.scale3d)
+            vert = vertices[nf]
+            meshes = {}
+            meshes[0] = {
+                'vertices': vert,
+                'faces': faces,
+                'id': 0,
+                'name': 'human_{}'.format(0)
+            }
+            K = cameras['K'][nf].copy()
+            K[:2, :] *= self.scale3d
+            R = cameras['R'][nf]
+            T = cameras['T'][nf]
+            ret = plot_meshes(vis, meshes, K, R, T, mode='image')
+            self.merge_and_write([ret])
+
+class Render_multiview(VisBase):
+    def __init__(self, view_list=[], name='render', model_name='body_model', render_mode='image', backend='pyrender', shape=[-1,-1], scale=1., **kwargs):
+        self.scale3d = scale
+        super().__init__(name=name, scale=1., **kwargs)
+        self.view_list = view_list
+        self.render_mode = render_mode
+        self.model_name = model_name
+        self.shape = shape
+
+    def render_(self, vertices, faces, cameras, imgnames):
+        for nf, img in enumerate(tqdm(imgnames, desc=self.name)):
+            mv_ret = []
+            if not isinstance(img, list):
+                img = [img]
+            for nv in self.view_list:
+                basename = os.path.basename(img[nv])
+                assert os.path.exists(img[nv]), img[nv]
+                vis = cv2.imread(img[nv])
+                vis = cv2.resize(vis, None, fx=self.scale3d, fy=self.scale3d)
+                vert = vertices[nf]
+                meshes = {}
+                if vert.ndim == 2:
+                    meshes[0] = {
+                        'vertices': vert,
+                        'faces': faces,
+                        'id': 0,
+                        'name': 'human_{}'.format(0)
+                    }
+                elif vert.ndim == 3:
+                    for pid in range(vert.shape[0]):
+                        meshes[pid] = {
+                            'vertices': vert[pid],
+                            'faces': faces,
+                            'id': pid,
+                            'name': 'human_{}'.format(pid)
+                        }
+                if cameras['K'].ndim == 4:
+                    K = cameras['K'][nf][nv].copy()
+                    K[:2, :] *= self.scale
+                    R = cameras['R'][nf][nv]
+                    T = cameras['T'][nf][nv]
+                else:
+                    K = cameras['K'][nv].copy()
+                    K[:2, :] *= self.scale3d
+                    R = cameras['R'][nv]
+                    T = cameras['T'][nv]
+                # add ground
+                if self.render_mode == 'ground':
+                    from easymocap.visualize.geometry import create_ground
+                    ground = create_ground(
+                        center=[0, 0, -0.05], xdir=[1, 0, 0], ydir=[0, 1, 0], # 位置
+                        step=1, xrange=10, yrange=10, # 尺寸
+                        white=[1., 1., 1.], black=[0.5,0.5,0.5], # 颜色
+                        two_sides=True
+                    )
+                    meshes[1001] = ground
+                    vis = np.zeros((self.shape[0], self.shape[1], 3), dtype=np.uint8) + 255
+                    focal = min(self.shape) * 1.2
+                    K = np.array([
+                        [focal,0,vis.shape[0]/2],
+                        [0,focal,vis.shape[1]/2],
+                        [0,0,1]])
+                    ret = plot_meshes(vis, meshes, K, R, T, mode='rgb')
+                else:
+                    ret = plot_meshes(vis, meshes, K, R, T, mode=self.render_mode)
+                ret = add_logo(ret)
+                mv_ret.append(ret)
+            self.merge_and_write(mv_ret)
+
+    def __call__(self, params, cameras, imgnames, **kwargs):
+        body_model = kwargs[self.model_name]
+        vertices = body_model.vertices(params, return_tensor=False)
+        faces = body_model.faces
+        self.render_(vertices, faces, cameras, imgnames)
+
+class Render_nocam:
+    def __init__(self, scale=0.5, backend='pyrender',view_list=[0]) -> None:
+        self.name = 'render'
+        self.scale = scale
+        self.view_list = view_list
+
+    def __call__(self, hand_model, params, images):
+
+        vertices = hand_model(**params, return_verts=True, return_tensor=False)
+        faces = hand_model.faces
+        for nf, img in enumerate(tqdm(images, desc=self.name)):
+            for nv in self.view_list:
+                if isinstance(img, np.ndarray):
+                    vis = img.copy()
+                    basename = '{:06}.jpg'.format(nf)
+                else:
+                    basename = os.path.basename(img[nv])
+                    # 重新读入图片
+                    assert os.path.exists(img[nv]), img[nv]
+                    vis = cv2.imread(img[nv])
+                    
+                vis = cv2.resize(vis, None, fx=self.scale, fy=self.scale)
+                vert = vertices[nf]
+                meshes = {}
+                meshes[0] = {
+                    'vertices': vert,
+                    'faces': faces,
+                    'id': 0,
+                    'name': 'human_{}'.format(0)
+                }
+                K = np.array([[vis.shape[0],0,vis.shape[0]/2],[0,vis.shape[1],vis.shape[1]/2],[0,0,1]])
+                K[:2, :] *= self.scale
+                R = np.eye(3)
+                T = np.array([0,0,0.3])
+                ret = plot_meshes(vis, meshes, K, R, T, mode='image')
+                outname = join(self.output, self.name, basename)
+                os.makedirs(os.path.dirname(outname), exist_ok=True)
+                cv2.imwrite(outname, ret)
+
+class Render_multiview_hand(Render_multiview):
+    def __call__(self, hand_model_l, params_l, cameras, imgnames):
+        vertices = hand_model_l(**params_l, return_verts=True, return_tensor=False)
+        faces = hand_model_l.faces
+        self.render_(vertices, faces, cameras, imgnames)
+        
+class Render_smplh(Render_multiview):
+    def __init__(self, path, at_step, scale=0.5, mode='image', backend='pyrender', view_list=[0]) -> None:
+        super().__init__(scale, mode, backend, view_list)
+        from easymocap.config import Config, load_object
+        cfg_data = Config.load(path)
+        self.model = load_object(cfg_data.module, cfg_data.args)
+        self.at_step = at_step
+
+    def __call__(self, params_smplh, cameras, imgnames):
+        vertices = self.model(return_verts=True, return_tensor=False, **params_smplh)
+        faces = self.model.faces
+        if self.at_step:
+            self.render_([vertices], faces, cameras, [imgnames])
+        else:
+            self.render_(vertices, faces, cameras, imgnames)
+
+class Render_smplh2(Render_smplh):
+    def __call__(self, params, cameras, imgnames):
+        super().__call__(params, cameras, imgnames)
+
+def projectPoints(X, K, R, t, Kd):    
+    x = R @ X + t
+    x[0:2,:] = x[0:2,:]/x[2,:]#到归一化平面
+    r = x[0,:]*x[0,:] + x[1,:]*x[1,:]
+
+    x[0,:] = x[0,:]*(1 + Kd[0]*r + Kd[1]*r*r + Kd[4]*r*r*r) + 2*Kd[2]*x[0,:]*x[1,:] + Kd[3]*(r + 2*x[0,:]*x[0,:])
+    x[1,:] = x[1,:]*(1 + Kd[0]*r + Kd[1]*r*r + Kd[4]*r*r*r) + 2*Kd[3]*x[0,:]*x[1,:] + Kd[2]*(r + 2*x[1,:]*x[1,:])
+    x[0,:] = K[0,0]*x[0,:] + K[0,1]*x[1,:] + K[0,2]
+    x[1,:] = K[1,0]*x[0,:] + K[1,1]*x[1,:] + K[1,2]
+    return x
+class Render_multiview_handbyk3d(Render_multiview):
+    def __call__(self, hand_model_l, params_l, hand_model_r, params_r, cameras, imgnames, keypoints3d):
+        # breakpoint()
+        joint_regressor_r = np.load('models/handmesh/data/joint_regressor_r.npy') #右手
+        joint_regressor_l = np.load('models/handmesh/data/joint_regressor_l.npy') #右手
+        facesl = hand_model_l.faces
+        facesr = hand_model_r.faces
+
+        # for nf, img in enumerate(tqdm(imgnames, desc=self.name)):
+        #不显示0号人物的结果
+        keypoints3d[0]=0
+
+        img = imgnames
+        k3d = keypoints3d
+        
+        vertices_l = hand_model_l(**params_l, return_verts=True, return_tensor=False) #[nf]
+        vertices_r = hand_model_r(**params_r, return_verts=True, return_tensor=False) #[nf]
+
+        # breakpoint()
+
+        joint_l = np.repeat(joint_regressor_l[None, :, :],vertices_l.shape[0],0) @ vertices_l
+        joint_r = np.repeat(joint_regressor_r[None, :, :],vertices_r.shape[0],0) @ vertices_r
+        params_l['Th']+=k3d[:,7,:3] - joint_l[:,0,:] #左手7右手4 #[nf]
+        params_r['Th']+=k3d[:,4,:3] - joint_r[:,0,:] #左手7右手4 #[nf]
+        vertices_l = hand_model_l(**params_l, return_verts=True, return_tensor=False) #[nf]
+        vertices_r = hand_model_r(**params_r, return_verts=True, return_tensor=False) #[nf]
+
+        faces = []
+        vert = []
+        pids = []
+        for i in range(k3d.shape[0]):
+            if k3d[i,7,-1]==0:
+                continue
+            vv = vertices_l[i].copy()
+            vert.append(vv)
+            faces.append(facesl)
+            pids.append(i)
+        
+        for i in range(k3d.shape[0]):
+            if k3d[i,4,-1]==0:
+                continue
+            vv = vertices_r[i].copy()
+            vert.append(vv)
+            faces.append(facesr)
+            pids.append(i)
+
+        faces = np.stack(faces)
+        vert = np.stack(vert)
+        
+        for nv in self.view_list:
+            basename = os.path.basename(img[nv])
+            # 重新读入图片
+            assert os.path.exists(img[nv]), img[nv]
+            vis = cv2.imread(img[nv])
+            vis = cv2.resize(vis, None, fx=self.scale, fy=self.scale)
+
+            # vert = vertices
+            meshes = {}
+            if vert.ndim == 2:
+                meshes[0] = {
+                    'vertices': vert,
+                    'faces': faces,
+                    'id': 0,
+                    'name': 'human_{}'.format(0)
+                }
+            elif vert.ndim == 3:
+                for pid in range(vert.shape[0]):
+                    meshes[pid] = {
+                        'vertices': vert[pid],
+                        'faces': faces[pid],
+                        'vid': pids[pid],
+                        'name': 'human_{}'.format(pid)
+                    }
+            K = cameras['K'][nv].copy()
+            K[:2, :] *= self.scale
+            R = cameras['R'][nv]
+            T = cameras['T'][nv]
+            # breakpoint()
+            from easymocap.mytools.vis_base import plot_keypoints_auto
+            for pid in range(keypoints3d.shape[0]):
+                keypoints_repro = projectPoints(keypoints3d[pid].T[:3,:], K, R, T, cameras['dist'][nv].reshape(5)).T
+                keypoints_repro[:,-1] = keypoints3d[pid,:,-1]
+                plot_keypoints_auto(vis, keypoints_repro, pid=pid, use_limb_color=False)
+
+            ret = plot_meshes(vis, meshes, K, R, T, mode=self.mode)
+            outname = join(self.output, self.name, basename)
+            os.makedirs(os.path.dirname(outname), exist_ok=True)
+            cv2.imwrite(outname, ret)
+
+class Render_selectview:
+    def __init__(self, scale=0.5, backend='pyrender', output='output',mode = 'image') -> None:
+        self.name = 'render_debug'
+        self.scale = scale
+        self.view_list = [5]
+        self.output = output
+        self.mode = mode
+
+    def __call__(self, hand_model_l, posel, match3d_l, cameras, imgnames, keypoints3d,bbox_handl, joint_regressor, wristid):
+
+        img = imgnames
+        k3d = keypoints3d
+        # joint_regressor_r = np.load('models/handmesh/data/joint_regressor_r.npy') #右手
+        # joint_regressor_l = np.load('models/handmesh/data/joint_regressor_l.npy') 
+        joint_regressor_l = joint_regressor
+        facesl = hand_model_l.faces
+        # facesr = hand_model_r.faces
+        # breakpoint()
+        hand_list=[]
+        for pid in range(len(match3d_l)):
+            dt = match3d_l[pid]
+            if(isinstance(dt,int)):
+                # TODO:处理-1的情况，也就是没有找到合适的匹配到的手
+                hand_list.append(np.zeros((1,48)))
+                break
+            # Merge_list=[]
+            out_img = []
+            for cid in range(len(dt['views'])):
+                nv = dt['views'][cid]
+                poseid = dt['indices'][cid]
+                pose = posel[nv][poseid].copy()
+
+                Rh = pose[:,:3].copy()
+                invR = np.linalg.inv(cameras['R'][nv])
+                Rh_m_old = np.matrix(cv2.Rodrigues(Rh)[0])
+                Rh_m_new = invR @ Rh_m_old
+                Rh = cv2.Rodrigues(Rh_m_new)[0]
+                
+                pose_ = np.hstack((Rh.reshape(3),pose[:,3:].reshape(-1))).reshape(1,-1)
+
+                Rh = pose_[:,:3].copy()
+                pose_[:,:3] = 0
+                params_l={
+                    'Rh':Rh,
+                    'Th':np.zeros_like(Rh),
+                    'poses':pose_,
+                    'shapes':np.zeros((Rh.shape[0],10)),
+                }
+                vertices_l = hand_model_l(**params_l, return_verts=True, return_tensor=False)
+                joint_l = np.repeat(joint_regressor_l[None, :, :],vertices_l.shape[0],0) @ vertices_l
+                params_l['Th']+=k3d[pid,wristid,:3] - joint_l[0,0,:]
+                vertices_l = hand_model_l(**params_l, return_verts=True, return_tensor=False)
+
+                vert = vertices_l[0]
+                faces = facesl
+
+                basename = os.path.basename(img[nv])
+                # 重新读入图片
+                assert os.path.exists(img[nv]), img[nv]
+                vis = cv2.imread(img[nv])
+
+                plot_bbox(vis,bbox_handl[nv][poseid],0)
+                vis = cv2.resize(vis, None, fx=self.scale, fy=self.scale)
+
+                meshes = {}
+                if vert.ndim == 2:
+                    meshes[0] = {
+                        'vertices': vert,
+                        'faces': faces,
+                        'id': 0,
+                        'name': 'human_{}'.format(0)
+                    }
+                elif vert.ndim == 3:
+                    for pid in range(vert.shape[0]):
+                        meshes[pid] = {
+                            'vertices': vert[pid],
+                            'faces': faces[pid],
+                            'id': pid,
+                            'name': 'human_{}'.format(pid)
+                        }
+                K = cameras['K'][nv].copy()
+                K[:2, :] *= self.scale
+                R = cameras['R'][nv]
+                T = cameras['T'][nv]
+                # breakpoint()
+                ret = plot_meshes(vis, meshes, K, R, T, mode=self.mode)
+                out_img.append(ret)
+            
+            out_img = merge(out_img)
+            outname = join(self.output, self.name, '{}-{:02d}.jpg'.format(basename.split('.jpg')[0],pid))
+            os.makedirs(os.path.dirname(outname), exist_ok=True)
+            cv2.imwrite(outname, out_img)
+
+class Render_selectview_lr:
+    def __init__(self, scale=0.5, backend='pyrender', output='output',mode = 'image') -> None:
+        self.output = output
+        self.model_l = Render_selectview(scale=0.5, backend='pyrender', output = self.output,mode = mode)
+        self.model_r = Render_selectview(scale=0.5, backend='pyrender', output = self.output,mode = mode)
+        self.model_l.name+='_l'
+        self.model_r.name+='_r'
+    def __call__(self, hand_model_l, posel, poser, match3d_l, match3d_r, hand_model_r, cameras, imgnames, keypoints3d,bbox_handl,bbox_handr):
+        joint_regressor_r = np.load('models/handmesh/data/joint_regressor_r.npy') #右手
+        joint_regressor_l = np.load('models/handmesh/data/joint_regressor_l.npy') 
+
+        self.model_l(hand_model_l, posel, match3d_l, cameras, imgnames, keypoints3d,bbox_handl, joint_regressor_l, 7)
+        self.model_r(hand_model_r, poser, match3d_r, cameras, imgnames, keypoints3d,bbox_handr, joint_regressor_r, 4)
+
+class Render_mv(Render):
+    def __call__(self, body_model, params, cameras, imgnames):
+        # breakpoint()
+        super().__call__(body_model, params, cameras, [imgnames[0],imgnames[1]])
--- a/myeasymocap/io/write.py
+++ b/myeasymocap/io/write.py
@ -0,0 +1,97 @@
+import os
+from easymocap.mytools.file_utils import write_keypoints3d, write_smpl
+from easymocap.annotator.file_utils import save_annot
+from os.path import join
+from tqdm import tqdm
+
+class Write:
+    def __init__(self, output='/tmp', name='keypoints3d') -> None:
+        self.output = output
+        self.name = name
+    
+    def __call__(self, keypoints3d):
+        for nf in tqdm(range(keypoints3d.shape[0]), desc='writing to {}/{}'.format(self.output, self.name)):
+            res = [{
+                'id': 0,
+                'keypoints3d': keypoints3d[nf]
+            }]
+            dumpname = join(self.output, self.name, '{:06d}.json'.format(nf))
+            write_keypoints3d(dumpname, res)
+        return {}
+
+class WriteAll:
+    def __init__(self, name, output='/tmp') -> None:
+        self.output = output
+        self.name = name
+    
+    def __call__(self, results, meta):
+        for nf in tqdm(range(len(results)), desc='writing to {}/{}'.format(self.output, self.name)):
+            res = [{'id': r['id'], 'keypoints3d': r['keypoints3d']} for r in results[nf]]
+            res.sort(key=lambda x: x['id'])
+            imgnames = meta['imgnames'][nf]
+            if len(imgnames) > 0:
+                name = os.path.basename(imgnames[0])
+                name = name.replace('.jpg', '')
+            else:
+                name = '{:06f}'.format(nf)
+            dumpname = join(self.output, self.name, '{}.json'.format(name))
+            write_keypoints3d(dumpname, res)
+
+class Write2D:
+    def __init__(self, name, output='/tmp') -> None:
+        self.output = output
+        self.name = name
+    
+    def __call__(self, results, meta):
+        for nf in tqdm(range(len(results)), desc='writing to {}/{}'.format(self.output, self.name)):
+            subs = meta['subs'][nf]
+            result = results[nf]
+            annots_all = {sub: [] for sub in subs}
+            for res in result:
+                for nv, v in enumerate(res['views']):
+                    annots_all[subs[v]].append({
+                        'personID': res['id'],
+                        'bbox': res['bbox'][nv],
+                        'keypoints': res['keypoints2d'][nv],
+                    })
+            for nv, sub in enumerate(subs):
+                annots = {
+                    'filename': f'{sub}/{nf:06d}.jpg',
+                    'height': meta['image_shape'][nf][nv][0],
+                    'width': meta['image_shape'][nf][nv][1],
+                    'annots': annots_all[sub],
+                    'isKeyframe': False
+                }
+                dumpname = join(self.output, self.name, sub, '{:06d}.json'.format(nf))
+                save_annot(dumpname, annots)
+
+class WriteSMPL:
+    def __init__(self, name='smpl') -> None:
+        self.name = name
+    
+    def __call__(self, params=None, results=None, meta=None, model=None):
+        results_all = []
+        if results is None and params is not None:
+            # copy params to results
+            results = {0: {'params': params, 'keypoints3d': None, 'frames': list(range(len(params['Rh'])))}}
+        for index in tqdm(meta['index'], desc=self.name):
+            results_frame = []
+            for pid, result in results.items():
+                if index >= result['frames'][0] and index <= result['frames'][-1]:
+                    frame_rel = result['frames'].index(index)
+                    results_frame.append({
+                        'id': pid,
+                        # 'keypoints3d': result['keypoints3d'][frame_rel]
+                    })
+                    for key in ['Rh', 'Th', 'poses', 'shapes']:
+                        if result['params'][key].shape[0] == 1:
+                            results_frame[-1][key] = result['params'][key]
+                        else:
+                            results_frame[-1][key] = result['params'][key][frame_rel:frame_rel+1]
+                    param = results_frame[-1]
+                    pred = model(param)['keypoints'][0]
+                    results_frame[-1]['keypoints3d'] = pred
+            write_smpl(join(self.output, self.name, '{:06d}.json'.format(meta['frame'][index])), results_frame)
+            write_keypoints3d(join(self.output, 'keypoints3d', '{:06d}.json'.format(meta['frame'][index])), results_frame)
+            results_all.append(results_frame)
+        return {'results_perframe': results_all}
--- a/myeasymocap/operations/init.py
+++ b/myeasymocap/operations/init.py
@ -0,0 +1,101 @@
+from typing import Any
+import numpy as np
+from easymocap.mytools.debug_utils import mywarn, log
+
+def solve_translation(X, x, K):
+    A = np.zeros((2*X.shape[0], 3))
+    b = np.zeros((2*X.shape[0], 1))
+    fx, fy = K[0, 0], K[1, 1]
+    cx, cy = K[0, 2], K[1, 2]
+    for nj in range(X.shape[0]):
+        A[2*nj, 0] = 1
+        A[2*nj + 1, 1] = 1
+        A[2*nj, 2] = -(x[nj, 0] - cx)/fx
+        A[2*nj+1, 2] = -(x[nj, 1] - cy)/fy
+        b[2*nj, 0] = X[nj, 2]*(x[nj, 0] - cx)/fx - X[nj, 0]
+        b[2*nj+1, 0] = X[nj, 2]*(x[nj, 1] - cy)/fy - X[nj, 1]
+        A[2*nj:2*nj+2, :] *= x[nj, 2]
+        b[2*nj:2*nj+2, :] *= x[nj, 2]
+    trans = np.linalg.inv(A.T @ A) @ A.T @ b
+    return trans.T[0]
+
+class MeanShapes:
+    def __init__(self, keys, dim=0) -> None:
+        self.keys = keys
+        self.dim = dim
+    
+    def __call__(self, params):
+        for key in self.keys:
+            log('[{}] Mean {}: {}'.format(self.__class__.__name__, key, params[key].shape))
+            params[key] = params[key].mean(axis=self.dim, keepdims=True)
+            log('[{}] Mean {}: {}'.format(self.__class__.__name__, key, params[key].shape))
+
+class InitTranslation:
+    def __init__(self, solve_T=True, solve_R=False) -> None:
+        self.solve_T = solve_T
+        self.solve_R = solve_R
+    
+    def __call__(self, body_model, params, cameras, keypoints):
+        nJoints = 15 # 只使用主要的15个点
+        params['Th'] = np.zeros_like(params['Th'])
+        kpts1 = body_model.keypoints(params, return_tensor=False)
+        for i in range(kpts1.shape[0]):
+            k2d = keypoints[i, :nJoints]
+            if k2d[:, -1].sum() < nJoints / 2:
+                mywarn('[{}] No valid keypoints in frame {}'.format(self.__class__.__name__, i))
+                params['Th'][i] = params['Th'][i-1]
+                continue
+            trans = solve_translation(kpts1[i, :nJoints], k2d, cameras['K'][i])
+            params['Th'][i] += trans
+        # params['shapes'] = params['shapes'].mean(0, keepdims=True)
+        return {'params': params}
+
+class InitParams:
+    def __init__(self, num_poses=69, num_shapes=10, rootid=8, share_shape=True, init_trans=0.) -> None:
+        self.num_poses = num_poses
+        self.num_shapes = num_shapes
+        self.rootid = rootid
+        self.share_shape = share_shape
+        self.init_trans = init_trans
+
+    def __call__(self, **kwargs):
+        """
+            keypoints3d: (nFrames, nJoints, 4) or (nFrames, nPerson, nFrames, 4)
+        """
+        key = list(kwargs.keys())[0]
+        keypoints3d = kwargs[key]
+        if keypoints3d.ndim == 4:
+            shape = (keypoints3d.shape[:2])
+        elif keypoints3d.ndim == 3:
+            shape = (keypoints3d.shape[0],)
+        else:
+            raise ValueError('keypoints3d must be 3 or 4 dim')
+        params={
+            'Rh': np.zeros((*shape, 3),dtype=np.float32),
+            'Th': np.zeros((*shape, 3),dtype=np.float32),
+            'poses': np.zeros((*shape, self.num_poses),dtype=np.float32),
+            'shapes': np.zeros((*shape, self.num_shapes),dtype=np.float32)
+        }
+        # TODO: check the root confidence and interpolate
+        # 初始化
+        if key == 'keypoints3d':
+            params['Th'] = keypoints3d[..., self.rootid, :3]
+        else:
+            mywarn('[{}] Not used keypoints3d, set to {}'.format(self.__class__.__name__, self.init_trans))
+            params['Th'][:, 2] = self.init_trans
+        if self.share_shape:
+            params['shapes'] = params['shapes'].mean(0, keepdims=True)
+        return {'params': params}
+
+class Init_params_and_target_poses(InitParams):
+    def __call__(self, params_smplh, model):
+        """
+            keypoints3d: (nFrames, nJoints, 4) or (nFrames, nPerson, nFrames, 4)
+        """
+        out = model(params_smplh)
+        keypoints3d = out['keypoints'].cpu().detach().numpy()
+        ret = super().__call__(keypoints3d)
+        for key in params_smplh.keys():
+            ret['params'][key] = params_smplh[key]
+            ret['target_'+key] = params_smplh[key]
+        return ret
--- a/myeasymocap/operations/loss.py
+++ b/myeasymocap/operations/loss.py
@ -0,0 +1,246 @@
+import torch
+import torch.nn as nn
+import numpy as np
+
+class GMoF(nn.Module):
+    def __init__(self, rho=1):
+        super(GMoF, self).__init__()
+        self.rho2 = rho * rho
+
+    def extra_repr(self):
+        return 'rho = {}'.format(self.rho)
+
+    def forward(self, est, gt=None, conf=None):
+        if gt is not None:
+            square_diff = torch.sum((est - gt)**2, dim=-1)
+        else:
+            square_diff = torch.sum(est**2, dim=-1)
+        diff = torch.div(square_diff, square_diff + self.rho2)
+        if conf is not None:
+            res = torch.sum(diff * conf)/(1e-5 + conf.sum())
+        else:
+            res = diff.sum()/diff.numel()
+        return res
+
+class BaseLoss(nn.Module):
+    def __init__(self, norm='l2', norm_info={}, reduce='sum') -> None:
+        super().__init__()
+        self.loss = self.make_loss(norm, norm_info, reduce)
+    
+    def make_loss(self, norm='l2', norm_info={}, reduce='sum'):
+        reduce = torch.sum if reduce=='sum' else torch.mean
+        if norm == 'l2':
+            def loss(est, gt=None, conf=None):
+                if gt is not None:
+                    square_diff = reduce((est - gt)**2, dim=-1)
+                else:
+                    square_diff = reduce(est**2, dim=-1)
+                if conf is not None:
+                    res = torch.sum(square_diff * conf)/(1e-5 + conf.sum())
+                else:
+                    res = square_diff.sum()/square_diff.numel()
+                return res
+        elif norm == 'l1':
+            def loss(est, gt=None, conf=None):
+                if gt is not None:
+                    square_diff = reduce(torch.abs(est - gt), dim=-1)
+                else:
+                    square_diff = reduce(torch.abs(est), dim=-1)
+                if conf is not None:
+                    res = torch.sum(square_diff * conf)/(1e-5 + conf.sum())
+                else:
+                    res = square_diff.sum()/square_diff.numel()
+                return res
+        elif norm == 'gm':
+            loss = GMoF(norm_info)
+        else:
+            loss = None
+        return loss
+
+    def forward(self, pred, target):
+        pass
+
+class BaseKeypoints(BaseLoss):
+    @staticmethod
+    def select(keypoints, index, ranges):
+        if len(index) > 0:
+            keypoints = keypoints[..., index, :]
+        elif len(ranges) > 0:
+            if ranges[1] == -1:
+                keypoints = keypoints[..., ranges[0]:, :]
+            else:
+                keypoints = keypoints[..., ranges[0]:ranges[1], :]
+        return keypoints
+
+    def __init__(self, index_est=[], index_gt=[],
+                ranges_est=[], ranges_gt=[], **kwargs):
+        super().__init__(**kwargs)
+        self.index_est = index_est
+        self.index_gt = index_gt
+        self.ranges_est = ranges_est
+        self.ranges_gt = ranges_gt
+
+    def forward(self, pred, target):
+        return super().forward(pred, target)
+    
+    def loss_keypoints(self, pred, target, conf):
+        # pred: (..., dim)
+        # target: (..., dim)
+        # conf: (..., 1)
+        dist = torch.sum((pred - target)**2, dim=-1, keepdim=True)
+        loss = torch.sum(dist * conf) / torch.sum(conf)
+        return loss
+
+class Keypoints2D(BaseKeypoints):
+    def forward(self, pred, target):
+        # (nFrames, nJoints, 3)
+        pred_kpts3d = self.select(pred['keypoints'] , self.index_est, self.ranges_est)
+        target_kpts2d = self.select(target['keypoints'], self.index_gt, self.ranges_gt)
+        cameras = target['cameras']
+        P = torch.cat([cameras['R'], cameras['T']], dim=-1)
+        invKtrans = torch.inverse(cameras['K']).transpose(-1, -2)
+        homo = torch.cat([target_kpts2d[..., :2], torch.ones_like(target_kpts2d[..., 2:])], dim=-1)
+        target_points = torch.matmul(homo, invKtrans)[..., :2]
+        pred_homo = torch.cat([pred_kpts3d, torch.ones_like(pred_kpts3d[..., :1])], dim=-1)
+        self.einsum = 'fab,fjb->fja'
+        point_cam = torch.einsum(self.einsum, P, pred_homo)
+        img_points = point_cam[..., :2]/point_cam[..., 2:]
+        loss = self.loss(est=img_points, gt=target_points, conf=target_kpts2d[..., -1])
+        return loss
+    
+class Keypoints3D(BaseKeypoints):
+    def forward(self, pred, target):
+        # (nFrames, nJoints, 3)
+        # breakpoint()
+        pred_kpts3d = self.select(pred['keypoints'] , self.index_est, self.ranges_est)
+        target_kpts3d = self.select(target['keypoints3d'], self.index_gt, self.ranges_gt)
+        assert target_kpts3d.shape[-1] == 4, 'Target keypoints {} must have confidence '.format(target_kpts3d.shape)
+        loss = self.loss(est=pred_kpts3d, gt=target_kpts3d[...,:3], conf=target_kpts3d[..., -1])
+        return loss
+
+class LimbLength(BaseKeypoints):
+    def __init__(self, kintree, key='keypoints3d', **kwargs):
+        self.kintree = np.array(kintree)
+        super().__init__(**kwargs)
+    
+    def __str__(self):
+        return "Limb of: {}".format(','.join(['[{},{}]'.format(i,j) for (i,j) in self.kintree]))
+
+    def forward(self, pred, target):
+        pred_kpts3d = pred['keypoints']
+        target_kpts3d = target['keypoints3d']
+        # 用kin tree来进行选择
+        pred = torch.norm(pred_kpts3d[..., self.kintree[:, 1], :] - pred_kpts3d[..., self.kintree[:, 0], :], dim=-1, keepdim=True)
+        target = torch.norm(target_kpts3d[..., self.kintree[:, 1], :] - target_kpts3d[..., self.kintree[:, 0], :], dim=-1, keepdim=True)
+        target_conf = torch.minimum(target_kpts3d[..., self.kintree[:, 1], -1], target_kpts3d[..., self.kintree[:, 0], -1])
+        loss = self.loss(est=pred, gt=target, conf=target_conf)
+        return loss
+
+class Smooth(BaseLoss):
+    def __init__(self, keys, smooth_type, order, norm, weights, window_weight) -> None:
+        super().__init__(norm)
+        self.loss = {}
+        for i in range(len(keys)):
+            new_key = keys[i] + '_' + smooth_type[i]
+            self.loss[new_key] = {
+                'func': self.make_loss(norm='l2', norm_info={}, reduce='sum'),
+                'key': keys[i],
+                'weight': weights[i],
+                'norm': norm[i],
+                'order': order[i],
+                'type': smooth_type[i],
+            }
+        self.window_weight = window_weight
+    
+    def convert_Rh_to_R(self, Rh):
+        from ..bodymodels.geometry import batch_rodrigues
+        # Rh: (..., nRot x 3)
+        nRot = Rh.shape[-1] // 3
+        Rh_flat = Rh.reshape(-1, nRot, 3)
+        Rh_flat = Rh_flat.reshape(-1, 3)
+        Rot = batch_rodrigues(Rh_flat)
+        Rot_0 = Rot.reshape(-1, nRot, 3, 3)
+        Rot = Rot_0.reshape(*Rh.shape[:-1], 3, 3)
+        Rot = Rot.reshape(*Rh.shape[:-1], 9)
+        return Rot
+    
+    def forward(self, pred, target):
+        ret = {}
+        for key, cfg in self.loss.items():
+            value = pred[cfg['key']]
+            loss = 0
+            for width, weight in enumerate(self.window_weight, start=1):
+                if cfg['type'] == 'Linear':
+                    vel = value[width:] - value[:-width]
+                elif cfg['type'] == 'Rot':
+                    _value = self.convert_Rh_to_R(value)
+                    vel = _value[width:] - _value[:-width]
+                elif cfg['type'] == 'Depth':
+                    # TODO: 考虑相机的RT
+                    if 'cameras' in target.keys():
+                        R = target['cameras']['R']
+                        _value = torch.bmm(value[..., None, :], R.transpose(-1, -2))
+                        _value = _value[..., 0, :]
+                    _value = _value[..., [2]] # 只使用深度
+                    vel = _value[width:] - _value[:-width]
+                if cfg['order'] == 2:
+                    vel = vel[1:] - vel[:-1]
+                loss += weight * cfg['func'](est=vel)
+            ret[key] = loss * cfg['weight']
+        return ret
+
+class AnySmooth(BaseLoss):
+    def __init__(self, key, weight, norm, norm_info={}, dim=-1, order=1):
+        super().__init__()
+        self.dim = dim
+        self.weight = weight
+        self.loss = self.make_loss(norm, norm_info)
+        self.norm_name = norm
+        self.key = key
+        self.order = order
+    
+    def forward(self, pred, target):
+        loss = 0
+        value = pred[self.key]
+        # value = select(value, self.ranges, self.index, self.dim)
+        if value.shape[0] <= len(self.weight):
+            return torch.FloatTensor([0.]).to(value.device)
+        for width, weight in enumerate(self.weight, start=1):
+            vel = value[width:] - value[:-width]
+            if self.order == 2:
+                vel = vel[1:] - vel[:-1]
+            loss += weight * self.loss(vel)
+        return loss
+
+class Init(BaseLoss):
+    def __init__(self, keys, weights, norm) -> None:
+        super().__init__(norm)
+        self.keys = keys
+        self.weights = weights
+
+    def forward(self, pred, target):
+        ret = {}
+        for key in self.keys:
+            ret[key] = torch.mean((pred[key] - target['init_'+key])**2)
+        return ret
+
+from easymocap.multistage.lossbase import AnyReg
+class RegLoss(AnyReg):
+    def __init__(self, key, norm) -> None:
+        super().__init__(key, norm)
+
+    def __call__(self, pred, target):
+        return self.forward(**{self.key: pred[self.key]})
+    
+class Init_pose(Init):
+    def __init__(self, keys, weights, norm) -> None:
+        super().__init__(keys, weights, norm)
+        self.norm = norm
+    def forward(self, pred, target):
+        ret = {}
+        for key in self.keys:
+            if self.norm == 'l2':
+                ret[key] = torch.sum((pred[key] - target['target_'+key])**2)
+            elif self.norm == 'l1':
+                ret[key] = torch.sum(torch.abs(pred[key] - target['target_'+key]))
+        return ret
--- a/myeasymocap/operations/match.py
+++ b/myeasymocap/operations/match.py
--- a/myeasymocap/operations/merge.py
+++ b/myeasymocap/operations/merge.py
@ -0,0 +1,193 @@
+import numpy as np
+import cv2
+import scipy
+import torch
+
+class MultilView_Merge:
+    def __init__(self) -> None:
+        pass
+    def forward(self, data,ax=0):
+        '''
+        data - dict
+        data[key] (nv,...)
+        '''
+        results={}
+        for key in data.keys():
+            results[key] = data[key].mean(axis=ax)
+        return results
+
+
+class Merge_hand(MultilView_Merge):
+    def __init__(self, camtoworld) -> None:
+        self.camtoworld = camtoworld
+        # pass
+    def __call__(self, posel , cameras, match3d_l):
+        # ret = []
+        # for nf in range(len(posel)):
+        # breakpoint()
+        hand_list=[]
+        for pid in range(len(match3d_l)):
+            dt = match3d_l[pid]
+            if(isinstance(dt,int)):
+                # TODO:处理-1的情况，也就是没有找到合适的匹配到的手
+                hand_list.append(np.zeros((1,48)))
+                break
+            Merge_list=[]
+            for cid in range(len(dt['views'])):
+                nv = dt['views'][cid]
+                poseid = dt['indices'][cid]
+                pose = posel[nv][poseid].copy()
+
+                if self.camtoworld:
+                    Rh = pose[:,:3].copy()
+                    invR = np.linalg.inv(cameras['R'][nv])
+                    Rh_m_old = np.matrix(cv2.Rodrigues(Rh)[0])
+                    Rh_m_new = invR @ Rh_m_old
+                    Rh = cv2.Rodrigues(Rh_m_new)[0]
+                    Merge_list.append(np.hstack((Rh.reshape(3),pose[:,3:].reshape(-1))))
+                else:
+                    Merge_list.append(pose)
+            out = self.forward({'pose':np.stack(Merge_list)},0)
+            
+            hand_list.append(out['pose'])
+        pose_ = np.stack(hand_list)
+        Rh = pose_[:,:3].copy()
+        pose_[:,:3] = 0
+        params={
+            'Rh':Rh,
+            'Th':np.zeros_like(Rh),
+            'poses':pose_,
+            'shapes':np.zeros((Rh.shape[0],10)),
+        }
+        # ret.append(params)
+        return {'params': params}
+
+class Merge_handlr(Merge_hand):
+    def __call__(self, posel, poser, cameras, match3d_l, match3d_r):
+        params_l = super().__call__(posel, cameras, match3d_l)
+        params_r = super().__call__(poser, cameras, match3d_r)
+        # breakpoint()
+        return {'params_l':params_l['params'], 'params_r':params_r['params']}
+        # return {'params_l':params_l['params'], 'params_r':params_r['params'], 'params':params_l['params']}
+
+class Merge_bodyandhand:
+    def __init__(self, tmp) -> None:
+        pass
+    def get_R(self, poses, cfg, st):
+        res = st.copy()
+        for i in cfg:
+            res = res @ cv2.Rodrigues(poses[i,:])[0]
+        return  res
+    def process_poses_mano(self, poses, hand_Rh, flag):
+        if sum(flag) == 0:
+            return poses 
+        
+        poses = poses.reshape((-1,3))
+        cfg={'rt': [0,3,6,9],
+            'r': [14,17,19],
+            'l': [13,16,18]
+        }
+        RA = self.get_R(poses, cfg['rt'],np.eye(3))
+
+        if flag[0] :
+            RL = self.get_R(poses, cfg['l'],RA)
+            tmppose = np.matrix(RL).I @ cv2.Rodrigues(np.array(hand_Rh[0]))[0]
+            tmppose = cv2.Rodrigues(tmppose)[0]
+            poses[20,:] = tmppose.reshape(3)
+
+            e20 = scipy.spatial.transform.Rotation.from_rotvec(torch.from_numpy(poses[20,:]).reshape(-1,3))
+            e20 = e20.as_euler('ZYX', degrees=True)
+
+           
+            dt = scipy.spatial.transform.Rotation.from_euler('ZYX', np.array([0,0,e20[0,2]/2]), degrees=True)
+            rot_dt = dt.as_matrix()
+            rot18 = cv2.Rodrigues(poses[18,:])[0]
+            rot18 = rot18@rot_dt
+            vec18 = cv2.Rodrigues(rot18)[0].reshape((1,3))
+            rot20 = cv2.Rodrigues(poses[20,:])[0]
+            rot20 = np.linalg.inv(rot_dt) @ rot20
+            vec20 = cv2.Rodrigues(rot20)[0].reshape((1,3))
+            poses[20,:] = vec20
+            poses[18,:] = vec18
+
+            # e18 = scipy.spatial.transform.Rotation.from_rotvec(torch.from_numpy(poses[18,:]).reshape(-1,3))
+            # e18 = e18.as_euler('ZYX', degrees=True)
+            # e20[0,2] =  e20[0,2]/2
+            # e18[0,2] += e20[0,2]
+            # e20 = scipy.spatial.transform.Rotation.from_euler('ZYX', e20, degrees=True)
+            # e20 = e20.as_rotvec()
+            # e18 = scipy.spatial.transform.Rotation.from_euler('ZYX', e18, degrees=True)
+            # e18 = e18.as_rotvec()
+            # poses[20,:] = e20
+            # poses[18,:] = e18
+        if flag[1] : #and sum(np.array(hand_Rh[1])!=0)>0:
+            RR = self.get_R(poses, cfg['r'],RA)
+            tmppose = np.matrix(RR).I @ cv2.Rodrigues(np.array(hand_Rh[1]))[0]
+            tmppose = cv2.Rodrigues(tmppose)[0]
+            poses[21,:] = tmppose.reshape(3)
+            
+            e21 = scipy.spatial.transform.Rotation.from_rotvec(torch.from_numpy(poses[21,:]).reshape(-1,3))
+            e21 = e21.as_euler('ZYX', degrees=True)
+
+            dt = scipy.spatial.transform.Rotation.from_euler('ZYX', np.array([0,0,e21[0,2]/2]), degrees=True)
+            rot_dt = dt.as_matrix()
+            rot19 = cv2.Rodrigues(poses[19,:])[0]
+            rot19 = rot19@rot_dt
+            vec19 = cv2.Rodrigues(rot19)[0].reshape((1,3))
+            rot21 = cv2.Rodrigues(poses[21,:])[0]
+            rot21 = np.linalg.inv(rot_dt) @ rot21
+            vec21 = cv2.Rodrigues(rot21)[0].reshape((1,3))
+            poses[21,:] = vec21
+            poses[19,:] = vec19
+
+            # e19 = scipy.spatial.transform.Rotation.from_rotvec(torch.from_numpy(poses[19,:]).reshape(-1,3))
+            # e19 = e19.as_euler('ZYX', degrees=True)
+            # e21[0,2] =  e21[0,2]/2
+            # e19[0,2] += e21[0,2]
+            # e21 = scipy.spatial.transform.Rotation.from_euler('ZYX', e21, degrees=True)
+            # e21 = e21.as_rotvec()
+            # e19 = scipy.spatial.transform.Rotation.from_euler('ZYX', e19, degrees=True)
+            # e19 = e19.as_rotvec()
+            # poses[21,:] = e21
+            # poses[19,:] = e19
+
+        return poses.reshape((1,-1))
+
+    def merge_pose(self, bodypose,handlpose,handrpose):
+        flag=[True,True]
+        if abs(handlpose).sum()==0:
+            flag[0]=False
+        if abs(handrpose).sum()==0:
+            flag[1]=False
+                
+        out_L = []
+        pose = np.hstack((bodypose,handlpose[:,3:],handrpose[:,3:])) # (1,156)
+        out_pose = self.process_poses_mano(pose, [handlpose[0,:3],handrpose[0,:3]], flag) # 如果没找到手，那么应该设置为全0 这里设置为false
+        out_L.append(out_pose)
+        return out_pose
+    def __call__(self, params_l, params_r, params):
+        # breakpoint()
+        bz = params['Rh'].shape[0]
+        ret = {
+            'Rh':    np.zeros((bz,3),dtype=np.float32),
+            'Th':    params['Th'],
+            'poses': np.zeros((bz,156),dtype=np.float32),
+            'shapes':np.zeros((bz,16),dtype=np.float32)
+        }
+        ret['shapes'][:,:10] = params['shapes']
+        # breakpoint()
+        #TODO for nframe nperson 
+        for i in range(bz):
+            inpose = np.zeros((1,66))
+            inpose[:,3:] = params['poses'][i][:63].copy()
+            inpose[:,:3] = params['Rh'][i].copy() # pose0:3有值 Rh  可能要合并
+
+            handlpose = params_l['poses'][i].reshape((1,-1)).copy()
+            handrpose = params_r['poses'][i].reshape((1,-1)).copy()
+            handlpose[:,:3] = params_l['Rh'][i]
+            handrpose[:,:3] = params_r['Rh'][i]
+
+            out = self.merge_pose(inpose.reshape((1,-1)), handlpose, handrpose)
+            ret['Rh'][i] = out[:,:3]
+            ret['poses'][i,3:] = out[:,3:]
+        return {'params_smplh': ret}
--- a/myeasymocap/operations/optimizer.py
+++ b/myeasymocap/operations/optimizer.py
@ -0,0 +1,167 @@
+import torch
+import torch.nn as nn
+from easymocap.config import Config, load_object
+from easymocap.mytools.debug_utils import log
+
+def dict_of_numpy_to_tensor(body_params, device):
+    params_ = {}
+    for key, val in body_params.items():
+        if isinstance(val, dict):
+            params_[key] = dict_of_numpy_to_tensor(val, device)
+        else:
+            params_[key] = torch.Tensor(val).to(device)
+    return params_
+
+def dict_of_tensor_to_numpy(body_params):
+    params_ = {}
+    for key, val in body_params.items():
+        if isinstance(val, dict):
+            params_[key] = dict_of_tensor_to_numpy(val)
+        else:
+            params_[key] = val.cpu().numpy()
+    return params_
+
+def make_optimizer(opt_params, optim_type='lbfgs', max_iter=20,
+    lr=1e-3, betas=(0.9, 0.999), weight_decay=0.0, **kwargs):
+    if isinstance(opt_params, dict):
+        # LBFGS 不支持参数字典
+        opt_params = list(opt_params.values())
+    if optim_type == 'lbfgs':
+        # optimizer = torch.optim.LBFGS(
+        #     opt_params, max_iter=max_iter, lr=lr, line_search_fn='strong_wolfe',
+        #     tolerance_grad= 0.0000001, # float32的有效位数是7位
+        #     tolerance_change=0.0000001,
+        # )
+        from easymocap.pyfitting.lbfgs import LBFGS
+        optimizer = LBFGS(opt_params, line_search_fn='strong_wolfe', max_iter=max_iter,
+                          tolerance_grad= 0.0000001, # float32的有效位数是7位
+                            tolerance_change=0.0000001,
+                          **kwargs)
+    elif optim_type == 'adam':
+        optimizer = torch.optim.Adam(opt_params, lr=lr, betas=betas, weight_decay=weight_decay)
+    else:
+        raise NotImplementedError
+    return optimizer
+
+def grad_require(params, flag=False):
+    if isinstance(params, list):
+        for par in params:
+            par.requires_grad = flag 
+    elif isinstance(params, dict):
+        for key, par in params.items():
+            par.requires_grad = flag
+
+def make_closure(optimizer, model, params, infos, loss, device):
+    loss_func = {}
+    for key, val in loss.items():
+        loss_func[key] = load_object(val['module'], val['args'])
+        if isinstance(loss_func[key], nn.Module):
+            loss_func[key].to(device)
+    
+    def closure(debug=False):
+        optimizer.zero_grad()
+        new_params = params.copy()
+        output = model(new_params)
+        loss_dict = {}
+        loss_weight = {key:loss[key].weight for key in loss_func.keys()}
+        for key, func in loss_func.items():
+            output_ = {k: output[k] for k in loss[key].key_from_output}
+            infos_ = {k: infos[k] for k in loss[key].key_from_infos}
+            loss_now = func(output_, infos_)
+            if isinstance(loss_now, dict):
+                for k, _loss in loss_now.items():
+                    loss_dict[key+'_'+k] = _loss
+                    loss_weight[key+'_'+k] = loss_weight[key]
+                loss_weight.pop(key)
+            else:
+                loss_dict[key] = loss_now
+        loss_sum = sum([loss_dict[key]*loss_weight[key]
+                        for key in loss_dict.keys()])
+        # for key in loss_dict.keys():
+        #     print(key, loss_dict[key] * loss_weight[key])
+        # print(loss_sum)
+        if debug:
+            return loss_dict, loss_weight
+        loss_sum.backward()
+        return loss_sum
+    return closure
+
+def rel_change(prev_val, curr_val):
+    return (prev_val - curr_val) / max([1e-5, abs(prev_val), abs(curr_val)])
+
+class Optimizer:
+    def __init__(self, optimize_keys, optimizer_args, loss) -> None:
+        self.optimize_keys = optimize_keys
+        self.optimizer_args = optimizer_args
+        self.loss = loss
+        self.used_infos = []
+        for key, val in loss.items():
+            self.used_infos.extend(val.key_from_infos)
+        self.used_infos = list(set(self.used_infos))
+
+    def log_loss(self, iter_, closure, print_loss=False):
+        if iter_ % 10 == 0 or print_loss:
+            with torch.no_grad():
+                loss_dict, loss_weight = closure(debug=True)
+            print('{:-6d}: '.format(iter_) + ' '.join([key + ' %7.4f'%(loss_dict[key].item()*loss_weight[key]) for key in loss_dict.keys()]))
+        
+    def optimizer_step(self, optimizer, closure):
+        prev_loss = None
+        self.log_loss(0, closure, True)
+        for iter_ in range(1, 1000):
+            loss = optimizer.step(closure)
+            # check the loss
+            if torch.isnan(loss).sum() > 0:
+                print('[optimize] NaN loss value, stopping!')
+                break
+            if torch.isinf(loss).sum() > 0:
+                print('[optimize] Infinite loss value, stopping!')
+                break
+            # check the delta
+            if iter_ > 0 and prev_loss is not None:
+                loss_rel_change = rel_change(prev_loss, loss.item())
+                if loss_rel_change <= 0.0000001:
+                    break
+            self.log_loss(iter_, closure)
+            prev_loss = loss.item()
+        self.log_loss(iter_, closure, True)
+        return True
+
+    def __call__(self, params, model, **infos):
+        """
+            待优化变量一定要在params中，但params中不一定会被优化
+            infos中的变量不一定会被优化
+        """
+        # TODO: 应该使用model的device，但考虑到model可能是一个函数，所以暂时当场计算
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        params = dict_of_numpy_to_tensor(params, device=device)
+        infos_used = {key: infos[key] for key in self.used_infos if key in infos.keys()}
+        infos_used = dict_of_numpy_to_tensor(infos_used, device=device)
+        
+        log('[{}] Optimize {}'.format(self.__class__.__name__, self.optimize_keys))
+        log('[{}] Loading {}'.format(self.__class__.__name__, self.used_infos))
+        opt_params = {}
+        for key in self.optimize_keys:
+            if key in infos.keys(): # 优化的参数
+                opt_params[key] = infos_used[key]
+            elif key in params.keys():
+                opt_params[key] = params[key]
+            else:
+                raise ValueError('{} is not in infos or body_params'.format(key))
+        for key, val in opt_params.items():
+            infos_used['init_'+key] = val.clone()
+        optimizer = make_optimizer(opt_params, **self.optimizer_args)
+        closure = make_closure(optimizer, model, params, infos_used, self.loss, device)
+        # 准备开始优化
+        grad_require(opt_params, True)
+        self.optimizer_step(optimizer, closure)
+        grad_require(opt_params, False)
+        # 直接返回
+        ret = {
+            'params': params
+        }
+        for key in self.optimize_keys:
+            if key in infos.keys():
+                ret[key] = opt_params[key]
+        ret = dict_of_tensor_to_numpy(ret)
+        return ret
--- a/myeasymocap/operations/select.py
+++ b/myeasymocap/operations/select.py
@ -0,0 +1,295 @@
+from typing import Any
+import numpy as np
+import cv2
+
+LOG_FILE = 'log_hand_select.txt'
+LOG_LEVEL = 2 #0 2
+FULL_LOG = (lambda x: print(x, file=open(LOG_FILE, 'a'))) if LOG_LEVEL > 1 else (lambda x: None)
+LOG = (lambda x: print(x, file=open(LOG_FILE, 'a'))) if LOG_LEVEL > 0 else (lambda x: None)
+
+def views_from_dimGroups(dimGroups):
+    views = np.zeros(dimGroups[-1], dtype=np.int)
+    for nv in range(len(dimGroups) - 1):
+        views[dimGroups[nv]:dimGroups[nv+1]] = nv
+    return views
+
+class Select_Views:
+    def __init__(self, camtoworld, handtype) -> None:
+        self.camtoworld = camtoworld
+        self.results = []
+        self.DIST_MAX = 50
+        self.threshold = 2
+        self.handtype = handtype
+
+        self.threshold2 = 0.3
+
+        self.count = 0
+        self.mode = 0 #[0,1] 0-sum  1-max&sum
+
+    def cvt_Rh_Rot(self, Rh):
+        import cv2
+        RotList = []
+        for i in range(Rh.shape[0]):
+            RotList.append(cv2.Rodrigues(Rh[i])[0])
+        return np.stack(RotList)
+
+    def get_dis_Rh(self, Rh1, Rh2):
+        rh_dis = (self.cvt_Rh_Rot(Rh1) - self.cvt_Rh_Rot(Rh2))**2
+        return rh_dis.sum(axis=(1,2))
+
+    def match_with_lastframe(self, lastpose, new_poses):
+        # breakpoint()
+        if self.mode==0:
+            rh_dis = self.get_dis_Rh(np.array(new_poses)[:,:3], lastpose[None][:,:3])
+            dis = ((np.array(new_poses)[:,3:] - lastpose[None][:,3:])**2).sum(axis=1)
+            dis+=rh_dis
+            minid = np.argmin(dis)
+            return new_poses[minid], dis[minid], minid, dis
+        else:
+        # breakpoint()
+            dis1 = ((np.array(new_poses) - lastpose[None])**2).sum(axis=1)
+            dis2 = ((np.array(new_poses) - lastpose[None])**2).max(axis=1)
+            dis = np.stack([dis2,dis1]).T
+            val_idx = dis[:,0]<self.threshold2
+            dis = dis[val_idx,:]
+            if(len(dis)==0):
+                dis = ((np.array(new_poses) - lastpose[None])**2).sum(axis=1)
+                minid = np.argmin(dis1)
+                mindis = dis[minid]
+                return new_poses[minid], mindis, minid, dis
+
+            else:
+                minid = np.argmin(dis[:,1])
+                mindis = dis[minid,1]
+                # breakpoint()
+                # minid = val_idx[minid]
+                return np.array(new_poses)[val_idx,:][minid], mindis, minid, dis
+
+
+        # breakpoint()
+        # dis = ((np.array(new_poses) - lastpose[None])**2).sum(axis=1)
+        # minid = np.argmin(dis)
+        # return new_poses[minid], mindis, minid, dis
+    
+    def calculate_aff(self, poseslist, DIST_MAX):
+        #TODO Rh的距离不能这么求，最好是转成Rot再求误差
+        M = len(poseslist)
+        distance = np.zeros((M, M), dtype=np.float32)
+        for id0 in range(M):
+            for id1 in range(id0+1,M):
+                p0 = poseslist[id0]
+                p1 = poseslist[id1]
+                dis = ((p0-p1)**2).sum()
+                distance[id0,id1]=dis
+                distance[id1,id0]=dis
+        DIST_MAX = max(DIST_MAX, distance.max())
+        # breakpoint()
+        # return distance
+        for nv in range(M):
+            distance[nv,nv]=DIST_MAX
+        # for nv in range(nViews):
+        #     distance[dimGroups[nv]:dimGroups[nv+1], dimGroups[nv]:dimGroups[nv+1]] = DIST_MAX
+        distance -= np.eye(M) * DIST_MAX
+        aff = (DIST_MAX - distance)/DIST_MAX
+        aff = np.clip(aff, 0, 1)
+        return aff
+    
+    def Hierarchical_Cluster(self, data,threshold=2):
+        # import matplotlib.pyplot as plt
+        # breakpoint()
+        if(len(data)==1):
+            return [[0]]
+        import scipy.cluster.hierarchy as sch
+
+        out = sch.linkage(data , method = 'ward')
+        ret=[]
+        vis=[]
+        for i in range(len(data)):
+            ret.append([i])
+            vis.append(0)
+
+        for i in range(out.shape[0]):
+            if(out[i][2]>threshold):
+                break
+            id1 = int(out[i][0])
+            id2 = int(out[i][1])
+            vis[id1]=1
+            vis[id2]=1
+            vis.append(0)
+            ret.append(ret[id1]+ret[id2])
+
+        groups = []
+        for i in range(len(ret)):
+            if vis[i]==1:
+                continue
+            groups.append(ret[i])
+
+        return groups
+    
+    def aff_to_groups(data, affinity, dimGroups, prev_id):
+        sum1 = np.zeros((affinity.shape[0]))
+        for i in range(len(dimGroups)-1):
+            start, end = dimGroups[i], dimGroups[i+1]
+            if end == start:continue
+            sum1 += affinity[:, start:end].max(axis=-1)
+        n2d = affinity.shape[0]
+        nViews = len(dimGroups) - 1
+        idx_zero = np.zeros(nViews, dtype=np.int) - 1
+        views = views_from_dimGroups(dimGroups)
+        # the assigned results of each person
+        p2dAssigned = np.zeros(n2d, dtype=np.int) - 1
+        visited = np.zeros(n2d, dtype=np.int)
+        sortidx = np.argsort(-sum1)
+        pid = 0
+        k3dresults = []
+        breakpoint()
+
+        return k3dresults
+
+        
+    
+    def __call__(self, posel , cameras, match3d_l):
+        hand_list=[]
+        # breakpoint()
+        for pid in range(len(match3d_l)):
+            dt = match3d_l[pid]
+
+            Merge_list=[]
+            Merge_list_rot = []
+
+            if(isinstance(dt,int)):
+                # TODO:处理-1的情况，也就是没有找到合适的匹配到的手
+                # hand_list.append(np.zeros((48,)))
+                Merge_list_rot.append(np.zeros((54,)))
+                # continue
+            else:
+                for cid in range(len(dt['views'])):
+                    nv = dt['views'][cid]
+                    poseid = dt['indices'][cid]
+                    pose = posel[nv][poseid].copy()
+
+                    if self.camtoworld:
+                        Rh = pose[:,:3].copy()
+                        invR = np.linalg.inv(cameras['R'][nv])
+                        Rh_m_old = np.matrix(cv2.Rodrigues(Rh)[0])
+                        Rh_m_new = invR @ Rh_m_old
+                        Rh = cv2.Rodrigues(Rh_m_new)[0]
+                        Merge_list.append(np.hstack((Rh.reshape(3),pose[:,3:].reshape(-1))))
+                        # breakpoint()
+                        Merge_list_rot.append(np.hstack((np.array(Rh_m_new).reshape(-1),pose[:,3:].reshape(-1))))
+
+                    else:
+                        Merge_list.append(pose.reshape(-1))
+
+                        Rh = pose[:,:3].copy()
+                        Rh_m_old = np.matrix(cv2.Rodrigues(Rh)[0])
+                        Merge_list_rot.append(np.hstack((np.array(Rh_m_old).reshape(-1),pose[:,3:].reshape(-1))))
+
+                    #将坐标系转换，及视角选择完的pose整理成新的集合。
+
+            
+            # breakpoint()
+            # self.count, pid, self.handtype, str(groups), (0,1)  0的话是,选了哪一组？ 1 xuanle怎么选择的
+            #用层次聚类的方法进行视角的选择
+            # groups = self.Hierarchical_Cluster(Merge_list, self.threshold)
+
+            groups = self.Hierarchical_Cluster(Merge_list_rot, self.threshold)
+
+
+            # #求亲和矩阵，即任意两个pose之间的距离。
+            # affinity = self.calculate_aff(Merge_list,self.DIST_MAX)
+            # N2D = affinity.shape[0]
+            # prev_id = np.zeros(N2D) - 1
+            # dims = [1]*N2D
+            # dimGroups = np.cumsum([0] + dims)
+            # groups = self.aff_to_groups(Merge_list, affinity, dimGroups, prev_id)
+            # # #根据亲和矩阵进行分组，这里可以考虑将分组的结果Merge起来。
+            # groups = []
+
+            FULL_LOG('[select views] frame:{}, pid:{}, handtype:{}'.format(self.count, pid, self.handtype))
+            FULL_LOG('[groups] groups:{}'.format(str(groups)))
+
+            
+            #合并分组结果
+            new_poses = []
+            for gp in groups:
+                # merge_pose = np.array(Merge_list)[gp].mean(axis=0)
+                merge_pose = np.array(Merge_list_rot)[gp].mean(axis=0)
+                # breakpoint()
+
+
+                Rot = merge_pose[:9].reshape((3,3))
+                Rh = cv2.Rodrigues(Rot)[0]
+                merge_pose = np.hstack((Rh.reshape(3),merge_pose[9:].reshape(-1)))
+
+                new_poses.append(merge_pose)
+            #多个组，求每个组和上一帧结果之间的距离。（找出上一帧匹配的手，和这帧对应的手）
+            #根据该距离在多个组之间进行选择。选出距离更小的组。
+            # if self.handtype == 'handr':
+            #     breakpoint()
+            if (len(self.results)>pid): # False and 
+                # TODO 求与前一帧的距离，如果发现距离过大？则尝试重启跟踪？即选择视角最多的
+                pose_, dis, minid, dis_ = self.match_with_lastframe(self.results[pid],new_poses)
+                FULL_LOG('[select 0 ] minid:{}'.format(minid))
+                FULL_LOG('[select 0 ] dis:{}'.format(str(dis_.tolist())))
+                if isinstance(dt,int) or dis_.min()>10: # 没有合适的视角检测到手，或者所有视角检测到的都与上一帧差的很远
+                    FULL_LOG('[select 0 ] las pose')
+                    pose_ = self.results[pid].copy()
+                else:
+                    threshold_=0.3
+                    if self.mode==1:
+                        threshold_=1
+                    if(dis>threshold_):# 超过一定阈值，假定上一帧不是很好，则这帧重选
+                        array_len = np.array([len(gp) for gp in groups])
+                        a_max = array_len.max()
+                        d_max = 500
+                        idx=0
+                        for gid in range(array_len.shape[0]):
+                            # breakpoint()
+                            if array_len[gid]==a_max and dis_[gid]<d_max:
+                                d_max = dis_[gid]
+                                idx=gid
+                        # dis_[array_len==a_max
+                        # breakpoint()
+                        # dis_
+                        # idx=np.argmax([len(gp) for gp in groups])
+                        pose_ = new_poses[idx].copy()
+
+                        FULL_LOG('[select 0 ] max len(groups):{}\n'.format(idx))
+
+                self.results[pid] = pose_.copy()
+            else:
+                #TODO如果没有前一帧的监督，一种可以用所有组的结果进行处理，另外就是可以用数量较多的组的结果
+                #TODO 如果数量相同的有多组，需要进一步处理 比如根据aff求sum最大的？
+                idx=np.argmax([len(gp) for gp in groups])
+                pose_ = new_poses[idx].copy()
+                self.results.append(pose_.copy())
+
+                FULL_LOG('[select 1 ] max len(groups):{}\n'.format(idx))
+            #将结果整理返回，有一组和身体id对应的左手或者右手的Pose集合（在世界坐标系下的），也可以返回Params ,看卡params是个list还是dict?
+
+            hand_list.append(pose_)
+        poses_ = np.stack(hand_list)
+        Rh = poses_[:,:3].copy()
+        poses_[:,:3] = 0
+        params={
+            'Rh':Rh,
+            'Th':np.zeros_like(Rh),
+            'poses':poses_,
+            'shapes':np.zeros((Rh.shape[0],10)),
+        }
+
+        self.count+=1
+
+        return {'params': params}
+
+class Select_Views_handlr:
+    def __init__(self, camtoworld) -> None:
+        self.camtoworld = camtoworld
+        self.model_l = Select_Views(camtoworld, 'handl')
+        self.model_r = Select_Views(camtoworld, 'handr')
+
+    def __call__(self, posel, poser, match3d_l, match3d_r, cameras) -> Any:
+        params_l = self.model_l(posel, cameras, match3d_l)
+        params_r = self.model_r(poser, cameras, match3d_r)
+        return {'params_l':params_l['params'], 'params_r':params_r['params']}
--- a/myeasymocap/operations/smooth.py
+++ b/myeasymocap/operations/smooth.py
@ -0,0 +1,138 @@
+from typing import Any
+import numpy as np
+
+class SmoothAny:
+    def __init__(self, window_size) -> None:
+        self.w = window_size
+
+    def __call__(self, value, with_conf=True):
+        wsize = self.w
+        value = value.copy()
+        if with_conf:
+            pos_sum = np.zeros_like(value[:-wsize, ..., :-1])
+            conf_sum = np.zeros_like(value[:-wsize, ..., -1:])
+        else:
+            pos_sum = np.zeros_like(value[:-wsize])
+        for w in range(wsize):
+            if with_conf:
+                pos_sum += value[w:w-wsize, ..., :-1] * value[w:w-wsize, ..., -1:]
+                conf_sum += value[w:w-wsize, ..., -1:]
+            else:
+                pos_sum += value[w:w-wsize]
+        if with_conf:
+            pos_smooth = pos_sum / (1e-5 + conf_sum)
+            value[wsize//2:-wsize//2] = np.dstack([pos_smooth, conf_sum])
+        else:
+            pos_smooth = pos_sum / (wsize)
+            value[wsize//2:-wsize//2] = pos_smooth
+        return value
+
+class Smooth(SmoothAny):
+    def __call__(self, keypoints3d):
+        return {'keypoints3d': super().__call__(keypoints3d, with_conf=True)}
+
+class SmoothPoses:
+    def __init__(self, window_size) -> None:
+        self.W = window_size
+    
+    def __call__(self, params):
+        # TODO: 这个是使用了padding的
+        poses = params['poses']
+        padding_before = poses[:1].copy().repeat(self.W, 0)
+        padding_after = poses[-1:].copy().repeat(self.W, 0)
+        mean = poses.copy()
+        nFrames = mean.shape[0]
+        poses_full = np.vstack([padding_before, poses, padding_after])
+        for w in range(1, self.W+1):
+            mean += poses_full[self.W-w:self.W-w+nFrames]
+            mean += poses_full[self.W+w:self.W+w+nFrames]
+        mean /= 2*self.W + 1
+        params['poses'] = mean
+        return {'params': params}
+    
+class SmoothRealtime:
+    def __init__(self, opt_name, win_sizes) -> None:
+        # import cv2
+        self.size = {}
+        self.opt_name = opt_name
+        self.smdata={}
+        for idx, name in enumerate(opt_name):
+            self.smdata[name] = []
+            self.size[name] = win_sizes[idx]
+    def cvt_Rh_Rot(self, Rh):
+        import cv2
+        RotList = []
+        Rh = Rh.reshape((-1,3))
+        for i in range(Rh.shape[0]):
+            RotList.append(cv2.Rodrigues(Rh[i])[0])
+        return np.stack(RotList)
+
+    def cvt_Rot_Rh(self, Rot):
+        import cv2
+        RhList = []
+        for i in range(Rot.shape[0]):
+            RhList.append(cv2.Rodrigues(Rot[i])[0].reshape(3))
+        return np.stack(RhList).reshape((1,-1))
+
+    def now_smplh(self):
+        data={}
+        for name in self.opt_name:
+            # if name == 'Rh':
+            if name in ['Rh','poses']:
+                out = (sum(self.smdata[name])/len(self.smdata[name]))
+                data[name] = self.cvt_Rot_Rh(out) 
+            else:
+                data[name] = (sum(self.smdata[name])/len(self.smdata[name])) 
+        return data
+    def __call__(self, data):
+        # breakpoint()
+        for name in self.opt_name:
+            if name in ['Rh','poses']:
+                self.smdata[name].append(self.cvt_Rh_Rot(data[name].copy()))
+                if len(self.smdata[name])>self.size[name]:
+                    self.smdata[name].pop(0)
+                out = (sum(self.smdata[name])/len(self.smdata[name]))
+                data[name] = self.cvt_Rot_Rh(out) #.reshape(1,self.smdata[key][0].shape[-1])
+            else:
+                self.smdata[name].append(data[name].copy())
+                if len(self.smdata[name])>self.size[name]:
+                    self.smdata[name].pop(0)
+                data[name] = (sum(self.smdata[name])/len(self.smdata[name])) #.reshape(1,self.smdata[key][0].shape[-1])
+        return data
+class SmoothHandlr:
+    def __init__(self, opt_name, win_sizes):
+        self.smooth_handl = SmoothRealtime(opt_name, win_sizes)
+        self.smooth_handr = SmoothRealtime(opt_name, win_sizes)
+    def __call__(self, params_l, params_r) -> Any:
+        params_l = self.smooth_handl(params_l)
+        params_r = self.smooth_handr(params_r)
+        return {'params_l': params_l, 'params_r': params_r}
+
+class SmoothSmplh(SmoothRealtime):
+    def __init__(self, opt_name, win_sizes):
+        self.opt_name = opt_name
+        self.win_sizes = win_sizes
+        self.smooth_lists=[]
+        # self.smooth_smplh = SmoothRealtime(opt_name, win_sizes)
+    def __call__(self, params_smplh):
+        #TODO 应该根据id， 放入到对应的smooth列表中， 长久不在的要删除或者清空,之后把id作为输入，然后smoothlists换成map
+        bz = params_smplh['Rh'].shape[0]
+        while (len(self.smooth_lists)<bz):
+            self.smooth_lists.append(SmoothRealtime(self.opt_name, self.win_sizes))
+        for i in range(bz):
+            param={}
+            for key in params_smplh.keys():
+                param[key] = params_smplh[key][i].reshape(1,-1)
+            out = self.smooth_lists[i](param)
+            for key in params_smplh.keys():
+                params_smplh[key][i] = out[key]
+        # params_smplh = self.smooth_smplh(params_smplh)
+        return {'params_smplh': params_smplh}
+
+class Smoothkeypoints3d(SmoothRealtime):
+    def __init__(self, opt_name, win_sizes):
+        self.smooth_smplh = SmoothRealtime(opt_name, win_sizes)
+    def __call__(self, keypoints3d):
+        ret = self.smooth_smplh({'keypoints3d':keypoints3d})
+        return ret
+    
--- a/myeasymocap/operations/triangulate.py
+++ b/myeasymocap/operations/triangulate.py
@ -0,0 +1,151 @@
+import numpy as np
+from itertools import combinations
+from easymocap.mytools.camera_utils import Undistort
+from easymocap.mytools.triangulator import iterative_triangulate
+
+def batch_triangulate(keypoints_, Pall, min_view=2):
+    """ triangulate the keypoints of whole body
+
+    Args:
+        keypoints_ (nViews, nJoints, 3): 2D detections
+        Pall (nViews, 3, 4): projection matrix of each view
+        min_view (int, optional): min view for visible points. Defaults to 2.
+
+    Returns:
+        keypoints3d: (nJoints, 4)
+    """
+    # keypoints: (nViews, nJoints, 3)
+    # Pall: (nViews, 3, 4)
+    # A: (nJoints, nViewsx2, 4), x: (nJoints, 4, 1); b: (nJoints, nViewsx2, 1)
+    v = (keypoints_[:, :, -1]>0).sum(axis=0)
+    valid_joint = np.where(v >= min_view)[0]
+    keypoints = keypoints_[:, valid_joint]
+    conf3d = keypoints[:, :, -1].sum(axis=0)/v[valid_joint]
+    # P2: P矩阵的最后一行：(1, nViews, 1, 4)
+    P0 = Pall[None, :, 0, :]
+    P1 = Pall[None, :, 1, :]
+    P2 = Pall[None, :, 2, :]
+    # uP2: x坐标乘上P2: (nJoints, nViews, 1, 4)
+    uP2 = keypoints[:, :, 0].T[:, :, None] * P2
+    vP2 = keypoints[:, :, 1].T[:, :, None] * P2
+    conf = keypoints[:, :, 2].T[:, :, None]
+    Au = conf * (uP2 - P0)
+    Av = conf * (vP2 - P1)
+    A = np.hstack([Au, Av])
+    u, s, v = np.linalg.svd(A)
+    X = v[:, -1, :]
+    X = X / X[:, 3:]
+    # out: (nJoints, 4)
+    result = np.zeros((keypoints_.shape[1], 4))
+    result[valid_joint, :3] = X[:, :3]
+    result[valid_joint, 3] = conf3d #* (conf[..., 0].sum(axis=-1)>min_view)
+    return result
+
+def project_wo_dist(keypoints, RT, einsum='vab,kb->vka'):
+    homo = np.concatenate([keypoints[..., :3], np.ones_like(keypoints[..., :1])], axis=-1)
+    kpts2d = np.einsum(einsum, RT, homo)
+    depth = kpts2d[..., 2]
+    kpts2d[..., :2] /= kpts2d[..., 2:]
+    return kpts2d, depth
+
+class SimpleTriangulate:
+    def __init__(self, mode):
+        self.mode = mode
+    
+    @staticmethod
+    def undistort(points, cameras):
+        nViews = len(points)
+        pelvis_undis = []
+        for nv in range(nViews):
+            camera = {key:cameras[key][nv] for key in ['R', 'T', 'K', 'dist']}
+            if points[nv].shape[0] > 0:
+                pelvis = Undistort.points(points[nv], camera['K'], camera['dist'])
+            else:
+                pelvis = points[nv].copy()
+            pelvis_undis.append(pelvis)
+        return pelvis_undis
+
+    def __call__(self, keypoints, cameras):
+        '''
+            keypoints: [nViews, nJoints, 3]
+        
+        output: 
+            keypoints3d: (nJoints, 4)
+        '''
+        keypoints = self.undistort(keypoints, cameras)
+        keypoints = np.stack(keypoints)
+        if self.mode == 'naive':
+            keypoints3d = batch_triangulate(keypoints, cameras['P'])
+        else:
+            keypoints3d, k2d = iterative_triangulate(keypoints, cameras['P'], dist_max=25)
+        return {'keypoints3d': keypoints3d}
+
+class RobustTriangulate(SimpleTriangulate):
+    def __init__(self, mode, cfg):
+        super().__init__(mode)
+        self.cache_view = {}
+        self.cfg = cfg
+
+    def try_to_triangulate_and_project(self, index, keypoints, cameras):
+        # 选择最好的3个视角
+        P = cameras['P'][index]
+        kpts = keypoints[index][:, None]
+        k3d = batch_triangulate(kpts, P)
+        k2d, depth = project_wo_dist(k3d, P)
+        dist_repro = np.linalg.norm(k2d[..., :2] - kpts[..., :2], axis=-1).mean(axis=-1)
+        return k3d, dist_repro
+
+    def robust_triangulate(self, keypoints, cameras):
+        # 选择最好的3个视角
+        # TODO: 移除不合理的视角
+        nViews = keypoints.shape[0]
+        if nViews not in self.cache_view:
+            views = list(range(nViews))
+            combs = list(combinations(views, self.cfg.triangulate.init_views))
+            combs = np.array(combs)
+            self.cache_view[nViews] = combs
+        combs = self.cache_view[nViews]
+        keypoints_comb = keypoints[combs]
+        conf_sum = keypoints_comb[..., 2].mean(axis=1) * (keypoints_comb[..., 2]>0.05).all(axis=1)
+        comb_sort_id = (-conf_sum).argsort()
+        flag_find_init = False
+        for comb_id in comb_sort_id:
+            if conf_sum[comb_id] < 0.1:
+                break
+            comb = combs[comb_id]
+            k3d, dist_repro = self.try_to_triangulate_and_project(comb, keypoints, cameras)
+            if (dist_repro < self.cfg.triangulate.repro_init).all():
+                flag_find_init = True
+                init = comb.tolist()
+                break
+        if not flag_find_init:
+            print('Cannot find good initialize pair')
+            import ipdb; ipdb.set_trace()
+        view_idxs = (-keypoints[:, -1]).argsort()
+        for view_idx in view_idxs:
+            if view_idx in init:
+                continue
+            if keypoints[view_idx, 2] < 0.1:
+                continue
+            k3d, dist_repro = self.try_to_triangulate_and_project(init+[view_idx], keypoints, cameras)
+            if (dist_repro < self.cfg.triangulate.repro_2d).all():
+                # print('Add view {}'.format(view_idx))
+                init.append(view_idx)
+        return k3d, init
+
+    def __call__(self, keypoints, cameras):
+        """
+            keypoints: (nViews, nJoints, 3)
+            cameras: (nViews, 3, 4)
+        """
+        nViews, nJoints, _ = keypoints.shape
+        keypoints_undis = np.stack(self.undistort(keypoints, cameras))
+        # for each points, find good initial pairs
+        points_all = np.zeros((nJoints, 4))
+        keypoints_copy = keypoints.copy()
+        for nj in range(nJoints):
+            point, select_views = self.robust_triangulate(keypoints_undis[:, nj], cameras)
+            points_all[nj:nj+1] = point
+            keypoints_copy[select_views, nj, 2] += 10
+            keypoints_copy[:, nj, 2] = np.clip(keypoints_copy[:, nj, 2]-10, 0, 1)
+        return {'keypoints3d': points_all, 'keypoints_select': keypoints_copy}
--- a/myeasymocap/stages/basestage.py
+++ b/myeasymocap/stages/basestage.py
@ -0,0 +1,149 @@
+from typing import Any
+from easymocap.config import Config, load_object
+from easymocap.mytools.debug_utils import mywarn, log
+import numpy as np
+import time
+from tabulate import tabulate
+
+class Timer:
+    def __init__(self, record, verbose) -> None:
+        self.keys = list(record.keys())
+        self.header = self.keys
+        self.verbose = verbose
+
+    def update(self, timer):
+        if not self.verbose:
+            return
+        contents = []
+        for key in self.keys:
+            if key not in timer:
+                contents.append('skip')
+            else:
+                contents.append('{:.3f}s'.format(timer[key]))
+        print(tabulate(headers=self.header, tabular_data=[contents], tablefmt='fancy_grid'))
+
+class MultiStage:
+    def load_final(self):
+        at_finals = {}
+        for key, val in self._at_final.items():
+            if val['module'] == 'skip':
+                mywarn('Stage {} is not used'.format(key))
+                continue
+            log('[{}] loading {}'.format(self.__class__.__name__, key))
+            model = load_object(val['module'], val['args'])
+            model.output = self.output
+            at_finals[key] = model
+        self.model_finals = at_finals
+
+    def __init__(self, output, at_step, at_final) -> None:
+        log('[{}] writing the results to {}'.format(self.__class__.__name__, output))
+        at_steps = {}
+        for key, val in at_step.items():
+            if val['module'] == 'skip':
+                mywarn('Stage {} is not used'.format(key))
+                continue
+            log('[{}] loading module {}'.format(self.__class__.__name__, key))
+            model = load_object(val['module'], val['args'])
+            model.output = output
+            at_steps[key] = model
+        self.output = output
+        self.model_steps = at_steps
+        self._at_step = at_step
+        self._at_final = at_final
+        self.timer = Timer(at_steps, verbose=False)
+
+    def at_step(self, data, index):
+        ret = {}
+        if 'meta' in data:
+            ret['meta'] = data['meta']
+        timer = {}
+        for key, model in self.model_steps.items():
+            for k in self._at_step[key].get('key_keep', []):
+                ret[k] = data[k]
+            if self._at_step[key].get('skip', False):
+                continue
+            inputs = {}
+            for k in self._at_step[key].get('key_from_data', []):
+                inputs[k] = data[k]
+            for k in self._at_step[key].get('key_from_previous', []):
+                inputs[k] = ret[k]
+            start = time.time()
+            try:
+                output = model(**inputs)
+            except:
+                print('[{}] Error in {}'.format('Stages', key))
+                raise Exception
+            timer[key] = time.time() - start
+            if output is not None:
+                ret.update(output)
+
+        self.timer.update(timer)
+        return ret
+
+    @staticmethod
+    def merge_data(infos_all):
+        info0 = infos_all[0]
+        data = {}
+        for key, val in info0.items():
+            data[key] = [info[key] for info in infos_all]
+            if isinstance(val, np.ndarray):
+                try:
+                    data[key] = np.stack(data[key])
+                except ValueError:
+                    print('[{}] Skip merge {}'.format('Stages', key))
+                    pass
+            elif isinstance(val, dict):
+                data[key] = MultiStage.merge_data(data[key])
+        return data
+
+    def at_final(self, infos_all):
+        self.load_final()
+        data = self.merge_data(infos_all)
+        log('Keep keys: {}'.format(list(data.keys())))
+        ret = {}
+        for key, model in self.model_finals.items():
+            for iter_ in range(self._at_final[key].get('repeat', 1)):
+                inputs = {}
+                for k in self._at_final[key].get('key_from_data', []):
+                    inputs[k] = data[k]
+                for k in self._at_final[key].get('key_from_previous', []):
+                    inputs[k] = ret[k]
+                try:
+                    output = model(**inputs)
+                except:
+                    print('[{}] Error in {}'.format('Stages', key))
+                    raise Exception
+                if output is not None:
+                    ret.update(output)
+        return ret
+
+class StageForFittingEach:
+    def __init__(self, stages, keys_keep) -> None:
+        stages_ = {}
+        for key, val in stages.items():
+            if val['module'] == 'skip':
+                mywarn('Stage {} is not used'.format(key))
+                continue
+            model = load_object(val['module'], val['args'])
+            stages_[key] = model
+        self.stages = stages_
+        self.stages_args = stages
+        self.keys_keep = keys_keep
+    
+    def __call__(self, results, **ret):
+        for pid, result in results.items():
+            ret0 = {}
+            ret0.update(ret)
+            for key, stage in self.stages.items():
+                for iter_ in range(self.stages_args[key].get('repeat', 1)):
+                    inputs = {}
+                    for k in self.stages_args[key].get('key_from_data', []):
+                        inputs[k] = result[k]
+                    for k in self.stages_args[key].get('key_from_previous', []):
+                        inputs[k] = ret0[k]
+                    output = stage(**inputs)
+                    if output is not None:
+                        ret0.update(output)
+            for key in self.keys_keep:
+                result[key] = ret0[key]
+        return {'results': results}
--- a/myeasymocap/stages/collect.py
+++ b/myeasymocap/stages/collect.py
@ -0,0 +1,46 @@
+import numpy as np
+from tqdm import tqdm
+
+class CheckFramePerson:
+    def __init__(self, key) -> None:
+        self.key = key
+        self.pids = []
+        self.frames = 0
+    
+    def __call__(self, keypoints3d, pids):
+        k3d_, pid_ = [], []
+        for i, pid in enumerate(pids):
+            if pid not in self.pids:
+                if self.frames == 0:
+                    print('[{}]/{:06d} Add person {}'.format(self.__class__.__name__, self.frames, pid))
+                    self.pids.append(pid)
+                else:
+                    continue
+            k3d_.append(keypoints3d[i])
+            pid_.append(pid)
+        self.frames += 1
+        k3d_ = np.stack(k3d_)
+        return {
+            'keypoints3d': k3d_,
+            'pids': pid_
+        }
+
+class CollectMultiPersonMultiFrame:
+    def __init__(self, key) -> None:
+        self.key = key
+    
+    def __call__(self, keypoints3d, pids):
+        records = {}
+        for frame in tqdm(range(len(pids)), desc='Reading'):
+            pid_frame = pids[frame]
+            for i, pid in enumerate(pid_frame):
+                if pid not in records:
+                    records[pid] = {
+                        'frames': [],
+                        'keypoints3d': []
+                    }
+                records[pid]['frames'].append(frame)
+                records[pid]['keypoints3d'].append(keypoints3d[frame][i])
+        for pid, record in records.items():
+            record['keypoints3d'] = np.stack(record['keypoints3d']).astype(np.float32)
+        return {'results': records}
--- a/requirements.txt
+++ b/requirements.txt
@ -6,5 +6,7 @@ yacs
 tabulate
 termcolor
 chumpy
-mediapipe
-func_timeout
+mediapipe==0.10.0
+func_timeout
+ultralytics
+gdown
--- a/setup.py
+++ b/setup.py
@ -21,8 +21,19 @@ setup(
        'easymocap.pyfitting',
        'easymocap.mytools', 
        'easymocap.annotator',
-        'easymocap.estimator'
+        'easymocap.estimator',
+        'myeasymocap'
    ],
+    entry_points={
+        'console_scripts': [
+            'emc=apps.mocap.run:main_entrypoint',
+            # 'easymocap_calib=easymocap.mytools.entry:calib',
+            # 'easymocap_tools=easymocap.mytools.entry:main',
+            # 'extract_keypoints=easymocap.mytools.cmdtools.extract_keypoints:main'
+        ],
+    },
    install_requires=[],
    data_files = []
 )
+
+emc = "apps.mocap.run:main_entrypoint"