diff --git a/easymocap/estimator/HRNet/__init__.py b/easymocap/estimator/HRNet/__init__.py
new file mode 100644
index 0000000..3e91d07
--- /dev/null
+++ b/easymocap/estimator/HRNet/__init__.py
@@ -0,0 +1,8 @@
+'''
+    @ Date: 2020-06-04 12:48:29
+  @ LastEditors: Qing Shuai
+  @ LastEditTime: 2020-11-17 15:52:23
+    @ Author: Qing Shuai
+    @ Mail: s_q@zju.edu.cn
+'''
+from .hrnet_api import SimpleHRNet
\ No newline at end of file
diff --git a/easymocap/estimator/HRNet/hrnet.py b/easymocap/estimator/HRNet/hrnet.py
new file mode 100644
index 0000000..d3e27f4
--- /dev/null
+++ b/easymocap/estimator/HRNet/hrnet.py
@@ -0,0 +1,216 @@
+import torch
+from torch import nn
+from .modules import BasicBlock, Bottleneck
+
+
+class StageModule(nn.Module):
+    def __init__(self, stage, output_branches, c, bn_momentum):
+        super(StageModule, self).__init__()
+        self.stage = stage
+        self.output_branches = output_branches
+
+        self.branches = nn.ModuleList()
+        for i in range(self.stage):
+            w = c * (2 ** i)
+            branch = nn.Sequential(
+                BasicBlock(w, w, bn_momentum=bn_momentum),
+                BasicBlock(w, w, bn_momentum=bn_momentum),
+                BasicBlock(w, w, bn_momentum=bn_momentum),
+                BasicBlock(w, w, bn_momentum=bn_momentum),
+            )
+            self.branches.append(branch)
+
+        self.fuse_layers = nn.ModuleList()
+        # for each output_branches (i.e. each branch in all cases but the very last one)
+        for i in range(self.output_branches):
+            self.fuse_layers.append(nn.ModuleList())
+            for j in range(self.stage):  # for each branch
+                if i == j:
+                    self.fuse_layers[-1].append(nn.Sequential())  # Used in place of "None" because it is callable
+                elif i < j:
+                    self.fuse_layers[-1].append(nn.Sequential(
+                        nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=(1, 1), stride=(1, 1), bias=False),
+                        nn.BatchNorm2d(c * (2 ** i), eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
+                        nn.Upsample(scale_factor=(2.0 ** (j - i)), mode='nearest'),
+                    ))
+                elif i > j:
+                    ops = []
+                    for k in range(i - j - 1):
+                        ops.append(nn.Sequential(
+                            nn.Conv2d(c * (2 ** j), c * (2 ** j), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1),
+                                      bias=False),
+                            nn.BatchNorm2d(c * (2 ** j), eps=1e-05, momentum=0.1, affine=True,
+                                           track_running_stats=True),
+                            nn.ReLU(inplace=True),
+                        ))
+                    ops.append(nn.Sequential(
+                        nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1),
+                                  bias=False),
+                        nn.BatchNorm2d(c * (2 ** i), eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
+                    ))
+                    self.fuse_layers[-1].append(nn.Sequential(*ops))
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        assert len(self.branches) == len(x)
+
+        x = [branch(b) for branch, b in zip(self.branches, x)]
+
+        x_fused = []
+        for i in range(len(self.fuse_layers)):
+            for j in range(0, len(self.branches)):
+                if j == 0:
+                    x_fused.append(self.fuse_layers[i][0](x[0]))
+                else:
+                    x_fused[i] = x_fused[i] + self.fuse_layers[i][j](x[j])
+
+        for i in range(len(x_fused)):
+            x_fused[i] = self.relu(x_fused[i])
+
+        return x_fused
+
+
+class HRNet(nn.Module):
+    def __init__(self, c=48, nof_joints=17, bn_momentum=0.1):
+        super(HRNet, self).__init__()
+
+        # Input (stem net)
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
+        self.bn1 = nn.BatchNorm2d(64, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
+        self.bn2 = nn.BatchNorm2d(64, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True)
+        self.relu = nn.ReLU(inplace=True)
+
+        # Stage 1 (layer1)      - First group of bottleneck (resnet) modules
+        downsample = nn.Sequential(
+            nn.Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False),
+            nn.BatchNorm2d(256, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+        )
+        self.layer1 = nn.Sequential(
+            Bottleneck(64, 64, downsample=downsample),
+            Bottleneck(256, 64),
+            Bottleneck(256, 64),
+            Bottleneck(256, 64),
+        )
+
+        # Fusion layer 1 (transition1)      - Creation of the first two branches (one full and one half resolution)
+        self.transition1 = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(256, c, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
+                nn.BatchNorm2d(c, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+                nn.ReLU(inplace=True),
+            ),
+            nn.Sequential(nn.Sequential(  # Double Sequential to fit with official pretrained weights
+                nn.Conv2d(256, c * (2 ** 1), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
+                nn.BatchNorm2d(c * (2 ** 1), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+                nn.ReLU(inplace=True),
+            )),
+        ])
+
+        # Stage 2 (stage2)      - Second module with 1 group of bottleneck (resnet) modules. This has 2 branches
+        self.stage2 = nn.Sequential(
+            StageModule(stage=2, output_branches=2, c=c, bn_momentum=bn_momentum),
+        )
+
+        # Fusion layer 2 (transition2)      - Creation of the third branch (1/4 resolution)
+        self.transition2 = nn.ModuleList([
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(nn.Sequential(  # Double Sequential to fit with official pretrained weights
+                nn.Conv2d(c * (2 ** 1), c * (2 ** 2), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
+                nn.BatchNorm2d(c * (2 ** 2), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+                nn.ReLU(inplace=True),
+            )),  # ToDo Why the new branch derives from the "upper" branch only?
+        ])
+
+        # Stage 3 (stage3)      - Third module with 4 groups of bottleneck (resnet) modules. This has 3 branches
+        self.stage3 = nn.Sequential(
+            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
+        )
+
+        # Fusion layer 3 (transition3)      - Creation of the fourth branch (1/8 resolution)
+        self.transition3 = nn.ModuleList([
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(nn.Sequential(  # Double Sequential to fit with official pretrained weights
+                nn.Conv2d(c * (2 ** 2), c * (2 ** 3), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
+                nn.BatchNorm2d(c * (2 ** 3), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+                nn.ReLU(inplace=True),
+            )),  # ToDo Why the new branch derives from the "upper" branch only?
+        ])
+
+        # Stage 4 (stage4)      - Fourth module with 3 groups of bottleneck (resnet) modules. This has 4 branches
+        self.stage4 = nn.Sequential(
+            StageModule(stage=4, output_branches=4, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=4, output_branches=4, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=4, output_branches=1, c=c, bn_momentum=bn_momentum),
+        )
+
+        # Final layer (final_layer)
+        self.final_layer = nn.Conv2d(c, nof_joints, kernel_size=(1, 1), stride=(1, 1))
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+
+        x = self.layer1(x)
+        x = [trans(x) for trans in self.transition1]  # Since now, x is a list (# == nof branches)
+
+        x = self.stage2(x)
+        # x = [trans(x[-1]) for trans in self.transition2]    # New branch derives from the "upper" branch only
+        x = [
+            self.transition2[0](x[0]),
+            self.transition2[1](x[1]),
+            self.transition2[2](x[-1])
+        ]  # New branch derives from the "upper" branch only
+
+        x = self.stage3(x)
+        # x = [trans(x) for trans in self.transition3]    # New branch derives from the "upper" branch only
+        x = [
+            self.transition3[0](x[0]),
+            self.transition3[1](x[1]),
+            self.transition3[2](x[2]),
+            self.transition3[3](x[-1])
+        ]  # New branch derives from the "upper" branch only
+
+        x = self.stage4(x)
+
+        x = self.final_layer(x[0])
+
+        return x
+
+
+if __name__ == '__main__':
+    # model = HRNet(48, 17, 0.1)
+    model = HRNet(32, 17, 0.1)
+
+    # print(model)
+
+    model.load_state_dict(
+        # torch.load('./weights/pose_hrnet_w48_384x288.pth')
+        torch.load('./weights/pose_hrnet_w32_256x192.pth')
+    )
+    print('ok!!')
+
+    if torch.cuda.is_available() and False:
+        torch.backends.cudnn.deterministic = True
+        device = torch.device('cuda:0')
+    else:
+        device = torch.device('cpu')
+
+    print(device)
+
+    model = model.to(device)
+
+    y = model(torch.ones(1, 3, 384, 288).to(device))
+    print(y.shape)
+    print(torch.min(y).item(), torch.mean(y).item(), torch.max(y).item())
diff --git a/easymocap/estimator/HRNet/hrnet_api.py b/easymocap/estimator/HRNet/hrnet_api.py
new file mode 100644
index 0000000..3084dee
--- /dev/null
+++ b/easymocap/estimator/HRNet/hrnet_api.py
@@ -0,0 +1,527 @@
+'''
+    @ Date: 2020-06-04 12:47:04
+  @ LastEditors: Qing Shuai
+  @ LastEditTime: 2022-04-19 17:02:57
+    @ Author: Qing Shuai
+    @ Mail: s_q@zju.edu.cn
+'''
+from os.path import join
+import cv2
+import numpy as np
+import torch
+from torchvision.transforms import transforms
+
+from .hrnet import HRNet
+
+COCO17_IN_BODY25 = [0,16,15,18,17,5,2,6,3,7,4,12,9,13,10,14,11]
+pairs = [[1, 8], [1, 2], [1, 5], [2, 3], [3, 4], [5, 6], [6, 7], [8, 9], [9, 10], [10, 11], [8, 12], [12, 13], [13, 14], [1, 0], [0,15], [15,17], [0,16], [16,18], [14,19], [19,20], [14,21], [11,22], [22,23], [11,24]]
+def coco17tobody25(points2d):
+    kpts = np.zeros((points2d.shape[0], 25, 3))
+    kpts[:, COCO17_IN_BODY25, :2] = points2d[:, :, :2]
+    kpts[:, COCO17_IN_BODY25, 2:3] = points2d[:, :, 2:3]
+    kpts[:, 8, :2] = kpts[:, [9, 12], :2].mean(axis=1)
+    kpts[:, 8, 2] = kpts[:, [9, 12], 2].min(axis=1)
+    kpts[:, 1, :2] = kpts[:, [2, 5], :2].mean(axis=1)
+    kpts[:, 1, 2] = kpts[:, [2, 5], 2].min(axis=1)
+    # 需要交换一下
+    # kpts = kpts[:, :, [1,0,2]]
+    return kpts
+
+# 生成高斯核
+def generate_gauss(sigma):
+    tmp_size = sigma * 3
+    size = 2 * tmp_size + 1
+    x = np.arange(0, size, 1, np.float32)
+    y = x[:, np.newaxis]
+    x0 = y0 = size // 2
+    # The gaussian is not normalized, we want the center value to equal 1
+    g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2))
+    return g, tmp_size
+
+gauss = {}
+for SIGMA in range(1, 5):
+    gauss_kernel, gauss_radius = generate_gauss(SIGMA)
+    gauss[SIGMA] = {
+        'kernel': gauss_kernel,
+        'radius': gauss_radius
+    }
+
+def box_to_center_scale(box, model_image_width, model_image_height, scale_factor=1.25):
+    """convert a box to center,scale information required for pose transformation
+    Parameters
+    ----------
+    box : list of tuple
+        list of length 2 with two tuples of floats representing
+        bottom left and top right corner of a box
+    model_image_width : int
+    model_image_height : int
+
+    Returns
+    -------
+    (numpy array, numpy array)
+        Two numpy arrays, coordinates for the center of the box and the scale of the box
+    """
+    center = np.zeros((2), dtype=np.float32)
+
+    bottom_left_corner = (box[0], box[1])
+    top_right_corner = (box[2], box[3])
+    box_width = top_right_corner[0]-bottom_left_corner[0]
+    box_height = top_right_corner[1]-bottom_left_corner[1]
+    bottom_left_x = bottom_left_corner[0]
+    bottom_left_y = bottom_left_corner[1]
+    center[0] = bottom_left_x + box_width * 0.5
+    center[1] = bottom_left_y + box_height * 0.5
+
+    aspect_ratio = model_image_width * 1.0 / model_image_height
+    pixel_std = 200
+
+    if box_width > aspect_ratio * box_height:
+        box_height = box_width * 1.0 / aspect_ratio
+    elif box_width < aspect_ratio * box_height:
+        box_width = box_height * aspect_ratio
+    scale = np.array(
+        [box_width * 1.0 / pixel_std, box_height * 1.0 / pixel_std],
+        dtype=np.float32)
+    scale = scale * scale_factor
+    return center, scale
+
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+
+    return src_result
+
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+
+
+def get_affine_transform(
+        center, scale, rot, output_size,
+        shift=np.array([0, 0], dtype=np.float32), inv=0
+):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        print(scale)
+        scale = np.array([scale, scale])
+
+    scale_tmp = scale * 200.0
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, dst_w * -0.5], np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def get_max_preds(batch_heatmaps):
+    '''
+    get predictions from score maps
+    heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
+    '''
+    assert isinstance(batch_heatmaps, np.ndarray), \
+        'batch_heatmaps should be numpy.ndarray'
+    assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim'
+
+    batch_size = batch_heatmaps.shape[0]
+    num_joints = batch_heatmaps.shape[1]
+    width = batch_heatmaps.shape[3]
+    heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1))
+    idx = np.argmax(heatmaps_reshaped, 2)
+    maxvals = np.amax(heatmaps_reshaped, 2)
+
+    maxvals = maxvals.reshape((batch_size, num_joints, 1))
+    idx = idx.reshape((batch_size, num_joints, 1))
+
+    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+
+    preds[:, :, 0] = (preds[:, :, 0]) % width
+    preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)
+
+    pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
+    pred_mask = pred_mask.astype(np.float32)
+
+    preds *= pred_mask
+    return preds, maxvals
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+
+def batch_affine_transform(points, trans):
+    points = np.hstack((points[:, :2], np.ones((points.shape[0], 1))))
+    out = points @ trans.T
+    return out
+
+def transform_preds(coords, center, scale, rot, output_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale, rot, output_size, inv=1)
+    target_coords[:, :2] = batch_affine_transform(coords, trans)
+    return target_coords
+
+config_ = {'kintree': [[1, 0], [2, 0], [3, 1], [4, 2], [5, 0], [6, 0], [7, 5], [8, 6], [9, 7], [10, 8], [11, 5], [12, 6], [13, 11], [
+    14, 12], [15, 13], [16, 14], [6, 5], [12, 11]], 'color': ['g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'k', 'k']}
+colors_table = {
+    # colorblind/print/copy safe:
+    '_blue': [0.65098039, 0.74117647, 0.85882353],
+    '_pink': [.9, .7, .7],
+    '_mint': [ 166/255.,  229/255.,  204/255.],
+    '_mint2': [ 202/255.,  229/255.,  223/255.],
+    '_green': [ 153/255.,  216/255.,  201/255.],
+    '_green2': [ 171/255.,  221/255.,  164/255.],
+    '_red': [ 251/255.,  128/255.,  114/255.],
+    '_orange': [ 253/255.,  174/255.,  97/255.],
+    '_yellow': [ 250/255.,  230/255.,  154/255.],
+    'r':[255/255,0,0],
+    'g':[0,255/255,0],
+    'b':[0,0,255/255],
+    'k':[0,0,0],
+    'y':[255/255,255/255,0],
+    'purple':[128/255,0,128/255]
+}
+for key, val in colors_table.items():
+    colors_table[key] = tuple([int(val[2]*255), int(val[1]*255), int(val[0]*255)])
+
+def save_batch_heatmaps(batch_image, batch_heatmaps, file_name,
+                        normalize=True):
+    '''
+    batch_image: [batch_size, channel, height, width]
+    batch_heatmaps: ['batch_size, num_joints, height, width]
+    file_name: saved file name
+    '''
+    if normalize:
+        batch_image = batch_image.clone()
+        min = float(batch_image.min())
+        max = float(batch_image.max())
+
+        batch_image.add_(-min).div_(max - min + 1e-5)
+
+    batch_size = batch_heatmaps.size(0)
+    num_joints = batch_heatmaps.size(1)
+    heatmap_height = batch_heatmaps.size(2)
+    heatmap_width = batch_heatmaps.size(3)
+
+    grid_image = np.zeros((batch_size*heatmap_height,
+                           (num_joints+2)*heatmap_width,
+                           3),
+                          dtype=np.uint8)
+
+    preds, maxvals = get_max_preds(batch_heatmaps.detach().cpu().numpy())
+
+    for i in range(batch_size):
+        image = batch_image[i].mul(255)\
+                              .clamp(0, 255)\
+                              .byte()\
+                              .permute(1, 2, 0)\
+                              .cpu().numpy()
+        heatmaps = batch_heatmaps[i].mul(255)\
+                                    .clamp(0, 255)\
+                                    .byte()\
+                                    .cpu().numpy()
+
+        resized_image = cv2.resize(image,
+                                   (int(heatmap_width), int(heatmap_height)))
+        resized_image_copy = resized_image.copy()
+        height_begin = heatmap_height * i
+        height_end = heatmap_height * (i + 1)
+        for ip in range(len(config_['kintree'])):
+            src, dst = config_['kintree'][ip]
+            c = config_['color'][ip]
+            if maxvals[i][src] < 0.1 or maxvals[i][dst] < 0.1:
+                continue
+            plot_line(resized_image_copy, preds[i][src], preds[i][dst], colors_table[c], 1)
+        for j in range(num_joints):
+            cv2.circle(resized_image,
+                       (int(preds[i][j][0]), int(preds[i][j][1])),
+                       1, [0, 0, 255], 1)
+            heatmap = heatmaps[j, :, :]
+            mask = (heatmap > 0.1)[:,:,None]
+            colored_heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
+            masked_image = (colored_heatmap*0.7 + resized_image*0.3)*mask + resized_image*(1-mask)
+            cv2.circle(masked_image,
+                       (int(preds[i][j][0]), int(preds[i][j][1])),
+                       1, [0, 0, 255], 1)
+
+            width_begin = heatmap_width * (j+2)
+            width_end = heatmap_width * (j+2+1)
+            grid_image[height_begin:height_end, width_begin:width_end, :] = \
+                masked_image
+            # grid_image[height_begin:height_end, width_begin:width_end, :] = \
+            #     colored_heatmap*0.7 + resized_image*0.3
+
+        grid_image[height_begin:height_end, 0:heatmap_width, :] = resized_image
+        grid_image[height_begin:height_end, heatmap_width:heatmap_width+heatmap_width, :] = resized_image_copy
+    cv2.imwrite(file_name, grid_image)
+    
+import math
+
+def get_final_preds(batch_heatmaps, center, scale, rot=None, flip=None):
+    coords, maxvals = get_max_preds(batch_heatmaps)
+
+    heatmap_height = batch_heatmaps.shape[2]
+    heatmap_width = batch_heatmaps.shape[3]
+
+    # post-processing
+    if True:
+        for n in range(coords.shape[0]):
+            for p in range(coords.shape[1]):
+                hm = batch_heatmaps[n][p]
+                px = int(math.floor(coords[n][p][0] + 0.5))
+                py = int(math.floor(coords[n][p][1] + 0.5))
+                if 1 < px < heatmap_width-1 and 1 < py < heatmap_height-1:
+                    diff = np.array(
+                        [
+                            hm[py][px+1] - hm[py][px-1],
+                            hm[py+1][px]-hm[py-1][px]
+                        ]
+                    )
+                    coords[n][p] += np.sign(diff) * .25
+
+    preds = coords.copy()
+
+    # Transform back
+    for i in range(coords.shape[0]):
+        if flip is not None:
+            if flip[i]:
+                coords[i, :, 0] = heatmap_width - 1 - coords[i, :, 0]
+        if rot is None:
+            _rot = 0
+        else:
+            _rot = rot[i]
+        preds[i] = transform_preds(
+            coords[i], center[i], scale[i], _rot, [heatmap_width, heatmap_height]
+        )
+    return preds, maxvals
+
+def get_gaussian_maps(net_out, keypoints, sigma):
+    radius, kernel = gauss[sigma]['radius'], gauss[sigma]['kernel']
+    weights = np.ones(net_out.shape, dtype=np.float32)
+    for i in range(weights.shape[0]):
+        for nj in range(weights.shape[1]):
+            if keypoints[i][nj][2] < 0:
+                weights[i][nj] = 0
+                continue
+            elif keypoints[i][nj][2] < 0.01:
+                weights[i][nj] = 0
+                continue
+            weights[i][nj] = 0
+            mu_x, mu_y = keypoints[i][nj][:2]
+            mu_x, mu_y = int(mu_x + 0.5), int(mu_y + 0.5)
+            # Usable gaussian range
+            ul = [mu_x - radius, mu_y - radius]
+            br = [mu_x + radius + 1, mu_y + radius + 1]
+            # Usable gaussian range
+            g_x = max(0, -ul[0]), min(br[0], weights.shape[3]) - ul[0]
+            g_y = max(0, -ul[1]), min(br[1], weights.shape[2]) - ul[1]
+            # Image range
+            img_x = max(0, ul[0]), min(br[0], weights.shape[3])
+            img_y = max(0, ul[1]), min(br[1], weights.shape[2])
+            weights[i][nj][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
+                kernel[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+    return weights
+
+humanId = 0
+
+class SimpleHRNet:
+    def __init__(self, c, nof_joints, checkpoint_path, device, resolution=(288, 384),):
+        self.device = device
+        self.c = c
+        self.nof_joints = nof_joints
+        self.checkpoint_path = checkpoint_path
+        self.max_batch_size = 64
+        self.resolution = resolution  # in the form (height, width) as in the original implementation
+        self.transform = transforms.Compose([
+                # transforms.ToPILImage(),
+                # transforms.Resize((self.resolution[0], self.resolution[1])),  # (height, width)
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            ])
+        self.model = HRNet(c=c, nof_joints=nof_joints).to(device)
+        self.model.load_state_dict(torch.load(checkpoint_path, map_location=self.device))
+        self.model.eval()
+    
+    def __call__(self, image, bboxes, rot=0, net_out=False):
+        # image: 
+        images = torch.zeros((len(bboxes), 3, self.resolution[1], self.resolution[0]), device=self.device)  # (height, width)
+        if len(bboxes) > 0:
+            # pose estimation : for multiple people
+            centers, scales, trans_all = [], [], []
+            for box in bboxes:
+                center, scale = box_to_center_scale(box, self.resolution[0], self.resolution[1])
+                centers.append(center)
+                scales.append(scale)
+                trans = get_affine_transform(center, scale, rot=rot, output_size=self.resolution)
+                trans_all.append(trans)
+            for i, trans in enumerate(trans_all):
+                # Crop smaller image of people
+                model_input = cv2.warpAffine(
+                    image, trans,
+                    (int(self.resolution[0]), int(self.resolution[1])),
+                    flags=cv2.INTER_LINEAR)
+                # cv2.imshow('input', model_input)
+                # cv2.waitKey(0)
+                # hwc -> 1chw
+                model_input = self.transform(model_input)#.unsqueeze(0)
+                images[i] = model_input
+            images = images.to(self.device) 
+            with torch.no_grad():
+                out = self.model(images)
+            out = out.cpu().detach().numpy()
+            if net_out:
+                return out, trans_all, centers, scales, rot
+            coords, max_val = get_final_preds(
+                out,
+                np.asarray(centers),
+                np.asarray(scales),
+                [rot for _ in range(out.shape[0])])
+            pts = np.concatenate((coords, max_val), axis=2)
+            return coco17tobody25(pts)
+        else:
+            return np.empty(0, 25, 3)
+        
+    def predict_with_previous(self, image, bboxes, keypoints, sigma):
+        # (batch, nJoints, height, width)
+        net_out, trans_all, centers, scales, rot = self.__call__(image, bboxes, net_out=True)
+        keypoints = keypoints[:, COCO17_IN_BODY25]
+        keypoints_rescale = keypoints.copy()
+        for i in range(keypoints.shape[0]):
+            keypoints_rescale[..., :2] = batch_affine_transform(keypoints[i], trans_all[i])/4
+        weights = get_gaussian_maps(net_out, keypoints_rescale, sigma)        
+        out = net_out * weights
+        coords, max_val = get_final_preds(
+            out,
+            np.asarray(centers),
+            np.asarray(scales),
+            rot)
+        pts = np.concatenate((coords, max_val), axis=2)
+        return coco17tobody25(pts)
+
+    def predict(self, image, detections, keypoints=None, ret_crop=False):
+        if keypoints is not None:
+            keypoints = keypoints[:, COCO17_IN_BODY25]
+            kpts_rescale = [None for _ in range(len(keypoints))]
+        boxes = []
+        rotation = 0
+        image_pose = image
+        # image_pose = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        if detections is not None:
+            images = torch.zeros((len(detections), 3, self.resolution[1], self.resolution[0]), device=self.device)  # (height, width)
+            # pose estimation : for multiple people
+            centers = []
+            scales = []
+            for box in detections:
+                center, scale = box_to_center_scale(box, self.resolution[0], self.resolution[1])
+                centers.append(center)
+                scales.append(scale)
+            model_inputs = []
+            for i, (center, scale) in enumerate(zip(centers, scales)):
+                trans = get_affine_transform(center, scale, rotation, self.resolution)
+                # Crop smaller image of people
+                model_input = cv2.warpAffine(
+                    image_pose,
+                    trans,
+                    (int(self.resolution[0]), int(self.resolution[1])),
+                    flags=cv2.INTER_LINEAR)
+                if keypoints is not None:
+                    kpts_homo = keypoints[i].copy()
+                    kpts_homo[:, 2] = 1
+                    kpts_rescale[i] = (kpts_homo @ trans.T)/4
+                # global humanId
+                # cv2.imwrite('../output/debughrnet/person_{}.jpg'.format(humanId), model_input[:,:,[2,1,0]])
+                # humanId += 1
+                # hwc -> 1chw
+                model_input = self.transform(model_input)#.unsqueeze(0)
+                images[i] = model_input
+        # torch.cuda.synchronize(self.device)
+
+        # print(' - spending {:.2f}ms in preprocess.'.format(1000*(time.time() - start)))
+        if images.shape[0] == 0:
+            return np.empty((0, 25, 3))
+        else:
+            # start = time.time()
+            images = images.to(self.device) 
+            # torch.cuda.synchronize(self.device)
+
+            # print(' - spending {:.2f}ms in copy to cuda.'.format(1000*(time.time() - start)))
+            # start = time.time()
+            with torch.no_grad():
+                if len(images) <= self.max_batch_size:
+                    out = self.model(images)
+                else:
+                    out = torch.empty(
+                        (images.shape[0], self.nof_joints, self.resolution[1] // 4, self.resolution[0] // 4)
+                    ).to(self.device)
+                    for i in range(0, len(images), self.max_batch_size):
+                        out[i:i + self.max_batch_size] = self.model(images[i:i + self.max_batch_size])
+            # torch.cuda.synchronize(self.device)
+            global humanId
+            if keypoints is not None:
+                filename = join('../output/debughrnet', '{:06d}.jpg'.format(humanId))
+                humanId += 1
+                # save_batch_heatmaps(images, out, filename)
+                # 制造高斯核，默认为1
+                weights = np.ones(out.shape, dtype=np.float32)
+                for i in range(weights.shape[0]):
+                    for nj in range(weights.shape[1]):
+                        if keypoints[i][nj][2] < 0:
+                            weights[i][nj] = 0
+                            continue
+                        elif keypoints[i][nj][2] < 0.01:
+                            continue
+                        weights[i][nj] = 0
+                        mu_x, mu_y = kpts_rescale[i][nj]
+                        mu_x, mu_y = int(mu_x + 0.5), int(mu_y + 0.5)
+                        # Usable gaussian range
+                        ul = [mu_x - gauss_radius, mu_y - gauss_radius]
+                        br = [mu_x + gauss_radius + 1, mu_y + gauss_radius + 1]
+                        # Usable gaussian range
+                        g_x = max(0, -ul[0]), min(br[0], weights.shape[3]) - ul[0]
+                        g_y = max(0, -ul[1]), min(br[1], weights.shape[2]) - ul[1]
+                        # Image range
+                        img_x = max(0, ul[0]), min(br[0], weights.shape[3])
+                        img_y = max(0, ul[1]), min(br[1], weights.shape[2])
+                        weights[i][nj][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
+                            gauss_kernel[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+                filename = join('../output/debughrnet', '{:06d}.jpg'.format(humanId))
+                humanId += 1
+                # save_batch_heatmaps(images, torch.Tensor(weights), filename)
+                out = out.cpu().detach().numpy()
+                out = out * weights
+                filename = join('../output/debughrnet', '{:06d}.jpg'.format(humanId))
+                humanId += 1
+                # save_batch_heatmaps(images, torch.Tensor(out), filename)
+            else:
+                out = out.cpu().detach().numpy()
+            coords, max_val = get_final_preds(
+                out,
+                np.asarray(centers),
+                np.asarray(scales))
+            pts = np.concatenate((coords, max_val), axis=2)
+            # torch.cuda.synchronize(self.device)
+            # print(' - spending {:.2f}ms in postprocess.'.format(1000*(time.time() - start)))
+            # print('')
+            if ret_crop:
+                return coco17tobody25(pts), images
+            else:
+                return coco17tobody25(pts)
\ No newline at end of file
diff --git a/easymocap/estimator/HRNet/modules.py b/easymocap/estimator/HRNet/modules.py
new file mode 100644
index 0000000..733fedd
--- /dev/null
+++ b/easymocap/estimator/HRNet/modules.py
@@ -0,0 +1,72 @@
+import torch
+from torch import nn
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=bn_momentum)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
diff --git a/easymocap/estimator/YOLOv4/__init__.py b/easymocap/estimator/YOLOv4/__init__.py
new file mode 100644
index 0000000..25b4f96
--- /dev/null
+++ b/easymocap/estimator/YOLOv4/__init__.py
@@ -0,0 +1,8 @@
+'''
+  @ Date: 2020-12-10 16:37:04
+  @ Author: Qing Shuai
+  @ LastEditors: Qing Shuai
+  @ LastEditTime: 2020-12-10 16:52:06
+  @ FilePath: /mvpose/code/estimator/YOLOv4/__init__.py
+'''
+from .yolo import YOLOv4
\ No newline at end of file
diff --git a/easymocap/estimator/YOLOv4/coco.names b/easymocap/estimator/YOLOv4/coco.names
new file mode 100644
index 0000000..ca76c80
--- /dev/null
+++ b/easymocap/estimator/YOLOv4/coco.names
@@ -0,0 +1,80 @@
+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
diff --git a/easymocap/estimator/YOLOv4/config.py b/easymocap/estimator/YOLOv4/config.py
new file mode 100644
index 0000000..04b60b8
--- /dev/null
+++ b/easymocap/estimator/YOLOv4/config.py
@@ -0,0 +1,257 @@
+import torch
+from .torch_utils import convert2cpu
+
+def parse_cfg(cfgfile):
+    blocks = []
+    fp = open(cfgfile, 'r')
+    block = None
+    line = fp.readline()
+    while line != '':
+        line = line.rstrip()
+        if line == '' or line[0] == '#':
+            line = fp.readline()
+            continue
+        elif line[0] == '[':
+            if block:
+                blocks.append(block)
+            block = dict()
+            block['type'] = line.lstrip('[').rstrip(']')
+            # set default value
+            if block['type'] == 'convolutional':
+                block['batch_normalize'] = 0
+        else:
+            key, value = line.split('=')
+            key = key.strip()
+            if key == 'type':
+                key = '_type'
+            value = value.strip()
+            block[key] = value
+        line = fp.readline()
+
+    if block:
+        blocks.append(block)
+    fp.close()
+    return blocks
+
+
+def print_cfg(blocks):
+    print('layer     filters    size              input                output');
+    prev_width = 416
+    prev_height = 416
+    prev_filters = 3
+    out_filters = []
+    out_widths = []
+    out_heights = []
+    ind = -2
+    for block in blocks:
+        ind = ind + 1
+        if block['type'] == 'net':
+            prev_width = int(block['width'])
+            prev_height = int(block['height'])
+            continue
+        elif block['type'] == 'convolutional':
+            filters = int(block['filters'])
+            kernel_size = int(block['size'])
+            stride = int(block['stride'])
+            is_pad = int(block['pad'])
+            pad = (kernel_size - 1) // 2 if is_pad else 0
+            width = (prev_width + 2 * pad - kernel_size) // stride + 1
+            height = (prev_height + 2 * pad - kernel_size) // stride + 1
+            print('%5d %-6s %4d  %d x %d / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
+                ind, 'conv', filters, kernel_size, kernel_size, stride, prev_width, prev_height, prev_filters, width,
+                height, filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'maxpool':
+            pool_size = int(block['size'])
+            stride = int(block['stride'])
+            width = prev_width // stride
+            height = prev_height // stride
+            print('%5d %-6s       %d x %d / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
+                ind, 'max', pool_size, pool_size, stride, prev_width, prev_height, prev_filters, width, height,
+                filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'avgpool':
+            width = 1
+            height = 1
+            print('%5d %-6s                   %3d x %3d x%4d   ->  %3d' % (
+                ind, 'avg', prev_width, prev_height, prev_filters, prev_filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'softmax':
+            print('%5d %-6s                                    ->  %3d' % (ind, 'softmax', prev_filters))
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'cost':
+            print('%5d %-6s                                     ->  %3d' % (ind, 'cost', prev_filters))
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'reorg':
+            stride = int(block['stride'])
+            filters = stride * stride * prev_filters
+            width = prev_width // stride
+            height = prev_height // stride
+            print('%5d %-6s             / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
+                ind, 'reorg', stride, prev_width, prev_height, prev_filters, width, height, filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'upsample':
+            stride = int(block['stride'])
+            filters = prev_filters
+            width = prev_width * stride
+            height = prev_height * stride
+            print('%5d %-6s           * %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
+                ind, 'upsample', stride, prev_width, prev_height, prev_filters, width, height, filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'route':
+            layers = block['layers'].split(',')
+            layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
+            if len(layers) == 1:
+                print('%5d %-6s %d' % (ind, 'route', layers[0]))
+                prev_width = out_widths[layers[0]]
+                prev_height = out_heights[layers[0]]
+                prev_filters = out_filters[layers[0]]
+            elif len(layers) == 2:
+                print('%5d %-6s %d %d' % (ind, 'route', layers[0], layers[1]))
+                prev_width = out_widths[layers[0]]
+                prev_height = out_heights[layers[0]]
+                assert (prev_width == out_widths[layers[1]])
+                assert (prev_height == out_heights[layers[1]])
+                prev_filters = out_filters[layers[0]] + out_filters[layers[1]]
+            elif len(layers) == 4:
+                print('%5d %-6s %d %d %d %d' % (ind, 'route', layers[0], layers[1], layers[2], layers[3]))
+                prev_width = out_widths[layers[0]]
+                prev_height = out_heights[layers[0]]
+                assert (prev_width == out_widths[layers[1]] == out_widths[layers[2]] == out_widths[layers[3]])
+                assert (prev_height == out_heights[layers[1]] == out_heights[layers[2]] == out_heights[layers[3]])
+                prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + out_filters[
+                    layers[3]]
+            else:
+                print("route error !!! {} {} {}".format(sys._getframe().f_code.co_filename,
+                                                        sys._getframe().f_code.co_name, sys._getframe().f_lineno))
+
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] in ['region', 'yolo']:
+            print('%5d %-6s' % (ind, 'detection'))
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'shortcut':
+            from_id = int(block['from'])
+            from_id = from_id if from_id > 0 else from_id + ind
+            print('%5d %-6s %d' % (ind, 'shortcut', from_id))
+            prev_width = out_widths[from_id]
+            prev_height = out_heights[from_id]
+            prev_filters = out_filters[from_id]
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'connected':
+            filters = int(block['output'])
+            print('%5d %-6s                            %d  ->  %3d' % (ind, 'connected', prev_filters, filters))
+            prev_filters = filters
+            out_widths.append(1)
+            out_heights.append(1)
+            out_filters.append(prev_filters)
+        else:
+            print('unknown type %s' % (block['type']))
+
+
+def load_conv(buf, start, conv_model):
+    num_w = conv_model.weight.numel()
+    num_b = conv_model.bias.numel()
+    conv_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape));
+    start = start + num_w
+    return start
+
+
+def save_conv(fp, conv_model):
+    if conv_model.bias.is_cuda:
+        convert2cpu(conv_model.bias.data).numpy().tofile(fp)
+        convert2cpu(conv_model.weight.data).numpy().tofile(fp)
+    else:
+        conv_model.bias.data.numpy().tofile(fp)
+        conv_model.weight.data.numpy().tofile(fp)
+
+
+def load_conv_bn(buf, start, conv_model, bn_model):
+    num_w = conv_model.weight.numel()
+    num_b = bn_model.bias.numel()
+    bn_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    bn_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    bn_model.running_mean.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    bn_model.running_var.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape));
+    start = start + num_w
+    return start
+
+
+def save_conv_bn(fp, conv_model, bn_model):
+    if bn_model.bias.is_cuda:
+        convert2cpu(bn_model.bias.data).numpy().tofile(fp)
+        convert2cpu(bn_model.weight.data).numpy().tofile(fp)
+        convert2cpu(bn_model.running_mean).numpy().tofile(fp)
+        convert2cpu(bn_model.running_var).numpy().tofile(fp)
+        convert2cpu(conv_model.weight.data).numpy().tofile(fp)
+    else:
+        bn_model.bias.data.numpy().tofile(fp)
+        bn_model.weight.data.numpy().tofile(fp)
+        bn_model.running_mean.numpy().tofile(fp)
+        bn_model.running_var.numpy().tofile(fp)
+        conv_model.weight.data.numpy().tofile(fp)
+
+
+def load_fc(buf, start, fc_model):
+    num_w = fc_model.weight.numel()
+    num_b = fc_model.bias.numel()
+    fc_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    fc_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]));
+    start = start + num_w
+    return start
+
+
+def save_fc(fp, fc_model):
+    fc_model.bias.data.numpy().tofile(fp)
+    fc_model.weight.data.numpy().tofile(fp)
+
+
+if __name__ == '__main__':
+    import sys
+
+    blocks = parse_cfg('cfg/yolo.cfg')
+    if len(sys.argv) == 2:
+        blocks = parse_cfg(sys.argv[1])
+    print_cfg(blocks)
diff --git a/easymocap/estimator/YOLOv4/darknet2pytorch.py b/easymocap/estimator/YOLOv4/darknet2pytorch.py
new file mode 100644
index 0000000..dfbfc45
--- /dev/null
+++ b/easymocap/estimator/YOLOv4/darknet2pytorch.py
@@ -0,0 +1,515 @@
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from .region_loss import RegionLoss
+from .yolo_layer import YoloLayer
+from .config import *
+from .torch_utils import *
+
+
+class Mish(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x = x * (torch.tanh(torch.nn.functional.softplus(x)))
+        return x
+
+
+class MaxPoolDark(nn.Module):
+    def __init__(self, size=2, stride=1):
+        super(MaxPoolDark, self).__init__()
+        self.size = size
+        self.stride = stride
+
+    def forward(self, x):
+        '''
+        darknet output_size = (input_size + p - k) / s +1
+        p : padding = k - 1
+        k : size
+        s : stride
+        torch output_size = (input_size + 2*p -k) / s +1
+        p : padding = k//2
+        '''
+        p = self.size // 2
+        if ((x.shape[2] - 1) // self.stride) != ((x.shape[2] + 2 * p - self.size) // self.stride):
+            padding1 = (self.size - 1) // 2
+            padding2 = padding1 + 1
+        else:
+            padding1 = (self.size - 1) // 2
+            padding2 = padding1
+        if ((x.shape[3] - 1) // self.stride) != ((x.shape[3] + 2 * p - self.size) // self.stride):
+            padding3 = (self.size - 1) // 2
+            padding4 = padding3 + 1
+        else:
+            padding3 = (self.size - 1) // 2
+            padding4 = padding3
+        x = F.max_pool2d(F.pad(x, (padding3, padding4, padding1, padding2), mode='replicate'),
+                         self.size, stride=self.stride)
+        return x
+
+
+class Upsample_expand(nn.Module):
+    def __init__(self, stride=2):
+        super(Upsample_expand, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        assert (x.data.dim() == 4)
+        
+        x = x.view(x.size(0), x.size(1), x.size(2), 1, x.size(3), 1).\
+            expand(x.size(0), x.size(1), x.size(2), self.stride, x.size(3), self.stride).contiguous().\
+            view(x.size(0), x.size(1), x.size(2) * self.stride, x.size(3) * self.stride)
+
+        return x
+
+
+class Upsample_interpolate(nn.Module):
+    def __init__(self, stride):
+        super(Upsample_interpolate, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        assert (x.data.dim() == 4)
+
+        out = F.interpolate(x, size=(x.size(2) * self.stride, x.size(3) * self.stride), mode='nearest')
+        return out
+
+
+class Reorg(nn.Module):
+    def __init__(self, stride=2):
+        super(Reorg, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        stride = self.stride
+        assert (x.data.dim() == 4)
+        B = x.data.size(0)
+        C = x.data.size(1)
+        H = x.data.size(2)
+        W = x.data.size(3)
+        assert (H % stride == 0)
+        assert (W % stride == 0)
+        ws = stride
+        hs = stride
+        x = x.view(B, C, H / hs, hs, W / ws, ws).transpose(3, 4).contiguous()
+        x = x.view(B, C, H / hs * W / ws, hs * ws).transpose(2, 3).contiguous()
+        x = x.view(B, C, hs * ws, H / hs, W / ws).transpose(1, 2).contiguous()
+        x = x.view(B, hs * ws * C, H / hs, W / ws)
+        return x
+
+
+class GlobalAvgPool2d(nn.Module):
+    def __init__(self):
+        super(GlobalAvgPool2d, self).__init__()
+
+    def forward(self, x):
+        N = x.data.size(0)
+        C = x.data.size(1)
+        H = x.data.size(2)
+        W = x.data.size(3)
+        x = F.avg_pool2d(x, (H, W))
+        x = x.view(N, C)
+        return x
+
+
+# for route and shortcut
+class EmptyModule(nn.Module):
+    def __init__(self):
+        super(EmptyModule, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+# support route shortcut and reorg
+class Darknet(nn.Module):
+    def __init__(self, cfgfile, inference=False):
+        super(Darknet, self).__init__()
+        self.inference = inference
+        self.training = not self.inference
+
+        self.blocks = parse_cfg(cfgfile)
+        self.width = int(self.blocks[0]['width'])
+        self.height = int(self.blocks[0]['height'])
+
+        self.models = self.create_network(self.blocks)  # merge conv, bn,leaky
+        self.loss = self.models[len(self.models) - 1]
+
+        if self.blocks[(len(self.blocks) - 1)]['type'] == 'region':
+            self.anchors = self.loss.anchors
+            self.num_anchors = self.loss.num_anchors
+            self.anchor_step = self.loss.anchor_step
+            self.num_classes = self.loss.num_classes
+
+        self.header = torch.IntTensor([0, 0, 0, 0])
+        self.seen = 0
+
+    def forward(self, x):
+        ind = -2
+        self.loss = None
+        outputs = dict()
+        out_boxes = []
+        for block in self.blocks:
+            ind = ind + 1
+            # if ind > 0:
+            #    return x
+
+            if block['type'] == 'net':
+                continue
+            elif block['type'] in ['convolutional', 'maxpool', 'reorg', 'upsample', 'avgpool', 'softmax', 'connected']:
+                x = self.models[ind](x)
+                outputs[ind] = x
+            elif block['type'] == 'route':
+                layers = block['layers'].split(',')
+                layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
+                if len(layers) == 1:
+                    if 'groups' not in block.keys() or int(block['groups']) == 1:
+                        x = outputs[layers[0]]
+                        outputs[ind] = x
+                    else:
+                        groups = int(block['groups'])
+                        group_id = int(block['group_id'])
+                        _, b, _, _ = outputs[layers[0]].shape
+                        x = outputs[layers[0]][:, b // groups * group_id:b // groups * (group_id + 1)]
+                        outputs[ind] = x
+                elif len(layers) == 2:
+                    x1 = outputs[layers[0]]
+                    x2 = outputs[layers[1]]
+                    x = torch.cat((x1, x2), 1)
+                    outputs[ind] = x
+                elif len(layers) == 4:
+                    x1 = outputs[layers[0]]
+                    x2 = outputs[layers[1]]
+                    x3 = outputs[layers[2]]
+                    x4 = outputs[layers[3]]
+                    x = torch.cat((x1, x2, x3, x4), 1)
+                    outputs[ind] = x
+                else:
+                    print("rounte number > 2 ,is {}".format(len(layers)))
+
+            elif block['type'] == 'shortcut':
+                from_layer = int(block['from'])
+                activation = block['activation']
+                from_layer = from_layer if from_layer > 0 else from_layer + ind
+                x1 = outputs[from_layer]
+                x2 = outputs[ind - 1]
+                x = x1 + x2
+                if activation == 'leaky':
+                    x = F.leaky_relu(x, 0.1, inplace=True)
+                elif activation == 'relu':
+                    x = F.relu(x, inplace=True)
+                outputs[ind] = x
+            elif block['type'] == 'region':
+                continue
+                if self.loss:
+                    self.loss = self.loss + self.models[ind](x)
+                else:
+                    self.loss = self.models[ind](x)
+                outputs[ind] = None
+            elif block['type'] == 'yolo':
+                # if self.training:
+                #     pass
+                # else:
+                #     boxes = self.models[ind](x)
+                #     out_boxes.append(boxes)
+                boxes = self.models[ind](x)
+                out_boxes.append(boxes)
+            elif block['type'] == 'cost':
+                continue
+            else:
+                print('unknown type %s' % (block['type']))
+
+        if self.training:
+            return out_boxes
+        else:
+            return get_region_boxes(out_boxes)
+
+    def print_network(self):
+        print_cfg(self.blocks)
+
+    def create_network(self, blocks):
+        models = nn.ModuleList()
+
+        prev_filters = 3
+        out_filters = []
+        prev_stride = 1
+        out_strides = []
+        conv_id = 0
+        for block in blocks:
+            if block['type'] == 'net':
+                prev_filters = int(block['channels'])
+                continue
+            elif block['type'] == 'convolutional':
+                conv_id = conv_id + 1
+                batch_normalize = int(block['batch_normalize'])
+                filters = int(block['filters'])
+                kernel_size = int(block['size'])
+                stride = int(block['stride'])
+                is_pad = int(block['pad'])
+                pad = (kernel_size - 1) // 2 if is_pad else 0
+                activation = block['activation']
+                model = nn.Sequential()
+                if batch_normalize:
+                    model.add_module('conv{0}'.format(conv_id),
+                                     nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias=False))
+                    model.add_module('bn{0}'.format(conv_id), nn.BatchNorm2d(filters))
+                    # model.add_module('bn{0}'.format(conv_id), BN2d(filters))
+                else:
+                    model.add_module('conv{0}'.format(conv_id),
+                                     nn.Conv2d(prev_filters, filters, kernel_size, stride, pad))
+                if activation == 'leaky':
+                    model.add_module('leaky{0}'.format(conv_id), nn.LeakyReLU(0.1, inplace=True))
+                elif activation == 'relu':
+                    model.add_module('relu{0}'.format(conv_id), nn.ReLU(inplace=True))
+                elif activation == 'mish':
+                    model.add_module('mish{0}'.format(conv_id), Mish())
+                else:
+                    pass
+                    # print("convalution havn't activate {}".format(activation))
+
+                prev_filters = filters
+                out_filters.append(prev_filters)
+                prev_stride = stride * prev_stride
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block['type'] == 'maxpool':
+                pool_size = int(block['size'])
+                stride = int(block['stride'])
+                if stride == 1 and pool_size % 2:
+                    # You can use Maxpooldark instead, here is convenient to convert onnx.
+                    # Example: [maxpool] size=3 stride=1
+                    model = nn.MaxPool2d(kernel_size=pool_size, stride=stride, padding=pool_size // 2)
+                elif stride == pool_size:
+                    # You can use Maxpooldark instead, here is convenient to convert onnx.
+                    # Example: [maxpool] size=2 stride=2
+                    model = nn.MaxPool2d(kernel_size=pool_size, stride=stride, padding=0)
+                else:
+                    model = MaxPoolDark(pool_size, stride)
+                out_filters.append(prev_filters)
+                prev_stride = stride * prev_stride
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block['type'] == 'avgpool':
+                model = GlobalAvgPool2d()
+                out_filters.append(prev_filters)
+                models.append(model)
+            elif block['type'] == 'softmax':
+                model = nn.Softmax()
+                out_strides.append(prev_stride)
+                out_filters.append(prev_filters)
+                models.append(model)
+            elif block['type'] == 'cost':
+                if block['_type'] == 'sse':
+                    model = nn.MSELoss(reduction='mean')
+                elif block['_type'] == 'L1':
+                    model = nn.L1Loss(reduction='mean')
+                elif block['_type'] == 'smooth':
+                    model = nn.SmoothL1Loss(reduction='mean')
+                out_filters.append(1)
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block['type'] == 'reorg':
+                stride = int(block['stride'])
+                prev_filters = stride * stride * prev_filters
+                out_filters.append(prev_filters)
+                prev_stride = prev_stride * stride
+                out_strides.append(prev_stride)
+                models.append(Reorg(stride))
+            elif block['type'] == 'upsample':
+                stride = int(block['stride'])
+                out_filters.append(prev_filters)
+                prev_stride = prev_stride // stride
+                out_strides.append(prev_stride)
+
+                models.append(Upsample_expand(stride))
+                # models.append(Upsample_interpolate(stride))
+
+            elif block['type'] == 'route':
+                layers = block['layers'].split(',')
+                ind = len(models)
+                layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
+                if len(layers) == 1:
+                    if 'groups' not in block.keys() or int(block['groups']) == 1:
+                        prev_filters = out_filters[layers[0]]
+                        prev_stride = out_strides[layers[0]]
+                    else:
+                        prev_filters = out_filters[layers[0]] // int(block['groups'])
+                        prev_stride = out_strides[layers[0]] // int(block['groups'])
+                elif len(layers) == 2:
+                    assert (layers[0] == ind - 1 or layers[1] == ind - 1)
+                    prev_filters = out_filters[layers[0]] + out_filters[layers[1]]
+                    prev_stride = out_strides[layers[0]]
+                elif len(layers) == 4:
+                    assert (layers[0] == ind - 1)
+                    prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + \
+                                   out_filters[layers[3]]
+                    prev_stride = out_strides[layers[0]]
+                else:
+                    print("route error!!!")
+
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(EmptyModule())
+            elif block['type'] == 'shortcut':
+                ind = len(models)
+                prev_filters = out_filters[ind - 1]
+                out_filters.append(prev_filters)
+                prev_stride = out_strides[ind - 1]
+                out_strides.append(prev_stride)
+                models.append(EmptyModule())
+            elif block['type'] == 'connected':
+                filters = int(block['output'])
+                if block['activation'] == 'linear':
+                    model = nn.Linear(prev_filters, filters)
+                elif block['activation'] == 'leaky':
+                    model = nn.Sequential(
+                        nn.Linear(prev_filters, filters),
+                        nn.LeakyReLU(0.1, inplace=True))
+                elif block['activation'] == 'relu':
+                    model = nn.Sequential(
+                        nn.Linear(prev_filters, filters),
+                        nn.ReLU(inplace=True))
+                prev_filters = filters
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block['type'] == 'region':
+                loss = RegionLoss()
+                anchors = block['anchors'].split(',')
+                loss.anchors = [float(i) for i in anchors]
+                loss.num_classes = int(block['classes'])
+                loss.num_anchors = int(block['num'])
+                loss.anchor_step = len(loss.anchors) // loss.num_anchors
+                loss.object_scale = float(block['object_scale'])
+                loss.noobject_scale = float(block['noobject_scale'])
+                loss.class_scale = float(block['class_scale'])
+                loss.coord_scale = float(block['coord_scale'])
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(loss)
+            elif block['type'] == 'yolo':
+                yolo_layer = YoloLayer()
+                anchors = block['anchors'].split(',')
+                anchor_mask = block['mask'].split(',')
+                yolo_layer.anchor_mask = [int(i) for i in anchor_mask]
+                yolo_layer.anchors = [float(i) for i in anchors]
+                yolo_layer.num_classes = int(block['classes'])
+                self.num_classes = yolo_layer.num_classes
+                yolo_layer.num_anchors = int(block['num'])
+                yolo_layer.anchor_step = len(yolo_layer.anchors) // yolo_layer.num_anchors
+                yolo_layer.stride = prev_stride
+                yolo_layer.scale_x_y = float(block['scale_x_y'])
+                # yolo_layer.object_scale = float(block['object_scale'])
+                # yolo_layer.noobject_scale = float(block['noobject_scale'])
+                # yolo_layer.class_scale = float(block['class_scale'])
+                # yolo_layer.coord_scale = float(block['coord_scale'])
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(yolo_layer)
+            else:
+                print('unknown type %s' % (block['type']))
+
+        return models
+
+    def load_weights(self, weightfile):
+        fp = open(weightfile, 'rb')
+        header = np.fromfile(fp, count=5, dtype=np.int32)
+        self.header = torch.from_numpy(header)
+        self.seen = self.header[3]
+        buf = np.fromfile(fp, dtype=np.float32)
+        fp.close()
+
+        start = 0
+        ind = -2
+        for block in self.blocks:
+            if start >= buf.size:
+                break
+            ind = ind + 1
+            if block['type'] == 'net':
+                continue
+            elif block['type'] == 'convolutional':
+                model = self.models[ind]
+                batch_normalize = int(block['batch_normalize'])
+                if batch_normalize:
+                    start = load_conv_bn(buf, start, model[0], model[1])
+                else:
+                    start = load_conv(buf, start, model[0])
+            elif block['type'] == 'connected':
+                model = self.models[ind]
+                if block['activation'] != 'linear':
+                    start = load_fc(buf, start, model[0])
+                else:
+                    start = load_fc(buf, start, model)
+            elif block['type'] == 'maxpool':
+                pass
+            elif block['type'] == 'reorg':
+                pass
+            elif block['type'] == 'upsample':
+                pass
+            elif block['type'] == 'route':
+                pass
+            elif block['type'] == 'shortcut':
+                pass
+            elif block['type'] == 'region':
+                pass
+            elif block['type'] == 'yolo':
+                pass
+            elif block['type'] == 'avgpool':
+                pass
+            elif block['type'] == 'softmax':
+                pass
+            elif block['type'] == 'cost':
+                pass
+            else:
+                print('unknown type %s' % (block['type']))
+
+    # def save_weights(self, outfile, cutoff=0):
+    #     if cutoff <= 0:
+    #         cutoff = len(self.blocks) - 1
+    #
+    #     fp = open(outfile, 'wb')
+    #     self.header[3] = self.seen
+    #     header = self.header
+    #     header.numpy().tofile(fp)
+    #
+    #     ind = -1
+    #     for blockId in range(1, cutoff + 1):
+    #         ind = ind + 1
+    #         block = self.blocks[blockId]
+    #         if block['type'] == 'convolutional':
+    #             model = self.models[ind]
+    #             batch_normalize = int(block['batch_normalize'])
+    #             if batch_normalize:
+    #                 save_conv_bn(fp, model[0], model[1])
+    #             else:
+    #                 save_conv(fp, model[0])
+    #         elif block['type'] == 'connected':
+    #             model = self.models[ind]
+    #             if block['activation'] != 'linear':
+    #                 save_fc(fc, model)
+    #             else:
+    #                 save_fc(fc, model[0])
+    #         elif block['type'] == 'maxpool':
+    #             pass
+    #         elif block['type'] == 'reorg':
+    #             pass
+    #         elif block['type'] == 'upsample':
+    #             pass
+    #         elif block['type'] == 'route':
+    #             pass
+    #         elif block['type'] == 'shortcut':
+    #             pass
+    #         elif block['type'] == 'region':
+    #             pass
+    #         elif block['type'] == 'yolo':
+    #             pass
+    #         elif block['type'] == 'avgpool':
+    #             pass
+    #         elif block['type'] == 'softmax':
+    #             pass
+    #         elif block['type'] == 'cost':
+    #             pass
+    #         else:
+    #             print('unknown type %s' % (block['type']))
+    #     fp.close()
diff --git a/easymocap/estimator/YOLOv4/region_loss.py b/easymocap/estimator/YOLOv4/region_loss.py
new file mode 100644
index 0000000..1aa7f18
--- /dev/null
+++ b/easymocap/estimator/YOLOv4/region_loss.py
@@ -0,0 +1,195 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from .torch_utils import *
+
+
+def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale,
+                  sil_thresh, seen):
+    nB = target.size(0)
+    nA = num_anchors
+    nC = num_classes
+    anchor_step = len(anchors) / num_anchors
+    conf_mask = torch.ones(nB, nA, nH, nW) * noobject_scale
+    coord_mask = torch.zeros(nB, nA, nH, nW)
+    cls_mask = torch.zeros(nB, nA, nH, nW)
+    tx = torch.zeros(nB, nA, nH, nW)
+    ty = torch.zeros(nB, nA, nH, nW)
+    tw = torch.zeros(nB, nA, nH, nW)
+    th = torch.zeros(nB, nA, nH, nW)
+    tconf = torch.zeros(nB, nA, nH, nW)
+    tcls = torch.zeros(nB, nA, nH, nW)
+
+    nAnchors = nA * nH * nW
+    nPixels = nH * nW
+    for b in range(nB):
+        cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t()
+        cur_ious = torch.zeros(nAnchors)
+        for t in range(50):
+            if target[b][t * 5 + 1] == 0:
+                break
+            gx = target[b][t * 5 + 1] * nW
+            gy = target[b][t * 5 + 2] * nH
+            gw = target[b][t * 5 + 3] * nW
+            gh = target[b][t * 5 + 4] * nH
+            cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t()
+            cur_ious = torch.max(cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False))
+        conf_mask[b][cur_ious > sil_thresh] = 0
+    if seen < 12800:
+        if anchor_step == 4:
+            tx = torch.FloatTensor(anchors).view(nA, anchor_step).index_select(1, torch.LongTensor([2])).view(1, nA, 1,
+                                                                                                              1).repeat(
+                nB, 1, nH, nW)
+            ty = torch.FloatTensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([2])).view(
+                1, nA, 1, 1).repeat(nB, 1, nH, nW)
+        else:
+            tx.fill_(0.5)
+            ty.fill_(0.5)
+        tw.zero_()
+        th.zero_()
+        coord_mask.fill_(1)
+
+    nGT = 0
+    nCorrect = 0
+    for b in range(nB):
+        for t in range(50):
+            if target[b][t * 5 + 1] == 0:
+                break
+            nGT = nGT + 1
+            best_iou = 0.0
+            best_n = -1
+            min_dist = 10000
+            gx = target[b][t * 5 + 1] * nW
+            gy = target[b][t * 5 + 2] * nH
+            gi = int(gx)
+            gj = int(gy)
+            gw = target[b][t * 5 + 3] * nW
+            gh = target[b][t * 5 + 4] * nH
+            gt_box = [0, 0, gw, gh]
+            for n in range(nA):
+                aw = anchors[anchor_step * n]
+                ah = anchors[anchor_step * n + 1]
+                anchor_box = [0, 0, aw, ah]
+                iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False)
+                if anchor_step == 4:
+                    ax = anchors[anchor_step * n + 2]
+                    ay = anchors[anchor_step * n + 3]
+                    dist = pow(((gi + ax) - gx), 2) + pow(((gj + ay) - gy), 2)
+                if iou > best_iou:
+                    best_iou = iou
+                    best_n = n
+                elif anchor_step == 4 and iou == best_iou and dist < min_dist:
+                    best_iou = iou
+                    best_n = n
+                    min_dist = dist
+
+            gt_box = [gx, gy, gw, gh]
+            pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi]
+
+            coord_mask[b][best_n][gj][gi] = 1
+            cls_mask[b][best_n][gj][gi] = 1
+            conf_mask[b][best_n][gj][gi] = object_scale
+            tx[b][best_n][gj][gi] = target[b][t * 5 + 1] * nW - gi
+            ty[b][best_n][gj][gi] = target[b][t * 5 + 2] * nH - gj
+            tw[b][best_n][gj][gi] = math.log(gw / anchors[anchor_step * best_n])
+            th[b][best_n][gj][gi] = math.log(gh / anchors[anchor_step * best_n + 1])
+            iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False)  # best_iou
+            tconf[b][best_n][gj][gi] = iou
+            tcls[b][best_n][gj][gi] = target[b][t * 5]
+            if iou > 0.5:
+                nCorrect = nCorrect + 1
+
+    return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls
+
+
+class RegionLoss(nn.Module):
+    def __init__(self, num_classes=0, anchors=[], num_anchors=1):
+        super(RegionLoss, self).__init__()
+        self.num_classes = num_classes
+        self.anchors = anchors
+        self.num_anchors = num_anchors
+        self.anchor_step = len(anchors) / num_anchors
+        self.coord_scale = 1
+        self.noobject_scale = 1
+        self.object_scale = 5
+        self.class_scale = 1
+        self.thresh = 0.6
+        self.seen = 0
+
+    def forward(self, output, target):
+        # output : BxAs*(4+1+num_classes)*H*W
+        t0 = time.time()
+        nB = output.data.size(0)
+        nA = self.num_anchors
+        nC = self.num_classes
+        nH = output.data.size(2)
+        nW = output.data.size(3)
+
+        output = output.view(nB, nA, (5 + nC), nH, nW)
+        x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW))
+        y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW))
+        w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW)
+        h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW)
+        conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW))
+        cls = output.index_select(2, Variable(torch.linspace(5, 5 + nC - 1, nC).long().cuda()))
+        cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(nB * nA * nH * nW, nC)
+        t1 = time.time()
+
+        pred_boxes = torch.cuda.FloatTensor(4, nB * nA * nH * nW)
+        grid_x = torch.linspace(0, nW - 1, nW).repeat(nH, 1).repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
+        grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
+        anchor_w = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([0])).cuda()
+        anchor_h = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([1])).cuda()
+        anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
+        anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
+        pred_boxes[0] = x.data + grid_x
+        pred_boxes[1] = y.data + grid_y
+        pred_boxes[2] = torch.exp(w.data) * anchor_w
+        pred_boxes[3] = torch.exp(h.data) * anchor_h
+        pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4))
+        t2 = time.time()
+
+        nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes,
+                                                                                                    target.data,
+                                                                                                    self.anchors, nA,
+                                                                                                    nC, \
+                                                                                                    nH, nW,
+                                                                                                    self.noobject_scale,
+                                                                                                    self.object_scale,
+                                                                                                    self.thresh,
+                                                                                                    self.seen)
+        cls_mask = (cls_mask == 1)
+        nProposals = int((conf > 0.25).sum().data[0])
+
+        tx = Variable(tx.cuda())
+        ty = Variable(ty.cuda())
+        tw = Variable(tw.cuda())
+        th = Variable(th.cuda())
+        tconf = Variable(tconf.cuda())
+        tcls = Variable(tcls.view(-1)[cls_mask].long().cuda())
+
+        coord_mask = Variable(coord_mask.cuda())
+        conf_mask = Variable(conf_mask.cuda().sqrt())
+        cls_mask = Variable(cls_mask.view(-1, 1).repeat(1, nC).cuda())
+        cls = cls[cls_mask].view(-1, nC)
+
+        t3 = time.time()
+
+        loss_x = self.coord_scale * nn.MSELoss(reduction='sum')(x * coord_mask, tx * coord_mask) / 2.0
+        loss_y = self.coord_scale * nn.MSELoss(reduction='sum')(y * coord_mask, ty * coord_mask) / 2.0
+        loss_w = self.coord_scale * nn.MSELoss(reduction='sum')(w * coord_mask, tw * coord_mask) / 2.0
+        loss_h = self.coord_scale * nn.MSELoss(reduction='sum')(h * coord_mask, th * coord_mask) / 2.0
+        loss_conf = nn.MSELoss(reduction='sum')(conf * conf_mask, tconf * conf_mask) / 2.0
+        loss_cls = self.class_scale * nn.CrossEntropyLoss(reduction='sum')(cls, tcls)
+        loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
+        t4 = time.time()
+        if False:
+            print('-----------------------------------')
+            print('        activation : %f' % (t1 - t0))
+            print(' create pred_boxes : %f' % (t2 - t1))
+            print('     build targets : %f' % (t3 - t2))
+            print('       create loss : %f' % (t4 - t3))
+            print('             total : %f' % (t4 - t0))
+        print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f' % (
+        self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_w.data[0], loss_h.data[0],
+        loss_conf.data[0], loss_cls.data[0], loss.data[0]))
+        return loss
diff --git a/easymocap/estimator/YOLOv4/torch_utils.py b/easymocap/estimator/YOLOv4/torch_utils.py
new file mode 100644
index 0000000..453bc1c
--- /dev/null
+++ b/easymocap/estimator/YOLOv4/torch_utils.py
@@ -0,0 +1,98 @@
+import sys
+import os
+import time
+import math
+import torch
+import numpy as np
+from torch.autograd import Variable
+
+
+def bbox_ious(boxes1, boxes2, x1y1x2y2=True):
+    if x1y1x2y2:
+        mx = torch.min(boxes1[0], boxes2[0])
+        Mx = torch.max(boxes1[2], boxes2[2])
+        my = torch.min(boxes1[1], boxes2[1])
+        My = torch.max(boxes1[3], boxes2[3])
+        w1 = boxes1[2] - boxes1[0]
+        h1 = boxes1[3] - boxes1[1]
+        w2 = boxes2[2] - boxes2[0]
+        h2 = boxes2[3] - boxes2[1]
+    else:
+        mx = torch.min(boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0)
+        Mx = torch.max(boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0)
+        my = torch.min(boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0)
+        My = torch.max(boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0)
+        w1 = boxes1[2]
+        h1 = boxes1[3]
+        w2 = boxes2[2]
+        h2 = boxes2[3]
+    uw = Mx - mx
+    uh = My - my
+    cw = w1 + w2 - uw
+    ch = h1 + h2 - uh
+    mask = ((cw <= 0) + (ch <= 0) > 0)
+    area1 = w1 * h1
+    area2 = w2 * h2
+    carea = cw * ch
+    carea[mask] = 0
+    uarea = area1 + area2 - carea
+    return carea / uarea
+
+
+def get_region_boxes(boxes_and_confs):
+
+    # print('Getting boxes from boxes and confs ...')
+
+    boxes_list = []
+    confs_list = []
+
+    for item in boxes_and_confs:
+        boxes_list.append(item[0])
+        confs_list.append(item[1])
+
+    # boxes: [batch, num1 + num2 + num3, 1, 4]
+    # confs: [batch, num1 + num2 + num3, num_classes]
+    boxes = torch.cat(boxes_list, dim=1)
+    confs = torch.cat(confs_list, dim=1)
+        
+    return [boxes, confs]
+
+
+def convert2cpu(gpu_matrix):
+    return torch.FloatTensor(gpu_matrix.size()).copy_(gpu_matrix)
+
+
+def convert2cpu_long(gpu_matrix):
+    return torch.LongTensor(gpu_matrix.size()).copy_(gpu_matrix)
+
+
+
+def do_detect(model, img, conf_thresh, nms_thresh, use_cuda=1):
+    model.eval()
+    t0 = time.time()
+
+    if type(img) == np.ndarray and len(img.shape) == 3:  # cv2 image
+        img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0)
+    elif type(img) == np.ndarray and len(img.shape) == 4:
+        img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0)
+    else:
+        print("unknow image type")
+        exit(-1)
+
+    if use_cuda:
+        img = img.cuda()
+    img = torch.autograd.Variable(img)
+    
+    t1 = time.time()
+
+    output = model(img)
+
+    t2 = time.time()
+
+    print('-----------------------------------')
+    print('           Preprocess : %f' % (t1 - t0))
+    print('      Model Inference : %f' % (t2 - t1))
+    print('-----------------------------------')
+
+    return utils.post_processing(img, conf_thresh, nms_thresh, output)
+
diff --git a/easymocap/estimator/YOLOv4/utils.py b/easymocap/estimator/YOLOv4/utils.py
new file mode 100644
index 0000000..9b69d3c
--- /dev/null
+++ b/easymocap/estimator/YOLOv4/utils.py
@@ -0,0 +1,221 @@
+import sys
+import os
+import time
+import math
+import numpy as np
+
+import itertools
+import struct  # get_image_size
+import imghdr  # get_image_size
+
+
+def sigmoid(x):
+    return 1.0 / (np.exp(-x) + 1.)
+
+
+def softmax(x):
+    x = np.exp(x - np.expand_dims(np.max(x, axis=1), axis=1))
+    x = x / np.expand_dims(x.sum(axis=1), axis=1)
+    return x
+
+
+def bbox_iou(box1, box2, x1y1x2y2=True):
+    
+    # print('iou box1:', box1)
+    # print('iou box2:', box2)
+
+    if x1y1x2y2:
+        mx = min(box1[0], box2[0])
+        Mx = max(box1[2], box2[2])
+        my = min(box1[1], box2[1])
+        My = max(box1[3], box2[3])
+        w1 = box1[2] - box1[0]
+        h1 = box1[3] - box1[1]
+        w2 = box2[2] - box2[0]
+        h2 = box2[3] - box2[1]
+    else:
+        w1 = box1[2]
+        h1 = box1[3]
+        w2 = box2[2]
+        h2 = box2[3]
+
+        mx = min(box1[0], box2[0])
+        Mx = max(box1[0] + w1, box2[0] + w2)
+        my = min(box1[1], box2[1])
+        My = max(box1[1] + h1, box2[1] + h2)
+    uw = Mx - mx
+    uh = My - my
+    cw = w1 + w2 - uw
+    ch = h1 + h2 - uh
+    carea = 0
+    if cw <= 0 or ch <= 0:
+        return 0.0
+
+    area1 = w1 * h1
+    area2 = w2 * h2
+    carea = cw * ch
+    uarea = area1 + area2 - carea
+    return carea / uarea
+
+
+def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False):
+    # print(boxes.shape)
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = confs.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        idx_self = order[0]
+        idx_other = order[1:]
+
+        keep.append(idx_self)
+
+        xx1 = np.maximum(x1[idx_self], x1[idx_other])
+        yy1 = np.maximum(y1[idx_self], y1[idx_other])
+        xx2 = np.minimum(x2[idx_self], x2[idx_other])
+        yy2 = np.minimum(y2[idx_self], y2[idx_other])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+
+        if min_mode:
+            over = inter / np.minimum(areas[order[0]], areas[order[1:]])
+        else:
+            over = inter / (areas[order[0]] + areas[order[1:]] - inter)
+
+        inds = np.where(over <= nms_thresh)[0]
+        order = order[inds + 1]
+    
+    return np.array(keep)
+
+
+
+def plot_boxes_cv2(img, boxes, savename=None, class_names=None, color=None):
+    import cv2
+    img = np.copy(img)
+    colors = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]], dtype=np.float32)
+
+    def get_color(c, x, max_val):
+        ratio = float(x) / max_val * 5
+        i = int(math.floor(ratio))
+        j = int(math.ceil(ratio))
+        ratio = ratio - i
+        r = (1 - ratio) * colors[i][c] + ratio * colors[j][c]
+        return int(r * 255)
+
+    width = img.shape[1]
+    height = img.shape[0]
+    for i in range(len(boxes)):
+        box = boxes[i]
+        x1 = int(box[0] * width)
+        y1 = int(box[1] * height)
+        x2 = int(box[2] * width)
+        y2 = int(box[3] * height)
+
+        if color:
+            rgb = color
+        else:
+            rgb = (255, 0, 0)
+        if len(box) >= 7 and class_names:
+            cls_conf = box[5]
+            cls_id = box[6]
+            print('%s: %f' % (class_names[cls_id], cls_conf))
+            classes = len(class_names)
+            offset = cls_id * 123457 % classes
+            red = get_color(2, offset, classes)
+            green = get_color(1, offset, classes)
+            blue = get_color(0, offset, classes)
+            if color is None:
+                rgb = (red, green, blue)
+            img = cv2.putText(img, class_names[cls_id], (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1.2, rgb, 1)
+        img = cv2.rectangle(img, (x1, y1), (x2, y2), rgb, 1)
+    if savename:
+        print("save plot results to %s" % savename)
+        cv2.imwrite(savename, img)
+    return img
+
+
+def read_truths(lab_path):
+    if not os.path.exists(lab_path):
+        return np.array([])
+    if os.path.getsize(lab_path):
+        truths = np.loadtxt(lab_path)
+        truths = truths.reshape(truths.size / 5, 5)  # to avoid single truth problem
+        return truths
+    else:
+        return np.array([])
+
+def post_processing(img, conf_thresh, nms_thresh, output):
+
+    # anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401]
+    # num_anchors = 9
+    # anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
+    # strides = [8, 16, 32]
+    # anchor_step = len(anchors) // num_anchors
+
+    # [batch, num, 1, 4]
+    box_array = output[0]
+    # [batch, num, num_classes]
+    confs = output[1]
+
+    t1 = time.time()
+
+    if type(box_array).__name__ != 'ndarray':
+        box_array = box_array.cpu().detach().numpy()
+        confs = confs.cpu().detach().numpy()
+
+    num_classes = confs.shape[2]
+
+    # [batch, num, 4]
+    box_array = box_array[:, :, 0]
+
+    # [batch, num, num_classes] --> [batch, num]
+    max_conf = np.max(confs, axis=2)
+    max_id = np.argmax(confs, axis=2)
+
+    t2 = time.time()
+
+    bboxes_batch = []
+    for i in range(box_array.shape[0]):
+       
+        argwhere = max_conf[i] > conf_thresh
+        l_box_array = box_array[i, argwhere, :]
+        l_max_conf = max_conf[i, argwhere]
+        l_max_id = max_id[i, argwhere]
+
+        bboxes = []
+        # nms for each class
+        for j in range(num_classes):
+
+            cls_argwhere = l_max_id == j
+            ll_box_array = l_box_array[cls_argwhere, :]
+            ll_max_conf = l_max_conf[cls_argwhere]
+            ll_max_id = l_max_id[cls_argwhere]
+
+            keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh)
+            
+            if (keep.size > 0):
+                ll_box_array = ll_box_array[keep, :]
+                ll_max_conf = ll_max_conf[keep]
+                ll_max_id = ll_max_id[keep]
+
+                for k in range(ll_box_array.shape[0]):
+                    bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2], ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]])
+        
+        bboxes_batch.append(bboxes)
+
+    t3 = time.time()
+
+    print('-----------------------------------')
+    print('       max and argmax : %f' % (t2 - t1))
+    print('                  nms : %f' % (t3 - t2))
+    print('Post processing total : %f' % (t3 - t1))
+    print('-----------------------------------')
+    
+    return bboxes_batch
diff --git a/easymocap/estimator/YOLOv4/yolo.py b/easymocap/estimator/YOLOv4/yolo.py
new file mode 100644
index 0000000..e3b7fad
--- /dev/null
+++ b/easymocap/estimator/YOLOv4/yolo.py
@@ -0,0 +1,161 @@
+'''
+  @ Date: 2020-12-10 16:39:51
+  @ Author: Qing Shuai
+  @ LastEditors: Qing Shuai
+  @ LastEditTime: 2022-04-21 23:53:40
+  @ FilePath: /EasyMocapPublic/easymocap/estimator/YOLOv4/yolo.py
+'''
+from .darknet2pytorch import Darknet
+import cv2
+import torch
+from os.path import join
+import os
+import numpy as np
+
+def load_class_names(namesfile):
+    class_names = []
+    with open(namesfile, 'r') as fp:
+        lines = fp.readlines()
+    for line in lines:
+        line = line.rstrip()
+        class_names.append(line)
+    return class_names
+
+def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False):
+    # print(boxes.shape)
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = confs.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        idx_self = order[0]
+        idx_other = order[1:]
+
+        keep.append(idx_self)
+
+        xx1 = np.maximum(x1[idx_self], x1[idx_other])
+        yy1 = np.maximum(y1[idx_self], y1[idx_other])
+        xx2 = np.minimum(x2[idx_self], x2[idx_other])
+        yy2 = np.minimum(y2[idx_self], y2[idx_other])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+
+        if min_mode:
+            over = inter / np.minimum(areas[order[0]], areas[order[1:]])
+        else:
+            over = inter / (areas[order[0]] + areas[order[1:]] - inter)
+
+        inds = np.where(over <= nms_thresh)[0]
+        order = order[inds + 1]
+    return np.array(keep)
+
+def post_processing(conf_thresh, nms_thresh, output):
+    # [batch, num, 1, 4]
+    box_array = output[0]
+    # [batch, num, num_classes]
+    confs = output[1]
+
+    if type(box_array).__name__ != 'ndarray':
+        box_array = box_array.cpu().detach().numpy()
+        confs = confs.cpu().detach().numpy()
+
+    num_classes = confs.shape[2]
+
+    # [batch, num, 4]
+    box_array = box_array[:, :, 0]
+
+    # [batch, num, num_classes] --> [batch, num]
+    max_conf = np.max(confs, axis=2)
+    max_id = np.argmax(confs, axis=2)
+
+    bboxes_batch = []
+    for i in range(box_array.shape[0]):
+        argwhere = max_conf[i] > conf_thresh
+        l_box_array = box_array[i, argwhere, :]
+        l_max_conf = max_conf[i, argwhere]
+        l_max_id = max_id[i, argwhere]
+
+        bboxes = []
+        # nms for class person
+        j = 0
+        cls_argwhere = l_max_id == j
+        ll_box_array = l_box_array[cls_argwhere, :]
+        ll_max_conf = l_max_conf[cls_argwhere]
+        ll_max_id = l_max_id[cls_argwhere]
+
+        keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh)
+        
+        if (keep.size > 0):
+            ll_box_array = ll_box_array[keep, :]
+            ll_max_conf = ll_max_conf[keep]
+            ll_max_id = ll_max_id[keep]
+            bboxes = np.hstack([ll_box_array, ll_max_conf[:, None]])
+
+        bboxes_batch.append(bboxes)
+
+    return bboxes_batch
+
+class YOLOv4:
+    def __init__(self, device, ckpt_path, box_nms_thres, conf_thres,
+        isWild=False) -> None:
+        dirname = os.path.dirname(__file__)
+        cfgfile = join(dirname, 'yolov4.cfg')
+        namesfile = join(dirname, 'coco.names')
+        self.model = Darknet(cfgfile)
+        self.model.load_weights(ckpt_path)
+        self.model.to(device)
+        self.model.eval()
+        class_names = load_class_names(namesfile)
+        self.device = device
+        self.box_nms_thres = box_nms_thres
+        self.conf_thres = conf_thres
+        self.isWild = isWild
+
+    def predict_single(self, image):
+        width  = image.shape[1]
+        height = image.shape[0]
+        tgt_width = self.model.width
+        # 先缩小，再padding
+        if width > height:
+            tgt_shape = (tgt_width, int(height/width*tgt_width))
+            resize = cv2.resize(image, tgt_shape)
+            sized = np.zeros((tgt_width, tgt_width, 3), dtype=np.uint8)
+            start = (sized.shape[0] - resize.shape[0])//2
+            sized[start:start+resize.shape[0], :, :] = resize
+            # pad_to_square
+        elif width == height:
+            sized = cv2.resize(image, (tgt_width, tgt_width))
+            start = 0
+        else:
+            tgt_shape = (int(width/height*tgt_width), tgt_width)
+            resize = cv2.resize(image, tgt_shape)
+            sized = np.zeros((tgt_width, tgt_width, 3), dtype=np.uint8)
+            start = (sized.shape[1] - resize.shape[1]) // 2
+            sized[:, start:start+resize.shape[1], :] = resize
+        img = torch.from_numpy(sized.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0)
+        img = img.to(self.device)
+        with torch.no_grad():
+            output = self.model(img)
+        bboxes = post_processing(self.conf_thres, self.box_nms_thres, output)[0]
+        if len(bboxes) == 0:
+            return bboxes
+        if self.isWild:
+            flag = ((bboxes[:, 2] - bboxes[:, 0]) < 0.8)&(((bboxes[:, 2] - bboxes[:, 0]) > 0.1)|((bboxes[:, 3] - bboxes[:, 1]) > 0.1))
+            bboxes = bboxes[flag]
+        if width >= height:
+            bboxes[:, :4] *= width
+            bboxes[:, 1] -= start*width/tgt_width
+            bboxes[:, 3] -= start*width/tgt_width
+        else:
+            bboxes[:, :4] *= height
+            bboxes[:, 0] -= start*height/tgt_width
+            bboxes[:, 2] -= start*height/tgt_width
+        # return bounding box
+        return bboxes
\ No newline at end of file
diff --git a/easymocap/estimator/YOLOv4/yolo_layer.py b/easymocap/estimator/YOLOv4/yolo_layer.py
new file mode 100644
index 0000000..3d3ae42
--- /dev/null
+++ b/easymocap/estimator/YOLOv4/yolo_layer.py
@@ -0,0 +1,322 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from .torch_utils import *
+
+def yolo_forward(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,
+                              validation=False):
+    # Output would be invalid if it does not satisfy this assert
+    # assert (output.size(1) == (5 + num_classes) * num_anchors)
+
+    # print(output.size())
+
+    # Slice the second dimension (channel) of output into:
+    # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ]
+    # And then into
+    # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ]
+    batch = output.size(0)
+    H = output.size(2)
+    W = output.size(3)
+
+    bxy_list = []
+    bwh_list = []
+    det_confs_list = []
+    cls_confs_list = []
+
+    for i in range(num_anchors):
+        begin = i * (5 + num_classes)
+        end = (i + 1) * (5 + num_classes)
+        
+        bxy_list.append(output[:, begin : begin + 2])
+        bwh_list.append(output[:, begin + 2 : begin + 4])
+        det_confs_list.append(output[:, begin + 4 : begin + 5])
+        cls_confs_list.append(output[:, begin + 5 : end])
+
+    # Shape: [batch, num_anchors * 2, H, W]
+    bxy = torch.cat(bxy_list, dim=1)
+    # Shape: [batch, num_anchors * 2, H, W]
+    bwh = torch.cat(bwh_list, dim=1)
+
+    # Shape: [batch, num_anchors, H, W]
+    det_confs = torch.cat(det_confs_list, dim=1)
+    # Shape: [batch, num_anchors * H * W]
+    det_confs = det_confs.view(batch, num_anchors * H * W)
+
+    # Shape: [batch, num_anchors * num_classes, H, W]
+    cls_confs = torch.cat(cls_confs_list, dim=1)
+    # Shape: [batch, num_anchors, num_classes, H * W]
+    cls_confs = cls_confs.view(batch, num_anchors, num_classes, H * W)
+    # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes] 
+    cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, num_classes)
+
+    # Apply sigmoid(), exp() and softmax() to slices
+    #
+    bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1)
+    bwh = torch.exp(bwh)
+    det_confs = torch.sigmoid(det_confs)
+    cls_confs = torch.sigmoid(cls_confs)
+
+    # Prepare C-x, C-y, P-w, P-h (None of them are torch related)
+    grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, W - 1, W), axis=0).repeat(H, 0), axis=0), axis=0)
+    grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(W, 1), axis=0), axis=0)
+    # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1)
+    # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W)
+
+    anchor_w = []
+    anchor_h = []
+    for i in range(num_anchors):
+        anchor_w.append(anchors[i * 2])
+        anchor_h.append(anchors[i * 2 + 1])
+
+    device = None
+    cuda_check = output.is_cuda
+    if cuda_check:
+        device = output.get_device()
+
+    bx_list = []
+    by_list = []
+    bw_list = []
+    bh_list = []
+
+    # Apply C-x, C-y, P-w, P-h
+    for i in range(num_anchors):
+        ii = i * 2
+        # Shape: [batch, 1, H, W]
+        bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32) # grid_x.to(device=device, dtype=torch.float32)
+        # Shape: [batch, 1, H, W]
+        by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32) # grid_y.to(device=device, dtype=torch.float32)
+        # Shape: [batch, 1, H, W]
+        bw = bwh[:, ii : ii + 1] * anchor_w[i]
+        # Shape: [batch, 1, H, W]
+        bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i]
+
+        bx_list.append(bx)
+        by_list.append(by)
+        bw_list.append(bw)
+        bh_list.append(bh)
+
+
+    ########################################
+    #   Figure out bboxes from slices     #
+    ########################################
+    
+    # Shape: [batch, num_anchors, H, W]
+    bx = torch.cat(bx_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    by = torch.cat(by_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    bw = torch.cat(bw_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    bh = torch.cat(bh_list, dim=1)
+
+    # Shape: [batch, 2 * num_anchors, H, W]
+    bx_bw = torch.cat((bx, bw), dim=1)
+    # Shape: [batch, 2 * num_anchors, H, W]
+    by_bh = torch.cat((by, bh), dim=1)
+
+    # normalize coordinates to [0, 1]
+    bx_bw /= W
+    by_bh /= H
+
+    # Shape: [batch, num_anchors * H * W, 1]
+    bx = bx_bw[:, :num_anchors].view(batch, num_anchors * H * W, 1)
+    by = by_bh[:, :num_anchors].view(batch, num_anchors * H * W, 1)
+    bw = bx_bw[:, num_anchors:].view(batch, num_anchors * H * W, 1)
+    bh = by_bh[:, num_anchors:].view(batch, num_anchors * H * W, 1)
+
+    bx1 = bx - bw * 0.5
+    by1 = by - bh * 0.5
+    bx2 = bx1 + bw
+    by2 = by1 + bh
+
+    # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4]
+    boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view(batch, num_anchors * H * W, 1, 4)
+    # boxes = boxes.repeat(1, 1, num_classes, 1)
+
+    # boxes:     [batch, num_anchors * H * W, 1, 4]
+    # cls_confs: [batch, num_anchors * H * W, num_classes]
+    # det_confs: [batch, num_anchors * H * W]
+
+    det_confs = det_confs.view(batch, num_anchors * H * W, 1)
+    confs = cls_confs * det_confs
+
+    # boxes: [batch, num_anchors * H * W, 1, 4]
+    # confs: [batch, num_anchors * H * W, num_classes]
+
+    return  boxes, confs
+
+
+def yolo_forward_dynamic(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,
+                              validation=False):
+    # Output would be invalid if it does not satisfy this assert
+    # assert (output.size(1) == (5 + num_classes) * num_anchors)
+
+    # print(output.size())
+
+    # Slice the second dimension (channel) of output into:
+    # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ]
+    # And then into
+    # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ]
+    # batch = output.size(0)
+    # H = output.size(2)
+    # W = output.size(3)
+
+    bxy_list = []
+    bwh_list = []
+    det_confs_list = []
+    cls_confs_list = []
+
+    for i in range(num_anchors):
+        begin = i * (5 + num_classes)
+        end = (i + 1) * (5 + num_classes)
+        
+        bxy_list.append(output[:, begin : begin + 2])
+        bwh_list.append(output[:, begin + 2 : begin + 4])
+        det_confs_list.append(output[:, begin + 4 : begin + 5])
+        cls_confs_list.append(output[:, begin + 5 : end])
+
+    # Shape: [batch, num_anchors * 2, H, W]
+    bxy = torch.cat(bxy_list, dim=1)
+    # Shape: [batch, num_anchors * 2, H, W]
+    bwh = torch.cat(bwh_list, dim=1)
+
+    # Shape: [batch, num_anchors, H, W]
+    det_confs = torch.cat(det_confs_list, dim=1)
+    # Shape: [batch, num_anchors * H * W]
+    det_confs = det_confs.view(output.size(0), num_anchors * output.size(2) * output.size(3))
+
+    # Shape: [batch, num_anchors * num_classes, H, W]
+    cls_confs = torch.cat(cls_confs_list, dim=1)
+    # Shape: [batch, num_anchors, num_classes, H * W]
+    cls_confs = cls_confs.view(output.size(0), num_anchors, num_classes, output.size(2) * output.size(3))
+    # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes] 
+    cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(output.size(0), num_anchors * output.size(2) * output.size(3), num_classes)
+
+    # Apply sigmoid(), exp() and softmax() to slices
+    #
+    bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1)
+    bwh = torch.exp(bwh)
+    det_confs = torch.sigmoid(det_confs)
+    cls_confs = torch.sigmoid(cls_confs)
+
+    # Prepare C-x, C-y, P-w, P-h (None of them are torch related)
+    grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, output.size(3) - 1, output.size(3)), axis=0).repeat(output.size(2), 0), axis=0), axis=0)
+    grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, output.size(2) - 1, output.size(2)), axis=1).repeat(output.size(3), 1), axis=0), axis=0)
+    # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1)
+    # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W)
+
+    anchor_w = []
+    anchor_h = []
+    for i in range(num_anchors):
+        anchor_w.append(anchors[i * 2])
+        anchor_h.append(anchors[i * 2 + 1])
+
+    device = None
+    cuda_check = output.is_cuda
+    if cuda_check:
+        device = output.get_device()
+
+    bx_list = []
+    by_list = []
+    bw_list = []
+    bh_list = []
+
+    # Apply C-x, C-y, P-w, P-h
+    for i in range(num_anchors):
+        ii = i * 2
+        # Shape: [batch, 1, H, W]
+        bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32) # grid_x.to(device=device, dtype=torch.float32)
+        # Shape: [batch, 1, H, W]
+        by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32) # grid_y.to(device=device, dtype=torch.float32)
+        # Shape: [batch, 1, H, W]
+        bw = bwh[:, ii : ii + 1] * anchor_w[i]
+        # Shape: [batch, 1, H, W]
+        bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i]
+
+        bx_list.append(bx)
+        by_list.append(by)
+        bw_list.append(bw)
+        bh_list.append(bh)
+
+
+    ########################################
+    #   Figure out bboxes from slices     #
+    ########################################
+    
+    # Shape: [batch, num_anchors, H, W]
+    bx = torch.cat(bx_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    by = torch.cat(by_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    bw = torch.cat(bw_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    bh = torch.cat(bh_list, dim=1)
+
+    # Shape: [batch, 2 * num_anchors, H, W]
+    bx_bw = torch.cat((bx, bw), dim=1)
+    # Shape: [batch, 2 * num_anchors, H, W]
+    by_bh = torch.cat((by, bh), dim=1)
+
+    # normalize coordinates to [0, 1]
+    bx_bw /= output.size(3)
+    by_bh /= output.size(2)
+
+    # Shape: [batch, num_anchors * H * W, 1]
+    bx = bx_bw[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+    by = by_bh[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+    bw = bx_bw[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+    bh = by_bh[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+
+    bx1 = bx - bw * 0.5
+    by1 = by - bh * 0.5
+    bx2 = bx1 + bw
+    by2 = by1 + bh
+
+    # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4]
+    boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view(output.size(0), num_anchors * output.size(2) * output.size(3), 1, 4)
+    # boxes = boxes.repeat(1, 1, num_classes, 1)
+
+    # boxes:     [batch, num_anchors * H * W, 1, 4]
+    # cls_confs: [batch, num_anchors * H * W, num_classes]
+    # det_confs: [batch, num_anchors * H * W]
+
+    det_confs = det_confs.view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+    confs = cls_confs * det_confs
+
+    # boxes: [batch, num_anchors * H * W, 1, 4]
+    # confs: [batch, num_anchors * H * W, num_classes]
+
+    return  boxes, confs
+
+class YoloLayer(nn.Module):
+    ''' Yolo layer
+    model_out: while inference,is post-processing inside or outside the model
+        true:outside
+    '''
+    def __init__(self, anchor_mask=[], num_classes=0, anchors=[], num_anchors=1, stride=32, model_out=False):
+        super(YoloLayer, self).__init__()
+        self.anchor_mask = anchor_mask
+        self.num_classes = num_classes
+        self.anchors = anchors
+        self.num_anchors = num_anchors
+        self.anchor_step = len(anchors) // num_anchors
+        self.coord_scale = 1
+        self.noobject_scale = 1
+        self.object_scale = 5
+        self.class_scale = 1
+        self.thresh = 0.6
+        self.stride = stride
+        self.seen = 0
+        self.scale_x_y = 1
+
+        self.model_out = model_out
+
+    def forward(self, output, target=None):
+        if self.training:
+            return output
+        masked_anchors = []
+        for m in self.anchor_mask:
+            masked_anchors += self.anchors[m * self.anchor_step:(m + 1) * self.anchor_step]
+        masked_anchors = [anchor / self.stride for anchor in masked_anchors]
+
+        return yolo_forward_dynamic(output, self.thresh, self.num_classes, masked_anchors, len(self.anchor_mask),scale_x_y=self.scale_x_y)
+
diff --git a/easymocap/estimator/YOLOv4/yolov4.cfg b/easymocap/estimator/YOLOv4/yolov4.cfg
new file mode 100644
index 0000000..2985a31
--- /dev/null
+++ b/easymocap/estimator/YOLOv4/yolov4.cfg
@@ -0,0 +1,1157 @@
+[net]
+batch=64
+subdivisions=8
+# Training
+#width=512
+#height=512
+width=608
+height=608
+channels=3
+momentum=0.949
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.0013
+burn_in=1000
+max_batches = 500500
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+#cutmix=1
+mosaic=1
+
+#:104x104 54:52x52 85:26x26 104:13x13 for 416
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-7
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-10
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-28
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-28
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-16
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=mish
+
+##########################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+### SPP ###
+[maxpool]
+stride=1
+size=5
+
+[route]
+layers=-2
+
+[maxpool]
+stride=1
+size=9
+
+[route]
+layers=-4
+
+[maxpool]
+stride=1
+size=13
+
+[route]
+layers=-1,-3,-5,-6
+### End SPP ###
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = 85
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -1, -3
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = 54
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -1, -3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+##########################
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+scale_x_y = 1.2
+iou_thresh=0.213
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+nms_kind=greedynms
+beta_nms=0.6
+max_delta=5
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=2
+pad=1
+filters=256
+activation=leaky
+
+[route]
+layers = -1, -16
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+scale_x_y = 1.1
+iou_thresh=0.213
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+nms_kind=greedynms
+beta_nms=0.6
+max_delta=5
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=2
+pad=1
+filters=512
+activation=leaky
+
+[route]
+layers = -1, -37
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+scale_x_y = 1.05
+iou_thresh=0.213
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+nms_kind=greedynms
+beta_nms=0.6
+max_delta=5
diff --git a/easymocap/estimator/yolohrnet_wrapper.py b/easymocap/estimator/yolohrnet_wrapper.py
new file mode 100644
index 0000000..de5e1e6
--- /dev/null
+++ b/easymocap/estimator/yolohrnet_wrapper.py
@@ -0,0 +1,122 @@
+from ..annotator.file_utils import read_json
+from .wrapper_base import check_result, create_annot_file, save_annot
+from glob import glob
+from os.path import join
+from tqdm import tqdm
+import os
+import cv2
+import numpy as np
+
+def detect_frame(detector, img, pid=0, only_bbox=False):
+    lDetections = detector.detect([img], only_bbox=only_bbox)[0]
+    annots = []
+    for i in range(len(lDetections)):
+        annot = {
+            'bbox': [float(d) for d in lDetections[i]['bbox']],
+            'personID': pid + i,
+            'isKeyframe': False
+        }
+        if not only_bbox:
+            annot['keypoints'] = lDetections[i]['keypoints'].tolist()
+        annots.append(annot)
+    return annots
+
+def extract_bbox(image_root, annot_root, ext, **config):
+    force = config.pop('force')
+    if check_result(image_root, annot_root) and not force:
+        return 0
+    import torch
+    from .YOLOv4 import YOLOv4
+    device = torch.device('cuda') \
+            if torch.cuda.is_available() else torch.device('cpu')
+    detector = YOLOv4(device=device, **config)
+    imgnames = sorted(glob(join(image_root, '*'+ext)))
+    if len(imgnames) == 0:
+        ext = '.png'
+        imgnames = sorted(glob(join(image_root, '*'+ext)))
+    # run_yolo(image_root, )
+    for imgname in tqdm(imgnames, desc='{:10s}'.format(os.path.basename(annot_root))):
+        base = os.path.basename(imgname).replace(ext, '')
+        annotname = join(annot_root, base+'.json')
+        annot = create_annot_file(annotname, imgname)
+        image = cv2.imread(imgname)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        detections = detector.predict_single(image_rgb)
+        annots = []
+        pid = 0
+        for i in range(len(detections)):
+            annot_ = {
+                'bbox': [float(d) for d in detections[i]],
+                'isKeyframe': False
+            }
+            annot_['area'] = max(annot_['bbox'][2] - annot_['bbox'][0], annot_['bbox'][3] - annot_['bbox'][1])**2
+            annots.append(annot_)
+        annots.sort(key=lambda x:-x['area'])
+        # re-assign the person ID
+        for i in range(len(annots)):
+            annots[i]['personID'] = i + pid
+        annot['annots'] = annots
+        save_annot(annotname, annot)
+
+def extract_hrnet(image_root, annot_root, ext, **config):
+    config.pop('force')
+    import torch
+    imgnames = sorted(glob(join(image_root, '*'+ext)))
+    import torch
+    device = torch.device('cuda') \
+            if torch.cuda.is_available() else torch.device('cpu')
+    from .HRNet import SimpleHRNet
+    estimator = SimpleHRNet(device=device, **config)
+
+    for imgname in tqdm(imgnames, desc='{:10s}'.format(os.path.basename(annot_root))):
+        base = os.path.basename(imgname).replace(ext, '')
+        annotname = join(annot_root, base+'.json')
+        annots = read_json(annotname)
+        detections = np.array([data['bbox'] for data in annots['annots']])
+        image = cv2.imread(imgname)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        points2d = estimator.predict(image_rgb, detections)
+        for i in range(detections.shape[0]):
+            annot_ = annots['annots'][i]
+            annot_['keypoints'] = points2d[i]
+        save_annot(annotname, annots)
+
+def extract_yolo_hrnet(image_root, annot_root, ext, config_yolo, config_hrnet):
+    config_yolo.pop('ext', None)
+    imgnames = sorted(glob(join(image_root, '*{}'.format(ext))))
+    import torch
+    device = torch.device('cuda')
+    from .YOLOv4 import YOLOv4
+    device = torch.device('cuda') \
+            if torch.cuda.is_available() else torch.device('cpu')
+    detector = YOLOv4(device=device, **config_yolo)
+    from .HRNet import SimpleHRNet
+    estimator = SimpleHRNet(device=device, **config_hrnet)
+
+    for nf, imgname in enumerate(tqdm(imgnames, desc=os.path.basename(image_root))):
+        base = os.path.basename(imgname).replace(ext, '')
+        annotname = join(annot_root, base+'.json')
+        annot = create_annot_file(annotname, imgname)
+        img0 = cv2.imread(imgname)
+        annot = create_annot_file(annotname, imgname)
+        image = cv2.imread(imgname)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        detections = detector.predict_single(image_rgb)
+        # forward_hrnet
+        points2d = estimator.predict(image_rgb, detections)
+        annots = []
+        pid = 0
+        for i in range(len(detections)):
+            annot_ = {
+                'bbox': [float(d) for d in detections[i]],
+                'keypoints': points2d[i],
+                'isKeyframe': False
+            }
+            annot_['area'] = max(annot_['bbox'][2] - annot_['bbox'][0], annot_['bbox'][3] - annot_['bbox'][1])**2
+            annots.append(annot_)
+        annots.sort(key=lambda x:-x['area'])
+        # re-assign the person ID
+        for i in range(len(annots)):
+            annots[i]['personID'] = i + pid
+        annot['annots'] = annots
+        save_annot(annotname, annot)
\ No newline at end of file
diff --git a/scripts/preprocess/copy_dataset.py b/scripts/preprocess/copy_dataset.py
new file mode 100644
index 0000000..dadb93e
--- /dev/null
+++ b/scripts/preprocess/copy_dataset.py
@@ -0,0 +1,184 @@
+'''
+  @ Date: 2021-06-14 15:39:26
+  @ Author: Qing Shuai
+  @ LastEditors: Qing Shuai
+  @ LastEditTime: 2022-08-02 21:50:40
+  @ FilePath: /EasyMocapPublic/scripts/preprocess/copy_dataset.py
+'''
+import os
+from os.path import join
+import shutil
+from tqdm import tqdm
+from glob import glob
+import cv2
+
+from easymocap.mytools.debug_utils import myerror, mywarn
+
+mkdir = lambda x:os.makedirs(x, exist_ok=True)
+
+import json
+
+def save_json(file, data):
+    if not os.path.exists(os.path.dirname(file)):
+        os.makedirs(os.path.dirname(file))
+    with open(file, 'w') as f:
+        json.dump(data, f, indent=4)
+
+def read_json(path):
+    with open(path) as f:
+        data = json.load(f)
+    return data
+
+def copy_dataset(inp, out, start, end, step, keys, args):
+    copy_keys = {
+        'images': args.ext,
+        'annots': '.json',
+        'mask-schp': '.png',
+    }
+    copy_share_keys = {
+        'output-keypoints3d/keypoints3d': '.json'
+    }
+    mkdir(out)
+    if os.path.exists(join(inp, 'intri.yml')):
+        shutil.copyfile(join(inp, 'intri.yml'), join(out, 'intri.yml'))
+        shutil.copyfile(join(inp, 'extri.yml'), join(out, 'extri.yml'))
+    if os.path.exists(join(inp, 'match_name.json')):
+        names = read_json(join(inp, 'match_name.json'))
+        names = names[start:end:step]
+        save_json(join(out, 'match_name.json'), names)
+    if os.path.exists(join(inp, 'sync_time.txt')):
+        import numpy as np
+        times = np.loadtxt(join(inp, 'sync_time.txt'))
+        times = times.reshape(times.shape[0], -1)
+        times = times[:, start:end:step]
+        np.savetxt(join(out, 'sync_time.txt'), times, fmt='%10d')
+    os.system('touch ' + join(out, '{}-{}-{}'.format(start, end, step)))
+    for copy, ext in copy_share_keys.items():
+        if not os.path.exists(join(inp, copy)):
+            continue
+        if len(args.frames) == 0:
+            ranges = [i for i in range(start, end, step)]
+        else:
+            ranges = args.frames
+        outdir = join(out, copy)
+        if os.path.exists(outdir) and len(os.listdir(outdir)) == len(ranges):
+            pass
+        os.makedirs(outdir, exist_ok=True)
+        for nnf, nf in enumerate(tqdm(ranges, desc='{}'.format(copy))):
+            oldname = join(inp, copy, '{:06d}{}'.format(nf, ext))
+            if not os.path.exists(oldname):
+                mywarn('{} not exists'.format(oldname))
+                continue
+            newname = join(outdir, '{:06d}{}'.format(nnf, ext))
+            shutil.copyfile(oldname, newname)
+
+    for copy in keys:
+        ext = copy_keys.get(copy, '.json')
+        if not os.path.exists(join(inp, copy)):
+            continue
+        if len(args.subs) == 0:
+            subs = sorted(os.listdir(join(inp, copy)))
+            subs = [s for s in subs if os.path.isdir(join(inp, copy, s))]
+        else:
+            subs = args.subs
+        for sub in subs:
+            if not os.path.exists(join(inp, copy)):
+                continue
+            outdir = join(out, copy, sub.replace(args.strip, ''))
+            os.makedirs(outdir, exist_ok=True)
+            if args.end == -1:
+                oldnames = sorted(glob(join(inp, copy, sub, '*{}'.format(ext))))
+                end = len(oldnames)
+                print('{} has {} frames'.format(sub, end))
+            if args.sample == -1:
+                if len(args.frames) == 0:
+                    ranges = [i for i in range(start, end, step)]
+                else:
+                    ranges = args.frames
+            else:
+                ranges = [(i/args.sample)*(end-start-2*args.strip_frame)+start+args.strip_frame for i in range(args.sample)]
+                ranges = [int(i+0.5) for i in ranges]
+            if os.path.exists(outdir) and len(os.listdir(outdir)) == len(ranges):
+                mywarn('[copy] Skip {}'.format(outdir))
+                continue
+            for nnf, nf in enumerate(tqdm(ranges, desc='{}:{}'.format(sub, copy))):
+                oldname = join(inp, copy, sub, '{:06d}{}'.format(nf, ext))
+                if not os.path.exists(oldname):
+                    oldnames = sorted(glob(join(inp, copy, sub, '{:06d}_*{}'.format(nf, ext))))
+                    if len(oldnames) == 0:
+                        myerror('{} not exists'.format(oldname))
+                        import ipdb;ipdb.set_trace()
+                    else:
+                        for oldname in oldnames:
+                            newname = join(outdir, os.path.basename(oldname).replace('{:06d}'.format(nf), '{:06d}'.format(nnf)))
+                            shutil.copyfile(oldname, newname)                            
+                else:
+                    newname = join(outdir, '{:06d}{}'.format(nnf, ext))
+                    if copy == 'images' and args.scale != 1:
+                        img = cv2.imread(oldname)
+                        img = cv2.resize(img, None, fx=args.scale, fy=args.scale)
+                        cv2.imwrite(newname, img)
+                    else:
+                        shutil.copyfile(oldname, newname)
+        # make videos
+        if copy == 'images' and args.make_video:
+            os.makedirs(join(out, 'videos'), exist_ok=True)
+            for sub in subs:
+                shell = '{} -y -i {}/images/{}/%06d{} -vcodec libx264 {}/videos/{}.mp4 -loglevel quiet'.format(
+                    args.ffmpeg, out, sub, ext, out, sub
+                )
+                print(shell)
+                os.system(shell)
+
+def export(root, out, keys):
+    mkdir(out)
+    for key in keys:
+        src = join(root, key)
+        dst = join(out, key)
+        if key == 'videos':
+            if os.path.exists(src):
+                shutil.copytree(src, dst)
+            else:
+                mkdir(dst)
+                subs = sorted(os.listdir(join(root, 'images')))
+                for sub in subs:
+                    cmd = '{ffmpeg} -r {fps} -i {inp}/%06d.jpg -vcodec libx264 {out}'.format(
+                        ffmpeg=args.ffmpeg, fps=50, inp=join(root, 'images', sub),
+                        out=join(dst, sub+'.mp4')
+                    )
+                    os.system(cmd)
+        if not os.path.exists(src):
+            print(src)
+            continue
+        shutil.copytree(src, dst)
+    for name in ['intri.yml', 'extri.yml']:
+        if os.path.exists(join(root, name)):
+            shutil.copyfile(join(root, name), join(out, name))
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('path', type=str)
+    parser.add_argument('out', type=str)
+    parser.add_argument('--strip', type=str, default='')
+    parser.add_argument('--keys', type=str, nargs='+', default=['images', 'annots', 'chessboard'])
+    parser.add_argument('--subs', type=str, nargs='+', default=[])
+    parser.add_argument('--start', type=int, default=0)
+    parser.add_argument('--step', type=int, default=1)
+    parser.add_argument('--end', type=int, default=-1)
+    parser.add_argument('--scale', type=float, default=1)
+    parser.add_argument('--strip_frame', type=int, default=0,
+        help='remove the start frames and end frames')
+    parser.add_argument('--ffmpeg', type=str, default='ffmpeg')
+    parser.add_argument('--ext', type=str, default='.jpg')
+    parser.add_argument('--sample', type=int, default=-1,
+        help='use this flag to sample a fixed number of frames')
+    parser.add_argument('--frames', type=int, default=[], nargs='+')
+    parser.add_argument('--debug', action='store_true')
+    parser.add_argument('--make_video', action='store_true')
+    parser.add_argument('--export', action='store_true')
+    args = parser.parse_args()
+    if args.export:
+        export(args.path, args.out, args.keys)
+    else:
+        copy_dataset(args.path, args.out, start=args.start, end=args.end, step=args.step, keys=args.keys, args=args)