support YOLOv4 + HRNet

2022-08-22 00:07:46 +08:00 · 2022-08-22 00:07:46 +08:00 · 0175f07290
commit 0175f07290
parent af452c6949
16 changed files with 4143 additions and 0 deletions
--- a/easymocap/estimator/HRNet/init.py
+++ b/easymocap/estimator/HRNet/init.py
@ -0,0 +1,8 @@
 '''
    @ Date: 2020-06-04 12:48:29
  @ LastEditors: Qing Shuai
  @ LastEditTime: 2020-11-17 15:52:23
    @ Author: Qing Shuai
    @ Mail: s_q@zju.edu.cn
 '''
 from .hrnet_api import SimpleHRNet
--- a/easymocap/estimator/HRNet/hrnet.py
+++ b/easymocap/estimator/HRNet/hrnet.py
@ -0,0 +1,216 @@
 import torch
 from torch import nn
 from .modules import BasicBlock, Bottleneck
 class StageModule(nn.Module):
    def __init__(self, stage, output_branches, c, bn_momentum):
        super(StageModule, self).__init__()
        self.stage = stage
        self.output_branches = output_branches
        self.branches = nn.ModuleList()
        for i in range(self.stage):
            w = c * (2 ** i)
            branch = nn.Sequential(
                BasicBlock(w, w, bn_momentum=bn_momentum),
                BasicBlock(w, w, bn_momentum=bn_momentum),
                BasicBlock(w, w, bn_momentum=bn_momentum),
                BasicBlock(w, w, bn_momentum=bn_momentum),
            )
            self.branches.append(branch)
        self.fuse_layers = nn.ModuleList()
        # for each output_branches (i.e. each branch in all cases but the very last one)
        for i in range(self.output_branches):
            self.fuse_layers.append(nn.ModuleList())
            for j in range(self.stage):  # for each branch
                if i == j:
                    self.fuse_layers[-1].append(nn.Sequential())  # Used in place of "None" because it is callable
                elif i < j:
                    self.fuse_layers[-1].append(nn.Sequential(
                        nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=(1, 1), stride=(1, 1), bias=False),
                        nn.BatchNorm2d(c * (2 ** i), eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
                        nn.Upsample(scale_factor=(2.0 ** (j - i)), mode='nearest'),
                    ))
                elif i > j:
                    ops = []
                    for k in range(i - j - 1):
                        ops.append(nn.Sequential(
                            nn.Conv2d(c * (2 ** j), c * (2 ** j), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1),
                                      bias=False),
                            nn.BatchNorm2d(c * (2 ** j), eps=1e-05, momentum=0.1, affine=True,
                                           track_running_stats=True),
                            nn.ReLU(inplace=True),
                        ))
                    ops.append(nn.Sequential(
                        nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1),
                                  bias=False),
                        nn.BatchNorm2d(c * (2 ** i), eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
                    ))
                    self.fuse_layers[-1].append(nn.Sequential(*ops))
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x):
        assert len(self.branches) == len(x)
        x = [branch(b) for branch, b in zip(self.branches, x)]
        x_fused = []
        for i in range(len(self.fuse_layers)):
            for j in range(0, len(self.branches)):
                if j == 0:
                    x_fused.append(self.fuse_layers[i][0](x[0]))
                else:
                    x_fused[i] = x_fused[i] + self.fuse_layers[i][j](x[j])
        for i in range(len(x_fused)):
            x_fused[i] = self.relu(x_fused[i])
        return x_fused
 class HRNet(nn.Module):
    def __init__(self, c=48, nof_joints=17, bn_momentum=0.1):
        super(HRNet, self).__init__()
        # Input (stem net)
        self.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        self.bn1 = nn.BatchNorm2d(64, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        self.bn2 = nn.BatchNorm2d(64, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True)
        self.relu = nn.ReLU(inplace=True)
        # Stage 1 (layer1)      - First group of bottleneck (resnet) modules
        downsample = nn.Sequential(
            nn.Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False),
            nn.BatchNorm2d(256, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
        )
        self.layer1 = nn.Sequential(
            Bottleneck(64, 64, downsample=downsample),
            Bottleneck(256, 64),
            Bottleneck(256, 64),
            Bottleneck(256, 64),
        )
        # Fusion layer 1 (transition1)      - Creation of the first two branches (one full and one half resolution)
        self.transition1 = nn.ModuleList([
            nn.Sequential(
                nn.Conv2d(256, c, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
                nn.BatchNorm2d(c, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
                nn.ReLU(inplace=True),
            ),
            nn.Sequential(nn.Sequential(  # Double Sequential to fit with official pretrained weights
                nn.Conv2d(256, c * (2 ** 1), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
                nn.BatchNorm2d(c * (2 ** 1), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
                nn.ReLU(inplace=True),
            )),
        ])
        # Stage 2 (stage2)      - Second module with 1 group of bottleneck (resnet) modules. This has 2 branches
        self.stage2 = nn.Sequential(
            StageModule(stage=2, output_branches=2, c=c, bn_momentum=bn_momentum),
        )
        # Fusion layer 2 (transition2)      - Creation of the third branch (1/4 resolution)
        self.transition2 = nn.ModuleList([
            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
            nn.Sequential(nn.Sequential(  # Double Sequential to fit with official pretrained weights
                nn.Conv2d(c * (2 ** 1), c * (2 ** 2), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
                nn.BatchNorm2d(c * (2 ** 2), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
                nn.ReLU(inplace=True),
            )),  # ToDo Why the new branch derives from the "upper" branch only?
        ])
        # Stage 3 (stage3)      - Third module with 4 groups of bottleneck (resnet) modules. This has 3 branches
        self.stage3 = nn.Sequential(
            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
        )
        # Fusion layer 3 (transition3)      - Creation of the fourth branch (1/8 resolution)
        self.transition3 = nn.ModuleList([
            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
            nn.Sequential(nn.Sequential(  # Double Sequential to fit with official pretrained weights
                nn.Conv2d(c * (2 ** 2), c * (2 ** 3), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
                nn.BatchNorm2d(c * (2 ** 3), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
                nn.ReLU(inplace=True),
            )),  # ToDo Why the new branch derives from the "upper" branch only?
        ])
        # Stage 4 (stage4)      - Fourth module with 3 groups of bottleneck (resnet) modules. This has 4 branches
        self.stage4 = nn.Sequential(
            StageModule(stage=4, output_branches=4, c=c, bn_momentum=bn_momentum),
            StageModule(stage=4, output_branches=4, c=c, bn_momentum=bn_momentum),
            StageModule(stage=4, output_branches=1, c=c, bn_momentum=bn_momentum),
        )
        # Final layer (final_layer)
        self.final_layer = nn.Conv2d(c, nof_joints, kernel_size=(1, 1), stride=(1, 1))
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.layer1(x)
        x = [trans(x) for trans in self.transition1]  # Since now, x is a list (# == nof branches)
        x = self.stage2(x)
        # x = [trans(x[-1]) for trans in self.transition2]    # New branch derives from the "upper" branch only
        x = [
            self.transition2[0](x[0]),
            self.transition2[1](x[1]),
            self.transition2[2](x[-1])
        ]  # New branch derives from the "upper" branch only
        x = self.stage3(x)
        # x = [trans(x) for trans in self.transition3]    # New branch derives from the "upper" branch only
        x = [
            self.transition3[0](x[0]),
            self.transition3[1](x[1]),
            self.transition3[2](x[2]),
            self.transition3[3](x[-1])
        ]  # New branch derives from the "upper" branch only
        x = self.stage4(x)
        x = self.final_layer(x[0])
        return x
 if __name__ == '__main__':
    # model = HRNet(48, 17, 0.1)
    model = HRNet(32, 17, 0.1)
    # print(model)
    model.load_state_dict(
        # torch.load('./weights/pose_hrnet_w48_384x288.pth')
        torch.load('./weights/pose_hrnet_w32_256x192.pth')
    )
    print('ok!!')
    if torch.cuda.is_available() and False:
        torch.backends.cudnn.deterministic = True
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
    print(device)
    model = model.to(device)
    y = model(torch.ones(1, 3, 384, 288).to(device))
    print(y.shape)
    print(torch.min(y).item(), torch.mean(y).item(), torch.max(y).item())
--- a/easymocap/estimator/HRNet/hrnet_api.py
+++ b/easymocap/estimator/HRNet/hrnet_api.py
@ -0,0 +1,527 @@
 '''
    @ Date: 2020-06-04 12:47:04
  @ LastEditors: Qing Shuai
  @ LastEditTime: 2022-04-19 17:02:57
    @ Author: Qing Shuai
    @ Mail: s_q@zju.edu.cn
 '''
 from os.path import join
 import cv2
 import numpy as np
 import torch
 from torchvision.transforms import transforms
 from .hrnet import HRNet
 COCO17_IN_BODY25 = [0,16,15,18,17,5,2,6,3,7,4,12,9,13,10,14,11]
 pairs = [[1, 8], [1, 2], [1, 5], [2, 3], [3, 4], [5, 6], [6, 7], [8, 9], [9, 10], [10, 11], [8, 12], [12, 13], [13, 14], [1, 0], [0,15], [15,17], [0,16], [16,18], [14,19], [19,20], [14,21], [11,22], [22,23], [11,24]]
 def coco17tobody25(points2d):
    kpts = np.zeros((points2d.shape[0], 25, 3))
    kpts[:, COCO17_IN_BODY25, :2] = points2d[:, :, :2]
    kpts[:, COCO17_IN_BODY25, 2:3] = points2d[:, :, 2:3]
    kpts[:, 8, :2] = kpts[:, [9, 12], :2].mean(axis=1)
    kpts[:, 8, 2] = kpts[:, [9, 12], 2].min(axis=1)
    kpts[:, 1, :2] = kpts[:, [2, 5], :2].mean(axis=1)
    kpts[:, 1, 2] = kpts[:, [2, 5], 2].min(axis=1)
    # 需要交换一下
    # kpts = kpts[:, :, [1,0,2]]
    return kpts
 # 生成高斯核
 def generate_gauss(sigma):
    tmp_size = sigma * 3
    size = 2 * tmp_size + 1
    x = np.arange(0, size, 1, np.float32)
    y = x[:, np.newaxis]
    x0 = y0 = size // 2
    # The gaussian is not normalized, we want the center value to equal 1
    g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2))
    return g, tmp_size
 gauss = {}
 for SIGMA in range(1, 5):
    gauss_kernel, gauss_radius = generate_gauss(SIGMA)
    gauss[SIGMA] = {
        'kernel': gauss_kernel,
        'radius': gauss_radius
    }
 def box_to_center_scale(box, model_image_width, model_image_height, scale_factor=1.25):
    """convert a box to center,scale information required for pose transformation
    Parameters
    ----------
    box : list of tuple
        list of length 2 with two tuples of floats representing
        bottom left and top right corner of a box
    model_image_width : int
    model_image_height : int
    Returns
    -------
    (numpy array, numpy array)
        Two numpy arrays, coordinates for the center of the box and the scale of the box
    """
    center = np.zeros((2), dtype=np.float32)
    bottom_left_corner = (box[0], box[1])
    top_right_corner = (box[2], box[3])
    box_width = top_right_corner[0]-bottom_left_corner[0]
    box_height = top_right_corner[1]-bottom_left_corner[1]
    bottom_left_x = bottom_left_corner[0]
    bottom_left_y = bottom_left_corner[1]
    center[0] = bottom_left_x + box_width * 0.5
    center[1] = bottom_left_y + box_height * 0.5
    aspect_ratio = model_image_width * 1.0 / model_image_height
    pixel_std = 200
    if box_width > aspect_ratio * box_height:
        box_height = box_width * 1.0 / aspect_ratio
    elif box_width < aspect_ratio * box_height:
        box_width = box_height * aspect_ratio
    scale = np.array(
        [box_width * 1.0 / pixel_std, box_height * 1.0 / pixel_std],
        dtype=np.float32)
    scale = scale * scale_factor
    return center, scale
 def get_dir(src_point, rot_rad):
    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
    src_result = [0, 0]
    src_result[0] = src_point[0] * cs - src_point[1] * sn
    src_result[1] = src_point[0] * sn + src_point[1] * cs
    return src_result
 def get_3rd_point(a, b):
    direct = a - b
    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
 def get_affine_transform(
        center, scale, rot, output_size,
        shift=np.array([0, 0], dtype=np.float32), inv=0
 ):
    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
        print(scale)
        scale = np.array([scale, scale])
    scale_tmp = scale * 200.0
    src_w = scale_tmp[0]
    dst_w = output_size[0]
    dst_h = output_size[1]
    rot_rad = np.pi * rot / 180
    src_dir = get_dir([0, src_w * -0.5], rot_rad)
    dst_dir = np.array([0, dst_w * -0.5], np.float32)
    src = np.zeros((3, 2), dtype=np.float32)
    dst = np.zeros((3, 2), dtype=np.float32)
    src[0, :] = center + scale_tmp * shift
    src[1, :] = center + src_dir + scale_tmp * shift
    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
    if inv:
        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
    else:
        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
    return trans
 def get_max_preds(batch_heatmaps):
    '''
    get predictions from score maps
    heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
    '''
    assert isinstance(batch_heatmaps, np.ndarray), \
        'batch_heatmaps should be numpy.ndarray'
    assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim'
    batch_size = batch_heatmaps.shape[0]
    num_joints = batch_heatmaps.shape[1]
    width = batch_heatmaps.shape[3]
    heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1))
    idx = np.argmax(heatmaps_reshaped, 2)
    maxvals = np.amax(heatmaps_reshaped, 2)
    maxvals = maxvals.reshape((batch_size, num_joints, 1))
    idx = idx.reshape((batch_size, num_joints, 1))
    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
    preds[:, :, 0] = (preds[:, :, 0]) % width
    preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)
    pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
    pred_mask = pred_mask.astype(np.float32)
    preds *= pred_mask
    return preds, maxvals
 def affine_transform(pt, t):
    new_pt = np.array([pt[0], pt[1], 1.]).T
    new_pt = np.dot(t, new_pt)
    return new_pt[:2]
 def batch_affine_transform(points, trans):
    points = np.hstack((points[:, :2], np.ones((points.shape[0], 1))))
    out = points @ trans.T
    return out
 def transform_preds(coords, center, scale, rot, output_size):
    target_coords = np.zeros(coords.shape)
    trans = get_affine_transform(center, scale, rot, output_size, inv=1)
    target_coords[:, :2] = batch_affine_transform(coords, trans)
    return target_coords
 config_ = {'kintree': [[1, 0], [2, 0], [3, 1], [4, 2], [5, 0], [6, 0], [7, 5], [8, 6], [9, 7], [10, 8], [11, 5], [12, 6], [13, 11], [
    14, 12], [15, 13], [16, 14], [6, 5], [12, 11]], 'color': ['g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'k', 'k']}
 colors_table = {
    # colorblind/print/copy safe:
    '_blue': [0.65098039, 0.74117647, 0.85882353],
    '_pink': [.9, .7, .7],
    '_mint': [ 166/255.,  229/255.,  204/255.],
    '_mint2': [ 202/255.,  229/255.,  223/255.],
    '_green': [ 153/255.,  216/255.,  201/255.],
    '_green2': [ 171/255.,  221/255.,  164/255.],
    '_red': [ 251/255.,  128/255.,  114/255.],
    '_orange': [ 253/255.,  174/255.,  97/255.],
    '_yellow': [ 250/255.,  230/255.,  154/255.],
    'r':[255/255,0,0],
    'g':[0,255/255,0],
    'b':[0,0,255/255],
    'k':[0,0,0],
    'y':[255/255,255/255,0],
    'purple':[128/255,0,128/255]
 }
 for key, val in colors_table.items():
    colors_table[key] = tuple([int(val[2]*255), int(val[1]*255), int(val[0]*255)])
 def save_batch_heatmaps(batch_image, batch_heatmaps, file_name,
                        normalize=True):
    '''
    batch_image: [batch_size, channel, height, width]
    batch_heatmaps: ['batch_size, num_joints, height, width]
    file_name: saved file name
    '''
    if normalize:
        batch_image = batch_image.clone()
        min = float(batch_image.min())
        max = float(batch_image.max())
        batch_image.add_(-min).div_(max - min + 1e-5)
    batch_size = batch_heatmaps.size(0)
    num_joints = batch_heatmaps.size(1)
    heatmap_height = batch_heatmaps.size(2)
    heatmap_width = batch_heatmaps.size(3)
    grid_image = np.zeros((batch_size*heatmap_height,
                           (num_joints+2)*heatmap_width,
                           3),
                          dtype=np.uint8)
    preds, maxvals = get_max_preds(batch_heatmaps.detach().cpu().numpy())
    for i in range(batch_size):
        image = batch_image[i].mul(255)\
                              .clamp(0, 255)\
                              .byte()\
                              .permute(1, 2, 0)\
                              .cpu().numpy()
        heatmaps = batch_heatmaps[i].mul(255)\
                                    .clamp(0, 255)\
                                    .byte()\
                                    .cpu().numpy()
        resized_image = cv2.resize(image,
                                   (int(heatmap_width), int(heatmap_height)))
        resized_image_copy = resized_image.copy()
        height_begin = heatmap_height * i
        height_end = heatmap_height * (i + 1)
        for ip in range(len(config_['kintree'])):
            src, dst = config_['kintree'][ip]
            c = config_['color'][ip]
            if maxvals[i][src] < 0.1 or maxvals[i][dst] < 0.1:
                continue
            plot_line(resized_image_copy, preds[i][src], preds[i][dst], colors_table[c], 1)
        for j in range(num_joints):
            cv2.circle(resized_image,
                       (int(preds[i][j][0]), int(preds[i][j][1])),
                       1, [0, 0, 255], 1)
            heatmap = heatmaps[j, :, :]
            mask = (heatmap > 0.1)[:,:,None]
            colored_heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
            masked_image = (colored_heatmap*0.7 + resized_image*0.3)*mask + resized_image*(1-mask)
            cv2.circle(masked_image,
                       (int(preds[i][j][0]), int(preds[i][j][1])),
                       1, [0, 0, 255], 1)
            width_begin = heatmap_width * (j+2)
            width_end = heatmap_width * (j+2+1)
            grid_image[height_begin:height_end, width_begin:width_end, :] = \
                masked_image
            # grid_image[height_begin:height_end, width_begin:width_end, :] = \
            #     colored_heatmap*0.7 + resized_image*0.3
        grid_image[height_begin:height_end, 0:heatmap_width, :] = resized_image
        grid_image[height_begin:height_end, heatmap_width:heatmap_width+heatmap_width, :] = resized_image_copy
    cv2.imwrite(file_name, grid_image)
 import math
 def get_final_preds(batch_heatmaps, center, scale, rot=None, flip=None):
    coords, maxvals = get_max_preds(batch_heatmaps)
    heatmap_height = batch_heatmaps.shape[2]
    heatmap_width = batch_heatmaps.shape[3]
    # post-processing
    if True:
        for n in range(coords.shape[0]):
            for p in range(coords.shape[1]):
                hm = batch_heatmaps[n][p]
                px = int(math.floor(coords[n][p][0] + 0.5))
                py = int(math.floor(coords[n][p][1] + 0.5))
                if 1 < px < heatmap_width-1 and 1 < py < heatmap_height-1:
                    diff = np.array(
                        [
                            hm[py][px+1] - hm[py][px-1],
                            hm[py+1][px]-hm[py-1][px]
                        ]
                    )
                    coords[n][p] += np.sign(diff) * .25
    preds = coords.copy()
    # Transform back
    for i in range(coords.shape[0]):
        if flip is not None:
            if flip[i]:
                coords[i, :, 0] = heatmap_width - 1 - coords[i, :, 0]
        if rot is None:
            _rot = 0
        else:
            _rot = rot[i]
        preds[i] = transform_preds(
            coords[i], center[i], scale[i], _rot, [heatmap_width, heatmap_height]
        )
    return preds, maxvals
 def get_gaussian_maps(net_out, keypoints, sigma):
    radius, kernel = gauss[sigma]['radius'], gauss[sigma]['kernel']
    weights = np.ones(net_out.shape, dtype=np.float32)
    for i in range(weights.shape[0]):
        for nj in range(weights.shape[1]):
            if keypoints[i][nj][2] < 0:
                weights[i][nj] = 0
                continue
            elif keypoints[i][nj][2] < 0.01:
                weights[i][nj] = 0
                continue
            weights[i][nj] = 0
            mu_x, mu_y = keypoints[i][nj][:2]
            mu_x, mu_y = int(mu_x + 0.5), int(mu_y + 0.5)
            # Usable gaussian range
            ul = [mu_x - radius, mu_y - radius]
            br = [mu_x + radius + 1, mu_y + radius + 1]
            # Usable gaussian range
            g_x = max(0, -ul[0]), min(br[0], weights.shape[3]) - ul[0]
            g_y = max(0, -ul[1]), min(br[1], weights.shape[2]) - ul[1]
            # Image range
            img_x = max(0, ul[0]), min(br[0], weights.shape[3])
            img_y = max(0, ul[1]), min(br[1], weights.shape[2])
            weights[i][nj][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
                kernel[g_y[0]:g_y[1], g_x[0]:g_x[1]]
    return weights
 humanId = 0
 class SimpleHRNet:
    def __init__(self, c, nof_joints, checkpoint_path, device, resolution=(288, 384),):
        self.device = device
        self.c = c
        self.nof_joints = nof_joints
        self.checkpoint_path = checkpoint_path
        self.max_batch_size = 64
        self.resolution = resolution  # in the form (height, width) as in the original implementation
        self.transform = transforms.Compose([
                # transforms.ToPILImage(),
                # transforms.Resize((self.resolution[0], self.resolution[1])),  # (height, width)
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ])
        self.model = HRNet(c=c, nof_joints=nof_joints).to(device)
        self.model.load_state_dict(torch.load(checkpoint_path, map_location=self.device))
        self.model.eval()
    def __call__(self, image, bboxes, rot=0, net_out=False):
        # image: 
        images = torch.zeros((len(bboxes), 3, self.resolution[1], self.resolution[0]), device=self.device)  # (height, width)
        if len(bboxes) > 0:
            # pose estimation : for multiple people
            centers, scales, trans_all = [], [], []
            for box in bboxes:
                center, scale = box_to_center_scale(box, self.resolution[0], self.resolution[1])
                centers.append(center)
                scales.append(scale)
                trans = get_affine_transform(center, scale, rot=rot, output_size=self.resolution)
                trans_all.append(trans)
            for i, trans in enumerate(trans_all):
                # Crop smaller image of people
                model_input = cv2.warpAffine(
                    image, trans,
                    (int(self.resolution[0]), int(self.resolution[1])),
                    flags=cv2.INTER_LINEAR)
                # cv2.imshow('input', model_input)
                # cv2.waitKey(0)
                # hwc -> 1chw
                model_input = self.transform(model_input)#.unsqueeze(0)
                images[i] = model_input
            images = images.to(self.device) 
            with torch.no_grad():
                out = self.model(images)
            out = out.cpu().detach().numpy()
            if net_out:
                return out, trans_all, centers, scales, rot
            coords, max_val = get_final_preds(
                out,
                np.asarray(centers),
                np.asarray(scales),
                [rot for _ in range(out.shape[0])])
            pts = np.concatenate((coords, max_val), axis=2)
            return coco17tobody25(pts)
        else:
            return np.empty(0, 25, 3)
    def predict_with_previous(self, image, bboxes, keypoints, sigma):
        # (batch, nJoints, height, width)
        net_out, trans_all, centers, scales, rot = self.__call__(image, bboxes, net_out=True)
        keypoints = keypoints[:, COCO17_IN_BODY25]
        keypoints_rescale = keypoints.copy()
        for i in range(keypoints.shape[0]):
            keypoints_rescale[..., :2] = batch_affine_transform(keypoints[i], trans_all[i])/4
        weights = get_gaussian_maps(net_out, keypoints_rescale, sigma)        
        out = net_out * weights
        coords, max_val = get_final_preds(
            out,
            np.asarray(centers),
            np.asarray(scales),
            rot)
        pts = np.concatenate((coords, max_val), axis=2)
        return coco17tobody25(pts)
    def predict(self, image, detections, keypoints=None, ret_crop=False):
        if keypoints is not None:
            keypoints = keypoints[:, COCO17_IN_BODY25]
            kpts_rescale = [None for _ in range(len(keypoints))]
        boxes = []
        rotation = 0
        image_pose = image
        # image_pose = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if detections is not None:
            images = torch.zeros((len(detections), 3, self.resolution[1], self.resolution[0]), device=self.device)  # (height, width)
            # pose estimation : for multiple people
            centers = []
            scales = []
            for box in detections:
                center, scale = box_to_center_scale(box, self.resolution[0], self.resolution[1])
                centers.append(center)
                scales.append(scale)
            model_inputs = []
            for i, (center, scale) in enumerate(zip(centers, scales)):
                trans = get_affine_transform(center, scale, rotation, self.resolution)
                # Crop smaller image of people
                model_input = cv2.warpAffine(
                    image_pose,
                    trans,
                    (int(self.resolution[0]), int(self.resolution[1])),
                    flags=cv2.INTER_LINEAR)
                if keypoints is not None:
                    kpts_homo = keypoints[i].copy()
                    kpts_homo[:, 2] = 1
                    kpts_rescale[i] = (kpts_homo @ trans.T)/4
                # global humanId
                # cv2.imwrite('../output/debughrnet/person_{}.jpg'.format(humanId), model_input[:,:,[2,1,0]])
                # humanId += 1
                # hwc -> 1chw
                model_input = self.transform(model_input)#.unsqueeze(0)
                images[i] = model_input
        # torch.cuda.synchronize(self.device)
        # print(' - spending {:.2f}ms in preprocess.'.format(1000*(time.time() - start)))
        if images.shape[0] == 0:
            return np.empty((0, 25, 3))
        else:
            # start = time.time()
            images = images.to(self.device) 
            # torch.cuda.synchronize(self.device)
            # print(' - spending {:.2f}ms in copy to cuda.'.format(1000*(time.time() - start)))
            # start = time.time()
            with torch.no_grad():
                if len(images) <= self.max_batch_size:
                    out = self.model(images)
                else:
                    out = torch.empty(
                        (images.shape[0], self.nof_joints, self.resolution[1] // 4, self.resolution[0] // 4)
                    ).to(self.device)
                    for i in range(0, len(images), self.max_batch_size):
                        out[i:i + self.max_batch_size] = self.model(images[i:i + self.max_batch_size])
            # torch.cuda.synchronize(self.device)
            global humanId
            if keypoints is not None:
                filename = join('../output/debughrnet', '{:06d}.jpg'.format(humanId))
                humanId += 1
                # save_batch_heatmaps(images, out, filename)
                # 制造高斯核，默认为1
                weights = np.ones(out.shape, dtype=np.float32)
                for i in range(weights.shape[0]):
                    for nj in range(weights.shape[1]):
                        if keypoints[i][nj][2] < 0:
                            weights[i][nj] = 0
                            continue
                        elif keypoints[i][nj][2] < 0.01:
                            continue
                        weights[i][nj] = 0
                        mu_x, mu_y = kpts_rescale[i][nj]
                        mu_x, mu_y = int(mu_x + 0.5), int(mu_y + 0.5)
                        # Usable gaussian range
                        ul = [mu_x - gauss_radius, mu_y - gauss_radius]
                        br = [mu_x + gauss_radius + 1, mu_y + gauss_radius + 1]
                        # Usable gaussian range
                        g_x = max(0, -ul[0]), min(br[0], weights.shape[3]) - ul[0]
                        g_y = max(0, -ul[1]), min(br[1], weights.shape[2]) - ul[1]
                        # Image range
                        img_x = max(0, ul[0]), min(br[0], weights.shape[3])
                        img_y = max(0, ul[1]), min(br[1], weights.shape[2])
                        weights[i][nj][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
                            gauss_kernel[g_y[0]:g_y[1], g_x[0]:g_x[1]]
                filename = join('../output/debughrnet', '{:06d}.jpg'.format(humanId))
                humanId += 1
                # save_batch_heatmaps(images, torch.Tensor(weights), filename)
                out = out.cpu().detach().numpy()
                out = out * weights
                filename = join('../output/debughrnet', '{:06d}.jpg'.format(humanId))
                humanId += 1
                # save_batch_heatmaps(images, torch.Tensor(out), filename)
            else:
                out = out.cpu().detach().numpy()
            coords, max_val = get_final_preds(
                out,
                np.asarray(centers),
                np.asarray(scales))
            pts = np.concatenate((coords, max_val), axis=2)
            # torch.cuda.synchronize(self.device)
            # print(' - spending {:.2f}ms in postprocess.'.format(1000*(time.time() - start)))
            # print('')
            if ret_crop:
                return coco17tobody25(pts), images
            else:
                return coco17tobody25(pts)
--- a/easymocap/estimator/HRNet/modules.py
+++ b/easymocap/estimator/HRNet/modules.py
@ -0,0 +1,72 @@
 import torch
 from torch import nn
 class Bottleneck(nn.Module):
    expansion = 4
    def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=bn_momentum)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.bn3(out)
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out
 class BasicBlock(nn.Module):
    expansion = 1
    def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum)
        self.downsample = downsample
        self.stride = stride
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out
--- a/easymocap/estimator/YOLOv4/init.py
+++ b/easymocap/estimator/YOLOv4/init.py
@ -0,0 +1,8 @@
 '''
  @ Date: 2020-12-10 16:37:04
  @ Author: Qing Shuai
  @ LastEditors: Qing Shuai
  @ LastEditTime: 2020-12-10 16:52:06
  @ FilePath: /mvpose/code/estimator/YOLOv4/__init__.py
 '''
 from .yolo import YOLOv4
--- a/easymocap/estimator/YOLOv4/coco.names
+++ b/easymocap/estimator/YOLOv4/coco.names
@ -0,0 +1,80 @@
 person
 bicycle
 car
 motorbike
 aeroplane
 bus
 train
 truck
 boat
 traffic light
 fire hydrant
 stop sign
 parking meter
 bench
 bird
 cat
 dog
 horse
 sheep
 cow
 elephant
 bear
 zebra
 giraffe
 backpack
 umbrella
 handbag
 tie
 suitcase
 frisbee
 skis
 snowboard
 sports ball
 kite
 baseball bat
 baseball glove
 skateboard
 surfboard
 tennis racket
 bottle
 wine glass
 cup
 fork
 knife
 spoon
 bowl
 banana
 apple
 sandwich
 orange
 broccoli
 carrot
 hot dog
 pizza
 donut
 cake
 chair
 sofa
 pottedplant
 bed
 diningtable
 toilet
 tvmonitor
 laptop
 mouse
 remote
 keyboard
 cell phone
 microwave
 oven
 toaster
 sink
 refrigerator
 book
 clock
 vase
 scissors
 teddy bear
 hair drier
 toothbrush
--- a/easymocap/estimator/YOLOv4/config.py
+++ b/easymocap/estimator/YOLOv4/config.py
@ -0,0 +1,257 @@
 import torch
 from .torch_utils import convert2cpu
 def parse_cfg(cfgfile):
    blocks = []
    fp = open(cfgfile, 'r')
    block = None
    line = fp.readline()
    while line != '':
        line = line.rstrip()
        if line == '' or line[0] == '#':
            line = fp.readline()
            continue
        elif line[0] == '[':
            if block:
                blocks.append(block)
            block = dict()
            block['type'] = line.lstrip('[').rstrip(']')
            # set default value
            if block['type'] == 'convolutional':
                block['batch_normalize'] = 0
        else:
            key, value = line.split('=')
            key = key.strip()
            if key == 'type':
                key = '_type'
            value = value.strip()
            block[key] = value
        line = fp.readline()
    if block:
        blocks.append(block)
    fp.close()
    return blocks
 def print_cfg(blocks):
    print('layer     filters    size              input                output');
    prev_width = 416
    prev_height = 416
    prev_filters = 3
    out_filters = []
    out_widths = []
    out_heights = []
    ind = -2
    for block in blocks:
        ind = ind + 1
        if block['type'] == 'net':
            prev_width = int(block['width'])
            prev_height = int(block['height'])
            continue
        elif block['type'] == 'convolutional':
            filters = int(block['filters'])
            kernel_size = int(block['size'])
            stride = int(block['stride'])
            is_pad = int(block['pad'])
            pad = (kernel_size - 1) // 2 if is_pad else 0
            width = (prev_width + 2 * pad - kernel_size) // stride + 1
            height = (prev_height + 2 * pad - kernel_size) // stride + 1
            print('%5d %-6s %4d  %d x %d / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
                ind, 'conv', filters, kernel_size, kernel_size, stride, prev_width, prev_height, prev_filters, width,
                height, filters))
            prev_width = width
            prev_height = height
            prev_filters = filters
            out_widths.append(prev_width)
            out_heights.append(prev_height)
            out_filters.append(prev_filters)
        elif block['type'] == 'maxpool':
            pool_size = int(block['size'])
            stride = int(block['stride'])
            width = prev_width // stride
            height = prev_height // stride
            print('%5d %-6s       %d x %d / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
                ind, 'max', pool_size, pool_size, stride, prev_width, prev_height, prev_filters, width, height,
                filters))
            prev_width = width
            prev_height = height
            prev_filters = filters
            out_widths.append(prev_width)
            out_heights.append(prev_height)
            out_filters.append(prev_filters)
        elif block['type'] == 'avgpool':
            width = 1
            height = 1
            print('%5d %-6s                   %3d x %3d x%4d   ->  %3d' % (
                ind, 'avg', prev_width, prev_height, prev_filters, prev_filters))
            prev_width = width
            prev_height = height
            prev_filters = filters
            out_widths.append(prev_width)
            out_heights.append(prev_height)
            out_filters.append(prev_filters)
        elif block['type'] == 'softmax':
            print('%5d %-6s                                    ->  %3d' % (ind, 'softmax', prev_filters))
            out_widths.append(prev_width)
            out_heights.append(prev_height)
            out_filters.append(prev_filters)
        elif block['type'] == 'cost':
            print('%5d %-6s                                     ->  %3d' % (ind, 'cost', prev_filters))
            out_widths.append(prev_width)
            out_heights.append(prev_height)
            out_filters.append(prev_filters)
        elif block['type'] == 'reorg':
            stride = int(block['stride'])
            filters = stride * stride * prev_filters
            width = prev_width // stride
            height = prev_height // stride
            print('%5d %-6s             / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
                ind, 'reorg', stride, prev_width, prev_height, prev_filters, width, height, filters))
            prev_width = width
            prev_height = height
            prev_filters = filters
            out_widths.append(prev_width)
            out_heights.append(prev_height)
            out_filters.append(prev_filters)
        elif block['type'] == 'upsample':
            stride = int(block['stride'])
            filters = prev_filters
            width = prev_width * stride
            height = prev_height * stride
            print('%5d %-6s           * %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
                ind, 'upsample', stride, prev_width, prev_height, prev_filters, width, height, filters))
            prev_width = width
            prev_height = height
            prev_filters = filters
            out_widths.append(prev_width)
            out_heights.append(prev_height)
            out_filters.append(prev_filters)
        elif block['type'] == 'route':
            layers = block['layers'].split(',')
            layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
            if len(layers) == 1:
                print('%5d %-6s %d' % (ind, 'route', layers[0]))
                prev_width = out_widths[layers[0]]
                prev_height = out_heights[layers[0]]
                prev_filters = out_filters[layers[0]]
            elif len(layers) == 2:
                print('%5d %-6s %d %d' % (ind, 'route', layers[0], layers[1]))
                prev_width = out_widths[layers[0]]
                prev_height = out_heights[layers[0]]
                assert (prev_width == out_widths[layers[1]])
                assert (prev_height == out_heights[layers[1]])
                prev_filters = out_filters[layers[0]] + out_filters[layers[1]]
            elif len(layers) == 4:
                print('%5d %-6s %d %d %d %d' % (ind, 'route', layers[0], layers[1], layers[2], layers[3]))
                prev_width = out_widths[layers[0]]
                prev_height = out_heights[layers[0]]
                assert (prev_width == out_widths[layers[1]] == out_widths[layers[2]] == out_widths[layers[3]])
                assert (prev_height == out_heights[layers[1]] == out_heights[layers[2]] == out_heights[layers[3]])
                prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + out_filters[
                    layers[3]]
            else:
                print("route error !!! {} {} {}".format(sys._getframe().f_code.co_filename,
                                                        sys._getframe().f_code.co_name, sys._getframe().f_lineno))
            out_widths.append(prev_width)
            out_heights.append(prev_height)
            out_filters.append(prev_filters)
        elif block['type'] in ['region', 'yolo']:
            print('%5d %-6s' % (ind, 'detection'))
            out_widths.append(prev_width)
            out_heights.append(prev_height)
            out_filters.append(prev_filters)
        elif block['type'] == 'shortcut':
            from_id = int(block['from'])
            from_id = from_id if from_id > 0 else from_id + ind
            print('%5d %-6s %d' % (ind, 'shortcut', from_id))
            prev_width = out_widths[from_id]
            prev_height = out_heights[from_id]
            prev_filters = out_filters[from_id]
            out_widths.append(prev_width)
            out_heights.append(prev_height)
            out_filters.append(prev_filters)
        elif block['type'] == 'connected':
            filters = int(block['output'])
            print('%5d %-6s                            %d  ->  %3d' % (ind, 'connected', prev_filters, filters))
            prev_filters = filters
            out_widths.append(1)
            out_heights.append(1)
            out_filters.append(prev_filters)
        else:
            print('unknown type %s' % (block['type']))
 def load_conv(buf, start, conv_model):
    num_w = conv_model.weight.numel()
    num_b = conv_model.bias.numel()
    conv_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
    start = start + num_b
    conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape));
    start = start + num_w
    return start
 def save_conv(fp, conv_model):
    if conv_model.bias.is_cuda:
        convert2cpu(conv_model.bias.data).numpy().tofile(fp)
        convert2cpu(conv_model.weight.data).numpy().tofile(fp)
    else:
        conv_model.bias.data.numpy().tofile(fp)
        conv_model.weight.data.numpy().tofile(fp)
 def load_conv_bn(buf, start, conv_model, bn_model):
    num_w = conv_model.weight.numel()
    num_b = bn_model.bias.numel()
    bn_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
    start = start + num_b
    bn_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_b]));
    start = start + num_b
    bn_model.running_mean.copy_(torch.from_numpy(buf[start:start + num_b]));
    start = start + num_b
    bn_model.running_var.copy_(torch.from_numpy(buf[start:start + num_b]));
    start = start + num_b
    conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape));
    start = start + num_w
    return start
 def save_conv_bn(fp, conv_model, bn_model):
    if bn_model.bias.is_cuda:
        convert2cpu(bn_model.bias.data).numpy().tofile(fp)
        convert2cpu(bn_model.weight.data).numpy().tofile(fp)
        convert2cpu(bn_model.running_mean).numpy().tofile(fp)
        convert2cpu(bn_model.running_var).numpy().tofile(fp)
        convert2cpu(conv_model.weight.data).numpy().tofile(fp)
    else:
        bn_model.bias.data.numpy().tofile(fp)
        bn_model.weight.data.numpy().tofile(fp)
        bn_model.running_mean.numpy().tofile(fp)
        bn_model.running_var.numpy().tofile(fp)
        conv_model.weight.data.numpy().tofile(fp)
 def load_fc(buf, start, fc_model):
    num_w = fc_model.weight.numel()
    num_b = fc_model.bias.numel()
    fc_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
    start = start + num_b
    fc_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]));
    start = start + num_w
    return start
 def save_fc(fp, fc_model):
    fc_model.bias.data.numpy().tofile(fp)
    fc_model.weight.data.numpy().tofile(fp)
 if __name__ == '__main__':
    import sys
    blocks = parse_cfg('cfg/yolo.cfg')
    if len(sys.argv) == 2:
        blocks = parse_cfg(sys.argv[1])
    print_cfg(blocks)
--- a/easymocap/estimator/YOLOv4/darknet2pytorch.py
+++ b/easymocap/estimator/YOLOv4/darknet2pytorch.py
@ -0,0 +1,515 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
 from .region_loss import RegionLoss
 from .yolo_layer import YoloLayer
 from .config import *
 from .torch_utils import *
 class Mish(torch.nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        x = x * (torch.tanh(torch.nn.functional.softplus(x)))
        return x
 class MaxPoolDark(nn.Module):
    def __init__(self, size=2, stride=1):
        super(MaxPoolDark, self).__init__()
        self.size = size
        self.stride = stride
    def forward(self, x):
        '''
        darknet output_size = (input_size + p - k) / s +1
        p : padding = k - 1
        k : size
        s : stride
        torch output_size = (input_size + 2*p -k) / s +1
        p : padding = k//2
        '''
        p = self.size // 2
        if ((x.shape[2] - 1) // self.stride) != ((x.shape[2] + 2 * p - self.size) // self.stride):
            padding1 = (self.size - 1) // 2
            padding2 = padding1 + 1
        else:
            padding1 = (self.size - 1) // 2
            padding2 = padding1
        if ((x.shape[3] - 1) // self.stride) != ((x.shape[3] + 2 * p - self.size) // self.stride):
            padding3 = (self.size - 1) // 2
            padding4 = padding3 + 1
        else:
            padding3 = (self.size - 1) // 2
            padding4 = padding3
        x = F.max_pool2d(F.pad(x, (padding3, padding4, padding1, padding2), mode='replicate'),
                         self.size, stride=self.stride)
        return x
 class Upsample_expand(nn.Module):
    def __init__(self, stride=2):
        super(Upsample_expand, self).__init__()
        self.stride = stride
    def forward(self, x):
        assert (x.data.dim() == 4)
        x = x.view(x.size(0), x.size(1), x.size(2), 1, x.size(3), 1).\
            expand(x.size(0), x.size(1), x.size(2), self.stride, x.size(3), self.stride).contiguous().\
            view(x.size(0), x.size(1), x.size(2) * self.stride, x.size(3) * self.stride)
        return x
 class Upsample_interpolate(nn.Module):
    def __init__(self, stride):
        super(Upsample_interpolate, self).__init__()
        self.stride = stride
    def forward(self, x):
        assert (x.data.dim() == 4)
        out = F.interpolate(x, size=(x.size(2) * self.stride, x.size(3) * self.stride), mode='nearest')
        return out
 class Reorg(nn.Module):
    def __init__(self, stride=2):
        super(Reorg, self).__init__()
        self.stride = stride
    def forward(self, x):
        stride = self.stride
        assert (x.data.dim() == 4)
        B = x.data.size(0)
        C = x.data.size(1)
        H = x.data.size(2)
        W = x.data.size(3)
        assert (H % stride == 0)
        assert (W % stride == 0)
        ws = stride
        hs = stride
        x = x.view(B, C, H / hs, hs, W / ws, ws).transpose(3, 4).contiguous()
        x = x.view(B, C, H / hs * W / ws, hs * ws).transpose(2, 3).contiguous()
        x = x.view(B, C, hs * ws, H / hs, W / ws).transpose(1, 2).contiguous()
        x = x.view(B, hs * ws * C, H / hs, W / ws)
        return x
 class GlobalAvgPool2d(nn.Module):
    def __init__(self):
        super(GlobalAvgPool2d, self).__init__()
    def forward(self, x):
        N = x.data.size(0)
        C = x.data.size(1)
        H = x.data.size(2)
        W = x.data.size(3)
        x = F.avg_pool2d(x, (H, W))
        x = x.view(N, C)
        return x
 # for route and shortcut
 class EmptyModule(nn.Module):
    def __init__(self):
        super(EmptyModule, self).__init__()
    def forward(self, x):
        return x
 # support route shortcut and reorg
 class Darknet(nn.Module):
    def __init__(self, cfgfile, inference=False):
        super(Darknet, self).__init__()
        self.inference = inference
        self.training = not self.inference
        self.blocks = parse_cfg(cfgfile)
        self.width = int(self.blocks[0]['width'])
        self.height = int(self.blocks[0]['height'])
        self.models = self.create_network(self.blocks)  # merge conv, bn,leaky
        self.loss = self.models[len(self.models) - 1]
        if self.blocks[(len(self.blocks) - 1)]['type'] == 'region':
            self.anchors = self.loss.anchors
            self.num_anchors = self.loss.num_anchors
            self.anchor_step = self.loss.anchor_step
            self.num_classes = self.loss.num_classes
        self.header = torch.IntTensor([0, 0, 0, 0])
        self.seen = 0
    def forward(self, x):
        ind = -2
        self.loss = None
        outputs = dict()
        out_boxes = []
        for block in self.blocks:
            ind = ind + 1
            # if ind > 0:
            #    return x
            if block['type'] == 'net':
                continue
            elif block['type'] in ['convolutional', 'maxpool', 'reorg', 'upsample', 'avgpool', 'softmax', 'connected']:
                x = self.models[ind](x)
                outputs[ind] = x
            elif block['type'] == 'route':
                layers = block['layers'].split(',')
                layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
                if len(layers) == 1:
                    if 'groups' not in block.keys() or int(block['groups']) == 1:
                        x = outputs[layers[0]]
                        outputs[ind] = x
                    else:
                        groups = int(block['groups'])
                        group_id = int(block['group_id'])
                        _, b, _, _ = outputs[layers[0]].shape
                        x = outputs[layers[0]][:, b // groups * group_id:b // groups * (group_id + 1)]
                        outputs[ind] = x
                elif len(layers) == 2:
                    x1 = outputs[layers[0]]
                    x2 = outputs[layers[1]]
                    x = torch.cat((x1, x2), 1)
                    outputs[ind] = x
                elif len(layers) == 4:
                    x1 = outputs[layers[0]]
                    x2 = outputs[layers[1]]
                    x3 = outputs[layers[2]]
                    x4 = outputs[layers[3]]
                    x = torch.cat((x1, x2, x3, x4), 1)
                    outputs[ind] = x
                else:
                    print("rounte number > 2 ,is {}".format(len(layers)))
            elif block['type'] == 'shortcut':
                from_layer = int(block['from'])
                activation = block['activation']
                from_layer = from_layer if from_layer > 0 else from_layer + ind
                x1 = outputs[from_layer]
                x2 = outputs[ind - 1]
                x = x1 + x2
                if activation == 'leaky':
                    x = F.leaky_relu(x, 0.1, inplace=True)
                elif activation == 'relu':
                    x = F.relu(x, inplace=True)
                outputs[ind] = x
            elif block['type'] == 'region':
                continue
                if self.loss:
                    self.loss = self.loss + self.models[ind](x)
                else:
                    self.loss = self.models[ind](x)
                outputs[ind] = None
            elif block['type'] == 'yolo':
                # if self.training:
                #     pass
                # else:
                #     boxes = self.models[ind](x)
                #     out_boxes.append(boxes)
                boxes = self.models[ind](x)
                out_boxes.append(boxes)
            elif block['type'] == 'cost':
                continue
            else:
                print('unknown type %s' % (block['type']))
        if self.training:
            return out_boxes
        else:
            return get_region_boxes(out_boxes)
    def print_network(self):
        print_cfg(self.blocks)
    def create_network(self, blocks):
        models = nn.ModuleList()
        prev_filters = 3
        out_filters = []
        prev_stride = 1
        out_strides = []
        conv_id = 0
        for block in blocks:
            if block['type'] == 'net':
                prev_filters = int(block['channels'])
                continue
            elif block['type'] == 'convolutional':
                conv_id = conv_id + 1
                batch_normalize = int(block['batch_normalize'])
                filters = int(block['filters'])
                kernel_size = int(block['size'])
                stride = int(block['stride'])
                is_pad = int(block['pad'])
                pad = (kernel_size - 1) // 2 if is_pad else 0
                activation = block['activation']
                model = nn.Sequential()
                if batch_normalize:
                    model.add_module('conv{0}'.format(conv_id),
                                     nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias=False))
                    model.add_module('bn{0}'.format(conv_id), nn.BatchNorm2d(filters))
                    # model.add_module('bn{0}'.format(conv_id), BN2d(filters))
                else:
                    model.add_module('conv{0}'.format(conv_id),
                                     nn.Conv2d(prev_filters, filters, kernel_size, stride, pad))
                if activation == 'leaky':
                    model.add_module('leaky{0}'.format(conv_id), nn.LeakyReLU(0.1, inplace=True))
                elif activation == 'relu':
                    model.add_module('relu{0}'.format(conv_id), nn.ReLU(inplace=True))
                elif activation == 'mish':
                    model.add_module('mish{0}'.format(conv_id), Mish())
                else:
                    pass
                    # print("convalution havn't activate {}".format(activation))
                prev_filters = filters
                out_filters.append(prev_filters)
                prev_stride = stride * prev_stride
                out_strides.append(prev_stride)
                models.append(model)
            elif block['type'] == 'maxpool':
                pool_size = int(block['size'])
                stride = int(block['stride'])
                if stride == 1 and pool_size % 2:
                    # You can use Maxpooldark instead, here is convenient to convert onnx.
                    # Example: [maxpool] size=3 stride=1
                    model = nn.MaxPool2d(kernel_size=pool_size, stride=stride, padding=pool_size // 2)
                elif stride == pool_size:
                    # You can use Maxpooldark instead, here is convenient to convert onnx.
                    # Example: [maxpool] size=2 stride=2
                    model = nn.MaxPool2d(kernel_size=pool_size, stride=stride, padding=0)
                else:
                    model = MaxPoolDark(pool_size, stride)
                out_filters.append(prev_filters)
                prev_stride = stride * prev_stride
                out_strides.append(prev_stride)
                models.append(model)
            elif block['type'] == 'avgpool':
                model = GlobalAvgPool2d()
                out_filters.append(prev_filters)
                models.append(model)
            elif block['type'] == 'softmax':
                model = nn.Softmax()
                out_strides.append(prev_stride)
                out_filters.append(prev_filters)
                models.append(model)
            elif block['type'] == 'cost':
                if block['_type'] == 'sse':
                    model = nn.MSELoss(reduction='mean')
                elif block['_type'] == 'L1':
                    model = nn.L1Loss(reduction='mean')
                elif block['_type'] == 'smooth':
                    model = nn.SmoothL1Loss(reduction='mean')
                out_filters.append(1)
                out_strides.append(prev_stride)
                models.append(model)
            elif block['type'] == 'reorg':
                stride = int(block['stride'])
                prev_filters = stride * stride * prev_filters
                out_filters.append(prev_filters)
                prev_stride = prev_stride * stride
                out_strides.append(prev_stride)
                models.append(Reorg(stride))
            elif block['type'] == 'upsample':
                stride = int(block['stride'])
                out_filters.append(prev_filters)
                prev_stride = prev_stride // stride
                out_strides.append(prev_stride)
                models.append(Upsample_expand(stride))
                # models.append(Upsample_interpolate(stride))
            elif block['type'] == 'route':
                layers = block['layers'].split(',')
                ind = len(models)
                layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
                if len(layers) == 1:
                    if 'groups' not in block.keys() or int(block['groups']) == 1:
                        prev_filters = out_filters[layers[0]]
                        prev_stride = out_strides[layers[0]]
                    else:
                        prev_filters = out_filters[layers[0]] // int(block['groups'])
                        prev_stride = out_strides[layers[0]] // int(block['groups'])
                elif len(layers) == 2:
                    assert (layers[0] == ind - 1 or layers[1] == ind - 1)
                    prev_filters = out_filters[layers[0]] + out_filters[layers[1]]
                    prev_stride = out_strides[layers[0]]
                elif len(layers) == 4:
                    assert (layers[0] == ind - 1)
                    prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + \
                                   out_filters[layers[3]]
                    prev_stride = out_strides[layers[0]]
                else:
                    print("route error!!!")
                out_filters.append(prev_filters)
                out_strides.append(prev_stride)
                models.append(EmptyModule())
            elif block['type'] == 'shortcut':
                ind = len(models)
                prev_filters = out_filters[ind - 1]
                out_filters.append(prev_filters)
                prev_stride = out_strides[ind - 1]
                out_strides.append(prev_stride)
                models.append(EmptyModule())
            elif block['type'] == 'connected':
                filters = int(block['output'])
                if block['activation'] == 'linear':
                    model = nn.Linear(prev_filters, filters)
                elif block['activation'] == 'leaky':
                    model = nn.Sequential(
                        nn.Linear(prev_filters, filters),
                        nn.LeakyReLU(0.1, inplace=True))
                elif block['activation'] == 'relu':
                    model = nn.Sequential(
                        nn.Linear(prev_filters, filters),
                        nn.ReLU(inplace=True))
                prev_filters = filters
                out_filters.append(prev_filters)
                out_strides.append(prev_stride)
                models.append(model)
            elif block['type'] == 'region':
                loss = RegionLoss()
                anchors = block['anchors'].split(',')
                loss.anchors = [float(i) for i in anchors]
                loss.num_classes = int(block['classes'])
                loss.num_anchors = int(block['num'])
                loss.anchor_step = len(loss.anchors) // loss.num_anchors
                loss.object_scale = float(block['object_scale'])
                loss.noobject_scale = float(block['noobject_scale'])
                loss.class_scale = float(block['class_scale'])
                loss.coord_scale = float(block['coord_scale'])
                out_filters.append(prev_filters)
                out_strides.append(prev_stride)
                models.append(loss)
            elif block['type'] == 'yolo':
                yolo_layer = YoloLayer()
                anchors = block['anchors'].split(',')
                anchor_mask = block['mask'].split(',')
                yolo_layer.anchor_mask = [int(i) for i in anchor_mask]
                yolo_layer.anchors = [float(i) for i in anchors]
                yolo_layer.num_classes = int(block['classes'])
                self.num_classes = yolo_layer.num_classes
                yolo_layer.num_anchors = int(block['num'])
                yolo_layer.anchor_step = len(yolo_layer.anchors) // yolo_layer.num_anchors
                yolo_layer.stride = prev_stride
                yolo_layer.scale_x_y = float(block['scale_x_y'])
                # yolo_layer.object_scale = float(block['object_scale'])
                # yolo_layer.noobject_scale = float(block['noobject_scale'])
                # yolo_layer.class_scale = float(block['class_scale'])
                # yolo_layer.coord_scale = float(block['coord_scale'])
                out_filters.append(prev_filters)
                out_strides.append(prev_stride)
                models.append(yolo_layer)
            else:
                print('unknown type %s' % (block['type']))
        return models
    def load_weights(self, weightfile):
        fp = open(weightfile, 'rb')
        header = np.fromfile(fp, count=5, dtype=np.int32)
        self.header = torch.from_numpy(header)
        self.seen = self.header[3]
        buf = np.fromfile(fp, dtype=np.float32)
        fp.close()
        start = 0
        ind = -2
        for block in self.blocks:
            if start >= buf.size:
                break
            ind = ind + 1
            if block['type'] == 'net':
                continue
            elif block['type'] == 'convolutional':
                model = self.models[ind]
                batch_normalize = int(block['batch_normalize'])
                if batch_normalize:
                    start = load_conv_bn(buf, start, model[0], model[1])
                else:
                    start = load_conv(buf, start, model[0])
            elif block['type'] == 'connected':
                model = self.models[ind]
                if block['activation'] != 'linear':
                    start = load_fc(buf, start, model[0])
                else:
                    start = load_fc(buf, start, model)
            elif block['type'] == 'maxpool':
                pass
            elif block['type'] == 'reorg':
                pass
            elif block['type'] == 'upsample':
                pass
            elif block['type'] == 'route':
                pass
            elif block['type'] == 'shortcut':
                pass
            elif block['type'] == 'region':
                pass
            elif block['type'] == 'yolo':
                pass
            elif block['type'] == 'avgpool':
                pass
            elif block['type'] == 'softmax':
                pass
            elif block['type'] == 'cost':
                pass
            else:
                print('unknown type %s' % (block['type']))
    # def save_weights(self, outfile, cutoff=0):
    #     if cutoff <= 0:
    #         cutoff = len(self.blocks) - 1
    #
    #     fp = open(outfile, 'wb')
    #     self.header[3] = self.seen
    #     header = self.header
    #     header.numpy().tofile(fp)
    #
    #     ind = -1
    #     for blockId in range(1, cutoff + 1):
    #         ind = ind + 1
    #         block = self.blocks[blockId]
    #         if block['type'] == 'convolutional':
    #             model = self.models[ind]
    #             batch_normalize = int(block['batch_normalize'])
    #             if batch_normalize:
    #                 save_conv_bn(fp, model[0], model[1])
    #             else:
    #                 save_conv(fp, model[0])
    #         elif block['type'] == 'connected':
    #             model = self.models[ind]
    #             if block['activation'] != 'linear':
    #                 save_fc(fc, model)
    #             else:
    #                 save_fc(fc, model[0])
    #         elif block['type'] == 'maxpool':
    #             pass
    #         elif block['type'] == 'reorg':
    #             pass
    #         elif block['type'] == 'upsample':
    #             pass
    #         elif block['type'] == 'route':
    #             pass
    #         elif block['type'] == 'shortcut':
    #             pass
    #         elif block['type'] == 'region':
    #             pass
    #         elif block['type'] == 'yolo':
    #             pass
    #         elif block['type'] == 'avgpool':
    #             pass
    #         elif block['type'] == 'softmax':
    #             pass
    #         elif block['type'] == 'cost':
    #             pass
    #         else:
    #             print('unknown type %s' % (block['type']))
    #     fp.close()
--- a/easymocap/estimator/YOLOv4/region_loss.py
+++ b/easymocap/estimator/YOLOv4/region_loss.py
@ -0,0 +1,195 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from .torch_utils import *
 def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale,
                  sil_thresh, seen):
    nB = target.size(0)
    nA = num_anchors
    nC = num_classes
    anchor_step = len(anchors) / num_anchors
    conf_mask = torch.ones(nB, nA, nH, nW) * noobject_scale
    coord_mask = torch.zeros(nB, nA, nH, nW)
    cls_mask = torch.zeros(nB, nA, nH, nW)
    tx = torch.zeros(nB, nA, nH, nW)
    ty = torch.zeros(nB, nA, nH, nW)
    tw = torch.zeros(nB, nA, nH, nW)
    th = torch.zeros(nB, nA, nH, nW)
    tconf = torch.zeros(nB, nA, nH, nW)
    tcls = torch.zeros(nB, nA, nH, nW)
    nAnchors = nA * nH * nW
    nPixels = nH * nW
    for b in range(nB):
        cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t()
        cur_ious = torch.zeros(nAnchors)
        for t in range(50):
            if target[b][t * 5 + 1] == 0:
                break
            gx = target[b][t * 5 + 1] * nW
            gy = target[b][t * 5 + 2] * nH
            gw = target[b][t * 5 + 3] * nW
            gh = target[b][t * 5 + 4] * nH
            cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t()
            cur_ious = torch.max(cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False))
        conf_mask[b][cur_ious > sil_thresh] = 0
    if seen < 12800:
        if anchor_step == 4:
            tx = torch.FloatTensor(anchors).view(nA, anchor_step).index_select(1, torch.LongTensor([2])).view(1, nA, 1,
                                                                                                              1).repeat(
                nB, 1, nH, nW)
            ty = torch.FloatTensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([2])).view(
                1, nA, 1, 1).repeat(nB, 1, nH, nW)
        else:
            tx.fill_(0.5)
            ty.fill_(0.5)
        tw.zero_()
        th.zero_()
        coord_mask.fill_(1)
    nGT = 0
    nCorrect = 0
    for b in range(nB):
        for t in range(50):
            if target[b][t * 5 + 1] == 0:
                break
            nGT = nGT + 1
            best_iou = 0.0
            best_n = -1
            min_dist = 10000
            gx = target[b][t * 5 + 1] * nW
            gy = target[b][t * 5 + 2] * nH
            gi = int(gx)
            gj = int(gy)
            gw = target[b][t * 5 + 3] * nW
            gh = target[b][t * 5 + 4] * nH
            gt_box = [0, 0, gw, gh]
            for n in range(nA):
                aw = anchors[anchor_step * n]
                ah = anchors[anchor_step * n + 1]
                anchor_box = [0, 0, aw, ah]
                iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False)
                if anchor_step == 4:
                    ax = anchors[anchor_step * n + 2]
                    ay = anchors[anchor_step * n + 3]
                    dist = pow(((gi + ax) - gx), 2) + pow(((gj + ay) - gy), 2)
                if iou > best_iou:
                    best_iou = iou
                    best_n = n
                elif anchor_step == 4 and iou == best_iou and dist < min_dist:
                    best_iou = iou
                    best_n = n
                    min_dist = dist
            gt_box = [gx, gy, gw, gh]
            pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi]
            coord_mask[b][best_n][gj][gi] = 1
            cls_mask[b][best_n][gj][gi] = 1
            conf_mask[b][best_n][gj][gi] = object_scale
            tx[b][best_n][gj][gi] = target[b][t * 5 + 1] * nW - gi
            ty[b][best_n][gj][gi] = target[b][t * 5 + 2] * nH - gj
            tw[b][best_n][gj][gi] = math.log(gw / anchors[anchor_step * best_n])
            th[b][best_n][gj][gi] = math.log(gh / anchors[anchor_step * best_n + 1])
            iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False)  # best_iou
            tconf[b][best_n][gj][gi] = iou
            tcls[b][best_n][gj][gi] = target[b][t * 5]
            if iou > 0.5:
                nCorrect = nCorrect + 1
    return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls
 class RegionLoss(nn.Module):
    def __init__(self, num_classes=0, anchors=[], num_anchors=1):
        super(RegionLoss, self).__init__()
        self.num_classes = num_classes
        self.anchors = anchors
        self.num_anchors = num_anchors
        self.anchor_step = len(anchors) / num_anchors
        self.coord_scale = 1
        self.noobject_scale = 1
        self.object_scale = 5
        self.class_scale = 1
        self.thresh = 0.6
        self.seen = 0
    def forward(self, output, target):
        # output : BxAs*(4+1+num_classes)*H*W
        t0 = time.time()
        nB = output.data.size(0)
        nA = self.num_anchors
        nC = self.num_classes
        nH = output.data.size(2)
        nW = output.data.size(3)
        output = output.view(nB, nA, (5 + nC), nH, nW)
        x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW))
        y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW))
        w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW)
        h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW)
        conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW))
        cls = output.index_select(2, Variable(torch.linspace(5, 5 + nC - 1, nC).long().cuda()))
        cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(nB * nA * nH * nW, nC)
        t1 = time.time()
        pred_boxes = torch.cuda.FloatTensor(4, nB * nA * nH * nW)
        grid_x = torch.linspace(0, nW - 1, nW).repeat(nH, 1).repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
        grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
        anchor_w = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([0])).cuda()
        anchor_h = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([1])).cuda()
        anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
        anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
        pred_boxes[0] = x.data + grid_x
        pred_boxes[1] = y.data + grid_y
        pred_boxes[2] = torch.exp(w.data) * anchor_w
        pred_boxes[3] = torch.exp(h.data) * anchor_h
        pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4))
        t2 = time.time()
        nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes,
                                                                                                    target.data,
                                                                                                    self.anchors, nA,
                                                                                                    nC, \
                                                                                                    nH, nW,
                                                                                                    self.noobject_scale,
                                                                                                    self.object_scale,
                                                                                                    self.thresh,
                                                                                                    self.seen)
        cls_mask = (cls_mask == 1)
        nProposals = int((conf > 0.25).sum().data[0])
        tx = Variable(tx.cuda())
        ty = Variable(ty.cuda())
        tw = Variable(tw.cuda())
        th = Variable(th.cuda())
        tconf = Variable(tconf.cuda())
        tcls = Variable(tcls.view(-1)[cls_mask].long().cuda())
        coord_mask = Variable(coord_mask.cuda())
        conf_mask = Variable(conf_mask.cuda().sqrt())
        cls_mask = Variable(cls_mask.view(-1, 1).repeat(1, nC).cuda())
        cls = cls[cls_mask].view(-1, nC)
        t3 = time.time()
        loss_x = self.coord_scale * nn.MSELoss(reduction='sum')(x * coord_mask, tx * coord_mask) / 2.0
        loss_y = self.coord_scale * nn.MSELoss(reduction='sum')(y * coord_mask, ty * coord_mask) / 2.0
        loss_w = self.coord_scale * nn.MSELoss(reduction='sum')(w * coord_mask, tw * coord_mask) / 2.0
        loss_h = self.coord_scale * nn.MSELoss(reduction='sum')(h * coord_mask, th * coord_mask) / 2.0
        loss_conf = nn.MSELoss(reduction='sum')(conf * conf_mask, tconf * conf_mask) / 2.0
        loss_cls = self.class_scale * nn.CrossEntropyLoss(reduction='sum')(cls, tcls)
        loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
        t4 = time.time()
        if False:
            print('-----------------------------------')
            print('        activation : %f' % (t1 - t0))
            print(' create pred_boxes : %f' % (t2 - t1))
            print('     build targets : %f' % (t3 - t2))
            print('       create loss : %f' % (t4 - t3))
            print('             total : %f' % (t4 - t0))
        print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f' % (
        self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_w.data[0], loss_h.data[0],
        loss_conf.data[0], loss_cls.data[0], loss.data[0]))
        return loss
--- a/easymocap/estimator/YOLOv4/torch_utils.py
+++ b/easymocap/estimator/YOLOv4/torch_utils.py
@ -0,0 +1,98 @@
 import sys
 import os
 import time
 import math
 import torch
 import numpy as np
 from torch.autograd import Variable
 def bbox_ious(boxes1, boxes2, x1y1x2y2=True):
    if x1y1x2y2:
        mx = torch.min(boxes1[0], boxes2[0])
        Mx = torch.max(boxes1[2], boxes2[2])
        my = torch.min(boxes1[1], boxes2[1])
        My = torch.max(boxes1[3], boxes2[3])
        w1 = boxes1[2] - boxes1[0]
        h1 = boxes1[3] - boxes1[1]
        w2 = boxes2[2] - boxes2[0]
        h2 = boxes2[3] - boxes2[1]
    else:
        mx = torch.min(boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0)
        Mx = torch.max(boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0)
        my = torch.min(boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0)
        My = torch.max(boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0)
        w1 = boxes1[2]
        h1 = boxes1[3]
        w2 = boxes2[2]
        h2 = boxes2[3]
    uw = Mx - mx
    uh = My - my
    cw = w1 + w2 - uw
    ch = h1 + h2 - uh
    mask = ((cw <= 0) + (ch <= 0) > 0)
    area1 = w1 * h1
    area2 = w2 * h2
    carea = cw * ch
    carea[mask] = 0
    uarea = area1 + area2 - carea
    return carea / uarea
 def get_region_boxes(boxes_and_confs):
    # print('Getting boxes from boxes and confs ...')
    boxes_list = []
    confs_list = []
    for item in boxes_and_confs:
        boxes_list.append(item[0])
        confs_list.append(item[1])
    # boxes: [batch, num1 + num2 + num3, 1, 4]
    # confs: [batch, num1 + num2 + num3, num_classes]
    boxes = torch.cat(boxes_list, dim=1)
    confs = torch.cat(confs_list, dim=1)
    return [boxes, confs]
 def convert2cpu(gpu_matrix):
    return torch.FloatTensor(gpu_matrix.size()).copy_(gpu_matrix)
 def convert2cpu_long(gpu_matrix):
    return torch.LongTensor(gpu_matrix.size()).copy_(gpu_matrix)
 def do_detect(model, img, conf_thresh, nms_thresh, use_cuda=1):
    model.eval()
    t0 = time.time()
    if type(img) == np.ndarray and len(img.shape) == 3:  # cv2 image
        img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0)
    elif type(img) == np.ndarray and len(img.shape) == 4:
        img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0)
    else:
        print("unknow image type")
        exit(-1)
    if use_cuda:
        img = img.cuda()
    img = torch.autograd.Variable(img)
    t1 = time.time()
    output = model(img)
    t2 = time.time()
    print('-----------------------------------')
    print('           Preprocess : %f' % (t1 - t0))
    print('      Model Inference : %f' % (t2 - t1))
    print('-----------------------------------')
    return utils.post_processing(img, conf_thresh, nms_thresh, output)
--- a/easymocap/estimator/YOLOv4/utils.py
+++ b/easymocap/estimator/YOLOv4/utils.py
@ -0,0 +1,221 @@
 import sys
 import os
 import time
 import math
 import numpy as np
 import itertools
 import struct  # get_image_size
 import imghdr  # get_image_size
 def sigmoid(x):
    return 1.0 / (np.exp(-x) + 1.)
 def softmax(x):
    x = np.exp(x - np.expand_dims(np.max(x, axis=1), axis=1))
    x = x / np.expand_dims(x.sum(axis=1), axis=1)
    return x
 def bbox_iou(box1, box2, x1y1x2y2=True):
    # print('iou box1:', box1)
    # print('iou box2:', box2)
    if x1y1x2y2:
        mx = min(box1[0], box2[0])
        Mx = max(box1[2], box2[2])
        my = min(box1[1], box2[1])
        My = max(box1[3], box2[3])
        w1 = box1[2] - box1[0]
        h1 = box1[3] - box1[1]
        w2 = box2[2] - box2[0]
        h2 = box2[3] - box2[1]
    else:
        w1 = box1[2]
        h1 = box1[3]
        w2 = box2[2]
        h2 = box2[3]
        mx = min(box1[0], box2[0])
        Mx = max(box1[0] + w1, box2[0] + w2)
        my = min(box1[1], box2[1])
        My = max(box1[1] + h1, box2[1] + h2)
    uw = Mx - mx
    uh = My - my
    cw = w1 + w2 - uw
    ch = h1 + h2 - uh
    carea = 0
    if cw <= 0 or ch <= 0:
        return 0.0
    area1 = w1 * h1
    area2 = w2 * h2
    carea = cw * ch
    uarea = area1 + area2 - carea
    return carea / uarea
 def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False):
    # print(boxes.shape)
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    areas = (x2 - x1) * (y2 - y1)
    order = confs.argsort()[::-1]
    keep = []
    while order.size > 0:
        idx_self = order[0]
        idx_other = order[1:]
        keep.append(idx_self)
        xx1 = np.maximum(x1[idx_self], x1[idx_other])
        yy1 = np.maximum(y1[idx_self], y1[idx_other])
        xx2 = np.minimum(x2[idx_self], x2[idx_other])
        yy2 = np.minimum(y2[idx_self], y2[idx_other])
        w = np.maximum(0.0, xx2 - xx1)
        h = np.maximum(0.0, yy2 - yy1)
        inter = w * h
        if min_mode:
            over = inter / np.minimum(areas[order[0]], areas[order[1:]])
        else:
            over = inter / (areas[order[0]] + areas[order[1:]] - inter)
        inds = np.where(over <= nms_thresh)[0]
        order = order[inds + 1]
    return np.array(keep)
 def plot_boxes_cv2(img, boxes, savename=None, class_names=None, color=None):
    import cv2
    img = np.copy(img)
    colors = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]], dtype=np.float32)
    def get_color(c, x, max_val):
        ratio = float(x) / max_val * 5
        i = int(math.floor(ratio))
        j = int(math.ceil(ratio))
        ratio = ratio - i
        r = (1 - ratio) * colors[i][c] + ratio * colors[j][c]
        return int(r * 255)
    width = img.shape[1]
    height = img.shape[0]
    for i in range(len(boxes)):
        box = boxes[i]
        x1 = int(box[0] * width)
        y1 = int(box[1] * height)
        x2 = int(box[2] * width)
        y2 = int(box[3] * height)
        if color:
            rgb = color
        else:
            rgb = (255, 0, 0)
        if len(box) >= 7 and class_names:
            cls_conf = box[5]
            cls_id = box[6]
            print('%s: %f' % (class_names[cls_id], cls_conf))
            classes = len(class_names)
            offset = cls_id * 123457 % classes
            red = get_color(2, offset, classes)
            green = get_color(1, offset, classes)
            blue = get_color(0, offset, classes)
            if color is None:
                rgb = (red, green, blue)
            img = cv2.putText(img, class_names[cls_id], (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1.2, rgb, 1)
        img = cv2.rectangle(img, (x1, y1), (x2, y2), rgb, 1)
    if savename:
        print("save plot results to %s" % savename)
        cv2.imwrite(savename, img)
    return img
 def read_truths(lab_path):
    if not os.path.exists(lab_path):
        return np.array([])
    if os.path.getsize(lab_path):
        truths = np.loadtxt(lab_path)
        truths = truths.reshape(truths.size / 5, 5)  # to avoid single truth problem
        return truths
    else:
        return np.array([])
 def post_processing(img, conf_thresh, nms_thresh, output):
    # anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401]
    # num_anchors = 9
    # anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
    # strides = [8, 16, 32]
    # anchor_step = len(anchors) // num_anchors
    # [batch, num, 1, 4]
    box_array = output[0]
    # [batch, num, num_classes]
    confs = output[1]
    t1 = time.time()
    if type(box_array).__name__ != 'ndarray':
        box_array = box_array.cpu().detach().numpy()
        confs = confs.cpu().detach().numpy()
    num_classes = confs.shape[2]
    # [batch, num, 4]
    box_array = box_array[:, :, 0]
    # [batch, num, num_classes] --> [batch, num]
    max_conf = np.max(confs, axis=2)
    max_id = np.argmax(confs, axis=2)
    t2 = time.time()
    bboxes_batch = []
    for i in range(box_array.shape[0]):
        argwhere = max_conf[i] > conf_thresh
        l_box_array = box_array[i, argwhere, :]
        l_max_conf = max_conf[i, argwhere]
        l_max_id = max_id[i, argwhere]
        bboxes = []
        # nms for each class
        for j in range(num_classes):
            cls_argwhere = l_max_id == j
            ll_box_array = l_box_array[cls_argwhere, :]
            ll_max_conf = l_max_conf[cls_argwhere]
            ll_max_id = l_max_id[cls_argwhere]
            keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh)
            if (keep.size > 0):
                ll_box_array = ll_box_array[keep, :]
                ll_max_conf = ll_max_conf[keep]
                ll_max_id = ll_max_id[keep]
                for k in range(ll_box_array.shape[0]):
                    bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2], ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]])
        bboxes_batch.append(bboxes)
    t3 = time.time()
    print('-----------------------------------')
    print('       max and argmax : %f' % (t2 - t1))
    print('                  nms : %f' % (t3 - t2))
    print('Post processing total : %f' % (t3 - t1))
    print('-----------------------------------')
    return bboxes_batch
--- a/easymocap/estimator/YOLOv4/yolo.py
+++ b/easymocap/estimator/YOLOv4/yolo.py
@ -0,0 +1,161 @@
 '''
  @ Date: 2020-12-10 16:39:51
  @ Author: Qing Shuai
  @ LastEditors: Qing Shuai
  @ LastEditTime: 2022-04-21 23:53:40
  @ FilePath: /EasyMocapPublic/easymocap/estimator/YOLOv4/yolo.py
 '''
 from .darknet2pytorch import Darknet
 import cv2
 import torch
 from os.path import join
 import os
 import numpy as np
 def load_class_names(namesfile):
    class_names = []
    with open(namesfile, 'r') as fp:
        lines = fp.readlines()
    for line in lines:
        line = line.rstrip()
        class_names.append(line)
    return class_names
 def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False):
    # print(boxes.shape)
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    areas = (x2 - x1) * (y2 - y1)
    order = confs.argsort()[::-1]
    keep = []
    while order.size > 0:
        idx_self = order[0]
        idx_other = order[1:]
        keep.append(idx_self)
        xx1 = np.maximum(x1[idx_self], x1[idx_other])
        yy1 = np.maximum(y1[idx_self], y1[idx_other])
        xx2 = np.minimum(x2[idx_self], x2[idx_other])
        yy2 = np.minimum(y2[idx_self], y2[idx_other])
        w = np.maximum(0.0, xx2 - xx1)
        h = np.maximum(0.0, yy2 - yy1)
        inter = w * h
        if min_mode:
            over = inter / np.minimum(areas[order[0]], areas[order[1:]])
        else:
            over = inter / (areas[order[0]] + areas[order[1:]] - inter)
        inds = np.where(over <= nms_thresh)[0]
        order = order[inds + 1]
    return np.array(keep)
 def post_processing(conf_thresh, nms_thresh, output):
    # [batch, num, 1, 4]
    box_array = output[0]
    # [batch, num, num_classes]
    confs = output[1]
    if type(box_array).__name__ != 'ndarray':
        box_array = box_array.cpu().detach().numpy()
        confs = confs.cpu().detach().numpy()
    num_classes = confs.shape[2]
    # [batch, num, 4]
    box_array = box_array[:, :, 0]
    # [batch, num, num_classes] --> [batch, num]
    max_conf = np.max(confs, axis=2)
    max_id = np.argmax(confs, axis=2)
    bboxes_batch = []
    for i in range(box_array.shape[0]):
        argwhere = max_conf[i] > conf_thresh
        l_box_array = box_array[i, argwhere, :]
        l_max_conf = max_conf[i, argwhere]
        l_max_id = max_id[i, argwhere]
        bboxes = []
        # nms for class person
        j = 0
        cls_argwhere = l_max_id == j
        ll_box_array = l_box_array[cls_argwhere, :]
        ll_max_conf = l_max_conf[cls_argwhere]
        ll_max_id = l_max_id[cls_argwhere]
        keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh)
        if (keep.size > 0):
            ll_box_array = ll_box_array[keep, :]
            ll_max_conf = ll_max_conf[keep]
            ll_max_id = ll_max_id[keep]
            bboxes = np.hstack([ll_box_array, ll_max_conf[:, None]])
        bboxes_batch.append(bboxes)
    return bboxes_batch
 class YOLOv4:
    def __init__(self, device, ckpt_path, box_nms_thres, conf_thres,
        isWild=False) -> None:
        dirname = os.path.dirname(__file__)
        cfgfile = join(dirname, 'yolov4.cfg')
        namesfile = join(dirname, 'coco.names')
        self.model = Darknet(cfgfile)
        self.model.load_weights(ckpt_path)
        self.model.to(device)
        self.model.eval()
        class_names = load_class_names(namesfile)
        self.device = device
        self.box_nms_thres = box_nms_thres
        self.conf_thres = conf_thres
        self.isWild = isWild
    def predict_single(self, image):
        width  = image.shape[1]
        height = image.shape[0]
        tgt_width = self.model.width
        # 先缩小，再padding
        if width > height:
            tgt_shape = (tgt_width, int(height/width*tgt_width))
            resize = cv2.resize(image, tgt_shape)
            sized = np.zeros((tgt_width, tgt_width, 3), dtype=np.uint8)
            start = (sized.shape[0] - resize.shape[0])//2
            sized[start:start+resize.shape[0], :, :] = resize
            # pad_to_square
        elif width == height:
            sized = cv2.resize(image, (tgt_width, tgt_width))
            start = 0
        else:
            tgt_shape = (int(width/height*tgt_width), tgt_width)
            resize = cv2.resize(image, tgt_shape)
            sized = np.zeros((tgt_width, tgt_width, 3), dtype=np.uint8)
            start = (sized.shape[1] - resize.shape[1]) // 2
            sized[:, start:start+resize.shape[1], :] = resize
        img = torch.from_numpy(sized.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0)
        img = img.to(self.device)
        with torch.no_grad():
            output = self.model(img)
        bboxes = post_processing(self.conf_thres, self.box_nms_thres, output)[0]
        if len(bboxes) == 0:
            return bboxes
        if self.isWild:
            flag = ((bboxes[:, 2] - bboxes[:, 0]) < 0.8)&(((bboxes[:, 2] - bboxes[:, 0]) > 0.1)|((bboxes[:, 3] - bboxes[:, 1]) > 0.1))
            bboxes = bboxes[flag]
        if width >= height:
            bboxes[:, :4] *= width
            bboxes[:, 1] -= start*width/tgt_width
            bboxes[:, 3] -= start*width/tgt_width
        else:
            bboxes[:, :4] *= height
            bboxes[:, 0] -= start*height/tgt_width
            bboxes[:, 2] -= start*height/tgt_width
        # return bounding box
        return bboxes
--- a/easymocap/estimator/YOLOv4/yolo_layer.py
+++ b/easymocap/estimator/YOLOv4/yolo_layer.py
@ -0,0 +1,322 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from .torch_utils import *
 def yolo_forward(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,
                              validation=False):
    # Output would be invalid if it does not satisfy this assert
    # assert (output.size(1) == (5 + num_classes) * num_anchors)
    # print(output.size())
    # Slice the second dimension (channel) of output into:
    # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ]
    # And then into
    # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ]
    batch = output.size(0)
    H = output.size(2)
    W = output.size(3)
    bxy_list = []
    bwh_list = []
    det_confs_list = []
    cls_confs_list = []
    for i in range(num_anchors):
        begin = i * (5 + num_classes)
        end = (i + 1) * (5 + num_classes)
        bxy_list.append(output[:, begin : begin + 2])
        bwh_list.append(output[:, begin + 2 : begin + 4])
        det_confs_list.append(output[:, begin + 4 : begin + 5])
        cls_confs_list.append(output[:, begin + 5 : end])
    # Shape: [batch, num_anchors * 2, H, W]
    bxy = torch.cat(bxy_list, dim=1)
    # Shape: [batch, num_anchors * 2, H, W]
    bwh = torch.cat(bwh_list, dim=1)
    # Shape: [batch, num_anchors, H, W]
    det_confs = torch.cat(det_confs_list, dim=1)
    # Shape: [batch, num_anchors * H * W]
    det_confs = det_confs.view(batch, num_anchors * H * W)
    # Shape: [batch, num_anchors * num_classes, H, W]
    cls_confs = torch.cat(cls_confs_list, dim=1)
    # Shape: [batch, num_anchors, num_classes, H * W]
    cls_confs = cls_confs.view(batch, num_anchors, num_classes, H * W)
    # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes] 
    cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, num_classes)
    # Apply sigmoid(), exp() and softmax() to slices
    #
    bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1)
    bwh = torch.exp(bwh)
    det_confs = torch.sigmoid(det_confs)
    cls_confs = torch.sigmoid(cls_confs)
    # Prepare C-x, C-y, P-w, P-h (None of them are torch related)
    grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, W - 1, W), axis=0).repeat(H, 0), axis=0), axis=0)
    grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(W, 1), axis=0), axis=0)
    # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1)
    # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W)
    anchor_w = []
    anchor_h = []
    for i in range(num_anchors):
        anchor_w.append(anchors[i * 2])
        anchor_h.append(anchors[i * 2 + 1])
    device = None
    cuda_check = output.is_cuda
    if cuda_check:
        device = output.get_device()
    bx_list = []
    by_list = []
    bw_list = []
    bh_list = []
    # Apply C-x, C-y, P-w, P-h
    for i in range(num_anchors):
        ii = i * 2
        # Shape: [batch, 1, H, W]
        bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32) # grid_x.to(device=device, dtype=torch.float32)
        # Shape: [batch, 1, H, W]
        by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32) # grid_y.to(device=device, dtype=torch.float32)
        # Shape: [batch, 1, H, W]
        bw = bwh[:, ii : ii + 1] * anchor_w[i]
        # Shape: [batch, 1, H, W]
        bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i]
        bx_list.append(bx)
        by_list.append(by)
        bw_list.append(bw)
        bh_list.append(bh)
    ########################################
    #   Figure out bboxes from slices     #
    ########################################
    # Shape: [batch, num_anchors, H, W]
    bx = torch.cat(bx_list, dim=1)
    # Shape: [batch, num_anchors, H, W]
    by = torch.cat(by_list, dim=1)
    # Shape: [batch, num_anchors, H, W]
    bw = torch.cat(bw_list, dim=1)
    # Shape: [batch, num_anchors, H, W]
    bh = torch.cat(bh_list, dim=1)
    # Shape: [batch, 2 * num_anchors, H, W]
    bx_bw = torch.cat((bx, bw), dim=1)
    # Shape: [batch, 2 * num_anchors, H, W]
    by_bh = torch.cat((by, bh), dim=1)
    # normalize coordinates to [0, 1]
    bx_bw /= W
    by_bh /= H
    # Shape: [batch, num_anchors * H * W, 1]
    bx = bx_bw[:, :num_anchors].view(batch, num_anchors * H * W, 1)
    by = by_bh[:, :num_anchors].view(batch, num_anchors * H * W, 1)
    bw = bx_bw[:, num_anchors:].view(batch, num_anchors * H * W, 1)
    bh = by_bh[:, num_anchors:].view(batch, num_anchors * H * W, 1)
    bx1 = bx - bw * 0.5
    by1 = by - bh * 0.5
    bx2 = bx1 + bw
    by2 = by1 + bh
    # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4]
    boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view(batch, num_anchors * H * W, 1, 4)
    # boxes = boxes.repeat(1, 1, num_classes, 1)
    # boxes:     [batch, num_anchors * H * W, 1, 4]
    # cls_confs: [batch, num_anchors * H * W, num_classes]
    # det_confs: [batch, num_anchors * H * W]
    det_confs = det_confs.view(batch, num_anchors * H * W, 1)
    confs = cls_confs * det_confs
    # boxes: [batch, num_anchors * H * W, 1, 4]
    # confs: [batch, num_anchors * H * W, num_classes]
    return  boxes, confs
 def yolo_forward_dynamic(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,
                              validation=False):
    # Output would be invalid if it does not satisfy this assert
    # assert (output.size(1) == (5 + num_classes) * num_anchors)
    # print(output.size())
    # Slice the second dimension (channel) of output into:
    # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ]
    # And then into
    # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ]
    # batch = output.size(0)
    # H = output.size(2)
    # W = output.size(3)
    bxy_list = []
    bwh_list = []
    det_confs_list = []
    cls_confs_list = []
    for i in range(num_anchors):
        begin = i * (5 + num_classes)
        end = (i + 1) * (5 + num_classes)
        bxy_list.append(output[:, begin : begin + 2])
        bwh_list.append(output[:, begin + 2 : begin + 4])
        det_confs_list.append(output[:, begin + 4 : begin + 5])
        cls_confs_list.append(output[:, begin + 5 : end])
    # Shape: [batch, num_anchors * 2, H, W]
    bxy = torch.cat(bxy_list, dim=1)
    # Shape: [batch, num_anchors * 2, H, W]
    bwh = torch.cat(bwh_list, dim=1)
    # Shape: [batch, num_anchors, H, W]
    det_confs = torch.cat(det_confs_list, dim=1)
    # Shape: [batch, num_anchors * H * W]
    det_confs = det_confs.view(output.size(0), num_anchors * output.size(2) * output.size(3))
    # Shape: [batch, num_anchors * num_classes, H, W]
    cls_confs = torch.cat(cls_confs_list, dim=1)
    # Shape: [batch, num_anchors, num_classes, H * W]
    cls_confs = cls_confs.view(output.size(0), num_anchors, num_classes, output.size(2) * output.size(3))
    # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes] 
    cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(output.size(0), num_anchors * output.size(2) * output.size(3), num_classes)
    # Apply sigmoid(), exp() and softmax() to slices
    #
    bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1)
    bwh = torch.exp(bwh)
    det_confs = torch.sigmoid(det_confs)
    cls_confs = torch.sigmoid(cls_confs)
    # Prepare C-x, C-y, P-w, P-h (None of them are torch related)
    grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, output.size(3) - 1, output.size(3)), axis=0).repeat(output.size(2), 0), axis=0), axis=0)
    grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, output.size(2) - 1, output.size(2)), axis=1).repeat(output.size(3), 1), axis=0), axis=0)
    # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1)
    # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W)
    anchor_w = []
    anchor_h = []
    for i in range(num_anchors):
        anchor_w.append(anchors[i * 2])
        anchor_h.append(anchors[i * 2 + 1])
    device = None
    cuda_check = output.is_cuda
    if cuda_check:
        device = output.get_device()
    bx_list = []
    by_list = []
    bw_list = []
    bh_list = []
    # Apply C-x, C-y, P-w, P-h
    for i in range(num_anchors):
        ii = i * 2
        # Shape: [batch, 1, H, W]
        bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32) # grid_x.to(device=device, dtype=torch.float32)
        # Shape: [batch, 1, H, W]
        by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32) # grid_y.to(device=device, dtype=torch.float32)
        # Shape: [batch, 1, H, W]
        bw = bwh[:, ii : ii + 1] * anchor_w[i]
        # Shape: [batch, 1, H, W]
        bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i]
        bx_list.append(bx)
        by_list.append(by)
        bw_list.append(bw)
        bh_list.append(bh)
    ########################################
    #   Figure out bboxes from slices     #
    ########################################
    # Shape: [batch, num_anchors, H, W]
    bx = torch.cat(bx_list, dim=1)
    # Shape: [batch, num_anchors, H, W]
    by = torch.cat(by_list, dim=1)
    # Shape: [batch, num_anchors, H, W]
    bw = torch.cat(bw_list, dim=1)
    # Shape: [batch, num_anchors, H, W]
    bh = torch.cat(bh_list, dim=1)
    # Shape: [batch, 2 * num_anchors, H, W]
    bx_bw = torch.cat((bx, bw), dim=1)
    # Shape: [batch, 2 * num_anchors, H, W]
    by_bh = torch.cat((by, bh), dim=1)
    # normalize coordinates to [0, 1]
    bx_bw /= output.size(3)
    by_bh /= output.size(2)
    # Shape: [batch, num_anchors * H * W, 1]
    bx = bx_bw[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
    by = by_bh[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
    bw = bx_bw[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
    bh = by_bh[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
    bx1 = bx - bw * 0.5
    by1 = by - bh * 0.5
    bx2 = bx1 + bw
    by2 = by1 + bh
    # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4]
    boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view(output.size(0), num_anchors * output.size(2) * output.size(3), 1, 4)
    # boxes = boxes.repeat(1, 1, num_classes, 1)
    # boxes:     [batch, num_anchors * H * W, 1, 4]
    # cls_confs: [batch, num_anchors * H * W, num_classes]
    # det_confs: [batch, num_anchors * H * W]
    det_confs = det_confs.view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
    confs = cls_confs * det_confs
    # boxes: [batch, num_anchors * H * W, 1, 4]
    # confs: [batch, num_anchors * H * W, num_classes]
    return  boxes, confs
 class YoloLayer(nn.Module):
    ''' Yolo layer
    model_out: while inference,is post-processing inside or outside the model
        true:outside
    '''
    def __init__(self, anchor_mask=[], num_classes=0, anchors=[], num_anchors=1, stride=32, model_out=False):
        super(YoloLayer, self).__init__()
        self.anchor_mask = anchor_mask
        self.num_classes = num_classes
        self.anchors = anchors
        self.num_anchors = num_anchors
        self.anchor_step = len(anchors) // num_anchors
        self.coord_scale = 1
        self.noobject_scale = 1
        self.object_scale = 5
        self.class_scale = 1
        self.thresh = 0.6
        self.stride = stride
        self.seen = 0
        self.scale_x_y = 1
        self.model_out = model_out
    def forward(self, output, target=None):
        if self.training:
            return output
        masked_anchors = []
        for m in self.anchor_mask:
            masked_anchors += self.anchors[m * self.anchor_step:(m + 1) * self.anchor_step]
        masked_anchors = [anchor / self.stride for anchor in masked_anchors]
        return yolo_forward_dynamic(output, self.thresh, self.num_classes, masked_anchors, len(self.anchor_mask),scale_x_y=self.scale_x_y)
--- a/easymocap/estimator/YOLOv4/yolov4.cfg
+++ b/easymocap/estimator/YOLOv4/yolov4.cfg
--- a/easymocap/estimator/yolohrnet_wrapper.py
+++ b/easymocap/estimator/yolohrnet_wrapper.py
@ -0,0 +1,122 @@
 from ..annotator.file_utils import read_json
 from .wrapper_base import check_result, create_annot_file, save_annot
 from glob import glob
 from os.path import join
 from tqdm import tqdm
 import os
 import cv2
 import numpy as np
 def detect_frame(detector, img, pid=0, only_bbox=False):
    lDetections = detector.detect([img], only_bbox=only_bbox)[0]
    annots = []
    for i in range(len(lDetections)):
        annot = {
            'bbox': [float(d) for d in lDetections[i]['bbox']],
            'personID': pid + i,
            'isKeyframe': False
        }
        if not only_bbox:
            annot['keypoints'] = lDetections[i]['keypoints'].tolist()
        annots.append(annot)
    return annots
 def extract_bbox(image_root, annot_root, ext, **config):
    force = config.pop('force')
    if check_result(image_root, annot_root) and not force:
        return 0
    import torch
    from .YOLOv4 import YOLOv4
    device = torch.device('cuda') \
            if torch.cuda.is_available() else torch.device('cpu')
    detector = YOLOv4(device=device, **config)
    imgnames = sorted(glob(join(image_root, '*'+ext)))
    if len(imgnames) == 0:
        ext = '.png'
        imgnames = sorted(glob(join(image_root, '*'+ext)))
    # run_yolo(image_root, )
    for imgname in tqdm(imgnames, desc='{:10s}'.format(os.path.basename(annot_root))):
        base = os.path.basename(imgname).replace(ext, '')
        annotname = join(annot_root, base+'.json')
        annot = create_annot_file(annotname, imgname)
        image = cv2.imread(imgname)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        detections = detector.predict_single(image_rgb)
        annots = []
        pid = 0
        for i in range(len(detections)):
            annot_ = {
                'bbox': [float(d) for d in detections[i]],
                'isKeyframe': False
            }
            annot_['area'] = max(annot_['bbox'][2] - annot_['bbox'][0], annot_['bbox'][3] - annot_['bbox'][1])**2
            annots.append(annot_)
        annots.sort(key=lambda x:-x['area'])
        # re-assign the person ID
        for i in range(len(annots)):
            annots[i]['personID'] = i + pid
        annot['annots'] = annots
        save_annot(annotname, annot)
 def extract_hrnet(image_root, annot_root, ext, **config):
    config.pop('force')
    import torch
    imgnames = sorted(glob(join(image_root, '*'+ext)))
    import torch
    device = torch.device('cuda') \
            if torch.cuda.is_available() else torch.device('cpu')
    from .HRNet import SimpleHRNet
    estimator = SimpleHRNet(device=device, **config)
    for imgname in tqdm(imgnames, desc='{:10s}'.format(os.path.basename(annot_root))):
        base = os.path.basename(imgname).replace(ext, '')
        annotname = join(annot_root, base+'.json')
        annots = read_json(annotname)
        detections = np.array([data['bbox'] for data in annots['annots']])
        image = cv2.imread(imgname)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        points2d = estimator.predict(image_rgb, detections)
        for i in range(detections.shape[0]):
            annot_ = annots['annots'][i]
            annot_['keypoints'] = points2d[i]
        save_annot(annotname, annots)
 def extract_yolo_hrnet(image_root, annot_root, ext, config_yolo, config_hrnet):
    config_yolo.pop('ext', None)
    imgnames = sorted(glob(join(image_root, '*{}'.format(ext))))
    import torch
    device = torch.device('cuda')
    from .YOLOv4 import YOLOv4
    device = torch.device('cuda') \
            if torch.cuda.is_available() else torch.device('cpu')
    detector = YOLOv4(device=device, **config_yolo)
    from .HRNet import SimpleHRNet
    estimator = SimpleHRNet(device=device, **config_hrnet)
    for nf, imgname in enumerate(tqdm(imgnames, desc=os.path.basename(image_root))):
        base = os.path.basename(imgname).replace(ext, '')
        annotname = join(annot_root, base+'.json')
        annot = create_annot_file(annotname, imgname)
        img0 = cv2.imread(imgname)
        annot = create_annot_file(annotname, imgname)
        image = cv2.imread(imgname)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        detections = detector.predict_single(image_rgb)
        # forward_hrnet
        points2d = estimator.predict(image_rgb, detections)
        annots = []
        pid = 0
        for i in range(len(detections)):
            annot_ = {
                'bbox': [float(d) for d in detections[i]],
                'keypoints': points2d[i],
                'isKeyframe': False
            }
            annot_['area'] = max(annot_['bbox'][2] - annot_['bbox'][0], annot_['bbox'][3] - annot_['bbox'][1])**2
            annots.append(annot_)
        annots.sort(key=lambda x:-x['area'])
        # re-assign the person ID
        for i in range(len(annots)):
            annots[i]['personID'] = i + pid
        annot['annots'] = annots
        save_annot(annotname, annot)
--- a/scripts/preprocess/copy_dataset.py
+++ b/scripts/preprocess/copy_dataset.py
@ -0,0 +1,184 @@
 '''
  @ Date: 2021-06-14 15:39:26
  @ Author: Qing Shuai
  @ LastEditors: Qing Shuai
  @ LastEditTime: 2022-08-02 21:50:40
  @ FilePath: /EasyMocapPublic/scripts/preprocess/copy_dataset.py
 '''
 import os
 from os.path import join
 import shutil
 from tqdm import tqdm
 from glob import glob
 import cv2
 from easymocap.mytools.debug_utils import myerror, mywarn
 mkdir = lambda x:os.makedirs(x, exist_ok=True)
 import json
 def save_json(file, data):
    if not os.path.exists(os.path.dirname(file)):
        os.makedirs(os.path.dirname(file))
    with open(file, 'w') as f:
        json.dump(data, f, indent=4)
 def read_json(path):
    with open(path) as f:
        data = json.load(f)
    return data
 def copy_dataset(inp, out, start, end, step, keys, args):
    copy_keys = {
        'images': args.ext,
        'annots': '.json',
        'mask-schp': '.png',
    }
    copy_share_keys = {
        'output-keypoints3d/keypoints3d': '.json'
    }
    mkdir(out)
    if os.path.exists(join(inp, 'intri.yml')):
        shutil.copyfile(join(inp, 'intri.yml'), join(out, 'intri.yml'))
        shutil.copyfile(join(inp, 'extri.yml'), join(out, 'extri.yml'))
    if os.path.exists(join(inp, 'match_name.json')):
        names = read_json(join(inp, 'match_name.json'))
        names = names[start:end:step]
        save_json(join(out, 'match_name.json'), names)
    if os.path.exists(join(inp, 'sync_time.txt')):
        import numpy as np
        times = np.loadtxt(join(inp, 'sync_time.txt'))
        times = times.reshape(times.shape[0], -1)
        times = times[:, start:end:step]
        np.savetxt(join(out, 'sync_time.txt'), times, fmt='%10d')
    os.system('touch ' + join(out, '{}-{}-{}'.format(start, end, step)))
    for copy, ext in copy_share_keys.items():
        if not os.path.exists(join(inp, copy)):
            continue
        if len(args.frames) == 0:
            ranges = [i for i in range(start, end, step)]
        else:
            ranges = args.frames
        outdir = join(out, copy)
        if os.path.exists(outdir) and len(os.listdir(outdir)) == len(ranges):
            pass
        os.makedirs(outdir, exist_ok=True)
        for nnf, nf in enumerate(tqdm(ranges, desc='{}'.format(copy))):
            oldname = join(inp, copy, '{:06d}{}'.format(nf, ext))
            if not os.path.exists(oldname):
                mywarn('{} not exists'.format(oldname))
                continue
            newname = join(outdir, '{:06d}{}'.format(nnf, ext))
            shutil.copyfile(oldname, newname)
    for copy in keys:
        ext = copy_keys.get(copy, '.json')
        if not os.path.exists(join(inp, copy)):
            continue
        if len(args.subs) == 0:
            subs = sorted(os.listdir(join(inp, copy)))
            subs = [s for s in subs if os.path.isdir(join(inp, copy, s))]
        else:
            subs = args.subs
        for sub in subs:
            if not os.path.exists(join(inp, copy)):
                continue
            outdir = join(out, copy, sub.replace(args.strip, ''))
            os.makedirs(outdir, exist_ok=True)
            if args.end == -1:
                oldnames = sorted(glob(join(inp, copy, sub, '*{}'.format(ext))))
                end = len(oldnames)
                print('{} has {} frames'.format(sub, end))
            if args.sample == -1:
                if len(args.frames) == 0:
                    ranges = [i for i in range(start, end, step)]
                else:
                    ranges = args.frames
            else:
                ranges = [(i/args.sample)*(end-start-2*args.strip_frame)+start+args.strip_frame for i in range(args.sample)]
                ranges = [int(i+0.5) for i in ranges]
            if os.path.exists(outdir) and len(os.listdir(outdir)) == len(ranges):
                mywarn('[copy] Skip {}'.format(outdir))
                continue
            for nnf, nf in enumerate(tqdm(ranges, desc='{}:{}'.format(sub, copy))):
                oldname = join(inp, copy, sub, '{:06d}{}'.format(nf, ext))
                if not os.path.exists(oldname):
                    oldnames = sorted(glob(join(inp, copy, sub, '{:06d}_*{}'.format(nf, ext))))
                    if len(oldnames) == 0:
                        myerror('{} not exists'.format(oldname))
                        import ipdb;ipdb.set_trace()
                    else:
                        for oldname in oldnames:
                            newname = join(outdir, os.path.basename(oldname).replace('{:06d}'.format(nf), '{:06d}'.format(nnf)))
                            shutil.copyfile(oldname, newname)                            
                else:
                    newname = join(outdir, '{:06d}{}'.format(nnf, ext))
                    if copy == 'images' and args.scale != 1:
                        img = cv2.imread(oldname)
                        img = cv2.resize(img, None, fx=args.scale, fy=args.scale)
                        cv2.imwrite(newname, img)
                    else:
                        shutil.copyfile(oldname, newname)
        # make videos
        if copy == 'images' and args.make_video:
            os.makedirs(join(out, 'videos'), exist_ok=True)
            for sub in subs:
                shell = '{} -y -i {}/images/{}/%06d{} -vcodec libx264 {}/videos/{}.mp4 -loglevel quiet'.format(
                    args.ffmpeg, out, sub, ext, out, sub
                )
                print(shell)
                os.system(shell)
 def export(root, out, keys):
    mkdir(out)
    for key in keys:
        src = join(root, key)
        dst = join(out, key)
        if key == 'videos':
            if os.path.exists(src):
                shutil.copytree(src, dst)
            else:
                mkdir(dst)
                subs = sorted(os.listdir(join(root, 'images')))
                for sub in subs:
                    cmd = '{ffmpeg} -r {fps} -i {inp}/%06d.jpg -vcodec libx264 {out}'.format(
                        ffmpeg=args.ffmpeg, fps=50, inp=join(root, 'images', sub),
                        out=join(dst, sub+'.mp4')
                    )
                    os.system(cmd)
        if not os.path.exists(src):
            print(src)
            continue
        shutil.copytree(src, dst)
    for name in ['intri.yml', 'extri.yml']:
        if os.path.exists(join(root, name)):
            shutil.copyfile(join(root, name), join(out, name))
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('path', type=str)
    parser.add_argument('out', type=str)
    parser.add_argument('--strip', type=str, default='')
    parser.add_argument('--keys', type=str, nargs='+', default=['images', 'annots', 'chessboard'])
    parser.add_argument('--subs', type=str, nargs='+', default=[])
    parser.add_argument('--start', type=int, default=0)
    parser.add_argument('--step', type=int, default=1)
    parser.add_argument('--end', type=int, default=-1)
    parser.add_argument('--scale', type=float, default=1)
    parser.add_argument('--strip_frame', type=int, default=0,
        help='remove the start frames and end frames')
    parser.add_argument('--ffmpeg', type=str, default='ffmpeg')
    parser.add_argument('--ext', type=str, default='.jpg')
    parser.add_argument('--sample', type=int, default=-1,
        help='use this flag to sample a fixed number of frames')
    parser.add_argument('--frames', type=int, default=[], nargs='+')
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--make_video', action='store_true')
    parser.add_argument('--export', action='store_true')
    args = parser.parse_args()
    if args.export:
        export(args.path, args.out, args.keys)
    else:
        copy_dataset(args.path, args.out, start=args.start, end=args.end, step=args.step, keys=args.keys, args=args)