From 0175f07290f32f4c06c5212983fc215dcc9d788b Mon Sep 17 00:00:00 2001 From: Qing Shuai Date: Mon, 22 Aug 2022 00:07:46 +0800 Subject: [PATCH] support YOLOv4 + HRNet --- easymocap/estimator/HRNet/__init__.py | 8 + easymocap/estimator/HRNet/hrnet.py | 216 +++ easymocap/estimator/HRNet/hrnet_api.py | 527 ++++++++ easymocap/estimator/HRNet/modules.py | 72 + easymocap/estimator/YOLOv4/__init__.py | 8 + easymocap/estimator/YOLOv4/coco.names | 80 ++ easymocap/estimator/YOLOv4/config.py | 257 ++++ easymocap/estimator/YOLOv4/darknet2pytorch.py | 515 ++++++++ easymocap/estimator/YOLOv4/region_loss.py | 195 +++ easymocap/estimator/YOLOv4/torch_utils.py | 98 ++ easymocap/estimator/YOLOv4/utils.py | 221 ++++ easymocap/estimator/YOLOv4/yolo.py | 161 +++ easymocap/estimator/YOLOv4/yolo_layer.py | 322 +++++ easymocap/estimator/YOLOv4/yolov4.cfg | 1157 +++++++++++++++++ easymocap/estimator/yolohrnet_wrapper.py | 122 ++ scripts/preprocess/copy_dataset.py | 184 +++ 16 files changed, 4143 insertions(+) create mode 100644 easymocap/estimator/HRNet/__init__.py create mode 100644 easymocap/estimator/HRNet/hrnet.py create mode 100644 easymocap/estimator/HRNet/hrnet_api.py create mode 100644 easymocap/estimator/HRNet/modules.py create mode 100644 easymocap/estimator/YOLOv4/__init__.py create mode 100644 easymocap/estimator/YOLOv4/coco.names create mode 100644 easymocap/estimator/YOLOv4/config.py create mode 100644 easymocap/estimator/YOLOv4/darknet2pytorch.py create mode 100644 easymocap/estimator/YOLOv4/region_loss.py create mode 100644 easymocap/estimator/YOLOv4/torch_utils.py create mode 100644 easymocap/estimator/YOLOv4/utils.py create mode 100644 easymocap/estimator/YOLOv4/yolo.py create mode 100644 easymocap/estimator/YOLOv4/yolo_layer.py create mode 100644 easymocap/estimator/YOLOv4/yolov4.cfg create mode 100644 easymocap/estimator/yolohrnet_wrapper.py create mode 100644 scripts/preprocess/copy_dataset.py diff --git a/easymocap/estimator/HRNet/__init__.py b/easymocap/estimator/HRNet/__init__.py new file mode 100644 index 0000000..3e91d07 --- /dev/null +++ b/easymocap/estimator/HRNet/__init__.py @@ -0,0 +1,8 @@ +''' + @ Date: 2020-06-04 12:48:29 + @ LastEditors: Qing Shuai + @ LastEditTime: 2020-11-17 15:52:23 + @ Author: Qing Shuai + @ Mail: s_q@zju.edu.cn +''' +from .hrnet_api import SimpleHRNet \ No newline at end of file diff --git a/easymocap/estimator/HRNet/hrnet.py b/easymocap/estimator/HRNet/hrnet.py new file mode 100644 index 0000000..d3e27f4 --- /dev/null +++ b/easymocap/estimator/HRNet/hrnet.py @@ -0,0 +1,216 @@ +import torch +from torch import nn +from .modules import BasicBlock, Bottleneck + + +class StageModule(nn.Module): + def __init__(self, stage, output_branches, c, bn_momentum): + super(StageModule, self).__init__() + self.stage = stage + self.output_branches = output_branches + + self.branches = nn.ModuleList() + for i in range(self.stage): + w = c * (2 ** i) + branch = nn.Sequential( + BasicBlock(w, w, bn_momentum=bn_momentum), + BasicBlock(w, w, bn_momentum=bn_momentum), + BasicBlock(w, w, bn_momentum=bn_momentum), + BasicBlock(w, w, bn_momentum=bn_momentum), + ) + self.branches.append(branch) + + self.fuse_layers = nn.ModuleList() + # for each output_branches (i.e. each branch in all cases but the very last one) + for i in range(self.output_branches): + self.fuse_layers.append(nn.ModuleList()) + for j in range(self.stage): # for each branch + if i == j: + self.fuse_layers[-1].append(nn.Sequential()) # Used in place of "None" because it is callable + elif i < j: + self.fuse_layers[-1].append(nn.Sequential( + nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=(1, 1), stride=(1, 1), bias=False), + nn.BatchNorm2d(c * (2 ** i), eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), + nn.Upsample(scale_factor=(2.0 ** (j - i)), mode='nearest'), + )) + elif i > j: + ops = [] + for k in range(i - j - 1): + ops.append(nn.Sequential( + nn.Conv2d(c * (2 ** j), c * (2 ** j), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), + bias=False), + nn.BatchNorm2d(c * (2 ** j), eps=1e-05, momentum=0.1, affine=True, + track_running_stats=True), + nn.ReLU(inplace=True), + )) + ops.append(nn.Sequential( + nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), + bias=False), + nn.BatchNorm2d(c * (2 ** i), eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), + )) + self.fuse_layers[-1].append(nn.Sequential(*ops)) + + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + assert len(self.branches) == len(x) + + x = [branch(b) for branch, b in zip(self.branches, x)] + + x_fused = [] + for i in range(len(self.fuse_layers)): + for j in range(0, len(self.branches)): + if j == 0: + x_fused.append(self.fuse_layers[i][0](x[0])) + else: + x_fused[i] = x_fused[i] + self.fuse_layers[i][j](x[j]) + + for i in range(len(x_fused)): + x_fused[i] = self.relu(x_fused[i]) + + return x_fused + + +class HRNet(nn.Module): + def __init__(self, c=48, nof_joints=17, bn_momentum=0.1): + super(HRNet, self).__init__() + + # Input (stem net) + self.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) + self.bn1 = nn.BatchNorm2d(64, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True) + self.conv2 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) + self.bn2 = nn.BatchNorm2d(64, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True) + self.relu = nn.ReLU(inplace=True) + + # Stage 1 (layer1) - First group of bottleneck (resnet) modules + downsample = nn.Sequential( + nn.Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False), + nn.BatchNorm2d(256, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True), + ) + self.layer1 = nn.Sequential( + Bottleneck(64, 64, downsample=downsample), + Bottleneck(256, 64), + Bottleneck(256, 64), + Bottleneck(256, 64), + ) + + # Fusion layer 1 (transition1) - Creation of the first two branches (one full and one half resolution) + self.transition1 = nn.ModuleList([ + nn.Sequential( + nn.Conv2d(256, c, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False), + nn.BatchNorm2d(c, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True), + nn.ReLU(inplace=True), + ), + nn.Sequential(nn.Sequential( # Double Sequential to fit with official pretrained weights + nn.Conv2d(256, c * (2 ** 1), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False), + nn.BatchNorm2d(c * (2 ** 1), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True), + nn.ReLU(inplace=True), + )), + ]) + + # Stage 2 (stage2) - Second module with 1 group of bottleneck (resnet) modules. This has 2 branches + self.stage2 = nn.Sequential( + StageModule(stage=2, output_branches=2, c=c, bn_momentum=bn_momentum), + ) + + # Fusion layer 2 (transition2) - Creation of the third branch (1/4 resolution) + self.transition2 = nn.ModuleList([ + nn.Sequential(), # None, - Used in place of "None" because it is callable + nn.Sequential(), # None, - Used in place of "None" because it is callable + nn.Sequential(nn.Sequential( # Double Sequential to fit with official pretrained weights + nn.Conv2d(c * (2 ** 1), c * (2 ** 2), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False), + nn.BatchNorm2d(c * (2 ** 2), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True), + nn.ReLU(inplace=True), + )), # ToDo Why the new branch derives from the "upper" branch only? + ]) + + # Stage 3 (stage3) - Third module with 4 groups of bottleneck (resnet) modules. This has 3 branches + self.stage3 = nn.Sequential( + StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum), + StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum), + StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum), + StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum), + ) + + # Fusion layer 3 (transition3) - Creation of the fourth branch (1/8 resolution) + self.transition3 = nn.ModuleList([ + nn.Sequential(), # None, - Used in place of "None" because it is callable + nn.Sequential(), # None, - Used in place of "None" because it is callable + nn.Sequential(), # None, - Used in place of "None" because it is callable + nn.Sequential(nn.Sequential( # Double Sequential to fit with official pretrained weights + nn.Conv2d(c * (2 ** 2), c * (2 ** 3), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False), + nn.BatchNorm2d(c * (2 ** 3), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True), + nn.ReLU(inplace=True), + )), # ToDo Why the new branch derives from the "upper" branch only? + ]) + + # Stage 4 (stage4) - Fourth module with 3 groups of bottleneck (resnet) modules. This has 4 branches + self.stage4 = nn.Sequential( + StageModule(stage=4, output_branches=4, c=c, bn_momentum=bn_momentum), + StageModule(stage=4, output_branches=4, c=c, bn_momentum=bn_momentum), + StageModule(stage=4, output_branches=1, c=c, bn_momentum=bn_momentum), + ) + + # Final layer (final_layer) + self.final_layer = nn.Conv2d(c, nof_joints, kernel_size=(1, 1), stride=(1, 1)) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + + x = self.layer1(x) + x = [trans(x) for trans in self.transition1] # Since now, x is a list (# == nof branches) + + x = self.stage2(x) + # x = [trans(x[-1]) for trans in self.transition2] # New branch derives from the "upper" branch only + x = [ + self.transition2[0](x[0]), + self.transition2[1](x[1]), + self.transition2[2](x[-1]) + ] # New branch derives from the "upper" branch only + + x = self.stage3(x) + # x = [trans(x) for trans in self.transition3] # New branch derives from the "upper" branch only + x = [ + self.transition3[0](x[0]), + self.transition3[1](x[1]), + self.transition3[2](x[2]), + self.transition3[3](x[-1]) + ] # New branch derives from the "upper" branch only + + x = self.stage4(x) + + x = self.final_layer(x[0]) + + return x + + +if __name__ == '__main__': + # model = HRNet(48, 17, 0.1) + model = HRNet(32, 17, 0.1) + + # print(model) + + model.load_state_dict( + # torch.load('./weights/pose_hrnet_w48_384x288.pth') + torch.load('./weights/pose_hrnet_w32_256x192.pth') + ) + print('ok!!') + + if torch.cuda.is_available() and False: + torch.backends.cudnn.deterministic = True + device = torch.device('cuda:0') + else: + device = torch.device('cpu') + + print(device) + + model = model.to(device) + + y = model(torch.ones(1, 3, 384, 288).to(device)) + print(y.shape) + print(torch.min(y).item(), torch.mean(y).item(), torch.max(y).item()) diff --git a/easymocap/estimator/HRNet/hrnet_api.py b/easymocap/estimator/HRNet/hrnet_api.py new file mode 100644 index 0000000..3084dee --- /dev/null +++ b/easymocap/estimator/HRNet/hrnet_api.py @@ -0,0 +1,527 @@ +''' + @ Date: 2020-06-04 12:47:04 + @ LastEditors: Qing Shuai + @ LastEditTime: 2022-04-19 17:02:57 + @ Author: Qing Shuai + @ Mail: s_q@zju.edu.cn +''' +from os.path import join +import cv2 +import numpy as np +import torch +from torchvision.transforms import transforms + +from .hrnet import HRNet + +COCO17_IN_BODY25 = [0,16,15,18,17,5,2,6,3,7,4,12,9,13,10,14,11] +pairs = [[1, 8], [1, 2], [1, 5], [2, 3], [3, 4], [5, 6], [6, 7], [8, 9], [9, 10], [10, 11], [8, 12], [12, 13], [13, 14], [1, 0], [0,15], [15,17], [0,16], [16,18], [14,19], [19,20], [14,21], [11,22], [22,23], [11,24]] +def coco17tobody25(points2d): + kpts = np.zeros((points2d.shape[0], 25, 3)) + kpts[:, COCO17_IN_BODY25, :2] = points2d[:, :, :2] + kpts[:, COCO17_IN_BODY25, 2:3] = points2d[:, :, 2:3] + kpts[:, 8, :2] = kpts[:, [9, 12], :2].mean(axis=1) + kpts[:, 8, 2] = kpts[:, [9, 12], 2].min(axis=1) + kpts[:, 1, :2] = kpts[:, [2, 5], :2].mean(axis=1) + kpts[:, 1, 2] = kpts[:, [2, 5], 2].min(axis=1) + # 需要交换一下 + # kpts = kpts[:, :, [1,0,2]] + return kpts + +# 生成高斯核 +def generate_gauss(sigma): + tmp_size = sigma * 3 + size = 2 * tmp_size + 1 + x = np.arange(0, size, 1, np.float32) + y = x[:, np.newaxis] + x0 = y0 = size // 2 + # The gaussian is not normalized, we want the center value to equal 1 + g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2)) + return g, tmp_size + +gauss = {} +for SIGMA in range(1, 5): + gauss_kernel, gauss_radius = generate_gauss(SIGMA) + gauss[SIGMA] = { + 'kernel': gauss_kernel, + 'radius': gauss_radius + } + +def box_to_center_scale(box, model_image_width, model_image_height, scale_factor=1.25): + """convert a box to center,scale information required for pose transformation + Parameters + ---------- + box : list of tuple + list of length 2 with two tuples of floats representing + bottom left and top right corner of a box + model_image_width : int + model_image_height : int + + Returns + ------- + (numpy array, numpy array) + Two numpy arrays, coordinates for the center of the box and the scale of the box + """ + center = np.zeros((2), dtype=np.float32) + + bottom_left_corner = (box[0], box[1]) + top_right_corner = (box[2], box[3]) + box_width = top_right_corner[0]-bottom_left_corner[0] + box_height = top_right_corner[1]-bottom_left_corner[1] + bottom_left_x = bottom_left_corner[0] + bottom_left_y = bottom_left_corner[1] + center[0] = bottom_left_x + box_width * 0.5 + center[1] = bottom_left_y + box_height * 0.5 + + aspect_ratio = model_image_width * 1.0 / model_image_height + pixel_std = 200 + + if box_width > aspect_ratio * box_height: + box_height = box_width * 1.0 / aspect_ratio + elif box_width < aspect_ratio * box_height: + box_width = box_height * aspect_ratio + scale = np.array( + [box_width * 1.0 / pixel_std, box_height * 1.0 / pixel_std], + dtype=np.float32) + scale = scale * scale_factor + return center, scale + +def get_dir(src_point, rot_rad): + sn, cs = np.sin(rot_rad), np.cos(rot_rad) + + src_result = [0, 0] + src_result[0] = src_point[0] * cs - src_point[1] * sn + src_result[1] = src_point[0] * sn + src_point[1] * cs + + return src_result + +def get_3rd_point(a, b): + direct = a - b + return b + np.array([-direct[1], direct[0]], dtype=np.float32) + + +def get_affine_transform( + center, scale, rot, output_size, + shift=np.array([0, 0], dtype=np.float32), inv=0 +): + if not isinstance(scale, np.ndarray) and not isinstance(scale, list): + print(scale) + scale = np.array([scale, scale]) + + scale_tmp = scale * 200.0 + src_w = scale_tmp[0] + dst_w = output_size[0] + dst_h = output_size[1] + + rot_rad = np.pi * rot / 180 + src_dir = get_dir([0, src_w * -0.5], rot_rad) + dst_dir = np.array([0, dst_w * -0.5], np.float32) + + src = np.zeros((3, 2), dtype=np.float32) + dst = np.zeros((3, 2), dtype=np.float32) + src[0, :] = center + scale_tmp * shift + src[1, :] = center + src_dir + scale_tmp * shift + dst[0, :] = [dst_w * 0.5, dst_h * 0.5] + dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir + + src[2:, :] = get_3rd_point(src[0, :], src[1, :]) + dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :]) + + if inv: + trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) + else: + trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) + + return trans + + +def get_max_preds(batch_heatmaps): + ''' + get predictions from score maps + heatmaps: numpy.ndarray([batch_size, num_joints, height, width]) + ''' + assert isinstance(batch_heatmaps, np.ndarray), \ + 'batch_heatmaps should be numpy.ndarray' + assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim' + + batch_size = batch_heatmaps.shape[0] + num_joints = batch_heatmaps.shape[1] + width = batch_heatmaps.shape[3] + heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1)) + idx = np.argmax(heatmaps_reshaped, 2) + maxvals = np.amax(heatmaps_reshaped, 2) + + maxvals = maxvals.reshape((batch_size, num_joints, 1)) + idx = idx.reshape((batch_size, num_joints, 1)) + + preds = np.tile(idx, (1, 1, 2)).astype(np.float32) + + preds[:, :, 0] = (preds[:, :, 0]) % width + preds[:, :, 1] = np.floor((preds[:, :, 1]) / width) + + pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2)) + pred_mask = pred_mask.astype(np.float32) + + preds *= pred_mask + return preds, maxvals + +def affine_transform(pt, t): + new_pt = np.array([pt[0], pt[1], 1.]).T + new_pt = np.dot(t, new_pt) + return new_pt[:2] + +def batch_affine_transform(points, trans): + points = np.hstack((points[:, :2], np.ones((points.shape[0], 1)))) + out = points @ trans.T + return out + +def transform_preds(coords, center, scale, rot, output_size): + target_coords = np.zeros(coords.shape) + trans = get_affine_transform(center, scale, rot, output_size, inv=1) + target_coords[:, :2] = batch_affine_transform(coords, trans) + return target_coords + +config_ = {'kintree': [[1, 0], [2, 0], [3, 1], [4, 2], [5, 0], [6, 0], [7, 5], [8, 6], [9, 7], [10, 8], [11, 5], [12, 6], [13, 11], [ + 14, 12], [15, 13], [16, 14], [6, 5], [12, 11]], 'color': ['g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'g', 'r', 'k', 'k']} +colors_table = { + # colorblind/print/copy safe: + '_blue': [0.65098039, 0.74117647, 0.85882353], + '_pink': [.9, .7, .7], + '_mint': [ 166/255., 229/255., 204/255.], + '_mint2': [ 202/255., 229/255., 223/255.], + '_green': [ 153/255., 216/255., 201/255.], + '_green2': [ 171/255., 221/255., 164/255.], + '_red': [ 251/255., 128/255., 114/255.], + '_orange': [ 253/255., 174/255., 97/255.], + '_yellow': [ 250/255., 230/255., 154/255.], + 'r':[255/255,0,0], + 'g':[0,255/255,0], + 'b':[0,0,255/255], + 'k':[0,0,0], + 'y':[255/255,255/255,0], + 'purple':[128/255,0,128/255] +} +for key, val in colors_table.items(): + colors_table[key] = tuple([int(val[2]*255), int(val[1]*255), int(val[0]*255)]) + +def save_batch_heatmaps(batch_image, batch_heatmaps, file_name, + normalize=True): + ''' + batch_image: [batch_size, channel, height, width] + batch_heatmaps: ['batch_size, num_joints, height, width] + file_name: saved file name + ''' + if normalize: + batch_image = batch_image.clone() + min = float(batch_image.min()) + max = float(batch_image.max()) + + batch_image.add_(-min).div_(max - min + 1e-5) + + batch_size = batch_heatmaps.size(0) + num_joints = batch_heatmaps.size(1) + heatmap_height = batch_heatmaps.size(2) + heatmap_width = batch_heatmaps.size(3) + + grid_image = np.zeros((batch_size*heatmap_height, + (num_joints+2)*heatmap_width, + 3), + dtype=np.uint8) + + preds, maxvals = get_max_preds(batch_heatmaps.detach().cpu().numpy()) + + for i in range(batch_size): + image = batch_image[i].mul(255)\ + .clamp(0, 255)\ + .byte()\ + .permute(1, 2, 0)\ + .cpu().numpy() + heatmaps = batch_heatmaps[i].mul(255)\ + .clamp(0, 255)\ + .byte()\ + .cpu().numpy() + + resized_image = cv2.resize(image, + (int(heatmap_width), int(heatmap_height))) + resized_image_copy = resized_image.copy() + height_begin = heatmap_height * i + height_end = heatmap_height * (i + 1) + for ip in range(len(config_['kintree'])): + src, dst = config_['kintree'][ip] + c = config_['color'][ip] + if maxvals[i][src] < 0.1 or maxvals[i][dst] < 0.1: + continue + plot_line(resized_image_copy, preds[i][src], preds[i][dst], colors_table[c], 1) + for j in range(num_joints): + cv2.circle(resized_image, + (int(preds[i][j][0]), int(preds[i][j][1])), + 1, [0, 0, 255], 1) + heatmap = heatmaps[j, :, :] + mask = (heatmap > 0.1)[:,:,None] + colored_heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) + masked_image = (colored_heatmap*0.7 + resized_image*0.3)*mask + resized_image*(1-mask) + cv2.circle(masked_image, + (int(preds[i][j][0]), int(preds[i][j][1])), + 1, [0, 0, 255], 1) + + width_begin = heatmap_width * (j+2) + width_end = heatmap_width * (j+2+1) + grid_image[height_begin:height_end, width_begin:width_end, :] = \ + masked_image + # grid_image[height_begin:height_end, width_begin:width_end, :] = \ + # colored_heatmap*0.7 + resized_image*0.3 + + grid_image[height_begin:height_end, 0:heatmap_width, :] = resized_image + grid_image[height_begin:height_end, heatmap_width:heatmap_width+heatmap_width, :] = resized_image_copy + cv2.imwrite(file_name, grid_image) + +import math + +def get_final_preds(batch_heatmaps, center, scale, rot=None, flip=None): + coords, maxvals = get_max_preds(batch_heatmaps) + + heatmap_height = batch_heatmaps.shape[2] + heatmap_width = batch_heatmaps.shape[3] + + # post-processing + if True: + for n in range(coords.shape[0]): + for p in range(coords.shape[1]): + hm = batch_heatmaps[n][p] + px = int(math.floor(coords[n][p][0] + 0.5)) + py = int(math.floor(coords[n][p][1] + 0.5)) + if 1 < px < heatmap_width-1 and 1 < py < heatmap_height-1: + diff = np.array( + [ + hm[py][px+1] - hm[py][px-1], + hm[py+1][px]-hm[py-1][px] + ] + ) + coords[n][p] += np.sign(diff) * .25 + + preds = coords.copy() + + # Transform back + for i in range(coords.shape[0]): + if flip is not None: + if flip[i]: + coords[i, :, 0] = heatmap_width - 1 - coords[i, :, 0] + if rot is None: + _rot = 0 + else: + _rot = rot[i] + preds[i] = transform_preds( + coords[i], center[i], scale[i], _rot, [heatmap_width, heatmap_height] + ) + return preds, maxvals + +def get_gaussian_maps(net_out, keypoints, sigma): + radius, kernel = gauss[sigma]['radius'], gauss[sigma]['kernel'] + weights = np.ones(net_out.shape, dtype=np.float32) + for i in range(weights.shape[0]): + for nj in range(weights.shape[1]): + if keypoints[i][nj][2] < 0: + weights[i][nj] = 0 + continue + elif keypoints[i][nj][2] < 0.01: + weights[i][nj] = 0 + continue + weights[i][nj] = 0 + mu_x, mu_y = keypoints[i][nj][:2] + mu_x, mu_y = int(mu_x + 0.5), int(mu_y + 0.5) + # Usable gaussian range + ul = [mu_x - radius, mu_y - radius] + br = [mu_x + radius + 1, mu_y + radius + 1] + # Usable gaussian range + g_x = max(0, -ul[0]), min(br[0], weights.shape[3]) - ul[0] + g_y = max(0, -ul[1]), min(br[1], weights.shape[2]) - ul[1] + # Image range + img_x = max(0, ul[0]), min(br[0], weights.shape[3]) + img_y = max(0, ul[1]), min(br[1], weights.shape[2]) + weights[i][nj][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \ + kernel[g_y[0]:g_y[1], g_x[0]:g_x[1]] + return weights + +humanId = 0 + +class SimpleHRNet: + def __init__(self, c, nof_joints, checkpoint_path, device, resolution=(288, 384),): + self.device = device + self.c = c + self.nof_joints = nof_joints + self.checkpoint_path = checkpoint_path + self.max_batch_size = 64 + self.resolution = resolution # in the form (height, width) as in the original implementation + self.transform = transforms.Compose([ + # transforms.ToPILImage(), + # transforms.Resize((self.resolution[0], self.resolution[1])), # (height, width) + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ]) + self.model = HRNet(c=c, nof_joints=nof_joints).to(device) + self.model.load_state_dict(torch.load(checkpoint_path, map_location=self.device)) + self.model.eval() + + def __call__(self, image, bboxes, rot=0, net_out=False): + # image: + images = torch.zeros((len(bboxes), 3, self.resolution[1], self.resolution[0]), device=self.device) # (height, width) + if len(bboxes) > 0: + # pose estimation : for multiple people + centers, scales, trans_all = [], [], [] + for box in bboxes: + center, scale = box_to_center_scale(box, self.resolution[0], self.resolution[1]) + centers.append(center) + scales.append(scale) + trans = get_affine_transform(center, scale, rot=rot, output_size=self.resolution) + trans_all.append(trans) + for i, trans in enumerate(trans_all): + # Crop smaller image of people + model_input = cv2.warpAffine( + image, trans, + (int(self.resolution[0]), int(self.resolution[1])), + flags=cv2.INTER_LINEAR) + # cv2.imshow('input', model_input) + # cv2.waitKey(0) + # hwc -> 1chw + model_input = self.transform(model_input)#.unsqueeze(0) + images[i] = model_input + images = images.to(self.device) + with torch.no_grad(): + out = self.model(images) + out = out.cpu().detach().numpy() + if net_out: + return out, trans_all, centers, scales, rot + coords, max_val = get_final_preds( + out, + np.asarray(centers), + np.asarray(scales), + [rot for _ in range(out.shape[0])]) + pts = np.concatenate((coords, max_val), axis=2) + return coco17tobody25(pts) + else: + return np.empty(0, 25, 3) + + def predict_with_previous(self, image, bboxes, keypoints, sigma): + # (batch, nJoints, height, width) + net_out, trans_all, centers, scales, rot = self.__call__(image, bboxes, net_out=True) + keypoints = keypoints[:, COCO17_IN_BODY25] + keypoints_rescale = keypoints.copy() + for i in range(keypoints.shape[0]): + keypoints_rescale[..., :2] = batch_affine_transform(keypoints[i], trans_all[i])/4 + weights = get_gaussian_maps(net_out, keypoints_rescale, sigma) + out = net_out * weights + coords, max_val = get_final_preds( + out, + np.asarray(centers), + np.asarray(scales), + rot) + pts = np.concatenate((coords, max_val), axis=2) + return coco17tobody25(pts) + + def predict(self, image, detections, keypoints=None, ret_crop=False): + if keypoints is not None: + keypoints = keypoints[:, COCO17_IN_BODY25] + kpts_rescale = [None for _ in range(len(keypoints))] + boxes = [] + rotation = 0 + image_pose = image + # image_pose = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + if detections is not None: + images = torch.zeros((len(detections), 3, self.resolution[1], self.resolution[0]), device=self.device) # (height, width) + # pose estimation : for multiple people + centers = [] + scales = [] + for box in detections: + center, scale = box_to_center_scale(box, self.resolution[0], self.resolution[1]) + centers.append(center) + scales.append(scale) + model_inputs = [] + for i, (center, scale) in enumerate(zip(centers, scales)): + trans = get_affine_transform(center, scale, rotation, self.resolution) + # Crop smaller image of people + model_input = cv2.warpAffine( + image_pose, + trans, + (int(self.resolution[0]), int(self.resolution[1])), + flags=cv2.INTER_LINEAR) + if keypoints is not None: + kpts_homo = keypoints[i].copy() + kpts_homo[:, 2] = 1 + kpts_rescale[i] = (kpts_homo @ trans.T)/4 + # global humanId + # cv2.imwrite('../output/debughrnet/person_{}.jpg'.format(humanId), model_input[:,:,[2,1,0]]) + # humanId += 1 + # hwc -> 1chw + model_input = self.transform(model_input)#.unsqueeze(0) + images[i] = model_input + # torch.cuda.synchronize(self.device) + + # print(' - spending {:.2f}ms in preprocess.'.format(1000*(time.time() - start))) + if images.shape[0] == 0: + return np.empty((0, 25, 3)) + else: + # start = time.time() + images = images.to(self.device) + # torch.cuda.synchronize(self.device) + + # print(' - spending {:.2f}ms in copy to cuda.'.format(1000*(time.time() - start))) + # start = time.time() + with torch.no_grad(): + if len(images) <= self.max_batch_size: + out = self.model(images) + else: + out = torch.empty( + (images.shape[0], self.nof_joints, self.resolution[1] // 4, self.resolution[0] // 4) + ).to(self.device) + for i in range(0, len(images), self.max_batch_size): + out[i:i + self.max_batch_size] = self.model(images[i:i + self.max_batch_size]) + # torch.cuda.synchronize(self.device) + global humanId + if keypoints is not None: + filename = join('../output/debughrnet', '{:06d}.jpg'.format(humanId)) + humanId += 1 + # save_batch_heatmaps(images, out, filename) + # 制造高斯核,默认为1 + weights = np.ones(out.shape, dtype=np.float32) + for i in range(weights.shape[0]): + for nj in range(weights.shape[1]): + if keypoints[i][nj][2] < 0: + weights[i][nj] = 0 + continue + elif keypoints[i][nj][2] < 0.01: + continue + weights[i][nj] = 0 + mu_x, mu_y = kpts_rescale[i][nj] + mu_x, mu_y = int(mu_x + 0.5), int(mu_y + 0.5) + # Usable gaussian range + ul = [mu_x - gauss_radius, mu_y - gauss_radius] + br = [mu_x + gauss_radius + 1, mu_y + gauss_radius + 1] + # Usable gaussian range + g_x = max(0, -ul[0]), min(br[0], weights.shape[3]) - ul[0] + g_y = max(0, -ul[1]), min(br[1], weights.shape[2]) - ul[1] + # Image range + img_x = max(0, ul[0]), min(br[0], weights.shape[3]) + img_y = max(0, ul[1]), min(br[1], weights.shape[2]) + weights[i][nj][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \ + gauss_kernel[g_y[0]:g_y[1], g_x[0]:g_x[1]] + filename = join('../output/debughrnet', '{:06d}.jpg'.format(humanId)) + humanId += 1 + # save_batch_heatmaps(images, torch.Tensor(weights), filename) + out = out.cpu().detach().numpy() + out = out * weights + filename = join('../output/debughrnet', '{:06d}.jpg'.format(humanId)) + humanId += 1 + # save_batch_heatmaps(images, torch.Tensor(out), filename) + else: + out = out.cpu().detach().numpy() + coords, max_val = get_final_preds( + out, + np.asarray(centers), + np.asarray(scales)) + pts = np.concatenate((coords, max_val), axis=2) + # torch.cuda.synchronize(self.device) + # print(' - spending {:.2f}ms in postprocess.'.format(1000*(time.time() - start))) + # print('') + if ret_crop: + return coco17tobody25(pts), images + else: + return coco17tobody25(pts) \ No newline at end of file diff --git a/easymocap/estimator/HRNet/modules.py b/easymocap/estimator/HRNet/modules.py new file mode 100644 index 0000000..733fedd --- /dev/null +++ b/easymocap/estimator/HRNet/modules.py @@ -0,0 +1,72 @@ +import torch +from torch import nn + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=bn_momentum) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum) + self.relu = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out diff --git a/easymocap/estimator/YOLOv4/__init__.py b/easymocap/estimator/YOLOv4/__init__.py new file mode 100644 index 0000000..25b4f96 --- /dev/null +++ b/easymocap/estimator/YOLOv4/__init__.py @@ -0,0 +1,8 @@ +''' + @ Date: 2020-12-10 16:37:04 + @ Author: Qing Shuai + @ LastEditors: Qing Shuai + @ LastEditTime: 2020-12-10 16:52:06 + @ FilePath: /mvpose/code/estimator/YOLOv4/__init__.py +''' +from .yolo import YOLOv4 \ No newline at end of file diff --git a/easymocap/estimator/YOLOv4/coco.names b/easymocap/estimator/YOLOv4/coco.names new file mode 100644 index 0000000..ca76c80 --- /dev/null +++ b/easymocap/estimator/YOLOv4/coco.names @@ -0,0 +1,80 @@ +person +bicycle +car +motorbike +aeroplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +sofa +pottedplant +bed +diningtable +toilet +tvmonitor +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/easymocap/estimator/YOLOv4/config.py b/easymocap/estimator/YOLOv4/config.py new file mode 100644 index 0000000..04b60b8 --- /dev/null +++ b/easymocap/estimator/YOLOv4/config.py @@ -0,0 +1,257 @@ +import torch +from .torch_utils import convert2cpu + +def parse_cfg(cfgfile): + blocks = [] + fp = open(cfgfile, 'r') + block = None + line = fp.readline() + while line != '': + line = line.rstrip() + if line == '' or line[0] == '#': + line = fp.readline() + continue + elif line[0] == '[': + if block: + blocks.append(block) + block = dict() + block['type'] = line.lstrip('[').rstrip(']') + # set default value + if block['type'] == 'convolutional': + block['batch_normalize'] = 0 + else: + key, value = line.split('=') + key = key.strip() + if key == 'type': + key = '_type' + value = value.strip() + block[key] = value + line = fp.readline() + + if block: + blocks.append(block) + fp.close() + return blocks + + +def print_cfg(blocks): + print('layer filters size input output'); + prev_width = 416 + prev_height = 416 + prev_filters = 3 + out_filters = [] + out_widths = [] + out_heights = [] + ind = -2 + for block in blocks: + ind = ind + 1 + if block['type'] == 'net': + prev_width = int(block['width']) + prev_height = int(block['height']) + continue + elif block['type'] == 'convolutional': + filters = int(block['filters']) + kernel_size = int(block['size']) + stride = int(block['stride']) + is_pad = int(block['pad']) + pad = (kernel_size - 1) // 2 if is_pad else 0 + width = (prev_width + 2 * pad - kernel_size) // stride + 1 + height = (prev_height + 2 * pad - kernel_size) // stride + 1 + print('%5d %-6s %4d %d x %d / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( + ind, 'conv', filters, kernel_size, kernel_size, stride, prev_width, prev_height, prev_filters, width, + height, filters)) + prev_width = width + prev_height = height + prev_filters = filters + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'maxpool': + pool_size = int(block['size']) + stride = int(block['stride']) + width = prev_width // stride + height = prev_height // stride + print('%5d %-6s %d x %d / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( + ind, 'max', pool_size, pool_size, stride, prev_width, prev_height, prev_filters, width, height, + filters)) + prev_width = width + prev_height = height + prev_filters = filters + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'avgpool': + width = 1 + height = 1 + print('%5d %-6s %3d x %3d x%4d -> %3d' % ( + ind, 'avg', prev_width, prev_height, prev_filters, prev_filters)) + prev_width = width + prev_height = height + prev_filters = filters + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'softmax': + print('%5d %-6s -> %3d' % (ind, 'softmax', prev_filters)) + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'cost': + print('%5d %-6s -> %3d' % (ind, 'cost', prev_filters)) + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'reorg': + stride = int(block['stride']) + filters = stride * stride * prev_filters + width = prev_width // stride + height = prev_height // stride + print('%5d %-6s / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( + ind, 'reorg', stride, prev_width, prev_height, prev_filters, width, height, filters)) + prev_width = width + prev_height = height + prev_filters = filters + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'upsample': + stride = int(block['stride']) + filters = prev_filters + width = prev_width * stride + height = prev_height * stride + print('%5d %-6s * %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( + ind, 'upsample', stride, prev_width, prev_height, prev_filters, width, height, filters)) + prev_width = width + prev_height = height + prev_filters = filters + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'route': + layers = block['layers'].split(',') + layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers] + if len(layers) == 1: + print('%5d %-6s %d' % (ind, 'route', layers[0])) + prev_width = out_widths[layers[0]] + prev_height = out_heights[layers[0]] + prev_filters = out_filters[layers[0]] + elif len(layers) == 2: + print('%5d %-6s %d %d' % (ind, 'route', layers[0], layers[1])) + prev_width = out_widths[layers[0]] + prev_height = out_heights[layers[0]] + assert (prev_width == out_widths[layers[1]]) + assert (prev_height == out_heights[layers[1]]) + prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + elif len(layers) == 4: + print('%5d %-6s %d %d %d %d' % (ind, 'route', layers[0], layers[1], layers[2], layers[3])) + prev_width = out_widths[layers[0]] + prev_height = out_heights[layers[0]] + assert (prev_width == out_widths[layers[1]] == out_widths[layers[2]] == out_widths[layers[3]]) + assert (prev_height == out_heights[layers[1]] == out_heights[layers[2]] == out_heights[layers[3]]) + prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + out_filters[ + layers[3]] + else: + print("route error !!! {} {} {}".format(sys._getframe().f_code.co_filename, + sys._getframe().f_code.co_name, sys._getframe().f_lineno)) + + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] in ['region', 'yolo']: + print('%5d %-6s' % (ind, 'detection')) + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'shortcut': + from_id = int(block['from']) + from_id = from_id if from_id > 0 else from_id + ind + print('%5d %-6s %d' % (ind, 'shortcut', from_id)) + prev_width = out_widths[from_id] + prev_height = out_heights[from_id] + prev_filters = out_filters[from_id] + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'connected': + filters = int(block['output']) + print('%5d %-6s %d -> %3d' % (ind, 'connected', prev_filters, filters)) + prev_filters = filters + out_widths.append(1) + out_heights.append(1) + out_filters.append(prev_filters) + else: + print('unknown type %s' % (block['type'])) + + +def load_conv(buf, start, conv_model): + num_w = conv_model.weight.numel() + num_b = conv_model.bias.numel() + conv_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b])); + start = start + num_b + conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape)); + start = start + num_w + return start + + +def save_conv(fp, conv_model): + if conv_model.bias.is_cuda: + convert2cpu(conv_model.bias.data).numpy().tofile(fp) + convert2cpu(conv_model.weight.data).numpy().tofile(fp) + else: + conv_model.bias.data.numpy().tofile(fp) + conv_model.weight.data.numpy().tofile(fp) + + +def load_conv_bn(buf, start, conv_model, bn_model): + num_w = conv_model.weight.numel() + num_b = bn_model.bias.numel() + bn_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b])); + start = start + num_b + bn_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_b])); + start = start + num_b + bn_model.running_mean.copy_(torch.from_numpy(buf[start:start + num_b])); + start = start + num_b + bn_model.running_var.copy_(torch.from_numpy(buf[start:start + num_b])); + start = start + num_b + conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape)); + start = start + num_w + return start + + +def save_conv_bn(fp, conv_model, bn_model): + if bn_model.bias.is_cuda: + convert2cpu(bn_model.bias.data).numpy().tofile(fp) + convert2cpu(bn_model.weight.data).numpy().tofile(fp) + convert2cpu(bn_model.running_mean).numpy().tofile(fp) + convert2cpu(bn_model.running_var).numpy().tofile(fp) + convert2cpu(conv_model.weight.data).numpy().tofile(fp) + else: + bn_model.bias.data.numpy().tofile(fp) + bn_model.weight.data.numpy().tofile(fp) + bn_model.running_mean.numpy().tofile(fp) + bn_model.running_var.numpy().tofile(fp) + conv_model.weight.data.numpy().tofile(fp) + + +def load_fc(buf, start, fc_model): + num_w = fc_model.weight.numel() + num_b = fc_model.bias.numel() + fc_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b])); + start = start + num_b + fc_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w])); + start = start + num_w + return start + + +def save_fc(fp, fc_model): + fc_model.bias.data.numpy().tofile(fp) + fc_model.weight.data.numpy().tofile(fp) + + +if __name__ == '__main__': + import sys + + blocks = parse_cfg('cfg/yolo.cfg') + if len(sys.argv) == 2: + blocks = parse_cfg(sys.argv[1]) + print_cfg(blocks) diff --git a/easymocap/estimator/YOLOv4/darknet2pytorch.py b/easymocap/estimator/YOLOv4/darknet2pytorch.py new file mode 100644 index 0000000..dfbfc45 --- /dev/null +++ b/easymocap/estimator/YOLOv4/darknet2pytorch.py @@ -0,0 +1,515 @@ +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from .region_loss import RegionLoss +from .yolo_layer import YoloLayer +from .config import * +from .torch_utils import * + + +class Mish(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + x = x * (torch.tanh(torch.nn.functional.softplus(x))) + return x + + +class MaxPoolDark(nn.Module): + def __init__(self, size=2, stride=1): + super(MaxPoolDark, self).__init__() + self.size = size + self.stride = stride + + def forward(self, x): + ''' + darknet output_size = (input_size + p - k) / s +1 + p : padding = k - 1 + k : size + s : stride + torch output_size = (input_size + 2*p -k) / s +1 + p : padding = k//2 + ''' + p = self.size // 2 + if ((x.shape[2] - 1) // self.stride) != ((x.shape[2] + 2 * p - self.size) // self.stride): + padding1 = (self.size - 1) // 2 + padding2 = padding1 + 1 + else: + padding1 = (self.size - 1) // 2 + padding2 = padding1 + if ((x.shape[3] - 1) // self.stride) != ((x.shape[3] + 2 * p - self.size) // self.stride): + padding3 = (self.size - 1) // 2 + padding4 = padding3 + 1 + else: + padding3 = (self.size - 1) // 2 + padding4 = padding3 + x = F.max_pool2d(F.pad(x, (padding3, padding4, padding1, padding2), mode='replicate'), + self.size, stride=self.stride) + return x + + +class Upsample_expand(nn.Module): + def __init__(self, stride=2): + super(Upsample_expand, self).__init__() + self.stride = stride + + def forward(self, x): + assert (x.data.dim() == 4) + + x = x.view(x.size(0), x.size(1), x.size(2), 1, x.size(3), 1).\ + expand(x.size(0), x.size(1), x.size(2), self.stride, x.size(3), self.stride).contiguous().\ + view(x.size(0), x.size(1), x.size(2) * self.stride, x.size(3) * self.stride) + + return x + + +class Upsample_interpolate(nn.Module): + def __init__(self, stride): + super(Upsample_interpolate, self).__init__() + self.stride = stride + + def forward(self, x): + assert (x.data.dim() == 4) + + out = F.interpolate(x, size=(x.size(2) * self.stride, x.size(3) * self.stride), mode='nearest') + return out + + +class Reorg(nn.Module): + def __init__(self, stride=2): + super(Reorg, self).__init__() + self.stride = stride + + def forward(self, x): + stride = self.stride + assert (x.data.dim() == 4) + B = x.data.size(0) + C = x.data.size(1) + H = x.data.size(2) + W = x.data.size(3) + assert (H % stride == 0) + assert (W % stride == 0) + ws = stride + hs = stride + x = x.view(B, C, H / hs, hs, W / ws, ws).transpose(3, 4).contiguous() + x = x.view(B, C, H / hs * W / ws, hs * ws).transpose(2, 3).contiguous() + x = x.view(B, C, hs * ws, H / hs, W / ws).transpose(1, 2).contiguous() + x = x.view(B, hs * ws * C, H / hs, W / ws) + return x + + +class GlobalAvgPool2d(nn.Module): + def __init__(self): + super(GlobalAvgPool2d, self).__init__() + + def forward(self, x): + N = x.data.size(0) + C = x.data.size(1) + H = x.data.size(2) + W = x.data.size(3) + x = F.avg_pool2d(x, (H, W)) + x = x.view(N, C) + return x + + +# for route and shortcut +class EmptyModule(nn.Module): + def __init__(self): + super(EmptyModule, self).__init__() + + def forward(self, x): + return x + + +# support route shortcut and reorg +class Darknet(nn.Module): + def __init__(self, cfgfile, inference=False): + super(Darknet, self).__init__() + self.inference = inference + self.training = not self.inference + + self.blocks = parse_cfg(cfgfile) + self.width = int(self.blocks[0]['width']) + self.height = int(self.blocks[0]['height']) + + self.models = self.create_network(self.blocks) # merge conv, bn,leaky + self.loss = self.models[len(self.models) - 1] + + if self.blocks[(len(self.blocks) - 1)]['type'] == 'region': + self.anchors = self.loss.anchors + self.num_anchors = self.loss.num_anchors + self.anchor_step = self.loss.anchor_step + self.num_classes = self.loss.num_classes + + self.header = torch.IntTensor([0, 0, 0, 0]) + self.seen = 0 + + def forward(self, x): + ind = -2 + self.loss = None + outputs = dict() + out_boxes = [] + for block in self.blocks: + ind = ind + 1 + # if ind > 0: + # return x + + if block['type'] == 'net': + continue + elif block['type'] in ['convolutional', 'maxpool', 'reorg', 'upsample', 'avgpool', 'softmax', 'connected']: + x = self.models[ind](x) + outputs[ind] = x + elif block['type'] == 'route': + layers = block['layers'].split(',') + layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers] + if len(layers) == 1: + if 'groups' not in block.keys() or int(block['groups']) == 1: + x = outputs[layers[0]] + outputs[ind] = x + else: + groups = int(block['groups']) + group_id = int(block['group_id']) + _, b, _, _ = outputs[layers[0]].shape + x = outputs[layers[0]][:, b // groups * group_id:b // groups * (group_id + 1)] + outputs[ind] = x + elif len(layers) == 2: + x1 = outputs[layers[0]] + x2 = outputs[layers[1]] + x = torch.cat((x1, x2), 1) + outputs[ind] = x + elif len(layers) == 4: + x1 = outputs[layers[0]] + x2 = outputs[layers[1]] + x3 = outputs[layers[2]] + x4 = outputs[layers[3]] + x = torch.cat((x1, x2, x3, x4), 1) + outputs[ind] = x + else: + print("rounte number > 2 ,is {}".format(len(layers))) + + elif block['type'] == 'shortcut': + from_layer = int(block['from']) + activation = block['activation'] + from_layer = from_layer if from_layer > 0 else from_layer + ind + x1 = outputs[from_layer] + x2 = outputs[ind - 1] + x = x1 + x2 + if activation == 'leaky': + x = F.leaky_relu(x, 0.1, inplace=True) + elif activation == 'relu': + x = F.relu(x, inplace=True) + outputs[ind] = x + elif block['type'] == 'region': + continue + if self.loss: + self.loss = self.loss + self.models[ind](x) + else: + self.loss = self.models[ind](x) + outputs[ind] = None + elif block['type'] == 'yolo': + # if self.training: + # pass + # else: + # boxes = self.models[ind](x) + # out_boxes.append(boxes) + boxes = self.models[ind](x) + out_boxes.append(boxes) + elif block['type'] == 'cost': + continue + else: + print('unknown type %s' % (block['type'])) + + if self.training: + return out_boxes + else: + return get_region_boxes(out_boxes) + + def print_network(self): + print_cfg(self.blocks) + + def create_network(self, blocks): + models = nn.ModuleList() + + prev_filters = 3 + out_filters = [] + prev_stride = 1 + out_strides = [] + conv_id = 0 + for block in blocks: + if block['type'] == 'net': + prev_filters = int(block['channels']) + continue + elif block['type'] == 'convolutional': + conv_id = conv_id + 1 + batch_normalize = int(block['batch_normalize']) + filters = int(block['filters']) + kernel_size = int(block['size']) + stride = int(block['stride']) + is_pad = int(block['pad']) + pad = (kernel_size - 1) // 2 if is_pad else 0 + activation = block['activation'] + model = nn.Sequential() + if batch_normalize: + model.add_module('conv{0}'.format(conv_id), + nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias=False)) + model.add_module('bn{0}'.format(conv_id), nn.BatchNorm2d(filters)) + # model.add_module('bn{0}'.format(conv_id), BN2d(filters)) + else: + model.add_module('conv{0}'.format(conv_id), + nn.Conv2d(prev_filters, filters, kernel_size, stride, pad)) + if activation == 'leaky': + model.add_module('leaky{0}'.format(conv_id), nn.LeakyReLU(0.1, inplace=True)) + elif activation == 'relu': + model.add_module('relu{0}'.format(conv_id), nn.ReLU(inplace=True)) + elif activation == 'mish': + model.add_module('mish{0}'.format(conv_id), Mish()) + else: + pass + # print("convalution havn't activate {}".format(activation)) + + prev_filters = filters + out_filters.append(prev_filters) + prev_stride = stride * prev_stride + out_strides.append(prev_stride) + models.append(model) + elif block['type'] == 'maxpool': + pool_size = int(block['size']) + stride = int(block['stride']) + if stride == 1 and pool_size % 2: + # You can use Maxpooldark instead, here is convenient to convert onnx. + # Example: [maxpool] size=3 stride=1 + model = nn.MaxPool2d(kernel_size=pool_size, stride=stride, padding=pool_size // 2) + elif stride == pool_size: + # You can use Maxpooldark instead, here is convenient to convert onnx. + # Example: [maxpool] size=2 stride=2 + model = nn.MaxPool2d(kernel_size=pool_size, stride=stride, padding=0) + else: + model = MaxPoolDark(pool_size, stride) + out_filters.append(prev_filters) + prev_stride = stride * prev_stride + out_strides.append(prev_stride) + models.append(model) + elif block['type'] == 'avgpool': + model = GlobalAvgPool2d() + out_filters.append(prev_filters) + models.append(model) + elif block['type'] == 'softmax': + model = nn.Softmax() + out_strides.append(prev_stride) + out_filters.append(prev_filters) + models.append(model) + elif block['type'] == 'cost': + if block['_type'] == 'sse': + model = nn.MSELoss(reduction='mean') + elif block['_type'] == 'L1': + model = nn.L1Loss(reduction='mean') + elif block['_type'] == 'smooth': + model = nn.SmoothL1Loss(reduction='mean') + out_filters.append(1) + out_strides.append(prev_stride) + models.append(model) + elif block['type'] == 'reorg': + stride = int(block['stride']) + prev_filters = stride * stride * prev_filters + out_filters.append(prev_filters) + prev_stride = prev_stride * stride + out_strides.append(prev_stride) + models.append(Reorg(stride)) + elif block['type'] == 'upsample': + stride = int(block['stride']) + out_filters.append(prev_filters) + prev_stride = prev_stride // stride + out_strides.append(prev_stride) + + models.append(Upsample_expand(stride)) + # models.append(Upsample_interpolate(stride)) + + elif block['type'] == 'route': + layers = block['layers'].split(',') + ind = len(models) + layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers] + if len(layers) == 1: + if 'groups' not in block.keys() or int(block['groups']) == 1: + prev_filters = out_filters[layers[0]] + prev_stride = out_strides[layers[0]] + else: + prev_filters = out_filters[layers[0]] // int(block['groups']) + prev_stride = out_strides[layers[0]] // int(block['groups']) + elif len(layers) == 2: + assert (layers[0] == ind - 1 or layers[1] == ind - 1) + prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + prev_stride = out_strides[layers[0]] + elif len(layers) == 4: + assert (layers[0] == ind - 1) + prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + \ + out_filters[layers[3]] + prev_stride = out_strides[layers[0]] + else: + print("route error!!!") + + out_filters.append(prev_filters) + out_strides.append(prev_stride) + models.append(EmptyModule()) + elif block['type'] == 'shortcut': + ind = len(models) + prev_filters = out_filters[ind - 1] + out_filters.append(prev_filters) + prev_stride = out_strides[ind - 1] + out_strides.append(prev_stride) + models.append(EmptyModule()) + elif block['type'] == 'connected': + filters = int(block['output']) + if block['activation'] == 'linear': + model = nn.Linear(prev_filters, filters) + elif block['activation'] == 'leaky': + model = nn.Sequential( + nn.Linear(prev_filters, filters), + nn.LeakyReLU(0.1, inplace=True)) + elif block['activation'] == 'relu': + model = nn.Sequential( + nn.Linear(prev_filters, filters), + nn.ReLU(inplace=True)) + prev_filters = filters + out_filters.append(prev_filters) + out_strides.append(prev_stride) + models.append(model) + elif block['type'] == 'region': + loss = RegionLoss() + anchors = block['anchors'].split(',') + loss.anchors = [float(i) for i in anchors] + loss.num_classes = int(block['classes']) + loss.num_anchors = int(block['num']) + loss.anchor_step = len(loss.anchors) // loss.num_anchors + loss.object_scale = float(block['object_scale']) + loss.noobject_scale = float(block['noobject_scale']) + loss.class_scale = float(block['class_scale']) + loss.coord_scale = float(block['coord_scale']) + out_filters.append(prev_filters) + out_strides.append(prev_stride) + models.append(loss) + elif block['type'] == 'yolo': + yolo_layer = YoloLayer() + anchors = block['anchors'].split(',') + anchor_mask = block['mask'].split(',') + yolo_layer.anchor_mask = [int(i) for i in anchor_mask] + yolo_layer.anchors = [float(i) for i in anchors] + yolo_layer.num_classes = int(block['classes']) + self.num_classes = yolo_layer.num_classes + yolo_layer.num_anchors = int(block['num']) + yolo_layer.anchor_step = len(yolo_layer.anchors) // yolo_layer.num_anchors + yolo_layer.stride = prev_stride + yolo_layer.scale_x_y = float(block['scale_x_y']) + # yolo_layer.object_scale = float(block['object_scale']) + # yolo_layer.noobject_scale = float(block['noobject_scale']) + # yolo_layer.class_scale = float(block['class_scale']) + # yolo_layer.coord_scale = float(block['coord_scale']) + out_filters.append(prev_filters) + out_strides.append(prev_stride) + models.append(yolo_layer) + else: + print('unknown type %s' % (block['type'])) + + return models + + def load_weights(self, weightfile): + fp = open(weightfile, 'rb') + header = np.fromfile(fp, count=5, dtype=np.int32) + self.header = torch.from_numpy(header) + self.seen = self.header[3] + buf = np.fromfile(fp, dtype=np.float32) + fp.close() + + start = 0 + ind = -2 + for block in self.blocks: + if start >= buf.size: + break + ind = ind + 1 + if block['type'] == 'net': + continue + elif block['type'] == 'convolutional': + model = self.models[ind] + batch_normalize = int(block['batch_normalize']) + if batch_normalize: + start = load_conv_bn(buf, start, model[0], model[1]) + else: + start = load_conv(buf, start, model[0]) + elif block['type'] == 'connected': + model = self.models[ind] + if block['activation'] != 'linear': + start = load_fc(buf, start, model[0]) + else: + start = load_fc(buf, start, model) + elif block['type'] == 'maxpool': + pass + elif block['type'] == 'reorg': + pass + elif block['type'] == 'upsample': + pass + elif block['type'] == 'route': + pass + elif block['type'] == 'shortcut': + pass + elif block['type'] == 'region': + pass + elif block['type'] == 'yolo': + pass + elif block['type'] == 'avgpool': + pass + elif block['type'] == 'softmax': + pass + elif block['type'] == 'cost': + pass + else: + print('unknown type %s' % (block['type'])) + + # def save_weights(self, outfile, cutoff=0): + # if cutoff <= 0: + # cutoff = len(self.blocks) - 1 + # + # fp = open(outfile, 'wb') + # self.header[3] = self.seen + # header = self.header + # header.numpy().tofile(fp) + # + # ind = -1 + # for blockId in range(1, cutoff + 1): + # ind = ind + 1 + # block = self.blocks[blockId] + # if block['type'] == 'convolutional': + # model = self.models[ind] + # batch_normalize = int(block['batch_normalize']) + # if batch_normalize: + # save_conv_bn(fp, model[0], model[1]) + # else: + # save_conv(fp, model[0]) + # elif block['type'] == 'connected': + # model = self.models[ind] + # if block['activation'] != 'linear': + # save_fc(fc, model) + # else: + # save_fc(fc, model[0]) + # elif block['type'] == 'maxpool': + # pass + # elif block['type'] == 'reorg': + # pass + # elif block['type'] == 'upsample': + # pass + # elif block['type'] == 'route': + # pass + # elif block['type'] == 'shortcut': + # pass + # elif block['type'] == 'region': + # pass + # elif block['type'] == 'yolo': + # pass + # elif block['type'] == 'avgpool': + # pass + # elif block['type'] == 'softmax': + # pass + # elif block['type'] == 'cost': + # pass + # else: + # print('unknown type %s' % (block['type'])) + # fp.close() diff --git a/easymocap/estimator/YOLOv4/region_loss.py b/easymocap/estimator/YOLOv4/region_loss.py new file mode 100644 index 0000000..1aa7f18 --- /dev/null +++ b/easymocap/estimator/YOLOv4/region_loss.py @@ -0,0 +1,195 @@ +import torch.nn as nn +import torch.nn.functional as F +from .torch_utils import * + + +def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale, + sil_thresh, seen): + nB = target.size(0) + nA = num_anchors + nC = num_classes + anchor_step = len(anchors) / num_anchors + conf_mask = torch.ones(nB, nA, nH, nW) * noobject_scale + coord_mask = torch.zeros(nB, nA, nH, nW) + cls_mask = torch.zeros(nB, nA, nH, nW) + tx = torch.zeros(nB, nA, nH, nW) + ty = torch.zeros(nB, nA, nH, nW) + tw = torch.zeros(nB, nA, nH, nW) + th = torch.zeros(nB, nA, nH, nW) + tconf = torch.zeros(nB, nA, nH, nW) + tcls = torch.zeros(nB, nA, nH, nW) + + nAnchors = nA * nH * nW + nPixels = nH * nW + for b in range(nB): + cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t() + cur_ious = torch.zeros(nAnchors) + for t in range(50): + if target[b][t * 5 + 1] == 0: + break + gx = target[b][t * 5 + 1] * nW + gy = target[b][t * 5 + 2] * nH + gw = target[b][t * 5 + 3] * nW + gh = target[b][t * 5 + 4] * nH + cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t() + cur_ious = torch.max(cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)) + conf_mask[b][cur_ious > sil_thresh] = 0 + if seen < 12800: + if anchor_step == 4: + tx = torch.FloatTensor(anchors).view(nA, anchor_step).index_select(1, torch.LongTensor([2])).view(1, nA, 1, + 1).repeat( + nB, 1, nH, nW) + ty = torch.FloatTensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([2])).view( + 1, nA, 1, 1).repeat(nB, 1, nH, nW) + else: + tx.fill_(0.5) + ty.fill_(0.5) + tw.zero_() + th.zero_() + coord_mask.fill_(1) + + nGT = 0 + nCorrect = 0 + for b in range(nB): + for t in range(50): + if target[b][t * 5 + 1] == 0: + break + nGT = nGT + 1 + best_iou = 0.0 + best_n = -1 + min_dist = 10000 + gx = target[b][t * 5 + 1] * nW + gy = target[b][t * 5 + 2] * nH + gi = int(gx) + gj = int(gy) + gw = target[b][t * 5 + 3] * nW + gh = target[b][t * 5 + 4] * nH + gt_box = [0, 0, gw, gh] + for n in range(nA): + aw = anchors[anchor_step * n] + ah = anchors[anchor_step * n + 1] + anchor_box = [0, 0, aw, ah] + iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False) + if anchor_step == 4: + ax = anchors[anchor_step * n + 2] + ay = anchors[anchor_step * n + 3] + dist = pow(((gi + ax) - gx), 2) + pow(((gj + ay) - gy), 2) + if iou > best_iou: + best_iou = iou + best_n = n + elif anchor_step == 4 and iou == best_iou and dist < min_dist: + best_iou = iou + best_n = n + min_dist = dist + + gt_box = [gx, gy, gw, gh] + pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi] + + coord_mask[b][best_n][gj][gi] = 1 + cls_mask[b][best_n][gj][gi] = 1 + conf_mask[b][best_n][gj][gi] = object_scale + tx[b][best_n][gj][gi] = target[b][t * 5 + 1] * nW - gi + ty[b][best_n][gj][gi] = target[b][t * 5 + 2] * nH - gj + tw[b][best_n][gj][gi] = math.log(gw / anchors[anchor_step * best_n]) + th[b][best_n][gj][gi] = math.log(gh / anchors[anchor_step * best_n + 1]) + iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) # best_iou + tconf[b][best_n][gj][gi] = iou + tcls[b][best_n][gj][gi] = target[b][t * 5] + if iou > 0.5: + nCorrect = nCorrect + 1 + + return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls + + +class RegionLoss(nn.Module): + def __init__(self, num_classes=0, anchors=[], num_anchors=1): + super(RegionLoss, self).__init__() + self.num_classes = num_classes + self.anchors = anchors + self.num_anchors = num_anchors + self.anchor_step = len(anchors) / num_anchors + self.coord_scale = 1 + self.noobject_scale = 1 + self.object_scale = 5 + self.class_scale = 1 + self.thresh = 0.6 + self.seen = 0 + + def forward(self, output, target): + # output : BxAs*(4+1+num_classes)*H*W + t0 = time.time() + nB = output.data.size(0) + nA = self.num_anchors + nC = self.num_classes + nH = output.data.size(2) + nW = output.data.size(3) + + output = output.view(nB, nA, (5 + nC), nH, nW) + x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW)) + y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW)) + w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW) + h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW) + conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW)) + cls = output.index_select(2, Variable(torch.linspace(5, 5 + nC - 1, nC).long().cuda())) + cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(nB * nA * nH * nW, nC) + t1 = time.time() + + pred_boxes = torch.cuda.FloatTensor(4, nB * nA * nH * nW) + grid_x = torch.linspace(0, nW - 1, nW).repeat(nH, 1).repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda() + grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda() + anchor_w = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([0])).cuda() + anchor_h = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([1])).cuda() + anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW) + anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW) + pred_boxes[0] = x.data + grid_x + pred_boxes[1] = y.data + grid_y + pred_boxes[2] = torch.exp(w.data) * anchor_w + pred_boxes[3] = torch.exp(h.data) * anchor_h + pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4)) + t2 = time.time() + + nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes, + target.data, + self.anchors, nA, + nC, \ + nH, nW, + self.noobject_scale, + self.object_scale, + self.thresh, + self.seen) + cls_mask = (cls_mask == 1) + nProposals = int((conf > 0.25).sum().data[0]) + + tx = Variable(tx.cuda()) + ty = Variable(ty.cuda()) + tw = Variable(tw.cuda()) + th = Variable(th.cuda()) + tconf = Variable(tconf.cuda()) + tcls = Variable(tcls.view(-1)[cls_mask].long().cuda()) + + coord_mask = Variable(coord_mask.cuda()) + conf_mask = Variable(conf_mask.cuda().sqrt()) + cls_mask = Variable(cls_mask.view(-1, 1).repeat(1, nC).cuda()) + cls = cls[cls_mask].view(-1, nC) + + t3 = time.time() + + loss_x = self.coord_scale * nn.MSELoss(reduction='sum')(x * coord_mask, tx * coord_mask) / 2.0 + loss_y = self.coord_scale * nn.MSELoss(reduction='sum')(y * coord_mask, ty * coord_mask) / 2.0 + loss_w = self.coord_scale * nn.MSELoss(reduction='sum')(w * coord_mask, tw * coord_mask) / 2.0 + loss_h = self.coord_scale * nn.MSELoss(reduction='sum')(h * coord_mask, th * coord_mask) / 2.0 + loss_conf = nn.MSELoss(reduction='sum')(conf * conf_mask, tconf * conf_mask) / 2.0 + loss_cls = self.class_scale * nn.CrossEntropyLoss(reduction='sum')(cls, tcls) + loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls + t4 = time.time() + if False: + print('-----------------------------------') + print(' activation : %f' % (t1 - t0)) + print(' create pred_boxes : %f' % (t2 - t1)) + print(' build targets : %f' % (t3 - t2)) + print(' create loss : %f' % (t4 - t3)) + print(' total : %f' % (t4 - t0)) + print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f' % ( + self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_w.data[0], loss_h.data[0], + loss_conf.data[0], loss_cls.data[0], loss.data[0])) + return loss diff --git a/easymocap/estimator/YOLOv4/torch_utils.py b/easymocap/estimator/YOLOv4/torch_utils.py new file mode 100644 index 0000000..453bc1c --- /dev/null +++ b/easymocap/estimator/YOLOv4/torch_utils.py @@ -0,0 +1,98 @@ +import sys +import os +import time +import math +import torch +import numpy as np +from torch.autograd import Variable + + +def bbox_ious(boxes1, boxes2, x1y1x2y2=True): + if x1y1x2y2: + mx = torch.min(boxes1[0], boxes2[0]) + Mx = torch.max(boxes1[2], boxes2[2]) + my = torch.min(boxes1[1], boxes2[1]) + My = torch.max(boxes1[3], boxes2[3]) + w1 = boxes1[2] - boxes1[0] + h1 = boxes1[3] - boxes1[1] + w2 = boxes2[2] - boxes2[0] + h2 = boxes2[3] - boxes2[1] + else: + mx = torch.min(boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0) + Mx = torch.max(boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0) + my = torch.min(boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0) + My = torch.max(boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0) + w1 = boxes1[2] + h1 = boxes1[3] + w2 = boxes2[2] + h2 = boxes2[3] + uw = Mx - mx + uh = My - my + cw = w1 + w2 - uw + ch = h1 + h2 - uh + mask = ((cw <= 0) + (ch <= 0) > 0) + area1 = w1 * h1 + area2 = w2 * h2 + carea = cw * ch + carea[mask] = 0 + uarea = area1 + area2 - carea + return carea / uarea + + +def get_region_boxes(boxes_and_confs): + + # print('Getting boxes from boxes and confs ...') + + boxes_list = [] + confs_list = [] + + for item in boxes_and_confs: + boxes_list.append(item[0]) + confs_list.append(item[1]) + + # boxes: [batch, num1 + num2 + num3, 1, 4] + # confs: [batch, num1 + num2 + num3, num_classes] + boxes = torch.cat(boxes_list, dim=1) + confs = torch.cat(confs_list, dim=1) + + return [boxes, confs] + + +def convert2cpu(gpu_matrix): + return torch.FloatTensor(gpu_matrix.size()).copy_(gpu_matrix) + + +def convert2cpu_long(gpu_matrix): + return torch.LongTensor(gpu_matrix.size()).copy_(gpu_matrix) + + + +def do_detect(model, img, conf_thresh, nms_thresh, use_cuda=1): + model.eval() + t0 = time.time() + + if type(img) == np.ndarray and len(img.shape) == 3: # cv2 image + img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) + elif type(img) == np.ndarray and len(img.shape) == 4: + img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0) + else: + print("unknow image type") + exit(-1) + + if use_cuda: + img = img.cuda() + img = torch.autograd.Variable(img) + + t1 = time.time() + + output = model(img) + + t2 = time.time() + + print('-----------------------------------') + print(' Preprocess : %f' % (t1 - t0)) + print(' Model Inference : %f' % (t2 - t1)) + print('-----------------------------------') + + return utils.post_processing(img, conf_thresh, nms_thresh, output) + diff --git a/easymocap/estimator/YOLOv4/utils.py b/easymocap/estimator/YOLOv4/utils.py new file mode 100644 index 0000000..9b69d3c --- /dev/null +++ b/easymocap/estimator/YOLOv4/utils.py @@ -0,0 +1,221 @@ +import sys +import os +import time +import math +import numpy as np + +import itertools +import struct # get_image_size +import imghdr # get_image_size + + +def sigmoid(x): + return 1.0 / (np.exp(-x) + 1.) + + +def softmax(x): + x = np.exp(x - np.expand_dims(np.max(x, axis=1), axis=1)) + x = x / np.expand_dims(x.sum(axis=1), axis=1) + return x + + +def bbox_iou(box1, box2, x1y1x2y2=True): + + # print('iou box1:', box1) + # print('iou box2:', box2) + + if x1y1x2y2: + mx = min(box1[0], box2[0]) + Mx = max(box1[2], box2[2]) + my = min(box1[1], box2[1]) + My = max(box1[3], box2[3]) + w1 = box1[2] - box1[0] + h1 = box1[3] - box1[1] + w2 = box2[2] - box2[0] + h2 = box2[3] - box2[1] + else: + w1 = box1[2] + h1 = box1[3] + w2 = box2[2] + h2 = box2[3] + + mx = min(box1[0], box2[0]) + Mx = max(box1[0] + w1, box2[0] + w2) + my = min(box1[1], box2[1]) + My = max(box1[1] + h1, box2[1] + h2) + uw = Mx - mx + uh = My - my + cw = w1 + w2 - uw + ch = h1 + h2 - uh + carea = 0 + if cw <= 0 or ch <= 0: + return 0.0 + + area1 = w1 * h1 + area2 = w2 * h2 + carea = cw * ch + uarea = area1 + area2 - carea + return carea / uarea + + +def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False): + # print(boxes.shape) + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + + areas = (x2 - x1) * (y2 - y1) + order = confs.argsort()[::-1] + + keep = [] + while order.size > 0: + idx_self = order[0] + idx_other = order[1:] + + keep.append(idx_self) + + xx1 = np.maximum(x1[idx_self], x1[idx_other]) + yy1 = np.maximum(y1[idx_self], y1[idx_other]) + xx2 = np.minimum(x2[idx_self], x2[idx_other]) + yy2 = np.minimum(y2[idx_self], y2[idx_other]) + + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + inter = w * h + + if min_mode: + over = inter / np.minimum(areas[order[0]], areas[order[1:]]) + else: + over = inter / (areas[order[0]] + areas[order[1:]] - inter) + + inds = np.where(over <= nms_thresh)[0] + order = order[inds + 1] + + return np.array(keep) + + + +def plot_boxes_cv2(img, boxes, savename=None, class_names=None, color=None): + import cv2 + img = np.copy(img) + colors = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]], dtype=np.float32) + + def get_color(c, x, max_val): + ratio = float(x) / max_val * 5 + i = int(math.floor(ratio)) + j = int(math.ceil(ratio)) + ratio = ratio - i + r = (1 - ratio) * colors[i][c] + ratio * colors[j][c] + return int(r * 255) + + width = img.shape[1] + height = img.shape[0] + for i in range(len(boxes)): + box = boxes[i] + x1 = int(box[0] * width) + y1 = int(box[1] * height) + x2 = int(box[2] * width) + y2 = int(box[3] * height) + + if color: + rgb = color + else: + rgb = (255, 0, 0) + if len(box) >= 7 and class_names: + cls_conf = box[5] + cls_id = box[6] + print('%s: %f' % (class_names[cls_id], cls_conf)) + classes = len(class_names) + offset = cls_id * 123457 % classes + red = get_color(2, offset, classes) + green = get_color(1, offset, classes) + blue = get_color(0, offset, classes) + if color is None: + rgb = (red, green, blue) + img = cv2.putText(img, class_names[cls_id], (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1.2, rgb, 1) + img = cv2.rectangle(img, (x1, y1), (x2, y2), rgb, 1) + if savename: + print("save plot results to %s" % savename) + cv2.imwrite(savename, img) + return img + + +def read_truths(lab_path): + if not os.path.exists(lab_path): + return np.array([]) + if os.path.getsize(lab_path): + truths = np.loadtxt(lab_path) + truths = truths.reshape(truths.size / 5, 5) # to avoid single truth problem + return truths + else: + return np.array([]) + +def post_processing(img, conf_thresh, nms_thresh, output): + + # anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401] + # num_anchors = 9 + # anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]] + # strides = [8, 16, 32] + # anchor_step = len(anchors) // num_anchors + + # [batch, num, 1, 4] + box_array = output[0] + # [batch, num, num_classes] + confs = output[1] + + t1 = time.time() + + if type(box_array).__name__ != 'ndarray': + box_array = box_array.cpu().detach().numpy() + confs = confs.cpu().detach().numpy() + + num_classes = confs.shape[2] + + # [batch, num, 4] + box_array = box_array[:, :, 0] + + # [batch, num, num_classes] --> [batch, num] + max_conf = np.max(confs, axis=2) + max_id = np.argmax(confs, axis=2) + + t2 = time.time() + + bboxes_batch = [] + for i in range(box_array.shape[0]): + + argwhere = max_conf[i] > conf_thresh + l_box_array = box_array[i, argwhere, :] + l_max_conf = max_conf[i, argwhere] + l_max_id = max_id[i, argwhere] + + bboxes = [] + # nms for each class + for j in range(num_classes): + + cls_argwhere = l_max_id == j + ll_box_array = l_box_array[cls_argwhere, :] + ll_max_conf = l_max_conf[cls_argwhere] + ll_max_id = l_max_id[cls_argwhere] + + keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh) + + if (keep.size > 0): + ll_box_array = ll_box_array[keep, :] + ll_max_conf = ll_max_conf[keep] + ll_max_id = ll_max_id[keep] + + for k in range(ll_box_array.shape[0]): + bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2], ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]]) + + bboxes_batch.append(bboxes) + + t3 = time.time() + + print('-----------------------------------') + print(' max and argmax : %f' % (t2 - t1)) + print(' nms : %f' % (t3 - t2)) + print('Post processing total : %f' % (t3 - t1)) + print('-----------------------------------') + + return bboxes_batch diff --git a/easymocap/estimator/YOLOv4/yolo.py b/easymocap/estimator/YOLOv4/yolo.py new file mode 100644 index 0000000..e3b7fad --- /dev/null +++ b/easymocap/estimator/YOLOv4/yolo.py @@ -0,0 +1,161 @@ +''' + @ Date: 2020-12-10 16:39:51 + @ Author: Qing Shuai + @ LastEditors: Qing Shuai + @ LastEditTime: 2022-04-21 23:53:40 + @ FilePath: /EasyMocapPublic/easymocap/estimator/YOLOv4/yolo.py +''' +from .darknet2pytorch import Darknet +import cv2 +import torch +from os.path import join +import os +import numpy as np + +def load_class_names(namesfile): + class_names = [] + with open(namesfile, 'r') as fp: + lines = fp.readlines() + for line in lines: + line = line.rstrip() + class_names.append(line) + return class_names + +def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False): + # print(boxes.shape) + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + + areas = (x2 - x1) * (y2 - y1) + order = confs.argsort()[::-1] + + keep = [] + while order.size > 0: + idx_self = order[0] + idx_other = order[1:] + + keep.append(idx_self) + + xx1 = np.maximum(x1[idx_self], x1[idx_other]) + yy1 = np.maximum(y1[idx_self], y1[idx_other]) + xx2 = np.minimum(x2[idx_self], x2[idx_other]) + yy2 = np.minimum(y2[idx_self], y2[idx_other]) + + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + inter = w * h + + if min_mode: + over = inter / np.minimum(areas[order[0]], areas[order[1:]]) + else: + over = inter / (areas[order[0]] + areas[order[1:]] - inter) + + inds = np.where(over <= nms_thresh)[0] + order = order[inds + 1] + return np.array(keep) + +def post_processing(conf_thresh, nms_thresh, output): + # [batch, num, 1, 4] + box_array = output[0] + # [batch, num, num_classes] + confs = output[1] + + if type(box_array).__name__ != 'ndarray': + box_array = box_array.cpu().detach().numpy() + confs = confs.cpu().detach().numpy() + + num_classes = confs.shape[2] + + # [batch, num, 4] + box_array = box_array[:, :, 0] + + # [batch, num, num_classes] --> [batch, num] + max_conf = np.max(confs, axis=2) + max_id = np.argmax(confs, axis=2) + + bboxes_batch = [] + for i in range(box_array.shape[0]): + argwhere = max_conf[i] > conf_thresh + l_box_array = box_array[i, argwhere, :] + l_max_conf = max_conf[i, argwhere] + l_max_id = max_id[i, argwhere] + + bboxes = [] + # nms for class person + j = 0 + cls_argwhere = l_max_id == j + ll_box_array = l_box_array[cls_argwhere, :] + ll_max_conf = l_max_conf[cls_argwhere] + ll_max_id = l_max_id[cls_argwhere] + + keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh) + + if (keep.size > 0): + ll_box_array = ll_box_array[keep, :] + ll_max_conf = ll_max_conf[keep] + ll_max_id = ll_max_id[keep] + bboxes = np.hstack([ll_box_array, ll_max_conf[:, None]]) + + bboxes_batch.append(bboxes) + + return bboxes_batch + +class YOLOv4: + def __init__(self, device, ckpt_path, box_nms_thres, conf_thres, + isWild=False) -> None: + dirname = os.path.dirname(__file__) + cfgfile = join(dirname, 'yolov4.cfg') + namesfile = join(dirname, 'coco.names') + self.model = Darknet(cfgfile) + self.model.load_weights(ckpt_path) + self.model.to(device) + self.model.eval() + class_names = load_class_names(namesfile) + self.device = device + self.box_nms_thres = box_nms_thres + self.conf_thres = conf_thres + self.isWild = isWild + + def predict_single(self, image): + width = image.shape[1] + height = image.shape[0] + tgt_width = self.model.width + # 先缩小,再padding + if width > height: + tgt_shape = (tgt_width, int(height/width*tgt_width)) + resize = cv2.resize(image, tgt_shape) + sized = np.zeros((tgt_width, tgt_width, 3), dtype=np.uint8) + start = (sized.shape[0] - resize.shape[0])//2 + sized[start:start+resize.shape[0], :, :] = resize + # pad_to_square + elif width == height: + sized = cv2.resize(image, (tgt_width, tgt_width)) + start = 0 + else: + tgt_shape = (int(width/height*tgt_width), tgt_width) + resize = cv2.resize(image, tgt_shape) + sized = np.zeros((tgt_width, tgt_width, 3), dtype=np.uint8) + start = (sized.shape[1] - resize.shape[1]) // 2 + sized[:, start:start+resize.shape[1], :] = resize + img = torch.from_numpy(sized.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) + img = img.to(self.device) + with torch.no_grad(): + output = self.model(img) + bboxes = post_processing(self.conf_thres, self.box_nms_thres, output)[0] + if len(bboxes) == 0: + return bboxes + if self.isWild: + flag = ((bboxes[:, 2] - bboxes[:, 0]) < 0.8)&(((bboxes[:, 2] - bboxes[:, 0]) > 0.1)|((bboxes[:, 3] - bboxes[:, 1]) > 0.1)) + bboxes = bboxes[flag] + if width >= height: + bboxes[:, :4] *= width + bboxes[:, 1] -= start*width/tgt_width + bboxes[:, 3] -= start*width/tgt_width + else: + bboxes[:, :4] *= height + bboxes[:, 0] -= start*height/tgt_width + bboxes[:, 2] -= start*height/tgt_width + # return bounding box + return bboxes \ No newline at end of file diff --git a/easymocap/estimator/YOLOv4/yolo_layer.py b/easymocap/estimator/YOLOv4/yolo_layer.py new file mode 100644 index 0000000..3d3ae42 --- /dev/null +++ b/easymocap/estimator/YOLOv4/yolo_layer.py @@ -0,0 +1,322 @@ +import torch.nn as nn +import torch.nn.functional as F +from .torch_utils import * + +def yolo_forward(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1, + validation=False): + # Output would be invalid if it does not satisfy this assert + # assert (output.size(1) == (5 + num_classes) * num_anchors) + + # print(output.size()) + + # Slice the second dimension (channel) of output into: + # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ] + # And then into + # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ] + batch = output.size(0) + H = output.size(2) + W = output.size(3) + + bxy_list = [] + bwh_list = [] + det_confs_list = [] + cls_confs_list = [] + + for i in range(num_anchors): + begin = i * (5 + num_classes) + end = (i + 1) * (5 + num_classes) + + bxy_list.append(output[:, begin : begin + 2]) + bwh_list.append(output[:, begin + 2 : begin + 4]) + det_confs_list.append(output[:, begin + 4 : begin + 5]) + cls_confs_list.append(output[:, begin + 5 : end]) + + # Shape: [batch, num_anchors * 2, H, W] + bxy = torch.cat(bxy_list, dim=1) + # Shape: [batch, num_anchors * 2, H, W] + bwh = torch.cat(bwh_list, dim=1) + + # Shape: [batch, num_anchors, H, W] + det_confs = torch.cat(det_confs_list, dim=1) + # Shape: [batch, num_anchors * H * W] + det_confs = det_confs.view(batch, num_anchors * H * W) + + # Shape: [batch, num_anchors * num_classes, H, W] + cls_confs = torch.cat(cls_confs_list, dim=1) + # Shape: [batch, num_anchors, num_classes, H * W] + cls_confs = cls_confs.view(batch, num_anchors, num_classes, H * W) + # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes] + cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, num_classes) + + # Apply sigmoid(), exp() and softmax() to slices + # + bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1) + bwh = torch.exp(bwh) + det_confs = torch.sigmoid(det_confs) + cls_confs = torch.sigmoid(cls_confs) + + # Prepare C-x, C-y, P-w, P-h (None of them are torch related) + grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, W - 1, W), axis=0).repeat(H, 0), axis=0), axis=0) + grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(W, 1), axis=0), axis=0) + # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1) + # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W) + + anchor_w = [] + anchor_h = [] + for i in range(num_anchors): + anchor_w.append(anchors[i * 2]) + anchor_h.append(anchors[i * 2 + 1]) + + device = None + cuda_check = output.is_cuda + if cuda_check: + device = output.get_device() + + bx_list = [] + by_list = [] + bw_list = [] + bh_list = [] + + # Apply C-x, C-y, P-w, P-h + for i in range(num_anchors): + ii = i * 2 + # Shape: [batch, 1, H, W] + bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32) # grid_x.to(device=device, dtype=torch.float32) + # Shape: [batch, 1, H, W] + by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32) # grid_y.to(device=device, dtype=torch.float32) + # Shape: [batch, 1, H, W] + bw = bwh[:, ii : ii + 1] * anchor_w[i] + # Shape: [batch, 1, H, W] + bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i] + + bx_list.append(bx) + by_list.append(by) + bw_list.append(bw) + bh_list.append(bh) + + + ######################################## + # Figure out bboxes from slices # + ######################################## + + # Shape: [batch, num_anchors, H, W] + bx = torch.cat(bx_list, dim=1) + # Shape: [batch, num_anchors, H, W] + by = torch.cat(by_list, dim=1) + # Shape: [batch, num_anchors, H, W] + bw = torch.cat(bw_list, dim=1) + # Shape: [batch, num_anchors, H, W] + bh = torch.cat(bh_list, dim=1) + + # Shape: [batch, 2 * num_anchors, H, W] + bx_bw = torch.cat((bx, bw), dim=1) + # Shape: [batch, 2 * num_anchors, H, W] + by_bh = torch.cat((by, bh), dim=1) + + # normalize coordinates to [0, 1] + bx_bw /= W + by_bh /= H + + # Shape: [batch, num_anchors * H * W, 1] + bx = bx_bw[:, :num_anchors].view(batch, num_anchors * H * W, 1) + by = by_bh[:, :num_anchors].view(batch, num_anchors * H * W, 1) + bw = bx_bw[:, num_anchors:].view(batch, num_anchors * H * W, 1) + bh = by_bh[:, num_anchors:].view(batch, num_anchors * H * W, 1) + + bx1 = bx - bw * 0.5 + by1 = by - bh * 0.5 + bx2 = bx1 + bw + by2 = by1 + bh + + # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4] + boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view(batch, num_anchors * H * W, 1, 4) + # boxes = boxes.repeat(1, 1, num_classes, 1) + + # boxes: [batch, num_anchors * H * W, 1, 4] + # cls_confs: [batch, num_anchors * H * W, num_classes] + # det_confs: [batch, num_anchors * H * W] + + det_confs = det_confs.view(batch, num_anchors * H * W, 1) + confs = cls_confs * det_confs + + # boxes: [batch, num_anchors * H * W, 1, 4] + # confs: [batch, num_anchors * H * W, num_classes] + + return boxes, confs + + +def yolo_forward_dynamic(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1, + validation=False): + # Output would be invalid if it does not satisfy this assert + # assert (output.size(1) == (5 + num_classes) * num_anchors) + + # print(output.size()) + + # Slice the second dimension (channel) of output into: + # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ] + # And then into + # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ] + # batch = output.size(0) + # H = output.size(2) + # W = output.size(3) + + bxy_list = [] + bwh_list = [] + det_confs_list = [] + cls_confs_list = [] + + for i in range(num_anchors): + begin = i * (5 + num_classes) + end = (i + 1) * (5 + num_classes) + + bxy_list.append(output[:, begin : begin + 2]) + bwh_list.append(output[:, begin + 2 : begin + 4]) + det_confs_list.append(output[:, begin + 4 : begin + 5]) + cls_confs_list.append(output[:, begin + 5 : end]) + + # Shape: [batch, num_anchors * 2, H, W] + bxy = torch.cat(bxy_list, dim=1) + # Shape: [batch, num_anchors * 2, H, W] + bwh = torch.cat(bwh_list, dim=1) + + # Shape: [batch, num_anchors, H, W] + det_confs = torch.cat(det_confs_list, dim=1) + # Shape: [batch, num_anchors * H * W] + det_confs = det_confs.view(output.size(0), num_anchors * output.size(2) * output.size(3)) + + # Shape: [batch, num_anchors * num_classes, H, W] + cls_confs = torch.cat(cls_confs_list, dim=1) + # Shape: [batch, num_anchors, num_classes, H * W] + cls_confs = cls_confs.view(output.size(0), num_anchors, num_classes, output.size(2) * output.size(3)) + # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes] + cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(output.size(0), num_anchors * output.size(2) * output.size(3), num_classes) + + # Apply sigmoid(), exp() and softmax() to slices + # + bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1) + bwh = torch.exp(bwh) + det_confs = torch.sigmoid(det_confs) + cls_confs = torch.sigmoid(cls_confs) + + # Prepare C-x, C-y, P-w, P-h (None of them are torch related) + grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, output.size(3) - 1, output.size(3)), axis=0).repeat(output.size(2), 0), axis=0), axis=0) + grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, output.size(2) - 1, output.size(2)), axis=1).repeat(output.size(3), 1), axis=0), axis=0) + # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1) + # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W) + + anchor_w = [] + anchor_h = [] + for i in range(num_anchors): + anchor_w.append(anchors[i * 2]) + anchor_h.append(anchors[i * 2 + 1]) + + device = None + cuda_check = output.is_cuda + if cuda_check: + device = output.get_device() + + bx_list = [] + by_list = [] + bw_list = [] + bh_list = [] + + # Apply C-x, C-y, P-w, P-h + for i in range(num_anchors): + ii = i * 2 + # Shape: [batch, 1, H, W] + bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32) # grid_x.to(device=device, dtype=torch.float32) + # Shape: [batch, 1, H, W] + by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32) # grid_y.to(device=device, dtype=torch.float32) + # Shape: [batch, 1, H, W] + bw = bwh[:, ii : ii + 1] * anchor_w[i] + # Shape: [batch, 1, H, W] + bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i] + + bx_list.append(bx) + by_list.append(by) + bw_list.append(bw) + bh_list.append(bh) + + + ######################################## + # Figure out bboxes from slices # + ######################################## + + # Shape: [batch, num_anchors, H, W] + bx = torch.cat(bx_list, dim=1) + # Shape: [batch, num_anchors, H, W] + by = torch.cat(by_list, dim=1) + # Shape: [batch, num_anchors, H, W] + bw = torch.cat(bw_list, dim=1) + # Shape: [batch, num_anchors, H, W] + bh = torch.cat(bh_list, dim=1) + + # Shape: [batch, 2 * num_anchors, H, W] + bx_bw = torch.cat((bx, bw), dim=1) + # Shape: [batch, 2 * num_anchors, H, W] + by_bh = torch.cat((by, bh), dim=1) + + # normalize coordinates to [0, 1] + bx_bw /= output.size(3) + by_bh /= output.size(2) + + # Shape: [batch, num_anchors * H * W, 1] + bx = bx_bw[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + by = by_bh[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + bw = bx_bw[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + bh = by_bh[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + + bx1 = bx - bw * 0.5 + by1 = by - bh * 0.5 + bx2 = bx1 + bw + by2 = by1 + bh + + # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4] + boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view(output.size(0), num_anchors * output.size(2) * output.size(3), 1, 4) + # boxes = boxes.repeat(1, 1, num_classes, 1) + + # boxes: [batch, num_anchors * H * W, 1, 4] + # cls_confs: [batch, num_anchors * H * W, num_classes] + # det_confs: [batch, num_anchors * H * W] + + det_confs = det_confs.view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + confs = cls_confs * det_confs + + # boxes: [batch, num_anchors * H * W, 1, 4] + # confs: [batch, num_anchors * H * W, num_classes] + + return boxes, confs + +class YoloLayer(nn.Module): + ''' Yolo layer + model_out: while inference,is post-processing inside or outside the model + true:outside + ''' + def __init__(self, anchor_mask=[], num_classes=0, anchors=[], num_anchors=1, stride=32, model_out=False): + super(YoloLayer, self).__init__() + self.anchor_mask = anchor_mask + self.num_classes = num_classes + self.anchors = anchors + self.num_anchors = num_anchors + self.anchor_step = len(anchors) // num_anchors + self.coord_scale = 1 + self.noobject_scale = 1 + self.object_scale = 5 + self.class_scale = 1 + self.thresh = 0.6 + self.stride = stride + self.seen = 0 + self.scale_x_y = 1 + + self.model_out = model_out + + def forward(self, output, target=None): + if self.training: + return output + masked_anchors = [] + for m in self.anchor_mask: + masked_anchors += self.anchors[m * self.anchor_step:(m + 1) * self.anchor_step] + masked_anchors = [anchor / self.stride for anchor in masked_anchors] + + return yolo_forward_dynamic(output, self.thresh, self.num_classes, masked_anchors, len(self.anchor_mask),scale_x_y=self.scale_x_y) + diff --git a/easymocap/estimator/YOLOv4/yolov4.cfg b/easymocap/estimator/YOLOv4/yolov4.cfg new file mode 100644 index 0000000..2985a31 --- /dev/null +++ b/easymocap/estimator/YOLOv4/yolov4.cfg @@ -0,0 +1,1157 @@ +[net] +batch=64 +subdivisions=8 +# Training +#width=512 +#height=512 +width=608 +height=608 +channels=3 +momentum=0.949 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.0013 +burn_in=1000 +max_batches = 500500 +policy=steps +steps=400000,450000 +scales=.1,.1 + +#cutmix=1 +mosaic=1 + +#:104x104 54:52x52 85:26x26 104:13x13 for 416 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=mish + +# Downsample + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -2 + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=32 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -1,-7 + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +# Downsample + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=2 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -2 + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -1,-10 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +# Downsample + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=2 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -2 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -1,-28 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +# Downsample + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=2 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -2 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -1,-28 + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +# Downsample + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=2 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -2 + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -1,-16 + +[convolutional] +batch_normalize=1 +filters=1024 +size=1 +stride=1 +pad=1 +activation=mish + +########################## + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +### SPP ### +[maxpool] +stride=1 +size=5 + +[route] +layers=-2 + +[maxpool] +stride=1 +size=9 + +[route] +layers=-4 + +[maxpool] +stride=1 +size=13 + +[route] +layers=-1,-3,-5,-6 +### End SPP ### + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = 85 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1, -3 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = 54 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1, -3 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +########################## + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 0,1,2 +anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +scale_x_y = 1.2 +iou_thresh=0.213 +cls_normalizer=1.0 +iou_normalizer=0.07 +iou_loss=ciou +nms_kind=greedynms +beta_nms=0.6 +max_delta=5 + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +size=3 +stride=2 +pad=1 +filters=256 +activation=leaky + +[route] +layers = -1, -16 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 3,4,5 +anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +scale_x_y = 1.1 +iou_thresh=0.213 +cls_normalizer=1.0 +iou_normalizer=0.07 +iou_loss=ciou +nms_kind=greedynms +beta_nms=0.6 +max_delta=5 + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +size=3 +stride=2 +pad=1 +filters=512 +activation=leaky + +[route] +layers = -1, -37 + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 6,7,8 +anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 +scale_x_y = 1.05 +iou_thresh=0.213 +cls_normalizer=1.0 +iou_normalizer=0.07 +iou_loss=ciou +nms_kind=greedynms +beta_nms=0.6 +max_delta=5 diff --git a/easymocap/estimator/yolohrnet_wrapper.py b/easymocap/estimator/yolohrnet_wrapper.py new file mode 100644 index 0000000..de5e1e6 --- /dev/null +++ b/easymocap/estimator/yolohrnet_wrapper.py @@ -0,0 +1,122 @@ +from ..annotator.file_utils import read_json +from .wrapper_base import check_result, create_annot_file, save_annot +from glob import glob +from os.path import join +from tqdm import tqdm +import os +import cv2 +import numpy as np + +def detect_frame(detector, img, pid=0, only_bbox=False): + lDetections = detector.detect([img], only_bbox=only_bbox)[0] + annots = [] + for i in range(len(lDetections)): + annot = { + 'bbox': [float(d) for d in lDetections[i]['bbox']], + 'personID': pid + i, + 'isKeyframe': False + } + if not only_bbox: + annot['keypoints'] = lDetections[i]['keypoints'].tolist() + annots.append(annot) + return annots + +def extract_bbox(image_root, annot_root, ext, **config): + force = config.pop('force') + if check_result(image_root, annot_root) and not force: + return 0 + import torch + from .YOLOv4 import YOLOv4 + device = torch.device('cuda') \ + if torch.cuda.is_available() else torch.device('cpu') + detector = YOLOv4(device=device, **config) + imgnames = sorted(glob(join(image_root, '*'+ext))) + if len(imgnames) == 0: + ext = '.png' + imgnames = sorted(glob(join(image_root, '*'+ext))) + # run_yolo(image_root, ) + for imgname in tqdm(imgnames, desc='{:10s}'.format(os.path.basename(annot_root))): + base = os.path.basename(imgname).replace(ext, '') + annotname = join(annot_root, base+'.json') + annot = create_annot_file(annotname, imgname) + image = cv2.imread(imgname) + image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + detections = detector.predict_single(image_rgb) + annots = [] + pid = 0 + for i in range(len(detections)): + annot_ = { + 'bbox': [float(d) for d in detections[i]], + 'isKeyframe': False + } + annot_['area'] = max(annot_['bbox'][2] - annot_['bbox'][0], annot_['bbox'][3] - annot_['bbox'][1])**2 + annots.append(annot_) + annots.sort(key=lambda x:-x['area']) + # re-assign the person ID + for i in range(len(annots)): + annots[i]['personID'] = i + pid + annot['annots'] = annots + save_annot(annotname, annot) + +def extract_hrnet(image_root, annot_root, ext, **config): + config.pop('force') + import torch + imgnames = sorted(glob(join(image_root, '*'+ext))) + import torch + device = torch.device('cuda') \ + if torch.cuda.is_available() else torch.device('cpu') + from .HRNet import SimpleHRNet + estimator = SimpleHRNet(device=device, **config) + + for imgname in tqdm(imgnames, desc='{:10s}'.format(os.path.basename(annot_root))): + base = os.path.basename(imgname).replace(ext, '') + annotname = join(annot_root, base+'.json') + annots = read_json(annotname) + detections = np.array([data['bbox'] for data in annots['annots']]) + image = cv2.imread(imgname) + image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + points2d = estimator.predict(image_rgb, detections) + for i in range(detections.shape[0]): + annot_ = annots['annots'][i] + annot_['keypoints'] = points2d[i] + save_annot(annotname, annots) + +def extract_yolo_hrnet(image_root, annot_root, ext, config_yolo, config_hrnet): + config_yolo.pop('ext', None) + imgnames = sorted(glob(join(image_root, '*{}'.format(ext)))) + import torch + device = torch.device('cuda') + from .YOLOv4 import YOLOv4 + device = torch.device('cuda') \ + if torch.cuda.is_available() else torch.device('cpu') + detector = YOLOv4(device=device, **config_yolo) + from .HRNet import SimpleHRNet + estimator = SimpleHRNet(device=device, **config_hrnet) + + for nf, imgname in enumerate(tqdm(imgnames, desc=os.path.basename(image_root))): + base = os.path.basename(imgname).replace(ext, '') + annotname = join(annot_root, base+'.json') + annot = create_annot_file(annotname, imgname) + img0 = cv2.imread(imgname) + annot = create_annot_file(annotname, imgname) + image = cv2.imread(imgname) + image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + detections = detector.predict_single(image_rgb) + # forward_hrnet + points2d = estimator.predict(image_rgb, detections) + annots = [] + pid = 0 + for i in range(len(detections)): + annot_ = { + 'bbox': [float(d) for d in detections[i]], + 'keypoints': points2d[i], + 'isKeyframe': False + } + annot_['area'] = max(annot_['bbox'][2] - annot_['bbox'][0], annot_['bbox'][3] - annot_['bbox'][1])**2 + annots.append(annot_) + annots.sort(key=lambda x:-x['area']) + # re-assign the person ID + for i in range(len(annots)): + annots[i]['personID'] = i + pid + annot['annots'] = annots + save_annot(annotname, annot) \ No newline at end of file diff --git a/scripts/preprocess/copy_dataset.py b/scripts/preprocess/copy_dataset.py new file mode 100644 index 0000000..dadb93e --- /dev/null +++ b/scripts/preprocess/copy_dataset.py @@ -0,0 +1,184 @@ +''' + @ Date: 2021-06-14 15:39:26 + @ Author: Qing Shuai + @ LastEditors: Qing Shuai + @ LastEditTime: 2022-08-02 21:50:40 + @ FilePath: /EasyMocapPublic/scripts/preprocess/copy_dataset.py +''' +import os +from os.path import join +import shutil +from tqdm import tqdm +from glob import glob +import cv2 + +from easymocap.mytools.debug_utils import myerror, mywarn + +mkdir = lambda x:os.makedirs(x, exist_ok=True) + +import json + +def save_json(file, data): + if not os.path.exists(os.path.dirname(file)): + os.makedirs(os.path.dirname(file)) + with open(file, 'w') as f: + json.dump(data, f, indent=4) + +def read_json(path): + with open(path) as f: + data = json.load(f) + return data + +def copy_dataset(inp, out, start, end, step, keys, args): + copy_keys = { + 'images': args.ext, + 'annots': '.json', + 'mask-schp': '.png', + } + copy_share_keys = { + 'output-keypoints3d/keypoints3d': '.json' + } + mkdir(out) + if os.path.exists(join(inp, 'intri.yml')): + shutil.copyfile(join(inp, 'intri.yml'), join(out, 'intri.yml')) + shutil.copyfile(join(inp, 'extri.yml'), join(out, 'extri.yml')) + if os.path.exists(join(inp, 'match_name.json')): + names = read_json(join(inp, 'match_name.json')) + names = names[start:end:step] + save_json(join(out, 'match_name.json'), names) + if os.path.exists(join(inp, 'sync_time.txt')): + import numpy as np + times = np.loadtxt(join(inp, 'sync_time.txt')) + times = times.reshape(times.shape[0], -1) + times = times[:, start:end:step] + np.savetxt(join(out, 'sync_time.txt'), times, fmt='%10d') + os.system('touch ' + join(out, '{}-{}-{}'.format(start, end, step))) + for copy, ext in copy_share_keys.items(): + if not os.path.exists(join(inp, copy)): + continue + if len(args.frames) == 0: + ranges = [i for i in range(start, end, step)] + else: + ranges = args.frames + outdir = join(out, copy) + if os.path.exists(outdir) and len(os.listdir(outdir)) == len(ranges): + pass + os.makedirs(outdir, exist_ok=True) + for nnf, nf in enumerate(tqdm(ranges, desc='{}'.format(copy))): + oldname = join(inp, copy, '{:06d}{}'.format(nf, ext)) + if not os.path.exists(oldname): + mywarn('{} not exists'.format(oldname)) + continue + newname = join(outdir, '{:06d}{}'.format(nnf, ext)) + shutil.copyfile(oldname, newname) + + for copy in keys: + ext = copy_keys.get(copy, '.json') + if not os.path.exists(join(inp, copy)): + continue + if len(args.subs) == 0: + subs = sorted(os.listdir(join(inp, copy))) + subs = [s for s in subs if os.path.isdir(join(inp, copy, s))] + else: + subs = args.subs + for sub in subs: + if not os.path.exists(join(inp, copy)): + continue + outdir = join(out, copy, sub.replace(args.strip, '')) + os.makedirs(outdir, exist_ok=True) + if args.end == -1: + oldnames = sorted(glob(join(inp, copy, sub, '*{}'.format(ext)))) + end = len(oldnames) + print('{} has {} frames'.format(sub, end)) + if args.sample == -1: + if len(args.frames) == 0: + ranges = [i for i in range(start, end, step)] + else: + ranges = args.frames + else: + ranges = [(i/args.sample)*(end-start-2*args.strip_frame)+start+args.strip_frame for i in range(args.sample)] + ranges = [int(i+0.5) for i in ranges] + if os.path.exists(outdir) and len(os.listdir(outdir)) == len(ranges): + mywarn('[copy] Skip {}'.format(outdir)) + continue + for nnf, nf in enumerate(tqdm(ranges, desc='{}:{}'.format(sub, copy))): + oldname = join(inp, copy, sub, '{:06d}{}'.format(nf, ext)) + if not os.path.exists(oldname): + oldnames = sorted(glob(join(inp, copy, sub, '{:06d}_*{}'.format(nf, ext)))) + if len(oldnames) == 0: + myerror('{} not exists'.format(oldname)) + import ipdb;ipdb.set_trace() + else: + for oldname in oldnames: + newname = join(outdir, os.path.basename(oldname).replace('{:06d}'.format(nf), '{:06d}'.format(nnf))) + shutil.copyfile(oldname, newname) + else: + newname = join(outdir, '{:06d}{}'.format(nnf, ext)) + if copy == 'images' and args.scale != 1: + img = cv2.imread(oldname) + img = cv2.resize(img, None, fx=args.scale, fy=args.scale) + cv2.imwrite(newname, img) + else: + shutil.copyfile(oldname, newname) + # make videos + if copy == 'images' and args.make_video: + os.makedirs(join(out, 'videos'), exist_ok=True) + for sub in subs: + shell = '{} -y -i {}/images/{}/%06d{} -vcodec libx264 {}/videos/{}.mp4 -loglevel quiet'.format( + args.ffmpeg, out, sub, ext, out, sub + ) + print(shell) + os.system(shell) + +def export(root, out, keys): + mkdir(out) + for key in keys: + src = join(root, key) + dst = join(out, key) + if key == 'videos': + if os.path.exists(src): + shutil.copytree(src, dst) + else: + mkdir(dst) + subs = sorted(os.listdir(join(root, 'images'))) + for sub in subs: + cmd = '{ffmpeg} -r {fps} -i {inp}/%06d.jpg -vcodec libx264 {out}'.format( + ffmpeg=args.ffmpeg, fps=50, inp=join(root, 'images', sub), + out=join(dst, sub+'.mp4') + ) + os.system(cmd) + if not os.path.exists(src): + print(src) + continue + shutil.copytree(src, dst) + for name in ['intri.yml', 'extri.yml']: + if os.path.exists(join(root, name)): + shutil.copyfile(join(root, name), join(out, name)) + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('path', type=str) + parser.add_argument('out', type=str) + parser.add_argument('--strip', type=str, default='') + parser.add_argument('--keys', type=str, nargs='+', default=['images', 'annots', 'chessboard']) + parser.add_argument('--subs', type=str, nargs='+', default=[]) + parser.add_argument('--start', type=int, default=0) + parser.add_argument('--step', type=int, default=1) + parser.add_argument('--end', type=int, default=-1) + parser.add_argument('--scale', type=float, default=1) + parser.add_argument('--strip_frame', type=int, default=0, + help='remove the start frames and end frames') + parser.add_argument('--ffmpeg', type=str, default='ffmpeg') + parser.add_argument('--ext', type=str, default='.jpg') + parser.add_argument('--sample', type=int, default=-1, + help='use this flag to sample a fixed number of frames') + parser.add_argument('--frames', type=int, default=[], nargs='+') + parser.add_argument('--debug', action='store_true') + parser.add_argument('--make_video', action='store_true') + parser.add_argument('--export', action='store_true') + args = parser.parse_args() + if args.export: + export(args.path, args.out, args.keys) + else: + copy_dataset(args.path, args.out, start=args.start, end=args.end, step=args.step, keys=args.keys, args=args)