256 lines
9.2 KiB
Python
256 lines
9.2 KiB
Python
'''
|
|
Date: 2021-10-25 11:51:37 am
|
|
Author: dihuangdh
|
|
Descriptions:
|
|
-----
|
|
LastEditTime: 2021-10-25 1:50:40 pm
|
|
LastEditors: dihuangdh
|
|
'''
|
|
|
|
import torch
|
|
from torchvision.transforms import Normalize
|
|
import numpy as np
|
|
import cv2
|
|
|
|
from .models import hmr
|
|
|
|
|
|
class constants:
|
|
FOCAL_LENGTH = 5000.
|
|
IMG_RES = 224
|
|
|
|
# Mean and standard deviation for normalizing input image
|
|
IMG_NORM_MEAN = [0.485, 0.456, 0.406]
|
|
IMG_NORM_STD = [0.229, 0.224, 0.225]
|
|
|
|
def get_transform(center, scale, res, rot=0):
|
|
"""Generate transformation matrix."""
|
|
h = 200 * scale
|
|
t = np.zeros((3, 3))
|
|
t[0, 0] = float(res[1]) / h
|
|
t[1, 1] = float(res[0]) / h
|
|
t[0, 2] = res[1] * (-float(center[0]) / h + .5)
|
|
t[1, 2] = res[0] * (-float(center[1]) / h + .5)
|
|
t[2, 2] = 1
|
|
if not rot == 0:
|
|
rot = -rot # To match direction of rotation from cropping
|
|
rot_mat = np.zeros((3,3))
|
|
rot_rad = rot * np.pi / 180
|
|
sn,cs = np.sin(rot_rad), np.cos(rot_rad)
|
|
rot_mat[0,:2] = [cs, -sn]
|
|
rot_mat[1,:2] = [sn, cs]
|
|
rot_mat[2,2] = 1
|
|
# Need to rotate around center
|
|
t_mat = np.eye(3)
|
|
t_mat[0,2] = -res[1]/2
|
|
t_mat[1,2] = -res[0]/2
|
|
t_inv = t_mat.copy()
|
|
t_inv[:2,2] *= -1
|
|
t = np.dot(t_inv,np.dot(rot_mat,np.dot(t_mat,t)))
|
|
return t
|
|
|
|
def transform(pt, center, scale, res, invert=0, rot=0):
|
|
"""Transform pixel location to different reference."""
|
|
t = get_transform(center, scale, res, rot=rot)
|
|
if invert:
|
|
t = np.linalg.inv(t)
|
|
new_pt = np.array([pt[0]-1, pt[1]-1, 1.]).T
|
|
new_pt = np.dot(t, new_pt)
|
|
return new_pt[:2].astype(int)+1
|
|
|
|
def crop(img, center, scale, res, rot=0, bias=0):
|
|
"""Crop image according to the supplied bounding box."""
|
|
# Upper left point
|
|
ul = np.array(transform([1, 1], center, scale, res, invert=1))-1
|
|
# Bottom right point
|
|
br = np.array(transform([res[0]+1,
|
|
res[1]+1], center, scale, res, invert=1))-1
|
|
|
|
# Padding so that when rotated proper amount of context is included
|
|
pad = int(np.linalg.norm(br - ul) / 2 - float(br[1] - ul[1]) / 2)
|
|
if not rot == 0:
|
|
ul -= pad
|
|
br += pad
|
|
|
|
new_shape = [br[1] - ul[1], br[0] - ul[0]]
|
|
if len(img.shape) > 2:
|
|
new_shape += [img.shape[2]]
|
|
new_img = np.zeros(new_shape) + bias
|
|
|
|
# Range to fill new array
|
|
new_x = max(0, -ul[0]), min(br[0], len(img[0])) - ul[0]
|
|
new_y = max(0, -ul[1]), min(br[1], len(img)) - ul[1]
|
|
# Range to sample from original image
|
|
old_x = max(0, ul[0]), min(len(img[0]), br[0])
|
|
old_y = max(0, ul[1]), min(len(img), br[1])
|
|
new_img[new_y[0]:new_y[1], new_x[0]:new_x[1]] = img[old_y[0]:old_y[1],
|
|
old_x[0]:old_x[1]]
|
|
|
|
if not rot == 0:
|
|
# Remove padding
|
|
new_img = scipy.misc.imrotate(new_img, rot)
|
|
new_img = new_img[pad:-pad, pad:-pad]
|
|
new_img = cv2.resize(new_img, (res[0], res[1]))
|
|
return new_img
|
|
|
|
def process_image(img, bbox, input_res=224):
|
|
"""Read image, do preprocessing and possibly crop it according to the bounding box.
|
|
If there are bounding box annotations, use them to crop the image.
|
|
If no bounding box is specified but openpose detections are available, use them to get the bounding box.
|
|
"""
|
|
img = img[:, :, ::-1].copy()
|
|
normalize_img = Normalize(mean=constants.IMG_NORM_MEAN, std=constants.IMG_NORM_STD)
|
|
l, t, r, b = bbox[:4]
|
|
center = [(l+r)/2, (t+b)/2]
|
|
width = max(r-l, b-t)
|
|
scale = width/200.0
|
|
img = crop(img, center, scale, (input_res, input_res))
|
|
img = img.astype(np.float32) / 255.
|
|
img = torch.from_numpy(img).permute(2,0,1)
|
|
norm_img = normalize_img(img.clone())[None]
|
|
return img, norm_img
|
|
|
|
def solve_translation(X, x, K):
|
|
A = np.zeros((2*X.shape[0], 3))
|
|
b = np.zeros((2*X.shape[0], 1))
|
|
fx, fy = K[0, 0], K[1, 1]
|
|
cx, cy = K[0, 2], K[1, 2]
|
|
for nj in range(X.shape[0]):
|
|
A[2*nj, 0] = 1
|
|
A[2*nj + 1, 1] = 1
|
|
A[2*nj, 2] = -(x[nj, 0] - cx)/fx
|
|
A[2*nj+1, 2] = -(x[nj, 1] - cy)/fy
|
|
b[2*nj, 0] = X[nj, 2]*(x[nj, 0] - cx)/fx - X[nj, 0]
|
|
b[2*nj+1, 0] = X[nj, 2]*(x[nj, 1] - cy)/fy - X[nj, 1]
|
|
A[2*nj:2*nj+2, :] *= x[nj, 2]
|
|
b[2*nj:2*nj+2, :] *= x[nj, 2]
|
|
trans = np.linalg.inv(A.T @ A) @ A.T @ b
|
|
return trans.T[0]
|
|
|
|
def estimate_translation_np(S, joints_2d, joints_conf, K):
|
|
"""Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
|
|
Input:
|
|
S: (25, 3) 3D joint locations
|
|
joints: (25, 3) 2D joint locations and confidence
|
|
Returns:
|
|
(3,) camera translation vector
|
|
"""
|
|
num_joints = S.shape[0]
|
|
# focal length
|
|
f = np.array([K[0, 0], K[1, 1]])
|
|
# optical center
|
|
center = np.array([K[0, 2], K[1, 2]])
|
|
|
|
# transformations
|
|
Z = np.reshape(np.tile(S[:,2],(2,1)).T,-1)
|
|
XY = np.reshape(S[:,0:2],-1)
|
|
O = np.tile(center,num_joints)
|
|
F = np.tile(f,num_joints)
|
|
weight2 = np.reshape(np.tile(np.sqrt(joints_conf),(2,1)).T,-1)
|
|
|
|
# least squares
|
|
Q = np.array([F*np.tile(np.array([1,0]),num_joints), F*np.tile(np.array([0,1]),num_joints), O-np.reshape(joints_2d,-1)]).T
|
|
c = (np.reshape(joints_2d,-1)-O)*Z - F*XY
|
|
|
|
# weighted least squares
|
|
W = np.diagflat(weight2)
|
|
Q = np.dot(W,Q)
|
|
c = np.dot(W,c)
|
|
|
|
# square matrix
|
|
A = np.dot(Q.T,Q)
|
|
b = np.dot(Q.T,c)
|
|
|
|
# solution
|
|
trans = np.linalg.solve(A, b)
|
|
|
|
return trans
|
|
|
|
|
|
class HMR:
|
|
def __init__(self, checkpoint, device) -> None:
|
|
model = hmr().to(device)
|
|
checkpoint = torch.load(checkpoint)
|
|
state_dict = checkpoint['state_dict']
|
|
# update state_dict, remove 'model.'
|
|
for key in list(state_dict.keys()):
|
|
state_dict[key[6:]] = state_dict.pop(key)
|
|
model.load_state_dict(state_dict, strict=False)
|
|
# Load SMPL model
|
|
model.eval()
|
|
self.model = model
|
|
self.device = device
|
|
|
|
def forward(self, img, bbox, use_rh_th=True):
|
|
# Preprocess input image and generate predictions
|
|
img, norm_img = process_image(img, bbox, input_res=constants.IMG_RES)
|
|
with torch.no_grad():
|
|
pred_rotmat, pred_betas, pred_camera = self.model(norm_img.to(self.device))
|
|
results = {
|
|
'shapes': pred_betas.detach().cpu().numpy()
|
|
}
|
|
results['poses'] = pred_rotmat.detach().cpu().numpy()
|
|
if use_rh_th:
|
|
body_params = {
|
|
'poses': results['poses'],
|
|
'shapes': results['shapes'],
|
|
'Rh': results['poses'][:, :3].copy(),
|
|
'Th': np.zeros((1, 3)),
|
|
}
|
|
body_params['Th'][0, 2] = 5
|
|
body_params['poses'][:, :3] = 0
|
|
results = body_params
|
|
return results
|
|
|
|
def __call__(self, body_model, img, bbox, kpts, camera, ret_vertices=True):
|
|
body_params = self.forward(img.copy(), bbox)
|
|
body_params = body_model.check_params(body_params)
|
|
# only use body joints to estimation translation
|
|
nJoints = 21
|
|
keypoints3d = body_model(return_verts=False, return_tensor=False, **body_params)[0]
|
|
trans = solve_translation(keypoints3d[:nJoints], kpts[:nJoints], camera['K'])
|
|
body_params['Th'] += trans[None, :]
|
|
if body_params['Th'][0, 2] < 0:
|
|
body_params['Th'] = -body_params['Th']
|
|
Rhold = cv2.Rodrigues(body_params['Rh'])[0]
|
|
rotx = cv2.Rodrigues(np.pi*np.array([1., 0, 0]))[0]
|
|
Rhold = rotx @ Rhold
|
|
body_params['Rh'] = cv2.Rodrigues(Rhold)[0].reshape(1, 3)
|
|
# convert to world coordinate
|
|
Rhold = cv2.Rodrigues(body_params['Rh'])[0]
|
|
Thold = body_params['Th']
|
|
Rh = camera['R'].T @ Rhold
|
|
Th = (camera['R'].T @ (Thold.T - camera['T'])).T
|
|
body_params['Th'] = Th
|
|
body_params['Rh'] = cv2.Rodrigues(Rh)[0].reshape(1, 3)
|
|
keypoints3d = body_model(return_verts=False, return_tensor=False, **body_params)[0]
|
|
results = {'body_params': body_params, 'keypoints3d': keypoints3d}
|
|
if ret_vertices:
|
|
vertices = body_model(return_verts=True, return_tensor=False, **body_params)[0]
|
|
results['vertices'] = vertices
|
|
return results
|
|
|
|
def init_with_hmr(body_model, spin_model, img, bbox, kpts, camera):
|
|
body_params = spin_model.forward(img.copy(), bbox)
|
|
body_params = body_model.check_params(body_params)
|
|
# only use body joints to estimation translation
|
|
nJoints = 15
|
|
keypoints3d = body_model(return_verts=False, return_tensor=False, **body_params)[0]
|
|
trans = estimate_translation_np(keypoints3d[:nJoints], kpts[:nJoints, :2], kpts[:nJoints, 2], camera['K'])
|
|
body_params['Th'] += trans[None, :]
|
|
# convert to world coordinate
|
|
Rhold = cv2.Rodrigues(body_params['Rh'])[0]
|
|
Thold = body_params['Th']
|
|
Rh = camera['R'].T @ Rhold
|
|
Th = (camera['R'].T @ (Thold.T - camera['T'])).T
|
|
body_params['Th'] = Th
|
|
body_params['Rh'] = cv2.Rodrigues(Rh)[0].reshape(1, 3)
|
|
vertices = body_model(return_verts=True, return_tensor=False, **body_params)[0]
|
|
keypoints3d = body_model(return_verts=False, return_tensor=False, **body_params)[0]
|
|
results = {'body_params': body_params, 'vertices': vertices, 'keypoints3d': keypoints3d}
|
|
return results
|
|
|
|
class
|
|
|
|
if __name__ == '__main__':
|
|
pass |