70 lines
3.0 KiB
Python
70 lines
3.0 KiB
Python
import numpy as np
|
|
import cv2
|
|
import torch
|
|
import torchvision
|
|
|
|
class VGGPerceptualLoss(torch.nn.Module):
|
|
def __init__(self, resize=False):
|
|
super(VGGPerceptualLoss, self).__init__()
|
|
blocks = []
|
|
blocks.append(torchvision.models.vgg16(pretrained=True).features[:4].eval())
|
|
blocks.append(torchvision.models.vgg16(pretrained=True).features[4:9].eval())
|
|
blocks.append(torchvision.models.vgg16(pretrained=True).features[9:16].eval())
|
|
blocks.append(torchvision.models.vgg16(pretrained=True).features[16:23].eval())
|
|
for bl in blocks:
|
|
for p in bl.parameters():
|
|
p.requires_grad = False
|
|
self.blocks = torch.nn.ModuleList(blocks)
|
|
self.transform = torch.nn.functional.interpolate
|
|
self.resize = resize
|
|
self.register_buffer("mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
|
|
self.register_buffer("std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
|
|
|
|
def forward(self, input, target, feature_layers=[0, 1, 2, 3], style_layers=[]):
|
|
if input.shape[1] != 3:
|
|
input = input.repeat(1, 3, 1, 1)
|
|
target = target.repeat(1, 3, 1, 1)
|
|
input = (input-self.mean) / self.std
|
|
target = (target-self.mean) / self.std
|
|
if self.resize:
|
|
input = self.transform(input, mode='bilinear', size=(224, 224), align_corners=False)
|
|
target = self.transform(target, mode='bilinear', size=(224, 224), align_corners=False)
|
|
loss = 0.0
|
|
x = input
|
|
y = target
|
|
for i, block in enumerate(self.blocks):
|
|
x = block(x)
|
|
y = block(y)
|
|
if i in feature_layers:
|
|
loss += torch.nn.functional.l1_loss(x, y)
|
|
if i in style_layers:
|
|
act_x = x.reshape(x.shape[0], x.shape[1], -1)
|
|
act_y = y.reshape(y.shape[0], y.shape[1], -1)
|
|
gram_x = act_x @ act_x.permute(0, 2, 1)
|
|
gram_y = act_y @ act_y.permute(0, 2, 1)
|
|
loss += torch.nn.functional.l1_loss(gram_x, gram_y)
|
|
return loss
|
|
|
|
class LossLPIPS(VGGPerceptualLoss):
|
|
def forward(self, inp, out):
|
|
W, H = 32, 32
|
|
target = inp['rgb'].reshape(-1, W, H, 3)
|
|
inputs = out['rgb_map'].reshape(-1, W, H, 3)
|
|
if inp['step'] % 100 == 0:
|
|
vis_all = []
|
|
for i in range(inputs.shape[0]):
|
|
target_ = target[i].detach().cpu().numpy()
|
|
inputs_ = inputs[i].detach().cpu().numpy()
|
|
vis = np.hstack([target_, inputs_])
|
|
vis = (vis*255).astype(np.uint8)
|
|
vis_all.append(vis)
|
|
vis_all = np.vstack(vis_all)
|
|
vis_all = vis_all[..., ::-1]
|
|
cv2.imwrite('debug/vis_lpips_{:08d}.jpg'.format(inp['step']), vis_all)
|
|
target = target.permute(0, 3, 1, 2)
|
|
inputs = inputs.permute(0, 3, 1, 2)
|
|
return super().forward(inputs, target)
|
|
|
|
if __name__ == '__main__':
|
|
lpips = VGGPerceptualLoss()
|