Portrait-Animation / src /dataset /test_preprocess.py
xiaozhongji
init spaces
79d88c4
raw
history blame
4.89 kB
import os
import numpy as np
from PIL import Image
import torch
import torchvision.transforms as transforms
from transformers import CLIPImageProcessor
import librosa
def process_bbox(bbox, expand_radio, height, width):
"""
raw_vid_path:
bbox: format: x1, y1, x2, y2
radio: expand radio against bbox size
height,width: source image height and width
"""
def expand(bbox, ratio, height, width):
bbox_h = bbox[3] - bbox[1]
bbox_w = bbox[2] - bbox[0]
expand_x1 = max(bbox[0] - ratio * bbox_w, 0)
expand_y1 = max(bbox[1] - ratio * bbox_h, 0)
expand_x2 = min(bbox[2] + ratio * bbox_w, width)
expand_y2 = min(bbox[3] + ratio * bbox_h, height)
return [expand_x1,expand_y1,expand_x2,expand_y2]
def to_square(bbox_src, bbox_expend, height, width):
h = bbox_expend[3] - bbox_expend[1]
w = bbox_expend[2] - bbox_expend[0]
c_h = (bbox_expend[1] + bbox_expend[3]) / 2
c_w = (bbox_expend[0] + bbox_expend[2]) / 2
c = min(h, w) / 2
c_src_h = (bbox_src[1] + bbox_src[3]) / 2
c_src_w = (bbox_src[0] + bbox_src[2]) / 2
s_h, s_w = 0, 0
if w < h:
d = abs((h - w) / 2)
s_h = min(d, abs(c_src_h-c_h))
s_h = s_h if c_src_h > c_h else s_h * (-1)
else:
d = abs((h - w) / 2)
s_w = min(d, abs(c_src_w-c_w))
s_w = s_w if c_src_w > c_w else s_w * (-1)
c_h = (bbox_expend[1] + bbox_expend[3]) / 2 + s_h
c_w = (bbox_expend[0] + bbox_expend[2]) / 2 + s_w
square_x1 = c_w - c
square_y1 = c_h - c
square_x2 = c_w + c
square_y2 = c_h + c
x1, y1, x2, y2 = square_x1, square_y1, square_x2, square_y2
ww = x2 - x1
hh = y2 - y1
cc_x = (x1 + x2)/2
cc_y = (y1 + y2)/2
# 1:1
ww = hh = min(ww, hh)
x1, x2 = round(cc_x - ww/2), round(cc_x + ww/2)
y1, y2 = round(cc_y - hh/2), round(cc_y + hh/2)
return [round(x1), round(y1), round(x2), round(y2)]
bbox_expend = expand(bbox, expand_radio, height=height, width=width)
processed_bbox = to_square(bbox, bbox_expend, height=height, width=width)
return processed_bbox
def get_audio_feature(audio_path, feature_extractor):
audio_input, sampling_rate = librosa.load(audio_path, sr=16000)
assert sampling_rate == 16000
audio_features = []
window = 750*640
for i in range(0, len(audio_input), window):
audio_feature = feature_extractor(audio_input[i:i+window],
sampling_rate=sampling_rate,
return_tensors="pt",
).input_features
audio_features.append(audio_feature)
audio_features = torch.cat(audio_features, dim=-1)
return audio_features, len(audio_input) // 640
def image_audio_to_tensor(align_instance, feature_extractor, image_path, audio_path, limit=100, image_size=512, area=1.25):
clip_processor = CLIPImageProcessor()
to_tensor = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
mask_to_tensor = transforms.Compose([
transforms.ToTensor(),
])
imSrc_ = Image.open(image_path).convert('RGB')
w, h = imSrc_.size
_, _, bboxes_list = align_instance(np.array(imSrc_)[:,:,[2,1,0]], maxface=True)
if len(bboxes_list) == 0:
return None
bboxSrc = bboxes_list[0]
x1, y1, ww, hh = bboxSrc
x2, y2 = x1 + ww, y1 + hh
mask_img = np.zeros_like(np.array(imSrc_))
ww, hh = (x2-x1) * area, (y2-y1) * area
center = [(x2+x1)//2, (y2+y1)//2]
x1 = max(center[0] - ww//2, 0)
y1 = max(center[1] - hh//2, 0)
x2 = min(center[0] + ww//2, w)
y2 = min(center[1] + hh//2, h)
mask_img[int(y1):int(y2), int(x1):int(x2)] = 255
mask_img = Image.fromarray(mask_img)
w, h = imSrc_.size
scale = image_size / min(w, h)
new_w = round(w * scale / 64) * 64
new_h = round(h * scale / 64) * 64
if new_h != h or new_w != w:
imSrc = imSrc_.resize((new_w, new_h), Image.LANCZOS)
mask_img = mask_img.resize((new_w, new_h), Image.LANCZOS)
else:
imSrc = imSrc_
clip_image = clip_processor(
images=imSrc.resize((224, 224), Image.LANCZOS), return_tensors="pt"
).pixel_values[0]
audio_input, audio_len = get_audio_feature(audio_path, feature_extractor)
audio_len = min(limit, audio_len)
sample = dict(
face_mask=mask_to_tensor(mask_img),
ref_img=to_tensor(imSrc),
clip_images=clip_image,
audio_feature=audio_input[0],
audio_len=audio_len
)
return sample