Spaces:

VIDraft
/

Portrait-Animation

Runtime error

Portrait-Animation / src /dataset /test_preprocess.py

xiaozhongji

init spaces

79d88c4 10 months ago

4.89 kB

	import os
	import numpy as np
	from PIL import Image
	import torch
	import torchvision.transforms as transforms
	from transformers import CLIPImageProcessor
	import librosa


	def process_bbox(bbox, expand_radio, height, width):
	"""
	raw_vid_path:
	bbox: format: x1, y1, x2, y2
	radio: expand radio against bbox size
	height,width: source image height and width
	"""

	def expand(bbox, ratio, height, width):

	bbox_h = bbox[3] - bbox[1]
	bbox_w = bbox[2] - bbox[0]

	expand_x1 = max(bbox[0] - ratio * bbox_w, 0)
	expand_y1 = max(bbox[1] - ratio * bbox_h, 0)
	expand_x2 = min(bbox[2] + ratio * bbox_w, width)
	expand_y2 = min(bbox[3] + ratio * bbox_h, height)

	return [expand_x1,expand_y1,expand_x2,expand_y2]

	def to_square(bbox_src, bbox_expend, height, width):

	h = bbox_expend[3] - bbox_expend[1]
	w = bbox_expend[2] - bbox_expend[0]
	c_h = (bbox_expend[1] + bbox_expend[3]) / 2
	c_w = (bbox_expend[0] + bbox_expend[2]) / 2

	c = min(h, w) / 2

	c_src_h = (bbox_src[1] + bbox_src[3]) / 2
	c_src_w = (bbox_src[0] + bbox_src[2]) / 2

	s_h, s_w = 0, 0
	if w < h:
	d = abs((h - w) / 2)
	s_h = min(d, abs(c_src_h-c_h))
	s_h = s_h if c_src_h > c_h else s_h * (-1)
	else:
	d = abs((h - w) / 2)
	s_w = min(d, abs(c_src_w-c_w))
	s_w = s_w if c_src_w > c_w else s_w * (-1)


	c_h = (bbox_expend[1] + bbox_expend[3]) / 2 + s_h
	c_w = (bbox_expend[0] + bbox_expend[2]) / 2 + s_w

	square_x1 = c_w - c
	square_y1 = c_h - c
	square_x2 = c_w + c
	square_y2 = c_h + c

	x1, y1, x2, y2 = square_x1, square_y1, square_x2, square_y2
	ww = x2 - x1
	hh = y2 - y1
	cc_x = (x1 + x2)/2
	cc_y = (y1 + y2)/2
	# 1:1
	ww = hh = min(ww, hh)
	x1, x2 = round(cc_x - ww/2), round(cc_x + ww/2)
	y1, y2 = round(cc_y - hh/2), round(cc_y + hh/2)

	return [round(x1), round(y1), round(x2), round(y2)]


	bbox_expend = expand(bbox, expand_radio, height=height, width=width)
	processed_bbox = to_square(bbox, bbox_expend, height=height, width=width)

	return processed_bbox


	def get_audio_feature(audio_path, feature_extractor):
	audio_input, sampling_rate = librosa.load(audio_path, sr=16000)
	assert sampling_rate == 16000

	audio_features = []
	window = 750*640
	for i in range(0, len(audio_input), window):
	audio_feature = feature_extractor(audio_input[i:i+window],
	sampling_rate=sampling_rate,
	return_tensors="pt",
	).input_features
	audio_features.append(audio_feature)
	audio_features = torch.cat(audio_features, dim=-1)
	return audio_features, len(audio_input) // 640

	def image_audio_to_tensor(align_instance, feature_extractor, image_path, audio_path, limit=100, image_size=512, area=1.25):

	clip_processor = CLIPImageProcessor()

	to_tensor = transforms.Compose([
	transforms.ToTensor(),
	transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
	])
	mask_to_tensor = transforms.Compose([
	transforms.ToTensor(),
	])


	imSrc_ = Image.open(image_path).convert('RGB')
	w, h = imSrc_.size

	_, _, bboxes_list = align_instance(np.array(imSrc_)[:,:,[2,1,0]], maxface=True)

	if len(bboxes_list) == 0:
	return None
	bboxSrc = bboxes_list[0]

	x1, y1, ww, hh = bboxSrc
	x2, y2 = x1 + ww, y1 + hh

	mask_img = np.zeros_like(np.array(imSrc_))
	ww, hh = (x2-x1) * area, (y2-y1) * area
	center = [(x2+x1)//2, (y2+y1)//2]
	x1 = max(center[0] - ww//2, 0)
	y1 = max(center[1] - hh//2, 0)
	x2 = min(center[0] + ww//2, w)
	y2 = min(center[1] + hh//2, h)
	mask_img[int(y1):int(y2), int(x1):int(x2)] = 255
	mask_img = Image.fromarray(mask_img)

	w, h = imSrc_.size
	scale = image_size / min(w, h)
	new_w = round(w * scale / 64) * 64
	new_h = round(h * scale / 64) * 64
	if new_h != h or new_w != w:
	imSrc = imSrc_.resize((new_w, new_h), Image.LANCZOS)
	mask_img = mask_img.resize((new_w, new_h), Image.LANCZOS)
	else:
	imSrc = imSrc_

	clip_image = clip_processor(
	images=imSrc.resize((224, 224), Image.LANCZOS), return_tensors="pt"
	).pixel_values[0]
	audio_input, audio_len = get_audio_feature(audio_path, feature_extractor)

	audio_len = min(limit, audio_len)

	sample = dict(
	face_mask=mask_to_tensor(mask_img),
	ref_img=to_tensor(imSrc),
	clip_images=clip_image,
	audio_feature=audio_input[0],
	audio_len=audio_len
	)

	return sample