Spaces:

fffiloni
/

echomimic-v2

Running

App Files Files Community

echomimic-v2 / EMTD_dataset /preprocess.py

fffiloni

Migrated from GitHub

6f199b8 verified 12 months ago

raw

history blame contribute delete

12.1 kB

	import sys

	from src.utils.img_utils import pil_to_cv2, cv2_to_pil, center_crop_cv2, pils_from_video, save_videos_from_pils, save_video_from_cv2_list
	from PIL import Image
	import cv2
	from IPython import embed
	import numpy as np
	import copy
	from src.utils.motion_utils import motion_sync
	import pathlib
	import torch
	import pickle
	from glob import glob
	import os
	from src.models.dwpose.dwpose_detector import dwpose_detector as dwprocessor
	from src.models.dwpose.util import draw_pose
	import decord
	from tqdm import tqdm
	from moviepy.editor import AudioFileClip, VideoFileClip
	from multiprocessing.pool import ThreadPool

	##################################
	base_dir = "root"
	tasks = ["emtd"]

	process_num = 800 #1266

	start = 0
	end = process_num + start
	#################################
	MAX_SIZE = 768


	def convert_fps(src_path, tgt_path, tgt_fps=24, tgt_sr=16000):
	clip = VideoFileClip(src_path)
	new_clip = clip.set_fps(tgt_fps)
	if tgt_fps is not None:
	audio = new_clip.audio
	audio = audio.set_fps(tgt_sr)
	new_clip = new_clip.set_audio(audio)

	new_clip.write_videofile(tgt_path, codec='libx264', audio_codec='aac')

	def get_video_pose(
	video_path: str,
	sample_stride: int=1,
	max_frame=None):

	# read input video
	vr = decord.VideoReader(video_path, ctx=decord.cpu(0))
	sample_stride *= max(1, int(vr.get_avg_fps() / 24))

	frames = vr.get_batch(list(range(0, len(vr), sample_stride))).asnumpy()
	if max_frame is not None:
	frames = frames[0:max_frame,:,:]
	height, width, _ = frames[0].shape
	# detected_poses = [dwprocessor(frm) for frm in tqdm(frames, desc="DWPose")]
	detected_poses = [dwprocessor(frm) for frm in frames]
	dwprocessor.release_memory()

	return detected_poses, height, width, frames

	def resize_and_pad(img, max_size):
	img_new = np.zeros((max_size, max_size, 3)).astype('uint8')
	imh, imw = img.shape[0], img.shape[1]
	half = max_size // 2
	if imh > imw:
	imh_new = max_size
	imw_new = int(round(imw/imh * imh_new))
	half_w = imw_new // 2
	rb, re = 0, max_size
	cb = half-half_w
	ce = cb + imw_new
	else:
	imw_new = max_size
	imh_new = int(round(imh/imw * imw_new))
	half_h = imh_new // 2
	cb, ce = 0, max_size
	rb = half-half_h
	re = rb + imh_new

	img_resize = cv2.resize(img, (imw_new, imh_new))
	img_new[rb:re,cb:ce,:] = img_resize
	return img_new

	def resize_and_pad_param(imh, imw, max_size):
	half = max_size // 2
	if imh > imw:
	imh_new = max_size
	imw_new = int(round(imw/imh * imh_new))
	half_w = imw_new // 2
	rb, re = 0, max_size
	cb = half-half_w
	ce = cb + imw_new
	else:
	imw_new = max_size
	imh_new = int(round(imh/imw * imw_new))
	imh_new = max_size

	half_h = imh_new // 2
	cb, ce = 0, max_size
	rb = half-half_h
	re = rb + imh_new

	return imh_new, imw_new, rb, re, cb, ce

	def get_pose_params(detected_poses, max_size):
	print('get_pose_params...')
	# pose rescale
	w_min_all, w_max_all, h_min_all, h_max_all = [], [], [], []
	mid_all = []
	for num, detected_pose in enumerate(detected_poses):
	detected_poses[num]['num'] = num
	candidate_body = detected_pose['bodies']['candidate']
	score_body = detected_pose['bodies']['score']
	candidate_face = detected_pose['faces']
	score_face = detected_pose['faces_score']
	candidate_hand = detected_pose['hands']
	score_hand = detected_pose['hands_score']

	# 选取置信度最高的face
	if candidate_face.shape[0] > 1:
	index = 0
	candidate_face = candidate_face[index]
	score_face = score_face[index]
	detected_poses[num]['faces'] = candidate_face.reshape(1, candidate_face.shape[0], candidate_face.shape[1])
	detected_poses[num]['faces_score'] = score_face.reshape(1, score_face.shape[0])
	else:
	candidate_face = candidate_face[0]
	score_face = score_face[0]

	# 选取置信度最高的body
	if score_body.shape[0] > 1:
	tmp_score = []
	for k in range(0, score_body.shape[0]):
	tmp_score.append(score_body[k].mean())
	index = np.argmax(tmp_score)
	candidate_body = candidate_body[index18:(index+1)18,:]
	score_body = score_body[index]
	score_hand = score_hand[(index2):(index2+2),:]
	candidate_hand = candidate_hand[(index2):(index2+2),:,:]
	else:
	score_body = score_body[0]
	all_pose = np.concatenate((candidate_body, candidate_face))
	all_score = np.concatenate((score_body, score_face))
	all_pose = all_pose[all_score>0.8]


	body_pose = np.concatenate((candidate_body,))
	mid_ = body_pose[1, 0]


	face_pose = candidate_face
	hand_pose = candidate_hand


	h_min, h_max = np.min(face_pose[:,1]), np.max(body_pose[:7,1])

	h_ = h_max - h_min

	mid_w = mid_
	w_min = mid_w - h_ // 2
	w_max = mid_w + h_ // 2

	w_min_all.append(w_min)
	w_max_all.append(w_max)
	h_min_all.append(h_min)
	h_max_all.append(h_max)
	mid_all.append(mid_w)

	w_min = np.min(w_min_all)
	w_max = np.max(w_max_all)
	h_min = np.min(h_min_all)
	h_max = np.max(h_max_all)
	mid = np.mean(mid_all)
	print(mid)

	margin_ratio = 0.25
	h_margin = (h_max-h_min)*margin_ratio

	h_min = max(h_min-h_margin*0.65, 0)
	h_max = min(h_max+h_margin*0.5, 1)

	h_new = h_max - h_min

	h_min_real = int(h_min*height)
	h_max_real = int(h_max*height)
	mid_real = int(mid*width)


	height_new = h_max_real-h_min_real+1
	width_new = height_new
	w_min_real = mid_real - height_new // 2

	w_max_real = w_min_real + width_new
	w_min = w_min_real / width
	w_max = w_max_real / width

	print(width_new, height_new)

	imh_new, imw_new, rb, re, cb, ce = resize_and_pad_param(height_new, width_new, max_size)
	res = {'draw_pose_params': [imh_new, imw_new, rb, re, cb, ce],
	'pose_params': [w_min, w_max, h_min, h_max],
	'video_params': [h_min_real, h_max_real, w_min_real, w_max_real],
	}
	return res

	def save_pose_params_item(input_items):
	detected_pose, pose_params, draw_pose_params, save_dir = input_items
	w_min, w_max, h_min, h_max = pose_params
	num = detected_pose['num']
	candidate_body = detected_pose['bodies']['candidate']
	candidate_face = detected_pose['faces'][0]
	candidate_hand = detected_pose['hands']
	candidate_body[:,0] = (candidate_body[:,0]-w_min)/(w_max-w_min)
	candidate_body[:,1] = (candidate_body[:,1]-h_min)/(h_max-h_min)
	candidate_face[:,0] = (candidate_face[:,0]-w_min)/(w_max-w_min)
	candidate_face[:,1] = (candidate_face[:,1]-h_min)/(h_max-h_min)
	candidate_hand[:,:,0] = (candidate_hand[:,:,0]-w_min)/(w_max-w_min)
	candidate_hand[:,:,1] = (candidate_hand[:,:,1]-h_min)/(h_max-h_min)
	detected_pose['bodies']['candidate'] = candidate_body
	detected_pose['faces'] = candidate_face.reshape(1, candidate_face.shape[0], candidate_face.shape[1])
	detected_pose['hands'] = candidate_hand
	detected_pose['draw_pose_params'] = draw_pose_params
	np.save(save_dir+'/'+str(num)+'.npy', detected_pose)

	def save_pose_params(detected_poses, pose_params, draw_pose_params, ori_video_path):
	save_dir = ori_video_path.replace('original_videos', 'image_audio_features/pose/')
	if not os.path.exists(save_dir):
	os.makedirs(save_dir)

	input_list = []
	for i, detected_pose in enumerate(detected_poses):
	input_list.append([detected_pose, pose_params, draw_pose_params, save_dir])

	pool = ThreadPool(8)
	pool.map(save_pose_params_item, input_list)
	pool.close()
	pool.join()

	def save_processed_video(ori_frames, video_params, ori_video_path, max_size):
	save_path = ori_video_path.replace('original_videos', 'processed/video/')
	save_dir = os.path.dirname(save_path)
	if not os.path.exists(save_dir):
	os.makedirs(save_dir)
	h_min_real, h_max_real, w_min_real, w_max_real = video_params
	video_frame_crop = []
	for img in ori_frames:
	img = img[h_min_real:h_max_real,w_min_real:w_max_real,:]
	img = resize_and_pad(img, max_size=max_size)
	video_frame_crop.append(img)
	save_video_from_cv2_list(video_frame_crop, save_path, fps=24.0, rgb2bgr=True)
	return video_frame_crop

	def save_audio(ori_video_path, sub_task):
	save_path = ori_video_path.replace('original_videos', 'processed/audio/')
	save_dir = os.path.dirname(save_path)
	save_path = save_path + '.wav'
	if not os.path.exists(save_dir):
	os.makedirs(save_dir)
	ori_video_path = ori_video_path.replace(sub_task, sub_task+'_24fps')
	audio_clip = AudioFileClip(ori_video_path)
	audio_clip.write_audiofile(save_path)

	def draw_pose_video(pose_params_path, save_path, max_size, ori_frames=None):
	pose_files = os.listdir(pose_params_path)
	# 生成Pose图cd pro
	output_pose_img = []
	for i in range(0, len(pose_files)):
	pose_params_path_tmp = pose_params_path + '/' + str(i) + '.npy'
	detected_pose = np.load(pose_params_path_tmp, allow_pickle=True).tolist()
	imh_new, imw_new, rb, re, cb, ce = detected_pose['draw_pose_params']
	im = draw_pose(detected_pose, imh_new, imw_new, ref_w=800)
	im = np.transpose(np.array(im),(1,2,0))
	img_new = np.zeros((max_size, max_size, 3)).astype('uint8')
	img_new[rb:re,cb:ce,:] = im
	if ori_frames is not None:
	img_new = img_new * 0.6 + ori_frames[i] * 0.4
	img_new = img_new.astype('uint8')
	output_pose_img.append(img_new)

	output_pose_img = np.stack(output_pose_img)
	save_video_from_cv2_list(output_pose_img, save_path, fps=24.0, rgb2bgr=True)
	print('save to ' + save_path)

	visualization = False
	for sub_task in tasks:

	ori_list = os.listdir(base_dir+sub_task)[start:end]

	mp4_list = ori_list

	new_dir = base_dir+sub_task+'_24fps'
	if not os.path.exists(new_dir):
	os.makedirs(new_dir)
	index = 1
	for i, mp4_file in enumerate(mp4_list):
	ori_video_path = base_dir+sub_task+'/'+mp4_file
	if ori_video_path[-3:]=='mp4' or ori_video_path[-3:] =='MOV':
	try:
	# 转换祯率
	ori_video_path_new = ori_video_path.replace(sub_task, sub_task+'_24fps')
	if '.MOV' in ori_video_path_new:
	ori_video_path_new.replace('.MOV', '.mp4')
	convert_fps(ori_video_path, ori_video_path_new)
	print([index+start, ori_video_path, start, end])
	# 提取Pose
	detected_poses, height, width, ori_frames = get_video_pose(ori_video_path_new, max_frame=None)
	print(height, width)
	# 提取相关参数
	res_params = get_pose_params(detected_poses, MAX_SIZE)
	# 存储Pose参数
	save_pose_params(detected_poses, res_params['pose_params'], res_params['draw_pose_params'], ori_video_path)
	# 存储截取视频
	video_frame_crop = save_processed_video(ori_frames, res_params['video_params'], ori_video_path, MAX_SIZE)
	# 存储音频
	save_audio(ori_video_path, sub_task)
	index += 1
	if visualization:
	# 绘制pose图
	pose_params_path = ori_video_path.replace('original_videos', 'image_audio_features/pose')
	save_path = "./vis_pose_results/" + os.path.basename(ori_video_path)
	draw_pose_video(pose_params_path, save_path, ori_frames=video_frame_crop)
	except:
	print(["extract crash!", index+start, ori_video_path, start, end])
	continue

	print(["All Finished", sub_task, start, end])