File size: 5,828 Bytes

517d380

import os

from skimage import io,img_as_float32
import cv2
import torch
import numpy as np
import subprocess
import pandas
from models.audio2pose import audio2poseLSTM
from scipy.io import wavfile
import python_speech_features
import pyworld
import config
import json
from scipy.interpolate import interp1d

def inter_pitch(y,y_flag):
    frame_num = y.shape[0]
    i = 0
    last = -1
    while(i<frame_num):
        if y_flag[i] == 0:
            while True:
                if y_flag[i]==0:
                    if i == frame_num-1:
                        if last !=-1:
                            y[last+1:] = y[last]
                        i+=1
                        break
                    i+=1
                else:
                    break
            if i >= frame_num:
                break
            elif last == -1:
                y[:i] = y[i]
            else:
                inter_num = i-last+1
                fy = np.array([y[last],y[i]])
                fx = np.linspace(0, 1, num=2)
                f = interp1d(fx,fy)
                fx_new = np.linspace(0,1,inter_num)
                fy_new = f(fx_new)
                y[last+1:i] = fy_new[1:-1]
                last = i
                i+=1

        else:
            last = i
            i+=1
    return y


def load_ckpt(checkpoint_path, generator = None, kp_detector = None, ph2kp = None):
    checkpoint = torch.load(checkpoint_path)
    if ph2kp is not None:
        ph2kp.load_state_dict(checkpoint['ph2kp'])
    if generator is not None:
        generator.load_state_dict(checkpoint['generator'])
    if kp_detector is not None:
        kp_detector.load_state_dict(checkpoint['kp_detector'])

def get_img_pose(img_path):
    processor = config.OPENFACE_POSE_EXTRACTOR_PATH

    tmp_dir = "samples/tmp_dir"
    os.makedirs((tmp_dir),exist_ok=True)
    subprocess.call([processor, "-f", img_path, "-out_dir", tmp_dir, "-pose"])

    img_file = os.path.basename(img_path)[:-4]+".csv"
    csv_file = os.path.join(tmp_dir,img_file)
    pos_data = pandas.read_csv(csv_file)
    i = 0
    pose = [pos_data["pose_Rx"][i], pos_data["pose_Ry"][i], pos_data["pose_Rz"][i],pos_data["pose_Tx"][i], pos_data["pose_Ty"][i], pos_data["pose_Tz"][i]]
    # pose = [pose]
    pose = np.array(pose,dtype=np.float32)
    return pose

def read_img(path):
    img = io.imread(path)[:,:,:3]
    img = cv2.resize(img, (256, 256))
    # img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    img = np.array(img_as_float32(img))
    img = img.transpose((2, 0, 1))
    img = torch.from_numpy(img).unsqueeze(0)
    return img


def parse_phoneme_file(phoneme_path,use_index = True):
    with open(phoneme_path,'r') as f:
        result_text = json.load(f)
    frame_num = int(result_text[-1]['phones'][-1]['ed']/100*25)
    phoneset_list = []
    index = 0

    word_len = len(result_text)
    word_index = 0
    phone_index = 0
    cur_phone_list = result_text[0]["phones"]
    phone_len = len(cur_phone_list)
    cur_end = cur_phone_list[0]["ed"]

    phone_list = []

    phoneset_list.append(cur_phone_list[0]["ph"])
    i = 0
    while i < frame_num:
        if i * 4 < cur_end:
            phone_list.append(cur_phone_list[phone_index]["ph"])
            i += 1
        else:
            phone_index += 1
            if phone_index >= phone_len:
                word_index += 1
                if word_index >= word_len:
                    phone_list.append(cur_phone_list[-1]["ph"])
                    i += 1
                else:
                    phone_index = 0
                    cur_phone_list = result_text[word_index]["phones"]
                    phone_len = len(cur_phone_list)
                    cur_end = cur_phone_list[phone_index]["ed"]
                    phoneset_list.append(cur_phone_list[phone_index]["ph"])
                    index += 1
            else:
                # print(word_index,phone_index)
                cur_end = cur_phone_list[phone_index]["ed"]
                phoneset_list.append(cur_phone_list[phone_index]["ph"])
                index += 1

    with open("phindex.json") as f:
        ph2index = json.load(f)
    if use_index:
        phone_list = [ph2index[p] for p in phone_list]
    saves = {"phone_list": phone_list}

    return saves

def get_audio_feature_from_audio(audio_path):
    sample_rate, audio = wavfile.read(audio_path)
    if len(audio.shape) == 2:
        if np.min(audio[:, 0]) <= 0:
            audio = audio[:, 1]
        else:
            audio = audio[:, 0]

    audio = audio - np.mean(audio)
    audio = audio / np.max(np.abs(audio))
    a = python_speech_features.mfcc(audio, sample_rate)
    b = python_speech_features.logfbank(audio, sample_rate)
    c, _ = pyworld.harvest(audio, sample_rate, frame_period=10)
    c_flag = (c == 0.0) ^ 1
    c = inter_pitch(c, c_flag)
    c = np.expand_dims(c, axis=1)
    c_flag = np.expand_dims(c_flag, axis=1)
    frame_num = np.min([a.shape[0], b.shape[0], c.shape[0]])

    cat = np.concatenate([a[:frame_num], b[:frame_num], c[:frame_num], c_flag[:frame_num]], axis=1)
    return cat

def get_pose_from_audio(img,audio,audio2pose):

    num_frame = len(audio) // 4

    minv = np.array([-0.6, -0.6, -0.6, -128.0, -128.0, 128.0], dtype=np.float32)
    maxv = np.array([0.6, 0.6, 0.6, 128.0, 128.0, 384.0], dtype=np.float32)
    generator = audio2poseLSTM().cuda().eval()

    ckpt_para = torch.load(audio2pose)

    generator.load_state_dict(ckpt_para["generator"])
    generator.eval()


    audio_seq = []
    for i in range(num_frame):
        audio_seq.append(audio[i*4:i*4+4])

    audio = torch.from_numpy(np.array(audio_seq,dtype=np.float32)).unsqueeze(0).cuda()

    x = {}
    x ["img"] = img
    x["audio"] = audio
    poses = generator(x)

    poses = poses.cpu().data.numpy()[0]
    poses = (poses+1)/2*(maxv-minv)+minv

    return poses