wedyanessam's picture
Update app.py
97e8796 verified
raw
history blame
2.8 kB
import gradio as gr
from pathlib import Path
import argparse
from STT.sst import speech_to_text
from LLM.llm import generate_reply
from TTS_X.tts import generate_voice
from FantasyTalking.infer import load_models, main
# ุซุงุจุชุงุช ุชุญู…ูŠู„ ุงู„ู†ู…ูˆุฐุฌ
args_template = argparse.Namespace(
wan_model_dir="./models/Wan2.1-I2V-14B-720P",
fantasytalking_model_path="./models/fantasytalking_model.ckpt",
wav2vec_model_dir="./models/wav2vec2-base-960h",
image_path="",
audio_path="",
prompt="",
output_dir="./output",
image_size=512,
audio_scale=1.0,
prompt_cfg_scale=5.0,
audio_cfg_scale=5.0,
max_num_frames=81,
inference_steps=20,
fps=23,
num_persistent_param_in_dit=None,
seed=1111
)
# ุชุญู…ูŠู„ ุงู„ู†ู…ุงุฐุฌ ู…ุฑุฉ ูˆุญุฏุฉ ูู‚ุท
pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(args_template)
def generate_video(image_path, audio_path, prompt, output_dir="./output"):
args = argparse.Namespace(
**vars(args_template),
image_path=image_path,
audio_path=audio_path,
prompt=prompt,
output_dir=output_dir
)
return main(args, pipe, fantasytalking, wav2vec_processor, wav2vec)
def full_pipeline(user_audio, user_image):
# 1. ุชุญูˆูŠู„ ุงู„ุตูˆุช ุฅู„ู‰ ู†ุต
user_text = speech_to_text(user_audio)
# 2. ุชูˆู„ูŠุฏ ุงู„ุฑุฏ ู…ู† LLM
reply = generate_reply(user_text)
# 3. ุชุญูˆูŠู„ ุงู„ุฑุฏ ุฅู„ู‰ ุตูˆุช
reply_audio_path = generate_voice(reply)
# 4. ุชูˆู„ูŠุฏ ููŠุฏูŠูˆ ู…ู† ุงู„ุตูˆุฑุฉ ูˆุงู„ุตูˆุช
Path("./output").mkdir(parents=True, exist_ok=True)
video_path = generate_video(
image_path=user_image,
audio_path=reply_audio_path,
prompt=reply
)
return user_text, reply, reply_audio_path, video_path
# ูˆุงุฌู‡ุฉ Gradio
with gr.Blocks(title="๐Ÿง  ุตูˆุชูƒ ูŠุญุฑูƒ ุตูˆุฑุฉ!") as demo:
gr.Markdown("## ๐ŸŽคโžก๏ธ๐Ÿ’ฌโžก๏ธ๐Ÿ”Šโžก๏ธ๐Ÿ“ฝ๏ธ ู…ู† ุตูˆุชูƒ ุฅู„ู‰ ููŠุฏูŠูˆ ู…ุชูƒู„ู…!")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(label="๐ŸŽ™๏ธ ุงุฑูุน ุตูˆุชูƒ", type="filepath")
image_input = gr.Image(label="๐Ÿ–ผ๏ธ ุตูˆุฑุฉ ุงู„ู…ุชุญุฏุซ", type="filepath")
btn = gr.Button("๐ŸŽฌ ุดุบู„")
with gr.Column():
user_text = gr.Textbox(label="๐Ÿ“ ุงู„ู†ุต ุงู„ู…ุณู…ูˆุน")
reply_text = gr.Textbox(label="๐Ÿค– ุฑุฏ ุงู„ู…ุณุงุนุฏ")
reply_audio = gr.Audio(label="๐Ÿ”Š ุงู„ุฑุฏ ุงู„ู…ู†ุทูˆู‚")
video_output = gr.Video(label="๐Ÿ“ฝ๏ธ ุงู„ููŠุฏูŠูˆ ุงู„ู†ุงุชุฌ")
btn.click(fn=full_pipeline,
inputs=[audio_input, image_input],
outputs=[user_text, reply_text, reply_audio, video_output])
demo.launch(inbrowser=True, share=True)