File size: 2,800 Bytes
d9d25fe
97e8796
 
 
3ec929e
769c7b4
c79816c
aa3c3a8
d9d25fe
97e8796
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3402d0b
568d66f
97e8796
 
 
 
3402d0b
97e8796
3402d0b
 
 
97e8796
3402d0b
 
 
97e8796
3402d0b
97e8796
3402d0b
97e8796
 
568d66f
3402d0b
97e8796
 
3402d0b
97e8796
 
3402d0b
 
 
97e8796
3402d0b
 
 
 
d9d25fe
97e8796
3402d0b
 
e682a2e
3402d0b
 
 
 
 
e682a2e
3402d0b
 
 
 
 
e682a2e
97e8796
 
3402d0b
9dd7b34
3402d0b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import gradio as gr
from pathlib import Path
import argparse

from STT.sst import speech_to_text
from LLM.llm import generate_reply
from TTS_X.tts import generate_voice
from FantasyTalking.infer import load_models, main

# ุซุงุจุชุงุช ุชุญู…ูŠู„ ุงู„ู†ู…ูˆุฐุฌ
args_template = argparse.Namespace(
    wan_model_dir="./models/Wan2.1-I2V-14B-720P",
    fantasytalking_model_path="./models/fantasytalking_model.ckpt",
    wav2vec_model_dir="./models/wav2vec2-base-960h",
    image_path="",
    audio_path="",
    prompt="",
    output_dir="./output",
    image_size=512,
    audio_scale=1.0,
    prompt_cfg_scale=5.0,
    audio_cfg_scale=5.0,
    max_num_frames=81,
    inference_steps=20,
    fps=23,
    num_persistent_param_in_dit=None,
    seed=1111
)

# ุชุญู…ูŠู„ ุงู„ู†ู…ุงุฐุฌ ู…ุฑุฉ ูˆุญุฏุฉ ูู‚ุท
pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(args_template)

def generate_video(image_path, audio_path, prompt, output_dir="./output"):
    args = argparse.Namespace(
        **vars(args_template),
        image_path=image_path,
        audio_path=audio_path,
        prompt=prompt,
        output_dir=output_dir
    )
    return main(args, pipe, fantasytalking, wav2vec_processor, wav2vec)


def full_pipeline(user_audio, user_image):
    # 1. ุชุญูˆูŠู„ ุงู„ุตูˆุช ุฅู„ู‰ ู†ุต
    user_text = speech_to_text(user_audio)

    # 2. ุชูˆู„ูŠุฏ ุงู„ุฑุฏ ู…ู† LLM
    reply = generate_reply(user_text)

    # 3. ุชุญูˆูŠู„ ุงู„ุฑุฏ ุฅู„ู‰ ุตูˆุช
    reply_audio_path = generate_voice(reply)

    # 4. ุชูˆู„ูŠุฏ ููŠุฏูŠูˆ ู…ู† ุงู„ุตูˆุฑุฉ ูˆุงู„ุตูˆุช
    Path("./output").mkdir(parents=True, exist_ok=True)
    video_path = generate_video(
        image_path=user_image,
        audio_path=reply_audio_path,
        prompt=reply
    )

    return user_text, reply, reply_audio_path, video_path


# ูˆุงุฌู‡ุฉ Gradio
with gr.Blocks(title="๐Ÿง  ุตูˆุชูƒ ูŠุญุฑูƒ ุตูˆุฑุฉ!") as demo:
    gr.Markdown("## ๐ŸŽคโžก๏ธ๐Ÿ’ฌโžก๏ธ๐Ÿ”Šโžก๏ธ๐Ÿ“ฝ๏ธ ู…ู† ุตูˆุชูƒ ุฅู„ู‰ ููŠุฏูŠูˆ ู…ุชูƒู„ู…!")

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(label="๐ŸŽ™๏ธ ุงุฑูุน ุตูˆุชูƒ", type="filepath")
            image_input = gr.Image(label="๐Ÿ–ผ๏ธ ุตูˆุฑุฉ ุงู„ู…ุชุญุฏุซ", type="filepath")
            btn = gr.Button("๐ŸŽฌ ุดุบู„")

        with gr.Column():
            user_text = gr.Textbox(label="๐Ÿ“ ุงู„ู†ุต ุงู„ู…ุณู…ูˆุน")
            reply_text = gr.Textbox(label="๐Ÿค– ุฑุฏ ุงู„ู…ุณุงุนุฏ")
            reply_audio = gr.Audio(label="๐Ÿ”Š ุงู„ุฑุฏ ุงู„ู…ู†ุทูˆู‚")
            video_output = gr.Video(label="๐Ÿ“ฝ๏ธ ุงู„ููŠุฏูŠูˆ ุงู„ู†ุงุชุฌ")

    btn.click(fn=full_pipeline,
              inputs=[audio_input, image_input],
              outputs=[user_text, reply_text, reply_audio, video_output])

demo.launch(inbrowser=True, share=True)