File size: 4,090 Bytes
7cedd07
e75e1c6
5223b6a
fa13218
97e8796
 
5223b6a
7cedd07
fa13218
 
 
 
 
 
 
 
 
 
 
 
 
da8f7f9
 
 
7cedd07
f71a8b3
5223b6a
1ed541f
da8f7f9
3ec929e
769c7b4
c79816c
aa3c3a8
d9d25fe
fa13218
97e8796
da8f7f9
 
fa13218
97e8796
 
 
 
fa13218
97e8796
fa13218
 
 
 
 
97e8796
 
3402d0b
568d66f
da8f7f9
ce79c62
97e8796
ce79c62
97e8796
fa13218
97e8796
93d986f
 
 
 
 
 
 
 
 
 
 
 
 
 
fa13218
3402d0b
fa13218
3402d0b
5223b6a
3402d0b
97e8796
5223b6a
568d66f
3402d0b
5223b6a
97e8796
3402d0b
5223b6a
97e8796
3402d0b
 
 
97e8796
3402d0b
 
 
 
da8f7f9
3402d0b
 
e682a2e
3402d0b
 
 
 
 
e682a2e
3402d0b
 
 
 
 
e682a2e
97e8796
 
3402d0b
9dd7b34
3402d0b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import subprocess
import os
import sys
import shutil
from pathlib import Path
import argparse
import gradio as gr

# โœ… ุงู„ุชู†ุธูŠู ุฃูˆู„ุงู‹: ูู‚ุท ู„ู„ู…ุฌู„ุฏุงุช ุงู„ู…ุคู‚ุชุฉ
folders_to_delete = ["./output", "./__pycache__", "./.cache", "./temp"]
for folder in folders_to_delete:
    if os.path.exists(folder):
        print(f"๐Ÿ—‘๏ธ ุญุฐู {folder}")
        shutil.rmtree(folder)

# โœ… ุทุจุงุนุฉ ุญุงู„ุฉ ุงู„ุฐุงูƒุฑุฉ
import psutil
mem = psutil.virtual_memory()
print(f"๐Ÿ” RAM ุงู„ู…ุณุชุฎุฏู…ุฉ: {mem.used / 1e9:.2f} GB / {mem.total / 1e9:.2f} GB")

# โœ… ุชุญู…ูŠู„ ุงู„ู…ูˆุฏูŠู„ุงุช ุฅุฐุง ู…ุง ูƒุงู†ุช ู…ูˆุฌูˆุฏุฉ
if not os.path.exists("./models/fantasytalking_model.ckpt"):
    print("๐Ÿ› ๏ธ ุฌุงุฑูŠ ุชุญู…ูŠู„ ุงู„ู†ู…ุงุฐุฌ ุนุจุฑ download_models.py ...")
    subprocess.run(["python", "download_models.py"])

# โœ… ุฅุนุฏุงุฏ ุงู„ู…ุณุงุฑุงุช
sys.path.append(os.path.abspath("."))

# โœ… ุงุณุชูŠุฑุงุฏ ุงู„ู…ูƒูˆู†ุงุช
from STT.sst import speech_to_text
from LLM.llm import generate_reply
from TTS_X.tts import generate_voice
from FantasyTalking.infer import load_models, main

# โœ… ุซุงุจุชุงุช ุงู„ู†ู…ูˆุฐุฌ
args_template = argparse.Namespace(
    fantasytalking_model_path="./models/fantasytalking_model.ckpt",
    wav2vec_model_dir="./models/wav2vec2-base-960h",
    wan_model_dir="./models/Wan2.1-I2V-14B-720P",
    image_path="",
    audio_path="",
    prompt="",
    output_dir="./output",
    image_size=512,
    audio_scale=1.0,
    prompt_cfg_scale=5.0,
    audio_cfg_scale=5.0,
    max_num_frames=81,
    inference_steps=20,
    fps=23,
    num_persistent_param_in_dit=None,
    seed=1111
)

# โœ… ุชุญู…ูŠู„ ุงู„ู†ู…ุงุฐุฌ
print("๐Ÿš€ ุฌุงุฑูŠ ุชุญู…ูŠู„ FantasyTalking ูˆ Wav2Vec...")
pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(args_template)
print("โœ… ุชู… ุงู„ุชุญู…ูŠู„!")

# โœ… ุชูˆู„ูŠุฏ ููŠุฏูŠูˆ
def generate_video(image_path, audio_path, prompt, output_dir="./output"):
    # ุงู†ุณุฎูŠ args_template ุฅู„ู‰ dict ุนุดุงู† ู†ุนุฏู„ ุนู„ูŠู‡ ุจุณู‡ูˆู„ุฉ
    args_dict = vars(args_template).copy()

    # ู†ุญุฏุซ ูู‚ุท ุงู„ู„ูŠ ู†ุญุชุงุฌู‡
    args_dict.update({
        "image_path": image_path,
        "audio_path": audio_path,
        "prompt": prompt,
        "output_dir": output_dir
    })

    # ู†ุญูˆู„ ู…ู† dict ุฅู„ู‰ argparse.Namespace
    args = argparse.Namespace(**args_dict)

    return main(args, pipe, fantasytalking, wav2vec_processor, wav2vec)

# โœ… ุฎุท ุงู„ุฃู†ุงุจูŠุจ ุงู„ูƒุงู…ู„
def full_pipeline(user_audio, user_image):
    print("๐ŸŽค ุชุญูˆูŠู„ ุงู„ุตูˆุช ุฅู„ู‰ ู†ุต...")
    user_text = speech_to_text(user_audio)

    print("๐Ÿ’ฌ ุชูˆู„ูŠุฏ ุงู„ุฑุฏ...")
    reply = generate_reply(user_text)

    print("๐Ÿ”Š ุชุญูˆูŠู„ ุงู„ุฑุฏ ุฅู„ู‰ ุตูˆุช...")
    reply_audio_path = generate_voice(reply)

    print("๐Ÿ“ฝ๏ธ ุชูˆู„ูŠุฏ ุงู„ููŠุฏูŠูˆ...")
    Path("./output").mkdir(parents=True, exist_ok=True)
    video_path = generate_video(
        image_path=user_image,
        audio_path=reply_audio_path,
        prompt=reply
    )

    return user_text, reply, reply_audio_path, video_path

# โœ… ูˆุงุฌู‡ุฉ Gradio
with gr.Blocks(title="๐Ÿง  ุตูˆุชูƒ ูŠุญุฑูƒ ุตูˆุฑุฉ!") as demo:
    gr.Markdown("## ๐ŸŽคโžก๏ธ๐Ÿ’ฌโžก๏ธ๐Ÿ”Šโžก๏ธ๐Ÿ“ฝ๏ธ ู…ู† ุตูˆุชูƒ ุฅู„ู‰ ููŠุฏูŠูˆ ู…ุชูƒู„ู…!")

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(label="๐ŸŽ™๏ธ ุงุฑูุน ุตูˆุชูƒ", type="filepath")
            image_input = gr.Image(label="๐Ÿ–ผ๏ธ ุตูˆุฑุฉ ุงู„ู…ุชุญุฏุซ", type="filepath")
            btn = gr.Button("๐ŸŽฌ ุดุบู„")

        with gr.Column():
            user_text = gr.Textbox(label="๐Ÿ“ ุงู„ู†ุต ุงู„ู…ุณู…ูˆุน")
            reply_text = gr.Textbox(label="๐Ÿค– ุฑุฏ ุงู„ู…ุณุงุนุฏ")
            reply_audio = gr.Audio(label="๐Ÿ”Š ุงู„ุฑุฏ ุงู„ู…ู†ุทูˆู‚")
            video_output = gr.Video(label="๐Ÿ“ฝ๏ธ ุงู„ููŠุฏูŠูˆ ุงู„ู†ุงุชุฌ")

    btn.click(fn=full_pipeline,
              inputs=[audio_input, image_input],
              outputs=[user_text, reply_text, reply_audio, video_output])

demo.launch(inbrowser=True, share=True)