mainmainminavoiceclone

Running

App Files Files Community

Uniaff commited on Oct 7, 2024

Commit

486b9c4

verified ·

1 Parent(s): cbaf094

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -46

app.py CHANGED Viewed

@@ -1,46 +1,141 @@
-import gradio as gr
-import subprocess
 import os
-def generate(video, audio, checkpoint, no_smooth, resize_factor, pad_top, pad_bottom, pad_left, pad_right, save_as_video):
-    if video is None or audio is None or checkpoint is None:
-        return "Пожалуйста, загрузите видео/изображение и аудио файл, а также выберите чекпойнт."
-    print(f"Текущая рабочая директория: {os.getcwd()}")
-    print(f"Содержимое текущей директории: {os.listdir('.')}")
-    print(f"Проверка наличия 'inference.py': {os.path.exists('inference.py')}")
-    video_path = video
-    audio_path = audio
-    print(f"Путь к видео: {video_path}")
-    print(f"Путь к аудио: {audio_path}")
     output_dir = "outputs"
     os.makedirs(output_dir, exist_ok=True)
     output_path = os.path.join(output_dir, "output.mp4")
-    print(f"Путь к выходному файлу: {output_path}")
     args = [
-    "--checkpoint_path", f"checkpoints/{checkpoint}.pth",
-    "--segmentation_path", "checkpoints/face_segmentation.pth",
-    "--no_seg",
-    "--no_sr",
-    "--face", video_path,
-    "--audio", audio_path,
-    "--outfile", output_path,
-    "--resize_factor", "2",
-    "--face_det_batch_size", "4",
-    "--wav2lip_batch_size", "64",
-    "--fps", "30",
-    "--pads", str(pad_top), str(pad_bottom), str(pad_left), str(pad_right)
     ]
     if no_smooth:
         args.append("--nosmooth")
     if save_as_video:
         args.append("--save_as_video")
@@ -59,31 +154,72 @@ def generate(video, audio, checkpoint, no_smooth, resize_factor, pad_top, pad_bo
     print(f"Выходной файл создан по пути: {output_path}")
     return output_path
-with gr.Blocks() as ui:
-    gr.Markdown("## Lypsinc")
     with gr.Row():
-        video = gr.File(label="Видео или Изображение", type="filepath")
-        audio = gr.File(label="Аудио", type="filepath")
         with gr.Column():
-            checkpoint = gr.Radio(["wav2lip", "wav2lip_gan"], label="Чекпойнт", value="wav2lip_gan", visible=False)
-            no_smooth = gr.Checkbox(label="Без сглаживания", value=False)
-            resize_factor = gr.Slider(minimum=1, maximum=4, step=1, label="Фактор изменения размера", value=2)
-    with gr.Row():
         with gr.Column():
             pad_top = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Отступ сверху")
             pad_bottom = gr.Slider(minimum=0, maximum=50, step=1, value=10, label="Отступ снизу")
             pad_left = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Отступ слева")
             pad_right = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Отступ справа")
-            save_as_video = gr.Checkbox(label="Сохранять как видео", value=True)
-            generate_btn = gr.Button("Сгенерировать")
-        with gr.Column():
-            result = gr.Video(label="Результат")
-    generate_btn.click(
-        generate,
-        inputs=[video, audio, checkpoint, no_smooth, resize_factor, pad_top, pad_bottom, pad_left, pad_right, save_as_video],
-        outputs=result,
-        concurrency_limit=30
     )
-ui.launch(debug=True)

 import os
+import subprocess
+import sys
+import uuid
+import gradio as gr
+from pydub import AudioSegment
+from TTS.api import TTS
+# Инициализация моделей TTS
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", )
+# tts.to("cuda")
+# Опции языков
+language_options = {
+    "English (en)": "en",
+    "Spanish (es)": "es",
+    "French (fr)": "fr",
+    "German (de)": "de",
+    "Italian (it)": "it",
+    "Portuguese (pt)": "pt",
+    "Polish (pl)": "pl",
+    "Turkish (tr)": "tr",
+    "Russian (ru)": "ru",
+    "Dutch (nl)": "nl",
+    "Czech (cs)": "cs",
+    "Arabic (ar)": "ar",
+    "Chinese (zh-cn)": "zh-cn",
+    "Japanese (ja)": "ja",
+    "Hungarian (hu)": "hu",
+    "Korean (ko)": "ko",
+    "Hindi (hi)": "hi"
+}
+other_language = {
+    "Vietnamese": "vie",
+    "Serbian": "srp",
+    "Romanian": "ron",
+    "Indonesian": "ind",
+    "Philippine": "tgl"
+}
+def clean_audio(audio_path):
+    out_filename = f"output/cleaned_{uuid.uuid4()}.wav"
+    lowpass_highpass = "lowpass=8000,highpass=75,"
+    trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
+    try:
+        shell_command = f"ffmpeg -y -i {audio_path} -af {lowpass_highpass}{trim_silence} {out_filename}".split()
+        subprocess.run(shell_command, capture_output=True, check=True)
+        print(f"Audio cleaned and saved to {out_filename}")
+        return out_filename
+    except subprocess.CalledProcessError as e:
+        print(f"Error during audio cleaning: {e}")
+        return audio_path
+def check_audio_length(audio_path, max_duration=120):
+    try:
+        audio = AudioSegment.from_file(audio_path)
+        duration = audio.duration_seconds
+        if duration > max_duration:
+            print(f"Audio is too long: {duration} seconds. Max allowed is {max_duration} seconds.")
+            return False
+        return True
+    except Exception as e:
+        print(f"Error while checking audio length: {e}")
+        return False
+def synthesize_and_convert_voice(text, language_iso, voice_audio_path, speed):
+    tts_synthesis = TTS(model_name=f"tts_models/{language_iso}/fairseq/vits", )
+    wav_data = tts_synthesis.tts(text, speed=speed)
+    tts_conversion = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False)
+    output_file = "output/docout.wav"
+    os.makedirs("output", exist_ok=True)
+    tts_conversion.voice_conversion_to_file(wav_data, target_wav=voice_audio_path,
+                                            file_path=output_file)
+    return output_file  # Возвращаем путь к сгенерированному аудио
+def synthesize_speech(text, speaker_wav_path, language_iso, speed):
+    output_file_xtts = "output/undocout.wav"
+    tts.tts_to_file(text=text, file_path=output_file_xtts, speed=speed, speaker_wav=speaker_wav_path,
+                    language=language_iso)
+    tts_conversion = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False)
+    output_file = "output/docout.wav"
+    os.makedirs("output", exist_ok=True)
+    tts_conversion.voice_conversion_to_file(output_file_xtts, target_wav=speaker_wav_path,
+                                            file_path=output_file)
+    return output_file  # Возвращаем путь к сгенерированному аудио
+def get_language_code(selected_language):
+    if selected_language in language_options:
+        return language_options[selected_language]
+    elif selected_language in other_language:
+        return other_language[selected_language]
+    else:
+        return None
+def process_speech(text, speaker_wav, selected_language, speed):
+    language_code = get_language_code(selected_language)
+    if language_code is None:
+        raise ValueError("Выбранный язык не поддерживается.")
+    # Проверка длины аудио
+    if not check_audio_length(speaker_wav):
+        error_message = "Длина аудио превышает допустимый лимит в 2 минуты."
+        error = gr.Error(error_message, duration=5)
+        raise error
+    cleaned_wav_path = clean_audio(speaker_wav)
+    if selected_language in other_language:
+        audio_path = synthesize_and_convert_voice(text, language_code, cleaned_wav_path, speed)
+    else:
+        audio_path = synthesize_speech(text, cleaned_wav_path, language_code, speed)
+    return audio_path
+def generate_lipsync(video_path, audio_path, pad_top, pad_bottom, pad_left, pad_right, no_smooth, save_as_video):
     output_dir = "outputs"
     os.makedirs(output_dir, exist_ok=True)
     output_path = os.path.join(output_dir, "output.mp4")
     args = [
+        "--checkpoint_path", "checkpoints/wav2lip_gan.pth",
+        "--segmentation_path", "checkpoints/face_segmentation.pth",
+        "--no_seg",
+        "--no_sr",
+        "--face", video_path,
+        "--audio", audio_path,
+        "--outfile", output_path,
+        "--resize_factor", "2",
+        "--face_det_batch_size", "4",
+        "--wav2lip_batch_size", "64",
+        "--fps", "30",
+        "--pads", str(pad_top), str(pad_bottom), str(pad_left), str(pad_right)
     ]
     if no_smooth:
         args.append("--nosmooth")
     if save_as_video:
         args.append("--save_as_video")
     print(f"Выходной файл создан по пути: {output_path}")
     return output_path
+def process_all(text, speaker_wav, selected_language, speed, video, pad_top, pad_bottom, pad_left, pad_right, no_smooth, save_as_video):
+    # Шаг 1: Генерация аудио с клонированным голосом
+    audio_path = process_speech(text, speaker_wav, selected_language, speed)
+    # Шаг 2: Генерация видео с липсинком
+    video_path = video  # Предполагается, что video — это путь к файлу
+    result = generate_lipsync(video_path, audio_path, pad_top, pad_bottom, pad_left, pad_right, no_smooth, save_as_video)
+    return result
+with gr.Blocks() as demo:
+    gr.Markdown("# Объединение Voice Clone и Lipsync")
     with gr.Row():
         with gr.Column():
+            gr.Markdown("### Шаг 1: Настройки синтеза речи")
+            text_input = gr.Textbox(label="Введите текст для генерации", placeholder="Введите ваш текст здесь...")
+            speaker_wav_input = gr.Audio(label="Загрузите аудио говорящего (WAV формат)", type="filepath")
+            all_languages = list(language_options.keys()) + list(other_language.keys())
+            language_input = gr.Dropdown(
+                label="Язык",
+                choices=all_languages,
+                value="English (en)"
+            )
+            speed_input = gr.Slider(
+                label="Скорость синтеза",
+                minimum=0.1,
+                maximum=10,
+                step=0.1,
+                value=1.0,
+                info="Выберите скорость"
+            )
         with gr.Column():
+            gr.Markdown("### Шаг 2: Настройки липсинка")
+            video_input = gr.File(label="Видео или Изображение", type="filepath")
             pad_top = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Отступ сверху")
             pad_bottom = gr.Slider(minimum=0, maximum=50, step=1, value=10, label="Отступ снизу")
             pad_left = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Отступ слева")
             pad_right = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Отступ справа")
+            no_smooth = gr.Checkbox(label="Без сглаживания", value=False)
+            save_as_video = gr.Checkbox(label="Сохранять как видео", value=True)
+    output_video = gr.Video(label="Сгенерированное видео")
+    with gr.Row():
+        generate_button = gr.Button("Сгенерировать")
+        gr.HTML("<div style='width:300px;'></div>")
+        reload_button = gr.Button("Перезапустить")
+    generate_button.click(
+        fn=process_all,
+        inputs=[text_input, speaker_wav_input, language_input, speed_input, video_input, pad_top, pad_bottom, pad_left, pad_right, no_smooth, save_as_video],
+        outputs=output_video
+    )
+    reload_button.click(fn=lambda: os._exit(0), inputs=None, outputs=None)
+def launch_gradio():
+    demo.launch(
+        debug=True,
+        server_port=8600,
+        server_name="0.0.0.0",
     )
+if __name__ == "__main__":
+    launch_gradio()