TTS_MMS_VITS-VOICECLONE

Running

App Files Files Community

VIZINTZOR commited on Jan 21

Commit

9bb2fdd

verified ·

1 Parent(s): c99090c

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -22

app.py CHANGED Viewed

@@ -11,15 +11,10 @@ from pathlib import Path
 output_dir = './openvoice_outputs'
 os.makedirs(output_dir, exist_ok=True)
-# Function to get model names from a directory
-def get_model_names(model_dir):
-    model_paths = Path(model_dir).glob('*')
-    return [model_path.name for model_path in model_paths if model_path.is_dir()]
-def generate_speech(text, model_path):
-    synthesiser = pipeline("text-to-speech", model_path, device=0 if torch.cuda.is_available() else -1)
     speech = synthesiser(text)
     # Resample to 48kHz if needed
     if speech["sampling_rate"] != 48000:
         resampled_audio = scipy.signal.resample(speech["audio"][0], int(len(speech["audio"][0]) * 48000 / speech["sampling_rate"]))
@@ -27,7 +22,7 @@ def generate_speech(text, model_path):
     else:
         resampled_audio = speech["audio"][0]
         sampling_rate = speech["sampling_rate"]
     return sampling_rate, resampled_audio
 def save_audio(sampling_rate, audio_data, filename="output.wav"):
@@ -40,7 +35,7 @@ def voice_cloning(base_speaker, reference_speaker, model_version, device_choice,
         ckpt_converter = f'./OPENVOICE_MODELS/{model_version}'
         device = "cuda:0" if device_choice == "GPU" and torch.cuda.is_available() else "cpu"
         print(f"Device: {device}")
         # Load the ToneColorConverter
         tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
         tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
@@ -48,10 +43,10 @@ def voice_cloning(base_speaker, reference_speaker, model_version, device_choice,
         # Extract speaker embeddings
         source_se, _ = se_extractor.get_se(base_speaker, tone_color_converter, vad=vad_select)
         target_se, _ = se_extractor.get_se(reference_speaker, tone_color_converter, vad=vad_select)
         # Define output file paths
         save_path = f'{output_dir}/output_cloned.wav'
         # Perform tone color conversion
         tone_color_converter.convert(
             audio_src_path=base_speaker,
@@ -63,11 +58,10 @@ def voice_cloning(base_speaker, reference_speaker, model_version, device_choice,
     except Exception as e:
         return None, f"Error: {str(e)}"
-def ui_fn(text, model_dir, model_name, clone, reference_speaker, model_version, device_choice, vad_select):
-    model_path = os.path.join(model_dir, model_name)
-    sampling_rate, audio_data = generate_speech(text, model_path)
     audio_file = save_audio(sampling_rate, audio_data)
     if clone:
         cloned_audio_file, status = voice_cloning(audio_file, reference_speaker, model_version, device_choice, vad_select)
         return cloned_audio_file, status
@@ -75,15 +69,11 @@ def ui_fn(text, model_dir, model_name, clone, reference_speaker, model_version,
         return audio_file, "Speech generation successful!"
 if __name__ == "__main__":
-    #model_dir = "./models_mms"
-    #model_names = get_model_names(model_dir)
     iface = gr.Interface(
         fn=ui_fn,
         inputs=[
             gr.Textbox(label="Text to Synthesize"),
-            gr.Textbox(label="Model Path or Id", value="VIZINTZOR/MMS-TTS-THAI-MALE-NARRATOR"),
-            #gr.Dropdown(model_names, label="Model"),
             gr.Checkbox(label="Clone Voice", value=False),
             gr.Audio(label="Reference Speaker (Target Voice)", type="filepath"),
             gr.Dropdown(["v1", "v2"], value="v2", label="Model Version"),
@@ -95,6 +85,6 @@ if __name__ == "__main__":
             gr.Textbox(label="Status", interactive=False)
         ],
         title="Text-to-Speech Synthesizer with Voice Cloning",
-        description="Enter text and model path to generate speech. Optionally, clone the voice using a reference speaker."
     )
     iface.launch()

 output_dir = './openvoice_outputs'
 os.makedirs(output_dir, exist_ok=True)
+def generate_speech(text, model_id):
+    synthesiser = pipeline("text-to-speech", model=model_id, device=0 if torch.cuda.is_available() else -1)
     speech = synthesiser(text)
     # Resample to 48kHz if needed
     if speech["sampling_rate"] != 48000:
         resampled_audio = scipy.signal.resample(speech["audio"][0], int(len(speech["audio"][0]) * 48000 / speech["sampling_rate"]))
     else:
         resampled_audio = speech["audio"][0]
         sampling_rate = speech["sampling_rate"]
     return sampling_rate, resampled_audio
 def save_audio(sampling_rate, audio_data, filename="output.wav"):
         ckpt_converter = f'./OPENVOICE_MODELS/{model_version}'
         device = "cuda:0" if device_choice == "GPU" and torch.cuda.is_available() else "cpu"
         print(f"Device: {device}")
         # Load the ToneColorConverter
         tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
         tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
         # Extract speaker embeddings
         source_se, _ = se_extractor.get_se(base_speaker, tone_color_converter, vad=vad_select)
         target_se, _ = se_extractor.get_se(reference_speaker, tone_color_converter, vad=vad_select)
         # Define output file paths
         save_path = f'{output_dir}/output_cloned.wav'
         # Perform tone color conversion
         tone_color_converter.convert(
             audio_src_path=base_speaker,
     except Exception as e:
         return None, f"Error: {str(e)}"
+def ui_fn(text, model_id, clone, reference_speaker, model_version, device_choice, vad_select):
+    sampling_rate, audio_data = generate_speech(text, model_id)
     audio_file = save_audio(sampling_rate, audio_data)
     if clone:
         cloned_audio_file, status = voice_cloning(audio_file, reference_speaker, model_version, device_choice, vad_select)
         return cloned_audio_file, status
         return audio_file, "Speech generation successful!"
 if __name__ == "__main__":
     iface = gr.Interface(
         fn=ui_fn,
         inputs=[
             gr.Textbox(label="Text to Synthesize"),
+            gr.Textbox(label="Model ID", value="VIZINTZOR/MMS-TTS-THAI-MALE-NARRATOR"),
             gr.Checkbox(label="Clone Voice", value=False),
             gr.Audio(label="Reference Speaker (Target Voice)", type="filepath"),
             gr.Dropdown(["v1", "v2"], value="v2", label="Model Version"),
             gr.Textbox(label="Status", interactive=False)
         ],
         title="Text-to-Speech Synthesizer with Voice Cloning",
+        description="Enter text and model ID to generate speech. Optionally, clone the voice using a reference speaker."
     )
     iface.launch()