VICTORZGITHUP commited on
Commit
37835db
·
1 Parent(s): b75dd68

Add application file

Browse files
Files changed (1) hide show
  1. app.py +100 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gradio as gr
4
+ from openvoice import se_extractor
5
+ from openvoice.api import ToneColorConverter
6
+ from transformers import pipeline
7
+ import scipy
8
+ from pathlib import Path
9
+
10
+ # Output directory setup
11
+ output_dir = './openvoice_outputs'
12
+ os.makedirs(output_dir, exist_ok=True)
13
+
14
+ # Function to get model names from a directory
15
+ def get_model_names(model_dir):
16
+ model_paths = Path(model_dir).glob('*')
17
+ return [model_path.name for model_path in model_paths if model_path.is_dir()]
18
+
19
+ def generate_speech(text, model_path):
20
+ synthesiser = pipeline("text-to-speech", model_path, device=0 if torch.cuda.is_available() else -1)
21
+ speech = synthesiser(text)
22
+
23
+ # Resample to 48kHz if needed
24
+ if speech["sampling_rate"] != 48000:
25
+ resampled_audio = scipy.signal.resample(speech["audio"][0], int(len(speech["audio"][0]) * 48000 / speech["sampling_rate"]))
26
+ sampling_rate = 48000
27
+ else:
28
+ resampled_audio = speech["audio"][0]
29
+ sampling_rate = speech["sampling_rate"]
30
+
31
+ return sampling_rate, resampled_audio
32
+
33
+ def save_audio(sampling_rate, audio_data, filename="output.wav"):
34
+ scipy.io.wavfile.write(filename, rate=sampling_rate, data=audio_data)
35
+ return filename
36
+
37
+ def voice_cloning(base_speaker, reference_speaker, model_version, device_choice, vad_select):
38
+ try:
39
+ # Determine paths and device
40
+ ckpt_converter = f'./OPENVOICE_MODELS/{model_version}'
41
+ device = "cuda:0" if device_choice == "GPU" and torch.cuda.is_available() else "cpu"
42
+ print(f"Device: {device}")
43
+
44
+ # Load the ToneColorConverter
45
+ tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
46
+ tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
47
+
48
+ # Extract speaker embeddings
49
+ source_se, _ = se_extractor.get_se(base_speaker, tone_color_converter, vad=vad_select)
50
+ target_se, _ = se_extractor.get_se(reference_speaker, tone_color_converter, vad=vad_select)
51
+
52
+ # Define output file paths
53
+ save_path = f'{output_dir}/output_cloned.wav'
54
+
55
+ # Perform tone color conversion
56
+ tone_color_converter.convert(
57
+ audio_src_path=base_speaker,
58
+ src_se=source_se,
59
+ tgt_se=target_se,
60
+ output_path=save_path,
61
+ )
62
+ return save_path, "Voice cloning successful!"
63
+ except Exception as e:
64
+ return None, f"Error: {str(e)}"
65
+
66
+ def ui_fn(text, model_dir, model_name, clone, reference_speaker, model_version, device_choice, vad_select):
67
+ model_path = os.path.join(model_dir, model_name)
68
+ sampling_rate, audio_data = generate_speech(text, model_path)
69
+ audio_file = save_audio(sampling_rate, audio_data)
70
+
71
+ if clone:
72
+ cloned_audio_file, status = voice_cloning(audio_file, reference_speaker, model_version, device_choice, vad_select)
73
+ return cloned_audio_file, status
74
+ else:
75
+ return audio_file, "Speech generation successful!"
76
+
77
+ if __name__ == "__main__":
78
+ #model_dir = "./models_mms"
79
+ #model_names = get_model_names(model_dir)
80
+
81
+ iface = gr.Interface(
82
+ fn=ui_fn,
83
+ inputs=[
84
+ gr.Textbox(label="Text to Synthesize"),
85
+ gr.Textbox(label="Model Path or Id", value="VIZINTZOR/MMS-TTS-THAI-MALE-NARRATOR"),
86
+ #gr.Dropdown(model_names, label="Model"),
87
+ gr.Checkbox(label="Clone Voice", value=False),
88
+ gr.Audio(label="Reference Speaker (Target Voice)", type="filepath"),
89
+ gr.Dropdown(["v1", "v2"], value="v2", label="Model Version"),
90
+ gr.Dropdown(["CPU", "GPU"], value="GPU" if torch.cuda.is_available() else "CPU", label="Device"),
91
+ gr.Checkbox(value=False, label="VAD", interactive=True)
92
+ ],
93
+ outputs=[
94
+ gr.Audio(label="Generated Audio", type="filepath"),
95
+ gr.Textbox(label="Status", interactive=False)
96
+ ],
97
+ title="Text-to-Speech Synthesizer with Voice Cloning",
98
+ description="Enter text and model path to generate speech. Optionally, clone the voice using a reference speaker."
99
+ )
100
+ iface.launch()