VIZINTZOR commited on
Commit
9bb2fdd
·
verified ·
1 Parent(s): c99090c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -22
app.py CHANGED
@@ -11,15 +11,10 @@ from pathlib import Path
11
  output_dir = './openvoice_outputs'
12
  os.makedirs(output_dir, exist_ok=True)
13
 
14
- # Function to get model names from a directory
15
- def get_model_names(model_dir):
16
- model_paths = Path(model_dir).glob('*')
17
- return [model_path.name for model_path in model_paths if model_path.is_dir()]
18
-
19
- def generate_speech(text, model_path):
20
- synthesiser = pipeline("text-to-speech", model_path, device=0 if torch.cuda.is_available() else -1)
21
  speech = synthesiser(text)
22
-
23
  # Resample to 48kHz if needed
24
  if speech["sampling_rate"] != 48000:
25
  resampled_audio = scipy.signal.resample(speech["audio"][0], int(len(speech["audio"][0]) * 48000 / speech["sampling_rate"]))
@@ -27,7 +22,7 @@ def generate_speech(text, model_path):
27
  else:
28
  resampled_audio = speech["audio"][0]
29
  sampling_rate = speech["sampling_rate"]
30
-
31
  return sampling_rate, resampled_audio
32
 
33
  def save_audio(sampling_rate, audio_data, filename="output.wav"):
@@ -40,7 +35,7 @@ def voice_cloning(base_speaker, reference_speaker, model_version, device_choice,
40
  ckpt_converter = f'./OPENVOICE_MODELS/{model_version}'
41
  device = "cuda:0" if device_choice == "GPU" and torch.cuda.is_available() else "cpu"
42
  print(f"Device: {device}")
43
-
44
  # Load the ToneColorConverter
45
  tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
46
  tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
@@ -48,10 +43,10 @@ def voice_cloning(base_speaker, reference_speaker, model_version, device_choice,
48
  # Extract speaker embeddings
49
  source_se, _ = se_extractor.get_se(base_speaker, tone_color_converter, vad=vad_select)
50
  target_se, _ = se_extractor.get_se(reference_speaker, tone_color_converter, vad=vad_select)
51
-
52
  # Define output file paths
53
  save_path = f'{output_dir}/output_cloned.wav'
54
-
55
  # Perform tone color conversion
56
  tone_color_converter.convert(
57
  audio_src_path=base_speaker,
@@ -63,11 +58,10 @@ def voice_cloning(base_speaker, reference_speaker, model_version, device_choice,
63
  except Exception as e:
64
  return None, f"Error: {str(e)}"
65
 
66
- def ui_fn(text, model_dir, model_name, clone, reference_speaker, model_version, device_choice, vad_select):
67
- model_path = os.path.join(model_dir, model_name)
68
- sampling_rate, audio_data = generate_speech(text, model_path)
69
  audio_file = save_audio(sampling_rate, audio_data)
70
-
71
  if clone:
72
  cloned_audio_file, status = voice_cloning(audio_file, reference_speaker, model_version, device_choice, vad_select)
73
  return cloned_audio_file, status
@@ -75,15 +69,11 @@ def ui_fn(text, model_dir, model_name, clone, reference_speaker, model_version,
75
  return audio_file, "Speech generation successful!"
76
 
77
  if __name__ == "__main__":
78
- #model_dir = "./models_mms"
79
- #model_names = get_model_names(model_dir)
80
-
81
  iface = gr.Interface(
82
  fn=ui_fn,
83
  inputs=[
84
  gr.Textbox(label="Text to Synthesize"),
85
- gr.Textbox(label="Model Path or Id", value="VIZINTZOR/MMS-TTS-THAI-MALE-NARRATOR"),
86
- #gr.Dropdown(model_names, label="Model"),
87
  gr.Checkbox(label="Clone Voice", value=False),
88
  gr.Audio(label="Reference Speaker (Target Voice)", type="filepath"),
89
  gr.Dropdown(["v1", "v2"], value="v2", label="Model Version"),
@@ -95,6 +85,6 @@ if __name__ == "__main__":
95
  gr.Textbox(label="Status", interactive=False)
96
  ],
97
  title="Text-to-Speech Synthesizer with Voice Cloning",
98
- description="Enter text and model path to generate speech. Optionally, clone the voice using a reference speaker."
99
  )
100
  iface.launch()
 
11
  output_dir = './openvoice_outputs'
12
  os.makedirs(output_dir, exist_ok=True)
13
 
14
+ def generate_speech(text, model_id):
15
+ synthesiser = pipeline("text-to-speech", model=model_id, device=0 if torch.cuda.is_available() else -1)
 
 
 
 
 
16
  speech = synthesiser(text)
17
+
18
  # Resample to 48kHz if needed
19
  if speech["sampling_rate"] != 48000:
20
  resampled_audio = scipy.signal.resample(speech["audio"][0], int(len(speech["audio"][0]) * 48000 / speech["sampling_rate"]))
 
22
  else:
23
  resampled_audio = speech["audio"][0]
24
  sampling_rate = speech["sampling_rate"]
25
+
26
  return sampling_rate, resampled_audio
27
 
28
  def save_audio(sampling_rate, audio_data, filename="output.wav"):
 
35
  ckpt_converter = f'./OPENVOICE_MODELS/{model_version}'
36
  device = "cuda:0" if device_choice == "GPU" and torch.cuda.is_available() else "cpu"
37
  print(f"Device: {device}")
38
+
39
  # Load the ToneColorConverter
40
  tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
41
  tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
 
43
  # Extract speaker embeddings
44
  source_se, _ = se_extractor.get_se(base_speaker, tone_color_converter, vad=vad_select)
45
  target_se, _ = se_extractor.get_se(reference_speaker, tone_color_converter, vad=vad_select)
46
+
47
  # Define output file paths
48
  save_path = f'{output_dir}/output_cloned.wav'
49
+
50
  # Perform tone color conversion
51
  tone_color_converter.convert(
52
  audio_src_path=base_speaker,
 
58
  except Exception as e:
59
  return None, f"Error: {str(e)}"
60
 
61
+ def ui_fn(text, model_id, clone, reference_speaker, model_version, device_choice, vad_select):
62
+ sampling_rate, audio_data = generate_speech(text, model_id)
 
63
  audio_file = save_audio(sampling_rate, audio_data)
64
+
65
  if clone:
66
  cloned_audio_file, status = voice_cloning(audio_file, reference_speaker, model_version, device_choice, vad_select)
67
  return cloned_audio_file, status
 
69
  return audio_file, "Speech generation successful!"
70
 
71
  if __name__ == "__main__":
 
 
 
72
  iface = gr.Interface(
73
  fn=ui_fn,
74
  inputs=[
75
  gr.Textbox(label="Text to Synthesize"),
76
+ gr.Textbox(label="Model ID", value="VIZINTZOR/MMS-TTS-THAI-MALE-NARRATOR"),
 
77
  gr.Checkbox(label="Clone Voice", value=False),
78
  gr.Audio(label="Reference Speaker (Target Voice)", type="filepath"),
79
  gr.Dropdown(["v1", "v2"], value="v2", label="Model Version"),
 
85
  gr.Textbox(label="Status", interactive=False)
86
  ],
87
  title="Text-to-Speech Synthesizer with Voice Cloning",
88
+ description="Enter text and model ID to generate speech. Optionally, clone the voice using a reference speaker."
89
  )
90
  iface.launch()