emirhanbilgic commited on
Commit
1610722
·
verified ·
1 Parent(s): 29a7123

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -52
app.py CHANGED
@@ -1,25 +1,42 @@
1
- import os
2
- import re
3
- import torch
4
  import gradio as gr
 
5
  from datasets import load_dataset
6
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
  import soundfile as sf
8
- from speechbrain.pretrained import EncoderClassifier
9
  import spaces
 
 
 
10
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  replacements = [
14
- ("â", "a"),
15
- ("ç", "ch"),
16
- ("ğ", "gh"),
17
- ("ı", "i"),
18
- ("î", "i"),
19
- ("ö", "oe"),
20
- ("ş", "sh"),
21
- ("ü", "ue"),
22
- ("û", "u"),
23
  ]
24
 
25
  number_words = {
@@ -54,61 +71,44 @@ def replace_numbers_with_words(text):
54
  def replace(match):
55
  number = int(match.group())
56
  return number_to_words(number)
57
- return re.sub(r'\b\d+\b', replace, text)
58
 
59
- def cleanup_text(text):
60
- for old, new in replacements:
61
- text = text.replace(old, new)
62
- return text
63
 
64
  def normalize_text(text):
 
65
  text = replace_numbers_with_words(text)
66
- text = cleanup_text(text)
 
67
  return text
68
 
69
- def load_models_and_data():
70
- model_name = "microsoft/speecht5_tts"
71
- processor = SpeechT5Processor.from_pretrained(model_name)
72
- model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device)
73
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
74
-
75
- speaker_model = EncoderClassifier.from_hparams(
76
- source="speechbrain/spkrec-xvect-voxceleb",
77
- run_opts={"device": device},
78
- savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb"),
79
- )
80
-
81
- return model, processor, vocoder, speaker_model
82
-
83
- model, processor, vocoder, speaker_model = load_models_and_data()
84
-
85
- def create_speaker_embedding(waveform):
86
- with torch.no_grad():
87
- speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0))
88
- speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
89
- speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
90
- return speaker_embeddings
91
-
92
  @spaces.GPU(duration = 60)
93
- def text_to_speech(text, waveform):
94
- final_text = normalize_text(text)
 
 
 
 
 
 
 
95
  speaker_embeddings = create_speaker_embedding(waveform)
96
- speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0).to(device)
97
 
98
- inputs = processor(text=final_text, return_tensors="pt").to(device)
99
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
100
  sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
101
- return "output.wav"
102
 
103
  iface = gr.Interface(
104
  fn=text_to_speech,
105
  inputs=[
106
  gr.Textbox(label="Enter Turkish text to convert to speech"),
107
- gr.Audio(type="numpy", label="Upload Speaker Audio"), # Updated this line
 
 
 
 
108
  ],
109
- outputs=gr.Audio(label="Generated Speech"),
110
- title="Turkish SpeechT5 Text-to-Speech Demo with Custom Speaker Embeddings",
111
- description="Enter Turkish text and upload an audio file to generate speech using the fine-tuned SpeechT5 model with custom speaker embeddings. The text is normalized with custom replacements and number-to-word conversions."
112
  )
113
 
114
- iface.launch()
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
  from datasets import load_dataset
4
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
  import soundfile as sf
 
6
  import spaces
7
+ import os
8
+ from speechbrain.pretrained import EncoderClassifier
9
+ import re
10
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
+ def load_models_and_data():
14
+ model_name = "microsoft/speecht5_tts"
15
+ processor = SpeechT5Processor.from_pretrained(model_name)
16
+ model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device)
17
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
18
+
19
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
20
+ speaker_model = EncoderClassifier.from_hparams(
21
+ source=spk_model_name,
22
+ run_opts={"device": device},
23
+ savedir=os.path.join("/tmp", spk_model_name),
24
+ )
25
+
26
+ return model, processor, vocoder, speaker_model
27
+
28
+ model, processor, vocoder, speaker_model = load_models_and_data()
29
+
30
+ def create_speaker_embedding(waveform):
31
+ with torch.no_grad():
32
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
33
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
34
+ speaker_embeddings = speaker_embeddings.squeeze()
35
+ return speaker_embeddings
36
+
37
  replacements = [
38
+ ("â", "a"), ("ç", "ch"), ("ğ", "gh"), ("ı", "i"), ("î", "i"),
39
+ ("ö", "oe"), ("ş", "sh"), ("ü", "ue"), ("û", "u"),
 
 
 
 
 
 
 
40
  ]
41
 
42
  number_words = {
 
71
  def replace(match):
72
  number = int(match.group())
73
  return number_to_words(number)
 
74
 
75
+ return re.sub(r'\b\d+\b', replace, text)
 
 
 
76
 
77
  def normalize_text(text):
78
+ text = text.lower()
79
  text = replace_numbers_with_words(text)
80
+ for old, new in replacements:
81
+ text = text.replace(old, new)
82
  return text
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  @spaces.GPU(duration = 60)
85
+ def text_to_speech(text, audio_file):
86
+ normalized_text = normalize_text(text)
87
+ inputs = processor(text=normalized_text, return_tensors="pt").to(device)
88
+
89
+ waveform, sample_rate = sf.read(audio_file)
90
+ if len(waveform.shape) > 1:
91
+ waveform = waveform[:, 0] # Take the first channel if stereo
92
+ if sample_rate != 16000:
93
+ print("Warning: The model expects 16kHz sampling rate")
94
  speaker_embeddings = create_speaker_embedding(waveform)
 
95
 
 
96
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
97
  sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
98
+ return "output.wav", normalized_text
99
 
100
  iface = gr.Interface(
101
  fn=text_to_speech,
102
  inputs=[
103
  gr.Textbox(label="Enter Turkish text to convert to speech"),
104
+ gr.Audio(label="Upload a short audio file of the target speaker", type="filepath")
105
+ ],
106
+ outputs=[
107
+ gr.Audio(label="Generated Speech"),
108
+ gr.Textbox(label="Normalized Text")
109
  ],
110
+ title="Turkish SpeechT5 Text-to-Speech Demo with Custom Speaker",
111
+ description="Enter Turkish text, upload a short audio file of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model. The text will be normalized for better pronunciation."
 
112
  )
113
 
114
+ iface.launch()