Spaces:

emirhanbilgic
/

Text-to-speech-Turkish

Running

App Files Files Community

emirhanbilgic commited on Aug 29, 2024

Commit

1610722

verified ·

1 Parent(s): 29a7123

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -52

app.py CHANGED Viewed

@@ -1,25 +1,42 @@
-import os
-import re
-import torch
 import gradio as gr
 from datasets import load_dataset
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 import soundfile as sf
-from speechbrain.pretrained import EncoderClassifier
 import spaces
 device = "cuda" if torch.cuda.is_available() else "cpu"
 replacements = [
-    ("â", "a"),
-    ("ç", "ch"),
-    ("ğ", "gh"),
-    ("ı", "i"),
-    ("î", "i"),
-    ("ö", "oe"),
-    ("ş", "sh"),
-    ("ü", "ue"),
-    ("û", "u"),
 ]
 number_words = {
@@ -54,61 +71,44 @@ def replace_numbers_with_words(text):
     def replace(match):
         number = int(match.group())
         return number_to_words(number)
-    return re.sub(r'\b\d+\b', replace, text)
-def cleanup_text(text):
-    for old, new in replacements:
-        text = text.replace(old, new)
-    return text
 def normalize_text(text):
     text = replace_numbers_with_words(text)
-    text = cleanup_text(text)
     return text
-def load_models_and_data():
-    model_name = "microsoft/speecht5_tts"
-    processor = SpeechT5Processor.from_pretrained(model_name)
-    model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device)
-    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
-    speaker_model = EncoderClassifier.from_hparams(
-        source="speechbrain/spkrec-xvect-voxceleb",
-        run_opts={"device": device},
-        savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb"),
-    )
-    return model, processor, vocoder, speaker_model
-model, processor, vocoder, speaker_model = load_models_and_data()
-def create_speaker_embedding(waveform):
-    with torch.no_grad():
-        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0))
-        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
-        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
-    return speaker_embeddings
 @spaces.GPU(duration = 60)
-def text_to_speech(text, waveform):
-    final_text = normalize_text(text)
     speaker_embeddings = create_speaker_embedding(waveform)
-    speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0).to(device)
-    inputs = processor(text=final_text, return_tensors="pt").to(device)
     speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
     sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
-    return "output.wav"
 iface = gr.Interface(
     fn=text_to_speech,
     inputs=[
         gr.Textbox(label="Enter Turkish text to convert to speech"),
-        gr.Audio(type="numpy", label="Upload Speaker Audio"),  # Updated this line
     ],
-    outputs=gr.Audio(label="Generated Speech"),
-    title="Turkish SpeechT5 Text-to-Speech Demo with Custom Speaker Embeddings",
-    description="Enter Turkish text and upload an audio file to generate speech using the fine-tuned SpeechT5 model with custom speaker embeddings. The text is normalized with custom replacements and number-to-word conversions."
 )
-iface.launch()

 import gradio as gr
+import torch
 from datasets import load_dataset
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 import soundfile as sf
 import spaces
+import os
+from speechbrain.pretrained import EncoderClassifier
+import re
 device = "cuda" if torch.cuda.is_available() else "cpu"
+def load_models_and_data():
+    model_name = "microsoft/speecht5_tts"
+    processor = SpeechT5Processor.from_pretrained(model_name)
+    model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device)
+    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
+    spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
+    speaker_model = EncoderClassifier.from_hparams(
+        source=spk_model_name,
+        run_opts={"device": device},
+        savedir=os.path.join("/tmp", spk_model_name),
+    )
+    return model, processor, vocoder, speaker_model
+model, processor, vocoder, speaker_model = load_models_and_data()
+def create_speaker_embedding(waveform):
+    with torch.no_grad():
+        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
+        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
+        speaker_embeddings = speaker_embeddings.squeeze()
+    return speaker_embeddings
 replacements = [
+    ("â", "a"), ("ç", "ch"), ("ğ", "gh"), ("ı", "i"), ("î", "i"),
+    ("ö", "oe"), ("ş", "sh"), ("ü", "ue"), ("û", "u"),
 ]
 number_words = {
     def replace(match):
         number = int(match.group())
         return number_to_words(number)
+    return re.sub(r'\b\d+\b', replace, text)
 def normalize_text(text):
+    text = text.lower()
     text = replace_numbers_with_words(text)
+    for old, new in replacements:
+        text = text.replace(old, new)
     return text
 @spaces.GPU(duration = 60)
+def text_to_speech(text, audio_file):
+    normalized_text = normalize_text(text)
+    inputs = processor(text=normalized_text, return_tensors="pt").to(device)
+    waveform, sample_rate = sf.read(audio_file)
+    if len(waveform.shape) > 1:
+        waveform = waveform[:, 0]  # Take the first channel if stereo
+    if sample_rate != 16000:
+        print("Warning: The model expects 16kHz sampling rate")
     speaker_embeddings = create_speaker_embedding(waveform)
     speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
     sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
+    return "output.wav", normalized_text
 iface = gr.Interface(
     fn=text_to_speech,
     inputs=[
         gr.Textbox(label="Enter Turkish text to convert to speech"),
+        gr.Audio(label="Upload a short audio file of the target speaker", type="filepath")
+    ],
+    outputs=[
+        gr.Audio(label="Generated Speech"),
+        gr.Textbox(label="Normalized Text")
     ],
+    title="Turkish SpeechT5 Text-to-Speech Demo with Custom Speaker",
+    description="Enter Turkish text, upload a short audio file of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model. The text will be normalized for better pronunciation."
 )
+iface.launch()