Spaces:

emirhanbilgic
/

Text-to-speech-Turkish

Runtime error

App Files Files Community

emirhanbilgic commited on Aug 29, 2024

Commit

05020c4

verified ·

1 Parent(s): ff597d6

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -9

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import gradio as gr
 import torch
 from datasets import load_dataset
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 import soundfile as sf
 import spaces
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -13,15 +15,28 @@ def load_models_and_data():
     model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device)
     vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
-    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
-    return model, processor, vocoder, speaker_embeddings
-model, processor, vocoder, speaker_embeddings = load_models_and_data()
 @spaces.GPU(duration = 60)
-def text_to_speech(text):
     inputs = processor(text=text, return_tensors="pt").to(device)
     speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
     sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
@@ -29,10 +44,13 @@ def text_to_speech(text):
 iface = gr.Interface(
     fn=text_to_speech,
-    inputs=gr.Textbox(label="Enter Turkish text to convert to speech"),
     outputs=gr.Audio(label="Generated Speech"),
-    title="Turkish SpeechT5 Text-to-Speech Demo",
-    description="Enter Turkish text and listen to the generated speech using the fine-tuned SpeechT5 model."
 )
-iface.launch()

+import os
 import gradio as gr
 import torch
 from datasets import load_dataset
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 import soundfile as sf
+from speechbrain.pretrained import EncoderClassifier
 import spaces
 device = "cuda" if torch.cuda.is_available() else "cpu"
     model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device)
     vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
+    speaker_model = EncoderClassifier.from_hparams(
+        source="speechbrain/spkrec-xvect-voxceleb",
+        run_opts={"device": device},
+        savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb"),
+    )
+    return model, processor, vocoder, speaker_model
+model, processor, vocoder, speaker_model = load_models_and_data()
+def create_speaker_embedding(waveform):
+    with torch.no_grad():
+        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0))
+        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
+        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
+    return speaker_embeddings
 @spaces.GPU(duration = 60)
+def text_to_speech(text, waveform):
+    speaker_embeddings = create_speaker_embedding(waveform)
+    speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0).to(device)
     inputs = processor(text=text, return_tensors="pt").to(device)
     speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
     sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
 iface = gr.Interface(
     fn=text_to_speech,
+    inputs=[
+        gr.Textbox(label="Enter Turkish text to convert to speech"),
+        gr.Audio(source="upload", type="numpy", label="Upload Speaker Audio"),
+    ],
     outputs=gr.Audio(label="Generated Speech"),
+    title="Turkish SpeechT5 Text-to-Speech Demo with Custom Speaker Embeddings",
+    description="Enter Turkish text and upload an audio file to generate speech using the fine-tuned SpeechT5 model with custom speaker embeddings."
 )
+iface.launch()