Spaces:

emirhanbilgic
/

Text-to-speech-Turkish

Runtime error

App Files Files Community

emirhanbilgic commited on Aug 29, 2024

Commit

94cde68

verified ·

1 Parent(s): c7fbbca

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -14

app.py CHANGED Viewed

@@ -3,8 +3,10 @@ import torch
 import soundfile as sf
 import spaces
 import os
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from speechbrain.pretrained import EncoderClassifier
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -21,28 +23,42 @@ def load_models_and_data():
         savedir=os.path.join("/tmp", spk_model_name),
     )
-    return model, processor, vocoder, speaker_model
-model, processor, vocoder, speaker_model = load_models_and_data()
 def create_speaker_embedding(waveform):
     with torch.no_grad():
         speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
         speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
-        speaker_embeddings = speaker_embeddings.squeeze().to(device)
     return speaker_embeddings
 @spaces.GPU(duration = 60)
-def text_to_speech(text, audio_file):
     inputs = processor(text=text, return_tensors="pt").to(device)
-    # Load the audio file and create speaker embedding
-    waveform, sample_rate = sf.read(audio_file)
-    if len(waveform.shape) > 1:
-        waveform = waveform[:, 0]  # Take the first channel if stereo
-    speaker_embeddings = create_speaker_embedding(waveform)
-    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
     sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
     return "output.wav"
@@ -50,11 +66,11 @@ iface = gr.Interface(
     fn=text_to_speech,
     inputs=[
         gr.Textbox(label="Enter Turkish text to convert to speech"),
-        gr.Audio(label="Upload a short audio sample of the target speaker", type="filepath")
     ],
     outputs=gr.Audio(label="Generated Speech"),
-    title="Turkish SpeechT5 Text-to-Speech Demo with Custom Voice",
-    description="Enter Turkish text, upload a short audio sample of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model."
 )
-iface.launch()

 import soundfile as sf
 import spaces
 import os
+import numpy as np
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from speechbrain.pretrained import EncoderClassifier
+from datasets import load_dataset
 device = "cuda" if torch.cuda.is_available() else "cpu"
         savedir=os.path.join("/tmp", spk_model_name),
     )
+    # Load a sample from a dataset for default embedding
+    dataset = load_dataset("erenfazlioglu/turkishvoicedataset", split="train")
+    example = dataset[304]
+    return model, processor, vocoder, speaker_model, example
+model, processor, vocoder, speaker_model, default_example = load_models_and_data()
 def create_speaker_embedding(waveform):
     with torch.no_grad():
         speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
         speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
+        speaker_embeddings = speaker_embeddings.squeeze()
     return speaker_embeddings
+def prepare_default_embedding(example):
+    audio = example["audio"]
+    return create_speaker_embedding(audio["array"])
+default_embedding = prepare_default_embedding(default_example)
 @spaces.GPU(duration = 60)
+def text_to_speech(text, audio_file=None):
     inputs = processor(text=text, return_tensors="pt").to(device)
+    if audio_file is not None:
+        # Load the audio file and create speaker embedding
+        waveform, sample_rate = sf.read(audio_file)
+        if len(waveform.shape) > 1:
+            waveform = waveform[:, 0]  # Take the first channel if stereo
+        speaker_embeddings = create_speaker_embedding(waveform)
+    else:
+        # Use default embedding if no audio file is provided
+        speaker_embeddings = default_embedding
+    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
     sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
     return "output.wav"
     fn=text_to_speech,
     inputs=[
         gr.Textbox(label="Enter Turkish text to convert to speech"),
+        gr.Audio(label="Upload a short audio sample of the target speaker (optional)", type="filepath")
     ],
     outputs=gr.Audio(label="Generated Speech"),
+    title="Turkish SpeechT5 Text-to-Speech Demo with Optional Custom Voice",
+    description="Enter Turkish text, optionally upload a short audio sample of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model."
 )
+iface.launch(share=True)