Spaces:

Aumkeshchy2003
/

Italian_TTS

Running

App Files Files Community

Aumkeshchy2003 commited on Oct 25, 2024

Commit

405ddc5

verified ·

1 Parent(s): 61b0ed4

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -88

app.py CHANGED Viewed

@@ -1,50 +1,21 @@
 import gradio as gr
 import torch
-import soundfile as sf
-import spaces
-import os
-import numpy as np
-import re
-from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-from speechbrain.pretrained import EncoderClassifier
 from datasets import load_dataset
-device = "cuda" if torch.cuda.is_available() else "cpu"
-def load_models_and_data():
-    model_name = "microsoft/speecht5_tts"
-    processor = SpeechT5Processor.from_pretrained(model_name)
-    model = SpeechT5ForTextToSpeech.from_pretrained("Aumkeshchy2003/speecht5_finetuned_AumkeshChy_italian_tts").to(device)
-    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
-    spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
-    speaker_model = EncoderClassifier.from_hparams(
-        source=spk_model_name,
-        run_opts={"device": device},
-        savedir=os.path.join("/tmp", spk_model_name),
-    )
-    # Load a sample from a dataset for default embedding
-    dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-    example = dataset[14]
-    return model, processor, vocoder, speaker_model, example
-model, processor, vocoder, speaker_model, default_example = load_models_and_data()
-def create_speaker_embedding(waveform):
-    with torch.no_grad():
-        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
-        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
-        speaker_embeddings = speaker_embeddings.squeeze()
-    return speaker_embeddings
-def prepare_default_embedding(example):
-    audio = example["audio"]
-    return create_speaker_embedding(audio["array"])
-default_embedding = prepare_default_embedding(default_example)
 replacements = [
     ('à', 'ah'),
@@ -96,51 +67,37 @@ def replace_numbers_with_words(text):
     return result
-def normalize_text(text):
-    # Convert to lowercase
-    text = text.lower()
-    # Replace numbers with words
-    text = replace_numbers_with_words(text)
-    # Apply character replacements
-    for old, new in replacements:
-        text = text.replace(old, new)
-    # Remove punctuation
-    text = re.sub(r'[^\w\s]', '', text)
-    return text
-@spaces.GPU(duration=60)
-def text_to_speech(text, audio_file=None):
-    # Normalize the input text
-    normalized_text = normalize_text(text)
-    # Prepare the input for the model
-    inputs = processor(text=normalized_text, return_tensors="pt").to(device)
-    # Use the default speaker embedding
-    speaker_embeddings = default_embedding
-    # Generate speech
-    with torch.no_grad():
-        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
-    speech_np = speech.cpu().numpy()
-    return (24000, speech_np)
-iface = gr.Interface(
-    fn=text_to_speech,
-    inputs=[
-        gr.Textbox(label="Enter Italian text to convert to speech")
-    ],
-    outputs=[
-        gr.Audio(label="Generated Speech", type="numpy")
-    ],
-    title="Italian SpeechT5 Text-to-Speech Demo",
-    description="Enter Italian text, and listen to the generated speech."
 )
-iface.launch(share=True)

 import gradio as gr
 import torch
 from datasets import load_dataset
+from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech
+# Load the fine-tuned model and vocoder for Italian from the new model ID
+model_id = "Aumkeshchy2003/speecht5_finetuned_AumkeshChy_italian_tts"
+model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+# Load speaker embeddings dataset
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)
+# Load processor for the new Italian model
+processor = SpeechT5Processor.from_pretrained(model_id)
 replacements = [
     ('à', 'ah'),
     return result
+# Text-to-speech synthesis function
+def synthesize_speech(text):
+    # Clean up text for Italian-specific accents
+    for src, dst in replacements:
+        text = text.replace(src, dst)
+    # Process input text
+    inputs = processor(text=text, return_tensors="pt")
+    # Generate speech using the model and vocoder
+    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+    # Return the generated speech as (sample_rate, audio_array)
+    return (16000, speech.cpu().numpy())
+# Title and description for the Gradio interface
+title = "Fine-tuning TTS for a Italian Language Using SpeechT5"
+description = """
+This Space generates speech in Italian using the fine-tuned SpeechT5 model from Hugging Face.
+The model is fine-tuned on the VoxPopuli Italian dataset.
+"""
+# Create Gradio interface
+interface = gr.Interface(
+    fn=synthesize_speech,
+    inputs=gr.Textbox(label="Input Text", placeholder="Enter Italian text"),
+    outputs=gr.Audio(label="Generated Speech"),
+    title=title,
+    description=description,
+    examples=["Buongiorno, come sta? Buona giornata"]
 )
+# Launch the interface
+interface.launch()