Aumkeshchy2003 commited on
Commit
405ddc5
·
verified ·
1 Parent(s): 61b0ed4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -88
app.py CHANGED
@@ -1,50 +1,21 @@
1
 
2
  import gradio as gr
3
  import torch
4
- import soundfile as sf
5
- import spaces
6
- import os
7
- import numpy as np
8
- import re
9
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
10
- from speechbrain.pretrained import EncoderClassifier
11
  from datasets import load_dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- device = "cuda" if torch.cuda.is_available() else "cpu"
14
-
15
- def load_models_and_data():
16
- model_name = "microsoft/speecht5_tts"
17
- processor = SpeechT5Processor.from_pretrained(model_name)
18
- model = SpeechT5ForTextToSpeech.from_pretrained("Aumkeshchy2003/speecht5_finetuned_AumkeshChy_italian_tts").to(device)
19
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
20
-
21
- spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
22
- speaker_model = EncoderClassifier.from_hparams(
23
- source=spk_model_name,
24
- run_opts={"device": device},
25
- savedir=os.path.join("/tmp", spk_model_name),
26
- )
27
-
28
- # Load a sample from a dataset for default embedding
29
- dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
30
- example = dataset[14]
31
-
32
- return model, processor, vocoder, speaker_model, example
33
-
34
- model, processor, vocoder, speaker_model, default_example = load_models_and_data()
35
-
36
- def create_speaker_embedding(waveform):
37
- with torch.no_grad():
38
- speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
39
- speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
40
- speaker_embeddings = speaker_embeddings.squeeze()
41
- return speaker_embeddings
42
-
43
- def prepare_default_embedding(example):
44
- audio = example["audio"]
45
- return create_speaker_embedding(audio["array"])
46
-
47
- default_embedding = prepare_default_embedding(default_example)
48
 
49
  replacements = [
50
  ('à', 'ah'),
@@ -96,51 +67,37 @@ def replace_numbers_with_words(text):
96
 
97
  return result
98
 
99
- def normalize_text(text):
100
- # Convert to lowercase
101
- text = text.lower()
102
-
103
- # Replace numbers with words
104
- text = replace_numbers_with_words(text)
105
-
106
- # Apply character replacements
107
- for old, new in replacements:
108
- text = text.replace(old, new)
109
-
110
- # Remove punctuation
111
- text = re.sub(r'[^\w\s]', '', text)
112
-
113
- return text
114
-
115
- @spaces.GPU(duration=60)
116
- def text_to_speech(text, audio_file=None):
117
- # Normalize the input text
118
- normalized_text = normalize_text(text)
119
-
120
- # Prepare the input for the model
121
- inputs = processor(text=normalized_text, return_tensors="pt").to(device)
122
-
123
- # Use the default speaker embedding
124
- speaker_embeddings = default_embedding
125
-
126
- # Generate speech
127
- with torch.no_grad():
128
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
129
-
130
- speech_np = speech.cpu().numpy()
131
-
132
- return (24000, speech_np)
133
-
134
- iface = gr.Interface(
135
- fn=text_to_speech,
136
- inputs=[
137
- gr.Textbox(label="Enter Italian text to convert to speech")
138
- ],
139
- outputs=[
140
- gr.Audio(label="Generated Speech", type="numpy")
141
- ],
142
- title="Italian SpeechT5 Text-to-Speech Demo",
143
- description="Enter Italian text, and listen to the generated speech."
144
  )
145
 
146
- iface.launch(share=True)
 
 
1
 
2
  import gradio as gr
3
  import torch
 
 
 
 
 
 
 
4
  from datasets import load_dataset
5
+ from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech
6
+
7
+ # Load the fine-tuned model and vocoder for Italian from the new model ID
8
+ model_id = "Aumkeshchy2003/speecht5_finetuned_AumkeshChy_italian_tts"
9
+ model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
10
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
11
+
12
+ # Load speaker embeddings dataset
13
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
14
+ speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)
15
+
16
+ # Load processor for the new Italian model
17
+ processor = SpeechT5Processor.from_pretrained(model_id)
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  replacements = [
21
  ('à', 'ah'),
 
67
 
68
  return result
69
 
70
+ # Text-to-speech synthesis function
71
+ def synthesize_speech(text):
72
+ # Clean up text for Italian-specific accents
73
+ for src, dst in replacements:
74
+ text = text.replace(src, dst)
75
+
76
+ # Process input text
77
+ inputs = processor(text=text, return_tensors="pt")
78
+
79
+ # Generate speech using the model and vocoder
80
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
81
+
82
+ # Return the generated speech as (sample_rate, audio_array)
83
+ return (16000, speech.cpu().numpy())
84
+
85
+ # Title and description for the Gradio interface
86
+ title = "Fine-tuning TTS for a Italian Language Using SpeechT5"
87
+ description = """
88
+ This Space generates speech in Italian using the fine-tuned SpeechT5 model from Hugging Face.
89
+ The model is fine-tuned on the VoxPopuli Italian dataset.
90
+ """
91
+
92
+ # Create Gradio interface
93
+ interface = gr.Interface(
94
+ fn=synthesize_speech,
95
+ inputs=gr.Textbox(label="Input Text", placeholder="Enter Italian text"),
96
+ outputs=gr.Audio(label="Generated Speech"),
97
+ title=title,
98
+ description=description,
99
+ examples=["Buongiorno, come sta? Buona giornata"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  )
101
 
102
+ # Launch the interface
103
+ interface.launch()