Moustapha91 commited on
Commit
2feeb4f
·
verified ·
1 Parent(s): a2ecab0

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +224 -0
app.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1E-6YoHEsjoKc6FVXzQHbtnRja-68brYc
8
+
9
+ numpy==1.23.5
10
+ transformers
11
+ datasets
12
+ soundfile
13
+ torch
14
+ torchaudio
15
+ sentencepiece
16
+ speechbrain==0.5.16
17
+ librosa
18
+ """
19
+
20
+ !pip install transformers datasets soundfile torch torchaudio sentencepiece speechbrain librosa
21
+
22
+ !pip install -q gradio
23
+
24
+ !pip install -q spaces
25
+
26
+ import gradio as gr
27
+ import torch
28
+ import soundfile as sf
29
+ import spaces
30
+ import os
31
+ import numpy as np
32
+ import re
33
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
34
+ from speechbrain.pretrained import EncoderClassifier
35
+ from datasets import load_dataset
36
+
37
+ device = "cuda" if torch.cuda.is_available() else "cpu"
38
+
39
+ def load_models_and_data():
40
+ model_name = "microsoft/speecht5_tts"
41
+ processor = SpeechT5Processor.from_pretrained(model_name)
42
+ model = SpeechT5ForTextToSpeech.from_pretrained("Moustapha91/speecht5_tts__v3_sn").to(device)
43
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
44
+
45
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
46
+ speaker_model = EncoderClassifier.from_hparams(
47
+ source=spk_model_name,
48
+ run_opts={"device": device},
49
+ savedir=os.path.join("/tmp", spk_model_name),
50
+ )
51
+
52
+ # Load a sample from a dataset for default embedding
53
+ dataset = load_dataset("Alwaly/Wolof_TTS", split="train")
54
+ dataset = dataset.select(range(12003))
55
+ dataset = dataset.train_test_split(test_size=0.1)
56
+ example = dataset['test'][8]
57
+
58
+ return model, processor, vocoder, speaker_model, example
59
+
60
+ model, processor, vocoder, speaker_model, default_example = load_models_and_data()
61
+
62
+ def create_speaker_embedding(waveform):
63
+ with torch.no_grad():
64
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
65
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
66
+ speaker_embeddings = speaker_embeddings.squeeze()
67
+ return speaker_embeddings
68
+
69
+ def prepare_default_embedding(example):
70
+ audio = example["audio"]
71
+ return create_speaker_embedding(audio["array"])
72
+
73
+ default_embedding = prepare_default_embedding(default_example)
74
+
75
+ replacements = [
76
+ ("а", "a"),
77
+ ("à", "a"),
78
+ ('á', "a"),
79
+ ('ã', "a"),
80
+ ('ä', "a"),
81
+ ('α', "a"),
82
+ ("ă", "a"),
83
+ ("â", "a"),
84
+ ('ā', "a"),
85
+ ('ą', "a"),
86
+ ('ɓ', "b"),
87
+ ('β', "b"),
88
+ ("ç", "c"),
89
+ ('с', "c"),
90
+ ('ɗ', "d"),
91
+ ('đ', "d"),
92
+ ("è", "e"),
93
+ ('ẽ', "e"),
94
+ ("ë", "e"),
95
+ ("ð", "e"),
96
+ ('έ', "e"),
97
+ ('ɐ', "e"),
98
+ ('ə', "e"),
99
+ ('ξ', "e"),
100
+ ('н', "h"),
101
+ ("î", "i"),
102
+ ('í', "i"),
103
+ ("ị", "i"),
104
+ ("ï", "i"),
105
+ ('к', "k"),
106
+ ('ł', "l"),
107
+ ("ŋ", "n"),
108
+ ('ń', "n"),
109
+ ("ñ", "n"),
110
+ ('ῆ', "n"),
111
+ ('й', "n"),
112
+ ('η', "n"),
113
+ ("ó", "o"),
114
+ ('ồ', "o"),
115
+ ('ớ', "o"),
116
+ ('ὀ', "o"),
117
+ ("ô", "o"),
118
+ ("õ", "o"),
119
+ ('ò', "o"),
120
+ ('ø', "o"),
121
+ ('σ', "o"),
122
+ ("ο", "o"),
123
+ ("ο", "o"),
124
+ ('р', "p"),
125
+ ('ρ', "p"),
126
+ ("т", "t"),
127
+ ('ƭ', "t"),
128
+ ('ц', "u"),
129
+ ("ù", "u"),
130
+ ("û", "u"),
131
+ ('μ', "u"),
132
+ ('ш', "w"),
133
+ # ('у', "y"),
134
+ # ('ý', "y"),
135
+ # ('γ', "y"),
136
+ ]
137
+
138
+ number_words = {
139
+ 0: "dara", 1: "benn", 2: "ñaar", 3: "ñett", 4: "ñent", 5: "juróom ", 6: "juróom ak benn", 7: "juróom ak ñaar", 8: "juróom ak ñett", 9: "juróom ak ñent",
140
+ 10: "fukk", 11: "fukk ak benn", 12: "fukk ak ñaar", 13: "fukk ak ñett", 14: "fukk ak ñent", 15: "fukk", 16: "fukk ak juróom ben", 17: "fukk ak juróom ñaar",
141
+ 18: "fukk ak juróom ñett", 19: "fukk ak juróom ñent", 20: "ñaar fukk", 30: "ñett fukk", 40: "ñent fukk", 50: "juróom fukk", 60: "juróom benn fukk", 70: "juróom ñaar fukk",
142
+ 80: "juróom ñett fukk", 90: "juróom ñent fukk", 100: "téeméer", 1000: " junni"
143
+ }
144
+
145
+ def number_to_words(number):
146
+ if number < 20:
147
+ return number_words[number]
148
+ elif number < 100:
149
+ tens, unit = divmod(number, 10)
150
+ return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
151
+ elif number < 1000:
152
+ hundreds, remainder = divmod(number, 100)
153
+ return (number_words[hundreds] + " yüz" if hundreds > 1 else "yüz") + (" " + number_to_words(remainder) if remainder else "")
154
+ elif number < 1000000:
155
+ thousands, remainder = divmod(number, 1000)
156
+ return (number_to_words(thousands) + " bin" if thousands > 1 else "bin") + (" " + number_to_words(remainder) if remainder else "")
157
+ elif number < 1000000000:
158
+ millions, remainder = divmod(number, 1000000)
159
+ return number_to_words(millions) + " milyon" + (" " + number_to_words(remainder) if remainder else "")
160
+ elif number < 1000000000000:
161
+ billions, remainder = divmod(number, 1000000000)
162
+ return number_to_words(billions) + " milyar" + (" " + number_to_words(remainder) if remainder else "")
163
+ else:
164
+ return str(number)
165
+
166
+ def replace_numbers_with_words(text):
167
+ def replace(match):
168
+ number = int(match.group())
169
+ return number_to_words(number)
170
+
171
+ # Find the numbers and change with words.
172
+ result = re.sub(r'\b\d+\b', replace, text)
173
+
174
+ return result
175
+
176
+ def normalize_text(text):
177
+ # Convert to lowercase
178
+ text = text.lower()
179
+
180
+ # Replace numbers with words
181
+ text = replace_numbers_with_words(text)
182
+
183
+ # Apply character replacements
184
+ for old, new in replacements:
185
+ text = text.replace(old, new)
186
+
187
+ # Remove punctuation
188
+ text = re.sub(r'[^\w\s]', '', text)
189
+
190
+ return text
191
+
192
+ @spaces.GPU(duration=60)
193
+ def text_to_speech(text, audio_file=None):
194
+ # Normalize the input text
195
+ normalized_text = normalize_text(text)
196
+
197
+ # Prepare the input for the model
198
+ inputs = processor(text=normalized_text, return_tensors="pt").to(device)
199
+
200
+ # Use the default speaker embedding
201
+ speaker_embeddings = default_embedding
202
+
203
+ # Generate speech
204
+ with torch.no_grad():
205
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
206
+
207
+ speech_np = speech.cpu().numpy()
208
+
209
+ return (16000, speech_np)
210
+
211
+ iface = gr.Interface(
212
+ fn=text_to_speech,
213
+ inputs=[
214
+ gr.Textbox(label="Enter Woloftext to convert to speech")
215
+ ],
216
+ outputs=[
217
+ gr.Audio(label="Generated Speech", type="numpy")
218
+ ],
219
+ title="Wolof SpeechT5 Text-to-Speech Demo",
220
+ description="Enter Wolof text, and listen to the generated speech."
221
+ )
222
+
223
+ iface.launch(share=True)
224
+