Moustapha91 commited on
Commit
d88d8e3
·
verified ·
1 Parent(s): 3bec276

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -194
app.py CHANGED
@@ -1,201 +1,161 @@
1
 
2
 
3
- import gradio as gr
4
- import torch
5
- import soundfile as sf
6
- import spaces
7
- import os
8
  import numpy as np
 
 
 
 
 
9
  import re
10
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
11
- from speechbrain.pretrained import EncoderClassifier
12
- from datasets import load_dataset
13
-
14
- device = "cuda" if torch.cuda.is_available() else "cpu"
15
-
16
- def load_models_and_data():
17
- model_name = "Moustapha91/TTS_WOLOF_FINAL"
18
- processor = SpeechT5Processor.from_pretrained(model_name)
19
- model = SpeechT5ForTextToSpeech.from_pretrained("model_name").to(device)
20
- vocoder = SpeechT5HifiGan.from_pretrained("model_name").to(device)
21
-
22
- spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
23
- speaker_model = EncoderClassifier.from_hparams(
24
- source=spk_model_name,
25
- run_opts={"device": device},
26
- savedir=os.path.join("/tmp", spk_model_name),
27
- )
28
-
29
- # Load a sample from a dataset for default embedding
30
- dataset = load_dataset("Alwaly/Wolof_TTS", split="train")
31
- dataset = dataset.select(range(12003))
32
- dataset = dataset.train_test_split(test_size=0.1)
33
- example = dataset['test'][8]
34
-
35
- return model, processor, vocoder, speaker_model, example
36
-
37
- model, processor, vocoder, speaker_model, default_example = load_models_and_data()
38
-
39
- def create_speaker_embedding(waveform):
40
- with torch.no_grad():
41
- speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
42
- speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
43
- speaker_embeddings = speaker_embeddings.squeeze()
44
- return speaker_embeddings
45
-
46
- def prepare_default_embedding(example):
47
- audio = example["audio"]
48
- return create_speaker_embedding(audio["array"])
49
-
50
- default_embedding = prepare_default_embedding(default_example)
51
-
52
- replacements = [
53
- ("а", "a"),
54
- ("à", "a"),
55
- ('á', "a"),
56
- ('ã', "a"),
57
- ('ä', "a"),
58
- ('α', "a"),
59
- ("ă", "a"),
60
- ("â", "a"),
61
- ('ā', "a"),
62
- ('ą', "a"),
63
- ('ɓ', "b"),
64
- ('β', "b"),
65
- ("ç", "c"),
66
- ('с', "c"),
67
- ('ɗ', "d"),
68
- ('đ', "d"),
69
- ("è", "e"),
70
- ('ẽ', "e"),
71
- ("ë", "e"),
72
- ("ð", "e"),
73
- ('έ', "e"),
74
- ('ɐ', "e"),
75
- ('ə', "e"),
76
- ('ξ', "e"),
77
- ('н', "h"),
78
- ("î", "i"),
79
- ('í', "i"),
80
- ("ị", "i"),
81
- ("ï", "i"),
82
- ('к', "k"),
83
- ('ł', "l"),
84
- ("ŋ", "n"),
85
- ('ń', "n"),
86
- ("ñ", "n"),
87
- ('ῆ', "n"),
88
- ('й', "n"),
89
- ('η', "n"),
90
- ("ó", "o"),
91
- ('ồ', "o"),
92
- ('ớ', "o"),
93
- ('ὀ', "o"),
94
- ("ô", "o"),
95
- ("õ", "o"),
96
- ('ò', "o"),
97
- ('ø', "o"),
98
- ('σ', "o"),
99
- ("ο", "o"),
100
- ("ο", "o"),
101
- ('р', "p"),
102
- ('ρ', "p"),
103
- ("т", "t"),
104
- ('ƭ', "t"),
105
- ('ц', "u"),
106
- ("ù", "u"),
107
- ("û", "u"),
108
- ('μ', "u"),
109
- ('ш', "w"),
110
- # ('у', "y"),
111
- # ('ý', "y"),
112
- # ('γ', "y"),
113
- ]
114
-
115
- number_words = {
116
- 0: "dara", 1: "benn", 2: "ñaar", 3: "ñett", 4: "ñent", 5: "juróom ", 6: "juróom ak benn", 7: "juróom ak ñaar", 8: "juróom ak ñett", 9: "juróom ak ñent",
117
- 10: "fukk", 11: "fukk ak benn", 12: "fukk ak ñaar", 13: "fukk ak ñett", 14: "fukk ak ñent", 15: "fukk", 16: "fukk ak juróom ben", 17: "fukk ak juróom ñaar",
118
- 18: "fukk ak juróom ñett", 19: "fukk ak juróom ñent", 20: "ñaar fukk", 30: "ñett fukk", 40: "ñent fukk", 50: "juróom fukk", 60: "juróom benn fukk", 70: "juróom ñaar fukk",
119
- 80: "juróom ñett fukk", 90: "juróom ñent fukk", 100: "téeméer", 1000: " junni"
120
- }
121
-
122
- def number_to_words(number):
123
- if number < 20:
124
- return number_words[number]
125
- elif number < 100:
126
- tens, unit = divmod(number, 10)
127
- return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
128
- elif number < 1000:
129
- hundreds, remainder = divmod(number, 100)
130
- return (number_words[hundreds] + " yüz" if hundreds > 1 else "yüz") + (" " + number_to_words(remainder) if remainder else "")
131
- elif number < 1000000:
132
- thousands, remainder = divmod(number, 1000)
133
- return (number_to_words(thousands) + " bin" if thousands > 1 else "bin") + (" " + number_to_words(remainder) if remainder else "")
134
- elif number < 1000000000:
135
- millions, remainder = divmod(number, 1000000)
136
- return number_to_words(millions) + " milyon" + (" " + number_to_words(remainder) if remainder else "")
137
- elif number < 1000000000000:
138
- billions, remainder = divmod(number, 1000000000)
139
- return number_to_words(billions) + " milyar" + (" " + number_to_words(remainder) if remainder else "")
140
- else:
141
- return str(number)
142
-
143
- def replace_numbers_with_words(text):
144
- def replace(match):
145
- number = int(match.group())
146
- return number_to_words(number)
147
-
148
- # Find the numbers and change with words.
149
- result = re.sub(r'\b\d+\b', replace, text)
150
-
151
- return result
152
-
153
- def normalize_text(text):
154
- # Convert to lowercase
155
- text = text.lower()
156
-
157
- # Replace numbers with words
158
- text = replace_numbers_with_words(text)
159
-
160
- # Apply character replacements
161
- for old, new in replacements:
162
- text = text.replace(old, new)
163
-
164
- # Remove punctuation
165
- text = re.sub(r'[^\w\s]', '', text)
166
-
167
  return text
168
 
169
- @spaces.GPU(duration=60)
170
- def text_to_speech(text, audio_file=None):
171
- # Normalize the input text
172
- normalized_text = normalize_text(text)
173
-
174
- # Prepare the input for the model
175
- inputs = processor(text=normalized_text, return_tensors="pt").to(device)
176
-
177
- # Use the default speaker embedding
178
- speaker_embeddings = default_embedding
179
-
180
- # Generate speech
181
- with torch.no_grad():
182
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
183
-
184
- speech_np = speech.cpu().numpy()
185
-
186
- return (16000, speech_np)
187
-
188
- iface = gr.Interface(
189
- fn=text_to_speech,
190
- inputs=[
191
- gr.Textbox(label="Enter Woloftext to convert to speech")
192
- ],
193
- outputs=[
194
- gr.Audio(label="Generated Speech", type="numpy")
195
- ],
196
- title="Wolof SpeechT5 Text-to-Speech Demo",
197
- description="Enter Wolof text, and listen to the generated speech."
198
- )
199
-
200
- iface.launch(share=True)
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
 
 
 
 
 
 
3
  import numpy as np
4
+ from scipy.io import wavfile
5
+ import torch
6
+ from parler_tts import ParlerTTSForConditionalGeneration
7
+ from transformers import AutoTokenizer
8
+ import gradio as gr
9
  import re
10
+ from num2words import num2words
11
+
12
+ # Vérification de la disponibilité de CUDA
13
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
14
+
15
+ # Chargement du modèle et du tokenizer
16
+ try:
17
+ model = ParlerTTSForConditionalGeneration.from_pretrained("Moustapha91/parler-tts-mutli-wolof", torch_dtype=torch.float16).to(device)
18
+ tokenizer = AutoTokenizer.from_pretrained("Moustapha91/parler-tts-mutli-wolof")
19
+ except Exception as e:
20
+ raise RuntimeError(f"Erreur lors du chargement du modèle : {e}")
21
+
22
+
23
+
24
+
25
+ # Normalisation des nombres
26
+ class EnglishNumberNormalizer:
27
+ def __call__(self, text):
28
+ # Trouver tous les nombres dans le texte
29
+ numbers = re.findall(r'\d+', text)
30
+ for number in numbers:
31
+ # Convertir le nombre en mots
32
+ text = text.replace(number, num2words(int(number), lang='fr'))
33
+ return text
34
+
35
+ number_normalizer = EnglishNumberNormalizer()
36
+
37
+ # Fonction de prétraitement
38
+ def preprocess(text):
39
+ # Normaliser les nombres
40
+ text = number_normalizer(text).strip()
41
+
42
+ # Remplacer les tirets par des espaces
43
+ text = text.replace("-", " ")
44
+
45
+ # Ajouter un point à la fin si le texte ne se termine pas par une ponctuation
46
+ if not text.endswith(('.', '!', '?')):
47
+ text += "."
48
+
49
+ # Traiter les abréviations
50
+ abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
51
+ abbreviations = re.findall(abbreviations_pattern, text)
52
+ for abv in abbreviations:
53
+ # Séparer les lettres des abréviations (par exemple, "U.S.A." -> "U S A")
54
+ separated_abv = " ".join(abv.replace(".", ""))
55
+ text = text.replace(abv, separated_abv)
56
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  return text
58
 
59
+ # Texte et description par défaut
60
+ default_prompt = "Entreprenariat ci Senegal dafa am solo lool ci yokkuteg koom-koom, di gëna yokk liggéey ak indi gis-gis yu bees ci dëkk bi."
61
+ default_description = """A crystal clear and distinct voice, with a moderate reading rate that facilitates understanding. The tone is monotonous, without variations or inflections, which provides a uniform listening experience. The voice is free of background noise and allows for continuous reading, without inappropriate pauses, thus ensuring a constant and pleasant flow."""
62
+
63
+
64
+
65
+ # Fonction pour générer l'audio sans segmentation
66
+ def generate_audio(prompt, description):
67
+ # Prétraiter le texte
68
+ prompt = preprocess(prompt)
69
+
70
+ # Génération des IDs d'entrée
71
+ input_ids = tokenizer(description.strip(), return_tensors="pt").input_ids.to(device)
72
+ prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
73
+
74
+ # Générer l'audio
75
+ generation_config = {
76
+ "temperature": 0.8, # Contrôle la variabilité de la sortie
77
+ "max_new_tokens": 1000, # Longueur maximale de la séquence générée
78
+ "do_sample": True, # Active l'échantillonnage aléatoire
79
+ "top_k": 50, # Limite le nombre de tokens considérés
80
+ "repetition_penalty": 1.2, # Pénalise la répétition de tokens
81
+ }
82
+
83
+ generation = model.generate(
84
+ input_ids=input_ids,
85
+ prompt_input_ids=prompt_input_ids,
86
+ **generation_config
87
+ )
 
 
 
88
 
89
+ #generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
90
+ audio_arr = generation.cpu().numpy().squeeze() # Transformer en tableau numpy
91
+
92
+ # Taux d'échantillonnage
93
+ sampling_rate = model.config.sampling_rate
94
+
95
+ # Normaliser l'audio
96
+ audio_arr = audio_arr / np.max(np.abs(audio_arr))
97
+ torch.cuda.empty_cache()
98
+ return sampling_rate, audio_arr
99
+
100
+ # Fonction pour mettre à jour le compteur de caractères
101
+ def update_char_counter(text):
102
+ remaining_chars = 180 - len(text)
103
+ return f"Caractères restants : {remaining_chars}"
104
+
105
+ # Interface Gradio
106
+ def create_interface():
107
+ with gr.Blocks() as demo:
108
+ # Ajouter une image ou un logo
109
+ gr.Markdown("![Logo](https://huggingface.co/spaces/CONCREE/Adia_TTS/resolve/main/adia.png)")
110
+
111
+ # Titre et description
112
+ gr.Markdown("# 🌟 Bienvenue sur Adia TTS 🌟")
113
+ gr.Markdown(f"[Adia TTS](https://huggingface.co/CONCREE/Adia_TTS) est un modèle de génération audio en wolof. Cette interface vous permet de générer des fichiers audio à partir de textes en wolof. Vous pouvez choisir une description pour personnaliser la voix générée.")
114
+
115
+ with gr.Row():
116
+ with gr.Column():
117
+ prompt_input = gr.Textbox(label="Entrez votre texte en wolof", placeholder="Adia TTS xeetu audio wolof la buy jàppale boroom këru liggéey yi ñu mëna def seen projet", value=default_prompt, max_length=180)
118
+ char_counter = gr.Label(value=update_char_counter(default_prompt))
119
+ description_input = gr.Textbox(label="Entrez une description pour la voix", value=default_description)
120
+ generate_button = gr.Button("Générer l'audio", variant="primary")
121
+
122
+ with gr.Column():
123
+ audio_output = gr.Audio(label="Audio généré", type="numpy")
124
+
125
+ # Section des exemples
126
+ gr.Markdown("## Exemples de textes et descriptions")
127
+ gr.Examples(
128
+ examples=[
129
+ [
130
+ """Yeneen jumtukaay yuy dimbal mooy yi ñeel saytu xaalis, comptabilite ak yoriinu liggéey administratif. Marketing digital itam doon na jumtukaay bu am solo ngir yokk sa gis-gis.""",
131
+ "A warm, natural, and friendly voice with a steady pace and clear pronunciation. The audio is crisp with no background noise, and the delivery is expressive but calm, giving a conversational feel.",
132
+
133
+
134
+ ],
135
+ [
136
+ """Liggéeyukaay ci wàllu mbay mi ci Senegal dafa am solo lool ci wàllu kaaraange dundu ak sos liggéey, ndax dafay boole xeeti liggéey yu bees yu melni agroecologie.""",
137
+ "A crystal clear and distinct voice, with a moderate reading rate that facilitates understanding. The tone is monotonous, without variations or inflections, which provides a uniform listening experience. The voice is free of background noise and allows for continuous reading, without inappropriate pauses, thus ensuring a constant and pleasant flow.",
138
+ ],
139
+ [
140
+ """Politigu Senegaal, doxalinu demokraasi bu dëgër la, am wote yuy faral di am, te askan wi di ci bokk bu baax. Waaye, waxtaan yi am ci wàllu nguur, leer ak coppite yi am ci liggéeyuk""",
141
+ "Adia's voice is very clear with a slight hint of expressiveness. The recording quality is moderate, with some background noise.",
142
+ ],
143
+ ],
144
+ inputs=[prompt_input, description_input],
145
+ outputs=audio_output,
146
+ fn=generate_audio, # Fonction à appeler lors du clic sur un exemple
147
+ label="Cliquez sur un exemple pour générer l'audio",
148
+ cache_examples=False,
149
+ )
150
+
151
+ # Mettre à jour le compteur de caractères à chaque saisie
152
+ prompt_input.change(fn=update_char_counter, inputs=prompt_input, outputs=char_counter)
153
+
154
+ generate_button.click(fn=generate_audio, inputs=[prompt_input, description_input], outputs=audio_output)
155
+
156
+ return demo
157
+
158
+ # Lancement de l'interface
159
+ if __name__ == "__main__":
160
+ interface = create_interface()
161
+ interface.launch()