Aumkeshchy2003 commited on
Commit
0452208
·
verified ·
1 Parent(s): c526086

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -0
app.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import soundfile as sf
4
+ import spaces
5
+ import os
6
+ import numpy as np
7
+ import re
8
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
+ from speechbrain.pretrained import EncoderClassifier
10
+ from datasets import load_dataset
11
+
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+
14
+ def load_models_and_data():
15
+ model_name = "microsoft/speecht5_tts"
16
+ processor = SpeechT5Processor.from_pretrained(model_name)
17
+ model = SpeechT5ForTextToSpeech.from_pretrained("speecht5_finetuned_Aumkesh_tr").to(device)
18
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
+
20
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
21
+ speaker_model = EncoderClassifier.from_hparams(
22
+ source=spk_model_name,
23
+ run_opts={"device": device},
24
+ savedir=os.path.join("/tmp", spk_model_name),
25
+ )
26
+
27
+ # Load a sample from a dataset for default embedding
28
+ dataset = load_dataset("Yassmen/TTS_English_Technical_data", split="train")
29
+ example = dataset[304]
30
+
31
+ return model, processor, vocoder, speaker_model, example
32
+
33
+ model, processor, vocoder, speaker_model, default_example = load_models_and_data()
34
+
35
+ def create_speaker_embedding(waveform):
36
+ with torch.no_grad():
37
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
38
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
39
+ speaker_embeddings = speaker_embeddings.squeeze()
40
+ return speaker_embeddings
41
+
42
+ def prepare_default_embedding(example):
43
+ audio = example["audio"]
44
+ return create_speaker_embedding(audio["array"])
45
+
46
+ default_embedding = prepare_default_embedding(default_example)
47
+
48
+ replacements = [
49
+ ('API', 'A-P-I'),
50
+ ('CUDA', 'Coo-da'),
51
+ ('ChatGPT', 'Chat-G-P-T'),
52
+ ('HTTP', 'H-T-T-P'),
53
+ ('JSON', 'J-S-O-N'),
54
+ ('GPU', 'G-P-U'),
55
+ ('RAM', 'R-A-M'),
56
+ ('CPU', 'C-P-U'),
57
+ ('SQL', 'S-Q-L'),
58
+ ('NLP', 'N-L-P'),
59
+ ('PyTorch', 'Pie-torch'),
60
+ ('TensorFlow', 'Ten-sor-flow'),
61
+ ('SaaS', 'SaaS'),
62
+ ('GitHub', 'Git-Hub'),
63
+ ('Docker', 'Dock-er'),
64
+ ('Kubernetes', 'Koo-ber-net-ees'),
65
+ ('OpenAI', 'Open-A-I'),
66
+ ('IOT', 'I-O-T'),
67
+ ('Linux', 'Li-nux'),
68
+ ]
69
+
70
+ number_words = {
71
+ 0: "zero", 1: "one", 2: "two", 3: "tree", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine",
72
+ 10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen", 17: "seventeen",
73
+ 18: "eighteen", 19: "nineteen", 20: "tweenty", 30: "thirty", 40: "forty", 50: "fifty", 60: "sixty", 70: "seventy",
74
+ 80: "eighty", 90: "ninty", 100: "hundred", 1000: "thousand"
75
+ }
76
+
77
+ def number_to_words(number):
78
+ if number < 20:
79
+ return number_words[number]
80
+ elif number < 100:
81
+ tens, unit = divmod(number, 10)
82
+ return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
83
+ elif number < 1000:
84
+ hundreds, remainder = divmod(number, 100)
85
+ return (number_words[hundreds] + " hundred" if hundreds > 1 else " hundred") + (" " + number_to_words(remainder) if remainder else "")
86
+ elif number < 1000000:
87
+ thousands, remainder = divmod(number, 1000)
88
+ return (number_to_words(thousands) + " thousand" if thousands > 1 else " thousand") + (" " + number_to_words(remainder) if remainder else "")
89
+ elif number < 1000000000:
90
+ millions, remainder = divmod(number, 1000000)
91
+ return number_to_words(millions) + " million" + (" " + number_to_words(remainder) if remainder else "")
92
+ elif number < 1000000000000:
93
+ billions, remainder = divmod(number, 1000000000)
94
+ return number_to_words(billions) + " billion" + (" " + number_to_words(remainder) if remainder else "")
95
+ else:
96
+ return str(number)
97
+
98
+ def replace_numbers_with_words(text):
99
+
100
+ def replace(match):
101
+ number = int(match.group())
102
+ return number_to_words(number)
103
+
104
+ # Find the numbers and change with words.
105
+ result = re.sub(r'\b\d+\b', replace, text)
106
+
107
+ return result
108
+
109
+ def replace_numbers_with_words(text):
110
+ def replace(match):
111
+ number = int(match.group())
112
+ return number_to_words(number)
113
+
114
+ # Find the numbers and change with words.
115
+ result = re.sub(r'\b\d+\b', replace, text)
116
+
117
+ return result
118
+
119
+ def normalize_text(text):
120
+ # Convert to lowercase
121
+ text = text.lower()
122
+
123
+ # Replace numbers with words
124
+ text = replace_numbers_with_words(text)
125
+
126
+ # Apply character replacements
127
+ for old, new in replacements:
128
+ text = text.replace(old, new)
129
+
130
+ # Remove punctuation
131
+ text = re.sub(r'[^\w\s]', '', text)
132
+
133
+ return text
134
+
135
+ @spaces.GPU(duration=60)
136
+ def text_to_speech(text, audio_file=None):
137
+ # Normalize the input text
138
+ normalized_text = normalize_text(text)
139
+
140
+ # Prepare the input for the model
141
+ inputs = processor(text=normalized_text, return_tensors="pt").to(device)
142
+
143
+ # Use the default speaker embedding
144
+ speaker_embeddings = default_embedding
145
+
146
+ # Generate speech
147
+ with torch.no_grad():
148
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
149
+
150
+ speech_np = speech.cpu().numpy()
151
+
152
+ return (16000, speech_np)
153
+
154
+ iface = gr.Interface(
155
+ fn=text_to_speech,
156
+ inputs=[
157
+ gr.Textbox(label="Enter Turkish text to convert to speech")
158
+ ],
159
+ outputs=[
160
+ gr.Audio(label="Generated Speech", type="numpy")
161
+ ],
162
+ title="English SpeechT5 Text-to-Speech Demo",
163
+ description="Enter English text, and listen to the generated speech."
164
+ )
165
+
166
+ iface.launch(share=True)