Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -1237,6 +1237,102 @@ def generate_audio_mars5(text):
|
|
1237 |
return combined_audio_path
|
1238 |
|
1239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1240 |
def generate_audio_toucan_tts(text):
|
1241 |
tts_interface = TTS_Interface()
|
1242 |
sr, audio_arr = tts_interface.read(text, "English", "English", "English Speaker's Voice")
|
@@ -1303,6 +1399,9 @@ demo.launch(share=True)
|
|
1303 |
|
1304 |
|
1305 |
|
|
|
|
|
|
|
1306 |
# import gradio as gr
|
1307 |
# import requests
|
1308 |
# import os
|
|
|
1237 |
return combined_audio_path
|
1238 |
|
1239 |
|
1240 |
+
def float2pcm(sig, dtype='int16'):
|
1241 |
+
"""
|
1242 |
+
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
|
1243 |
+
"""
|
1244 |
+
sig = np.asarray(sig)
|
1245 |
+
if sig.dtype.kind != 'f':
|
1246 |
+
raise TypeError("'sig' must be a float array")
|
1247 |
+
dtype = np.dtype(dtype)
|
1248 |
+
if dtype.kind not in 'iu':
|
1249 |
+
raise TypeError("'dtype' must be an integer type")
|
1250 |
+
i = np.iinfo(dtype)
|
1251 |
+
abs_max = 2 ** (i.bits - 1)
|
1252 |
+
offset = i.min + abs_max
|
1253 |
+
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
|
1254 |
+
|
1255 |
+
class TTS_Interface:
|
1256 |
+
def __init__(self):
|
1257 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
1258 |
+
self.model = Meta_FastSpeech2(device=self.device)
|
1259 |
+
self.current_speaker = "English Speaker's Voice"
|
1260 |
+
self.current_language = "English"
|
1261 |
+
self.current_accent = "English"
|
1262 |
+
self.language_id_lookup = {
|
1263 |
+
"English" : "en",
|
1264 |
+
"German" : "de",
|
1265 |
+
"Greek" : "el",
|
1266 |
+
"Spanish" : "es",
|
1267 |
+
"Finnish" : "fi",
|
1268 |
+
"Russian" : "ru",
|
1269 |
+
"Hungarian" : "hu",
|
1270 |
+
"Dutch" : "nl",
|
1271 |
+
"French" : "fr",
|
1272 |
+
'Polish' : "pl",
|
1273 |
+
'Portuguese': "pt",
|
1274 |
+
'Italian' : "it",
|
1275 |
+
}
|
1276 |
+
self.speaker_path_lookup = {
|
1277 |
+
"English Speaker's Voice" : "reference_audios/english.wav",
|
1278 |
+
"German Speaker's Voice" : "reference_audios/german.wav",
|
1279 |
+
"Greek Speaker's Voice" : "reference_audios/greek.wav",
|
1280 |
+
"Spanish Speaker's Voice" : "reference_audios/spanish.wav",
|
1281 |
+
"Finnish Speaker's Voice" : "reference_audios/finnish.wav",
|
1282 |
+
"Russian Speaker's Voice" : "reference_audios/russian.wav",
|
1283 |
+
"Hungarian Speaker's Voice" : "reference_audios/hungarian.wav",
|
1284 |
+
"Dutch Speaker's Voice" : "reference_audios/dutch.wav",
|
1285 |
+
"French Speaker's Voice" : "reference_audios/french.wav",
|
1286 |
+
"Polish Speaker's Voice" : "reference_audios/polish.flac",
|
1287 |
+
"Portuguese Speaker's Voice": "reference_audios/portuguese.flac",
|
1288 |
+
"Italian Speaker's Voice" : "reference_audios/italian.flac",
|
1289 |
+
}
|
1290 |
+
self.model.set_utterance_embedding(self.speaker_path_lookup[self.current_speaker])
|
1291 |
+
|
1292 |
+
def read(self, prompt, language, accent, speaker):
|
1293 |
+
language = language.split()[0]
|
1294 |
+
accent = accent.split()[0]
|
1295 |
+
if self.current_language != language:
|
1296 |
+
self.model.set_phonemizer_language(self.language_id_lookup[language])
|
1297 |
+
self.current_language = language
|
1298 |
+
if self.current_accent != accent:
|
1299 |
+
self.model.set_accent_language(self.language_id_lookup[accent])
|
1300 |
+
self.current_accent = accent
|
1301 |
+
if self.current_speaker != speaker:
|
1302 |
+
self.model.set_utterance_embedding(self.speaker_path_lookup[speaker])
|
1303 |
+
self.current_speaker = speaker
|
1304 |
+
|
1305 |
+
phones = self.model.text2phone.get_phone_string(prompt)
|
1306 |
+
if len(phones) > 1800:
|
1307 |
+
if language == "English":
|
1308 |
+
prompt = "Your input was too long. Please try either a shorter text or split it into several parts."
|
1309 |
+
elif language == "German":
|
1310 |
+
prompt = "Deine Eingabe war zu lang. Bitte versuche es entweder mit einem kürzeren Text oder teile ihn in mehrere Teile auf."
|
1311 |
+
elif language == "Greek":
|
1312 |
+
prompt = "Η εισήγησή σας ήταν πολύ μεγάλη. Παρακαλώ δοκιμάστε είτε ένα μικρότερο κείμενο είτε χωρίστε το σε διάφορα μέρη."
|
1313 |
+
elif language == "Spanish":
|
1314 |
+
prompt = "Su entrada es demasiado larga. Por favor, intente un texto más corto o divídalo en varias partes."
|
1315 |
+
elif language == "Finnish":
|
1316 |
+
prompt = "Vastauksesi oli liian pitkä. Kokeile joko lyhyempää tekstiä tai jaa se useampaan osaan."
|
1317 |
+
elif language == "Russian":
|
1318 |
+
prompt = "Ваш текст слишком длинный. Пожалуйста, попробуйте либо сократить текст, либо разделить его на несколько частей."
|
1319 |
+
elif language == "Hungarian":
|
1320 |
+
prompt = "Túl hosszú volt a bevitele. Kérjük, próbáljon meg rövidebb szöveget írni, vagy ossza több részre."
|
1321 |
+
elif language == "Dutch":
|
1322 |
+
prompt = "Uw input was te lang. Probeer een kortere tekst of splits het in verschillende delen."
|
1323 |
+
elif language == "French":
|
1324 |
+
prompt = "Votre saisie était trop longue. Veuillez essayer un texte plus court ou le diviser en plusieurs parties."
|
1325 |
+
elif language == 'Polish':
|
1326 |
+
prompt = "Twój wpis był zbyt długi. Spróbuj skrócić tekst lub podzielić go na kilka części."
|
1327 |
+
elif language == 'Portuguese':
|
1328 |
+
prompt = "O seu contributo foi demasiado longo. Por favor, tente um texto mais curto ou divida-o em várias partes."
|
1329 |
+
elif language == 'Italian':
|
1330 |
+
prompt = "Il tuo input era troppo lungo. Per favore, prova un testo più corto o dividilo in più parti."
|
1331 |
+
phones = self.model.text2phone.get_phone_string(prompt)
|
1332 |
+
|
1333 |
+
wav = self.model(phones)
|
1334 |
+
return 48000, float2pcm(wav.cpu().numpy())
|
1335 |
+
|
1336 |
def generate_audio_toucan_tts(text):
|
1337 |
tts_interface = TTS_Interface()
|
1338 |
sr, audio_arr = tts_interface.read(text, "English", "English", "English Speaker's Voice")
|
|
|
1399 |
|
1400 |
|
1401 |
|
1402 |
+
|
1403 |
+
|
1404 |
+
|
1405 |
# import gradio as gr
|
1406 |
# import requests
|
1407 |
# import os
|