Pijush2023 commited on
Commit
cd80d42
·
verified ·
1 Parent(s): f50b611

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -0
app.py CHANGED
@@ -1237,6 +1237,102 @@ def generate_audio_mars5(text):
1237
  return combined_audio_path
1238
 
1239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1240
  def generate_audio_toucan_tts(text):
1241
  tts_interface = TTS_Interface()
1242
  sr, audio_arr = tts_interface.read(text, "English", "English", "English Speaker's Voice")
@@ -1303,6 +1399,9 @@ demo.launch(share=True)
1303
 
1304
 
1305
 
 
 
 
1306
  # import gradio as gr
1307
  # import requests
1308
  # import os
 
1237
  return combined_audio_path
1238
 
1239
 
1240
+ def float2pcm(sig, dtype='int16'):
1241
+ """
1242
+ https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
1243
+ """
1244
+ sig = np.asarray(sig)
1245
+ if sig.dtype.kind != 'f':
1246
+ raise TypeError("'sig' must be a float array")
1247
+ dtype = np.dtype(dtype)
1248
+ if dtype.kind not in 'iu':
1249
+ raise TypeError("'dtype' must be an integer type")
1250
+ i = np.iinfo(dtype)
1251
+ abs_max = 2 ** (i.bits - 1)
1252
+ offset = i.min + abs_max
1253
+ return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
1254
+
1255
+ class TTS_Interface:
1256
+ def __init__(self):
1257
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
1258
+ self.model = Meta_FastSpeech2(device=self.device)
1259
+ self.current_speaker = "English Speaker's Voice"
1260
+ self.current_language = "English"
1261
+ self.current_accent = "English"
1262
+ self.language_id_lookup = {
1263
+ "English" : "en",
1264
+ "German" : "de",
1265
+ "Greek" : "el",
1266
+ "Spanish" : "es",
1267
+ "Finnish" : "fi",
1268
+ "Russian" : "ru",
1269
+ "Hungarian" : "hu",
1270
+ "Dutch" : "nl",
1271
+ "French" : "fr",
1272
+ 'Polish' : "pl",
1273
+ 'Portuguese': "pt",
1274
+ 'Italian' : "it",
1275
+ }
1276
+ self.speaker_path_lookup = {
1277
+ "English Speaker's Voice" : "reference_audios/english.wav",
1278
+ "German Speaker's Voice" : "reference_audios/german.wav",
1279
+ "Greek Speaker's Voice" : "reference_audios/greek.wav",
1280
+ "Spanish Speaker's Voice" : "reference_audios/spanish.wav",
1281
+ "Finnish Speaker's Voice" : "reference_audios/finnish.wav",
1282
+ "Russian Speaker's Voice" : "reference_audios/russian.wav",
1283
+ "Hungarian Speaker's Voice" : "reference_audios/hungarian.wav",
1284
+ "Dutch Speaker's Voice" : "reference_audios/dutch.wav",
1285
+ "French Speaker's Voice" : "reference_audios/french.wav",
1286
+ "Polish Speaker's Voice" : "reference_audios/polish.flac",
1287
+ "Portuguese Speaker's Voice": "reference_audios/portuguese.flac",
1288
+ "Italian Speaker's Voice" : "reference_audios/italian.flac",
1289
+ }
1290
+ self.model.set_utterance_embedding(self.speaker_path_lookup[self.current_speaker])
1291
+
1292
+ def read(self, prompt, language, accent, speaker):
1293
+ language = language.split()[0]
1294
+ accent = accent.split()[0]
1295
+ if self.current_language != language:
1296
+ self.model.set_phonemizer_language(self.language_id_lookup[language])
1297
+ self.current_language = language
1298
+ if self.current_accent != accent:
1299
+ self.model.set_accent_language(self.language_id_lookup[accent])
1300
+ self.current_accent = accent
1301
+ if self.current_speaker != speaker:
1302
+ self.model.set_utterance_embedding(self.speaker_path_lookup[speaker])
1303
+ self.current_speaker = speaker
1304
+
1305
+ phones = self.model.text2phone.get_phone_string(prompt)
1306
+ if len(phones) > 1800:
1307
+ if language == "English":
1308
+ prompt = "Your input was too long. Please try either a shorter text or split it into several parts."
1309
+ elif language == "German":
1310
+ prompt = "Deine Eingabe war zu lang. Bitte versuche es entweder mit einem kürzeren Text oder teile ihn in mehrere Teile auf."
1311
+ elif language == "Greek":
1312
+ prompt = "Η εισήγησή σας ήταν πολύ μεγάλη. Παρακαλώ δοκιμάστε είτε ένα μικρότερο κείμενο είτε χωρίστε το σε διάφορα μέρη."
1313
+ elif language == "Spanish":
1314
+ prompt = "Su entrada es demasiado larga. Por favor, intente un texto más corto o divídalo en varias partes."
1315
+ elif language == "Finnish":
1316
+ prompt = "Vastauksesi oli liian pitkä. Kokeile joko lyhyempää tekstiä tai jaa se useampaan osaan."
1317
+ elif language == "Russian":
1318
+ prompt = "Ваш текст слишком длинный. Пожалуйста, попробуйте либо сократить текст, либо разделить его на несколько частей."
1319
+ elif language == "Hungarian":
1320
+ prompt = "Túl hosszú volt a bevitele. Kérjük, próbáljon meg rövidebb szöveget írni, vagy ossza több részre."
1321
+ elif language == "Dutch":
1322
+ prompt = "Uw input was te lang. Probeer een kortere tekst of splits het in verschillende delen."
1323
+ elif language == "French":
1324
+ prompt = "Votre saisie était trop longue. Veuillez essayer un texte plus court ou le diviser en plusieurs parties."
1325
+ elif language == 'Polish':
1326
+ prompt = "Twój wpis był zbyt długi. Spróbuj skrócić tekst lub podzielić go na kilka części."
1327
+ elif language == 'Portuguese':
1328
+ prompt = "O seu contributo foi demasiado longo. Por favor, tente um texto mais curto ou divida-o em várias partes."
1329
+ elif language == 'Italian':
1330
+ prompt = "Il tuo input era troppo lungo. Per favore, prova un testo più corto o dividilo in più parti."
1331
+ phones = self.model.text2phone.get_phone_string(prompt)
1332
+
1333
+ wav = self.model(phones)
1334
+ return 48000, float2pcm(wav.cpu().numpy())
1335
+
1336
  def generate_audio_toucan_tts(text):
1337
  tts_interface = TTS_Interface()
1338
  sr, audio_arr = tts_interface.read(text, "English", "English", "English Speaker's Voice")
 
1399
 
1400
 
1401
 
1402
+
1403
+
1404
+
1405
  # import gradio as gr
1406
  # import requests
1407
  # import os