beta 16kHz

Browse files

Files changed (5) hide show

Modules/hifigan.py +5 -5
Utils/text_utils.py +1 -1
api.py +134 -51
models.py +3 -3
msinference.py +27 -23

Modules/hifigan.py CHANGED Viewed

@@ -122,14 +122,14 @@ class SineGen(torch.nn.Module):
         rad_values = (f0_values / self.sampling_rate) % 1   # -21 % 10 = 9 as -3*10 + 9 = 21 NOTICE THAT LCM IS SIGNED HENCE not POSITIVE integer
-        # print('BEF', rad_values.shape)
         rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
                                                         scale_factor=1/self.upsample_scale,
                                                         mode="linear").transpose(1, 2)
-        print('AFt', rad_values.shape)  # downsamples the phases to 1/300 and sums them to be 0,,1,100000,20000*2*pi
         phase = torch.cumsum(rad_values, dim=1) * 1.84 * np.pi  # 1.89 sounds also nice has woofer at punctuation
         phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
                                                 scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
@@ -215,7 +215,7 @@ class Generator(torch.nn.Module):
         # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 484]) GENERAT 249
         f0 = self.f0_upsamp(f0).transpose(1, 2)
-        print(f'{x.shape=} {s.shape=} {f0.shape=} GENERAT 249 LALALALALA\n\n')
         # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 145200, 1]) GENERAT 253
         har_source = self.m_source(f0)  # [1, 145400, 1] f0 enters already upsampled to full wav 24kHz length
@@ -229,7 +229,7 @@ class Generator(torch.nn.Module):
             x_source = self.noise_res[i](x_source, s)
             x = self.ups[i](x)
-            print(x.min(), x.max(), x_source.min(), x_source.max())
             x = x + x_source
             xs = None
@@ -351,7 +351,7 @@ class Decoder(nn.Module):
         N = self.N_conv(N)
-        print(asr.shape, F0.shape, N.shape, 'TF')
         x = torch.cat([asr, F0, N], axis=1)

         rad_values = (f0_values / self.sampling_rate) % 1   # -21 % 10 = 9 as -3*10 + 9 = 21 NOTICE THAT LCM IS SIGNED HENCE not POSITIVE integer
         rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
                                                         scale_factor=1/self.upsample_scale,
                                                         mode="linear").transpose(1, 2)
         phase = torch.cumsum(rad_values, dim=1) * 1.84 * np.pi  # 1.89 sounds also nice has woofer at punctuation
         phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
                                                 scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
         # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 484]) GENERAT 249
         f0 = self.f0_upsamp(f0).transpose(1, 2)
         # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 145200, 1]) GENERAT 253
         har_source = self.m_source(f0)  # [1, 145400, 1] f0 enters already upsampled to full wav 24kHz length
             x_source = self.noise_res[i](x_source, s)
             x = self.ups[i](x)
+            # print(x.min(), x.max(), x_source.min(), x_source.max())
             x = x + x_source
             xs = None
         N = self.N_conv(N)
+        # print(asr.shape, F0.shape, N.shape, 'TF')
         x = torch.cat([asr, F0, N], axis=1)

Utils/text_utils.py CHANGED Viewed

@@ -85,7 +85,7 @@ def split_into_sentences(text):
     # Split Very long sentences >500 phoneme - StyleTTS2 crashes
     # -- even 400 phonemes sometimes OOM in cuda:4
-    sentences = [sub_sent+' ' for s in sentences for sub_sent in textwrap.wrap(s, 300, break_long_words=0)]
     # if sentences and not sentences[-1]:
     #     sentences = sentences[:-1]

     # Split Very long sentences >500 phoneme - StyleTTS2 crashes
     # -- even 400 phonemes sometimes OOM in cuda:4
+    sentences = [sub_sent+' ' for s in sentences for sub_sent in textwrap.wrap(s, 200, break_long_words=0)]
     # if sentences and not sentences[-1]:
     #     sentences = sentences[:-1]

api.py CHANGED Viewed

@@ -6,6 +6,7 @@ from Utils.text_utils import split_into_sentences
 import msinference
 import re
 import srt
 import subprocess
 import cv2
 from pathlib import Path
@@ -20,6 +21,54 @@ sound_generator = AudioGen().to('cuda:0').eval()  # duration chosen in generate(
 Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
 def _shorten(filename):
     return filename.replace("/","")[-6:]
@@ -57,15 +106,19 @@ def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
 def overlay(x,soundscape=None):
     if soundscape is not None:
         background = sound_generator.generate(soundscape,
-                                              duration=len(x)/24000 + .74, # seconds - TTS @ 24kHz
                                               ).detach().cpu().numpy() # bs, 11400 @.74s
-        # blend TTS
-        # background /= np.abs(background).max() + 1e-7  # amplify speech to full [-1,1]
-        x = .4 * x + .46 * background[:len(x)]
-    # TTS & AudioGen at 24kHz
-    return x
 def tts_multi_sentence(precomputed_style_vector=None,
@@ -87,6 +140,8 @@ def tts_multi_sentence(precomputed_style_vector=None,
     if precomputed_style_vector is not None:
         x = []
         for _sentence in text:
             # StyleTTS2 - pronounciation Fx
@@ -96,7 +151,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
                 # fix sounding of sleepy AAABS TRAACT
                 _sentence = _sentence.replace('abstract', 'ahbstract')  # 'ahstract'
             x.append(msinference.inference(_sentence,
-                        precomputed_style_vector)
                      )
         x = np.concatenate(x)
@@ -104,7 +159,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
     else:
-        # dont split foreign sentences: Avoids re-load of VITS & random speaker change issue
         x = msinference.foreign(text=text,
                                 lang=voice,  # voice = 'romanian', 'serbian' 'hungarian'
                                 speed=speed)  # normalisation externally
@@ -164,7 +219,7 @@ def serve_wav():
         text = [[j.content, j.start.total_seconds(), j.end.total_seconds()] for j in srt.parse(s)]
         assert args.video is not None
         native_audio_file = '_tmp.wav'
-        subprocess.call(
             ["ffmpeg",
                 "-y",  # https://stackoverflow.com/questions/39788972/ffmpeg-overwrite-output-file-if-exists
                 "-i",
@@ -172,20 +227,22 @@ def serve_wav():
                 "-f",
                 "mp3",
                 "-ar",
-                "24000",  # "22050 for mimic3",
                 "-vn",
                 native_audio_file])
         x_native, _ = soundfile.read(native_audio_file)  # reads mp3
-        x_native = x_native[:, 0]  # stereo
         # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wa
     else:
         with open(args.text, 'r') as f:
-            t = ''.join(f)
-        t = re.sub(' +', ' ', t)  # delete spaces
-        # -- sub all punctuation with ' '
-        text = split_into_sentences(t)  # split to short sentences (~100 phonemes max for OOM)
-    # ====STYLE VECTOR====
     precomputed_style_vector = None
@@ -199,15 +256,13 @@ def serve_wav():
             native_audio_file += '__native_audio_track.wav'
             soundfile.write('tgt_spk.wav',
                 np.concatenate([
-                    x_native[:int(4 * 24000)]], 0).astype(np.float32), 24000)  # 27400?
             precomputed_style_vector = msinference.compute_style('tgt_spk.wav')
-    # NOTE: style vector may be None
-    # Native Eng
     if precomputed_style_vector is None:
         if 'en_US' in args.voice or 'en_UK' in args.voice:
             _dir = '/' if args.affective else '_v2/'
             precomputed_style_vector = msinference.compute_style(
@@ -216,23 +271,20 @@ def serve_wav():
                     '#', '_').replace(
                     'cmu-arctic', 'cmu_arctic').replace(
                     '_low', '') + '.wav')
-        # Non-Native Eng
         elif '_' in  args.voice:
             precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
                                                                  '/', '_').replace('#', '_').replace(
                                                                     'cmu-arctic', 'cmu_arctic').replace(
                                                                         '_low', '') + '.wav')
-        # Foreign Lang - MMS/TTS
         else:
             print(f'\n\n\n\n\n FallBack to MMS TTS due to: {args.voice=}')
-    # precomputed_style_vector is None for Foreign langs
-    # ====SILENT VIDEO====
     if args.video is not None:
         # banner - precomput @ 1920 pixels
@@ -304,14 +356,17 @@ def serve_wav():
                 im = np.copy(get_frame(t))  # pic
-                ix = int(t * 24000)
-                if is_tts[ix] > .5:     # mask == 1 => tts / mask == 0 -> native
-                    frame = frame_tts   # rename frame to rsz_frame_... because if frame_tts is mod
-                                        # then is considered a "local variable" thus the "outer var"
-                                        # is not observed by python raising referenced before assign
                 else:
-                    frame = frame_orig
                 # im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
@@ -352,16 +407,13 @@ def serve_wav():
         if do_video_dub:
             OUT_FILE = 'tmp.mp4' #args.out_file + '_video_dub.mp4'
             subtitles = text
-            MAX_LEN = int(subtitles[-1][2] + 17) * 24000
             # 17 extra seconds fail-safe for long-last-segment
             print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================')
             pieces = []
             for k, (_text_, orig_start, orig_end) in enumerate(subtitles):
-                # PAUSES ?????????????????????????
-                pieces.append(tts_multi_sentence(text=[_text_],
                                                  precomputed_style_vector=precomputed_style_vector,
                                                  voice=args.voice,
                                                  soundscape=args.soundscape,
@@ -379,7 +431,7 @@ def serve_wav():
             soundfile.write(AUDIO_TRACK,
                             # (is_tts * total + (1-is_tts) * x_native)[:, None],
                             (.64 * total + .27 * x_native)[:, None],
-                            24000)
         else:  # Video from plain (.txt)
             OUT_FILE = 'tmp.mp4'
             x = tts_multi_sentence(text=text,
@@ -387,13 +439,20 @@ def serve_wav():
                                voice=args.voice,
                                soundscape=args.soundscape,
                                speed=args.speed)
-            soundfile.write(AUDIO_TRACK, x, 24000)
     # IMAGE 2 SPEECH
     if args.image is not None:
-        STATIC_FRAME = args.image  # 'assets/image_from_T31.jpg'
         OUT_FILE = 'tmp.mp4' #args.out_file + '_image_to_speech.mp4'
         # SILENT CLIP
@@ -408,10 +467,10 @@ def serve_wav():
                                soundscape=args.soundscape,
                                speed=args.speed
                                )
-        soundfile.write(AUDIO_TRACK, x, 24000)
     if args.video or args.image:
         # write final output video
-        subprocess.call(
             ["ffmpeg",
                 "-y",
                 "-i",
@@ -437,7 +496,7 @@ def serve_wav():
                                soundscape=args.soundscape,
                                speed=args.speed)
         OUT_FILE = 'tmp.wav'
-        soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
@@ -451,7 +510,7 @@ def serve_wav():
     # response.headers["Content-Type"] = "audio/wav"
     # https://stackoverflow.com/questions/67591467/
     #            flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
     # send server's output as default file -> srv_result.xx
@@ -461,6 +520,30 @@ def serve_wav():
     print('________________\n              ? \n_______________')
     return response
 if __name__ == "__main__":
     app.run(host="0.0.0.0")

 import msinference
 import re
 import srt
+import time
 import subprocess
 import cv2
 from pathlib import Path
 Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
+def resize_with_white_padding(image):
+    """
+    Resizes an image to 1920x1080 while preserving aspect ratio
+    by adding white padding.
+    Args:
+        image (np.ndarray): The input image as a NumPy array.
+    Returns:
+        np.ndarray: The resized image with white padding.
+    """
+    h, w = image.shape[:2]
+    target_h, target_w = 1080, 1920
+    aspect_ratio = w / h
+    target_aspect_ratio = target_w / target_h
+    if aspect_ratio > target_aspect_ratio:
+        # Image is wider than the target, pad top and bottom
+        new_w = target_w
+        new_h = int(new_w / aspect_ratio)
+        resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
+        padding_h = target_h - new_h
+        top_padding = padding_h // 2
+        bottom_padding = padding_h - top_padding
+        padding = [(top_padding, bottom_padding), (0, 0)]
+        if len(image.shape) == 3:
+            padding.append((0, 0))  # Add padding for color channels
+        padded_image = np.pad(resized_image, padding, mode='constant', constant_values=255)
+    elif aspect_ratio < target_aspect_ratio:
+        # Image is taller than the target, pad left and right
+        new_h = target_h
+        new_w = int(new_h * aspect_ratio)
+        resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
+        padding_w = target_w - new_w
+        left_padding = padding_w // 2
+        right_padding = padding_w - left_padding
+        padding = [(0, 0), (left_padding, right_padding)]
+        if len(image.shape) == 3:
+            padding.append((0, 0))  # Add padding for color channels
+        padded_image = np.pad(resized_image, padding, mode='constant', constant_values=255)
+    else:
+        # Aspect ratio matches the target, just resize
+        padded_image = cv2.resize(image, (target_w, target_h), interpolation=cv2.INTER_LANCZOS4)
+    return padded_image # image 2 speech
 def _shorten(filename):
     return filename.replace("/","")[-6:]
 def overlay(x,soundscape=None):
     if soundscape is not None:
+        # AudioGen sound is suffice to be ~10s long
         background = sound_generator.generate(soundscape,
+                                              duration=len(x)/16000 + .74, # sound duration = TTS dur
                                               ).detach().cpu().numpy() # bs, 11400 @.74s
+        # len_soundscape = len(background)
+        # fading = .5 + .5 * np.tanh(4*(np.linspace(10, -10, len_soundscape) + 9.4))  # fade heaviside  1,1,1,1,...,0
+        # x = np.concatenate([fading * background, x], 0)  # blend TTS with AudioGen
+        #background /= np.abs(background).max() + 1e-7  # amplify speech to full [-1,1]
+        x = .4 * x + .46 * background[:len(x)]  # background will be longer by xtra .74s
+    return x  # TTS / AudioGen @ 16kHz
 def tts_multi_sentence(precomputed_style_vector=None,
     if precomputed_style_vector is not None:
         x = []
+        if not isinstance(text, list):
+            text = split_into_sentences(text)  # Avoid OOM in StyleTTS2
         for _sentence in text:
             # StyleTTS2 - pronounciation Fx
                 # fix sounding of sleepy AAABS TRAACT
                 _sentence = _sentence.replace('abstract', 'ahbstract')  # 'ahstract'
             x.append(msinference.inference(_sentence,
+                                           precomputed_style_vector)
                      )
         x = np.concatenate(x)
     else:
+        # dont split foreign sentences: Avoids speaker change issue
         x = msinference.foreign(text=text,
                                 lang=voice,  # voice = 'romanian', 'serbian' 'hungarian'
                                 speed=speed)  # normalisation externally
         text = [[j.content, j.start.total_seconds(), j.end.total_seconds()] for j in srt.parse(s)]
         assert args.video is not None
         native_audio_file = '_tmp.wav'
+        subprocess.run(
             ["ffmpeg",
                 "-y",  # https://stackoverflow.com/questions/39788972/ffmpeg-overwrite-output-file-if-exists
                 "-i",
                 "-f",
                 "mp3",
                 "-ar",
+                "16000",  # "22050 for mimic3",
                 "-vn",
                 native_audio_file])
         x_native, _ = soundfile.read(native_audio_file)  # reads mp3
+        # stereo in video
+        if x_native.ndim > 1:
+            x_native = x_native[:, 0]  # stereo
         # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wa
     else:
         with open(args.text, 'r') as f:
+            text = ''.join(f)
+        text = re.sub(' +', ' ', text)  # delete spaces  / split in list in tts_multi_sentence()
+    # == STYLE VECTOR ==
     precomputed_style_vector = None
             native_audio_file += '__native_audio_track.wav'
             soundfile.write('tgt_spk.wav',
                 np.concatenate([
+                    x_native[:int(4 * 16000)]], 0).astype(np.float32), 16000)  # 27400?
             precomputed_style_vector = msinference.compute_style('tgt_spk.wav')
+    # NOTE: style vector is normally None here - except if --native arg was passed
+    # Native English Accent TTS
     if precomputed_style_vector is None:
         if 'en_US' in args.voice or 'en_UK' in args.voice:
             _dir = '/' if args.affective else '_v2/'
             precomputed_style_vector = msinference.compute_style(
                     '#', '_').replace(
                     'cmu-arctic', 'cmu_arctic').replace(
                     '_low', '') + '.wav')
+        # Non-Native English Accent TTS
         elif '_' in  args.voice:
             precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
                                                                  '/', '_').replace('#', '_').replace(
                                                                     'cmu-arctic', 'cmu_arctic').replace(
                                                                         '_low', '') + '.wav')
+        # Foreign Lang
         else:
             print(f'\n\n\n\n\n FallBack to MMS TTS due to: {args.voice=}')
+    # NOTE : precomputed_style_vector is still None if MMS TTS
+    # == SILENT VIDEO ==
     if args.video is not None:
         # banner - precomput @ 1920 pixels
                 im = np.copy(get_frame(t))  # pic
+                ix = int(t * 16000)   # ix may overflow the is_tts.shape
+                if ix < num:
+                    if is_tts[ix] > .5:     # mask == 1 => tts / mask == 0 -> native
+                        frame = frame_tts   # rename frame to rsz_frame_... because if frame_tts is mod
+                                            # then is considered a "local variable" thus the "outer var"
+                                            # is not observed by python raising referenced before assign
+                    else:
+                        frame = frame_orig
+                # For the ix that is out of bounds of num assume frame_tts
                 else:
+                    frame = frame_tts
                 # im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
         if do_video_dub:
             OUT_FILE = 'tmp.mp4' #args.out_file + '_video_dub.mp4'
             subtitles = text
+            MAX_LEN = int(subtitles[-1][2] + 17) * 16000
             # 17 extra seconds fail-safe for long-last-segment
             print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================')
             pieces = []
             for k, (_text_, orig_start, orig_end) in enumerate(subtitles):
+                pieces.append(tts_multi_sentence(text=_text_,
                                                  precomputed_style_vector=precomputed_style_vector,
                                                  voice=args.voice,
                                                  soundscape=args.soundscape,
             soundfile.write(AUDIO_TRACK,
                             # (is_tts * total + (1-is_tts) * x_native)[:, None],
                             (.64 * total + .27 * x_native)[:, None],
+                            16000)
         else:  # Video from plain (.txt)
             OUT_FILE = 'tmp.mp4'
             x = tts_multi_sentence(text=text,
                                voice=args.voice,
                                soundscape=args.soundscape,
                                speed=args.speed)
+            soundfile.write(AUDIO_TRACK, x, 16000)
     # IMAGE 2 SPEECH
     if args.image is not None:
+        # Resize Input Image to 1920x1080 - Issue of .mp4 non visible for other aspect ratios
+        STATIC_FRAME = args.image  + '.jpg' # 'assets/image_from_T31.jpg'
+        cv2.imwrite(
+            STATIC_FRAME,
+            resize_with_white_padding(cv2.imread(args.image)
+                                      ))
         OUT_FILE = 'tmp.mp4' #args.out_file + '_image_to_speech.mp4'
         # SILENT CLIP
                                soundscape=args.soundscape,
                                speed=args.speed
                                )
+        soundfile.write(AUDIO_TRACK, x, 16000)
     if args.video or args.image:
         # write final output video
+        subprocess.run(
             ["ffmpeg",
                 "-y",
                 "-i",
                                soundscape=args.soundscape,
                                speed=args.speed)
         OUT_FILE = 'tmp.wav'
+        soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
     # response.headers["Content-Type"] = "audio/wav"
     # https://stackoverflow.com/questions/67591467/
     #            flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
+    # time.sleep(4)
     # send server's output as default file -> srv_result.xx
     print('________________\n              ? \n_______________')
     return response
 if __name__ == "__main__":
     app.run(host="0.0.0.0")
+# Concat. .mp4
+# _list.txt
+#
+# file out/som_utasitvany_en_txt.mp4
+# file out/som_utasitvany_hu_txt.mp4
+#
+#
+# subprocess.run(
+#             [
+#             "ffmpeg",
+#             "-f",
+#             "concat",
+#             '-safe',
+#             '0',
+#             '-i',
+#             '_list.txt',
+#             '-c',
+#             'copy',
+#             f'fusion.mp4',  # save to correct location is handled in client
+#                 ])
+#
+# ffmpeg -f concat -i mylist.txt -c copy output.mp4

models.py CHANGED Viewed

@@ -304,7 +304,7 @@ class ProsodyPredictor(nn.Module):
         self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
     def F0Ntrain(self, x, s):
-        print(x.shape, s.shape, 'F)N T T T')
         x, _ = self.shared(x.transpose(1, 2))  # [bs, time, ch] LSTM
         x = x.transpose(1, 2)  # [bs, ch, time]
@@ -313,11 +313,11 @@ class ProsodyPredictor(nn.Module):
         F0 = x
         for block in self.F0:
-            print(f'LOOP {F0.shape=} {s.shape=}\n')
             # )N F0.shape=torch.Size([1, 512, 147]) s.shape=torch.Size([1, 128])
             F0 = block(F0, s)  # This is an AdainResBlk1d expects conv1d dimensions
         F0 = self.F0_proj(F0)
-        print('____________________________2nd F0Ntra')
         N = x
         for block in self.N:

         self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
     def F0Ntrain(self, x, s):
         x, _ = self.shared(x.transpose(1, 2))  # [bs, time, ch] LSTM
         x = x.transpose(1, 2)  # [bs, ch, time]
         F0 = x
         for block in self.F0:
+            # print(f'LOOP {F0.shape=} {s.shape=}\n')
             # )N F0.shape=torch.Size([1, 512, 147]) s.shape=torch.Size([1, 128])
             F0 = block(F0, s)  # This is an AdainResBlk1d expects conv1d dimensions
         F0 = self.F0_proj(F0)
         N = x
         for block in self.N:

msinference.py CHANGED Viewed

@@ -223,10 +223,15 @@ def inference(text,
                     s=ref)
     x = x.cpu().numpy()[0, 0, :-400] # weird pulse at the end of sentences
-    print(x.shape,' A')
     if x.shape[0] > 10:
         x /= np.abs(x).max() + 1e-7
     else:
         print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
         x = np.zeros(0)
@@ -393,18 +398,20 @@ def foreign(text=None,   # split sentences here so we can prepend a txt for germ
     tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
     # CALL MMS TTS VITS
     total_audio = []
     # Split long sentences if deu to control voice switch - for other languages let text no-split
     if not isinstance(text, list):
         if lang_code == 'deu':
             # Split Very long sentences >500 phoneme - StyleTTS2 crashes # -- even 400 phonemes sometimes OOM in cuda:4
             # However prosody is nicer on non-split for MMS TTS
-            text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 300, break_long_words=0)]
         else:
-            text = [text]
     for _t in text:
         _t = _t.lower()
@@ -413,9 +420,9 @@ def foreign(text=None,   # split sentences here so we can prepend a txt for germ
             _t = re.sub(r'\d+', number_to_phonemes, _t)
             _t = fix_phones(_t)
         elif lang_code == 'ron':
             # numerals
             _t = romanian_num2str(_t)
@@ -425,31 +432,28 @@ def foreign(text=None,   # split sentences here so we can prepend a txt for germ
         # /data/dkounadis/.hf7/hub/models--facebook--mms-tts/snapshots/44cc7fb408064ef9ea6e7c59130d88cac1274671/models/rmc-script_latin/vocab.txt
         inputs = tokenizer(_t, return_tensors="pt")  # input_ids / attention_mask
         with torch.no_grad():
             # MMS
             x = net_g(input_ids=inputs.input_ids.to(device),
                            attention_mask=inputs.attention_mask.to(device),
-                           speed = 1.14 + .44 * np.random.rand()  # variable speed / sentence
                            )[0, :]
             # crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()
             total_audio.append(x)
         print(f'\n\n_______________________________ {_t} {x.shape=}')
     x = torch.cat(total_audio).cpu().numpy()
     x /= np.abs(x).max() + 1e-7
     # print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
-    x = audresample.resample(signal=x.astype(np.float32),
-                             original_rate=16000,
-                             target_rate=24000)[0, :]  # reshapes (64,) -> (1,64)
-    return x

                     s=ref)
     x = x.cpu().numpy()[0, 0, :-400] # weird pulse at the end of sentences
+    # StyleTTS2 is 24kHz -> Resample to 16kHz ofAudioGen / MMS
     if x.shape[0] > 10:
         x /= np.abs(x).max() + 1e-7
+        x = audresample.resample(signal=x.astype(np.float32),
+                             original_rate=24000,
+                             target_rate=16000)[0, :]  # reshapes (64,) -> (1,64)
     else:
         print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
         x = np.zeros(0)
     tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
     # CALL MMS TTS VITS
     total_audio = []
     # Split long sentences if deu to control voice switch - for other languages let text no-split
     if not isinstance(text, list):
         if lang_code == 'deu':
             # Split Very long sentences >500 phoneme - StyleTTS2 crashes # -- even 400 phonemes sometimes OOM in cuda:4
             # However prosody is nicer on non-split for MMS TTS
+            text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 200, break_long_words=0)]   # prepend txt snippet
+                                                                                                 # assert that it chooses unique voice
         else:
+            text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 140, break_long_words=0)]  # allow longer non split text
+                                                                                                 # for non deu MMS TTS lang.
     for _t in text:
         _t = _t.lower()
             _t = re.sub(r'\d+', number_to_phonemes, _t)
             _t = fix_phones(_t)
         elif lang_code == 'ron':
             # numerals
             _t = romanian_num2str(_t)
         # /data/dkounadis/.hf7/hub/models--facebook--mms-tts/snapshots/44cc7fb408064ef9ea6e7c59130d88cac1274671/models/rmc-script_latin/vocab.txt
         inputs = tokenizer(_t, return_tensors="pt")  # input_ids / attention_mask
         with torch.no_grad():
             # MMS
             x = net_g(input_ids=inputs.input_ids.to(device),
                            attention_mask=inputs.attention_mask.to(device),
+                           speed = speed + .44 * np.random.rand()  # variable speed for different sentence
                            )[0, :]
             # crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()
             total_audio.append(x)
         print(f'\n\n_______________________________ {_t} {x.shape=}')
     x = torch.cat(total_audio).cpu().numpy()
     x /= np.abs(x).max() + 1e-7
     # print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
+    return x  # 16kHz - only resample  StyleTTS2 from 24Hkz -> 16kHz