Dionyssos commited on
Commit
c2687b7
·
1 Parent(s): 4eabff6

beta 16kHz

Browse files
Files changed (5) hide show
  1. Modules/hifigan.py +5 -5
  2. Utils/text_utils.py +1 -1
  3. api.py +134 -51
  4. models.py +3 -3
  5. msinference.py +27 -23
Modules/hifigan.py CHANGED
@@ -122,14 +122,14 @@ class SineGen(torch.nn.Module):
122
 
123
  rad_values = (f0_values / self.sampling_rate) % 1 # -21 % 10 = 9 as -3*10 + 9 = 21 NOTICE THAT LCM IS SIGNED HENCE not POSITIVE integer
124
 
125
- # print('BEF', rad_values.shape)
126
 
127
 
128
 
129
  rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
130
  scale_factor=1/self.upsample_scale,
131
  mode="linear").transpose(1, 2)
132
- print('AFt', rad_values.shape) # downsamples the phases to 1/300 and sums them to be 0,,1,100000,20000*2*pi
133
  phase = torch.cumsum(rad_values, dim=1) * 1.84 * np.pi # 1.89 sounds also nice has woofer at punctuation
134
  phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
135
  scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
@@ -215,7 +215,7 @@ class Generator(torch.nn.Module):
215
 
216
  # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 484]) GENERAT 249
217
  f0 = self.f0_upsamp(f0).transpose(1, 2)
218
- print(f'{x.shape=} {s.shape=} {f0.shape=} GENERAT 249 LALALALALA\n\n')
219
  # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 145200, 1]) GENERAT 253
220
 
221
  har_source = self.m_source(f0) # [1, 145400, 1] f0 enters already upsampled to full wav 24kHz length
@@ -229,7 +229,7 @@ class Generator(torch.nn.Module):
229
  x_source = self.noise_res[i](x_source, s)
230
 
231
  x = self.ups[i](x)
232
- print(x.min(), x.max(), x_source.min(), x_source.max())
233
  x = x + x_source
234
 
235
  xs = None
@@ -351,7 +351,7 @@ class Decoder(nn.Module):
351
  N = self.N_conv(N)
352
 
353
 
354
- print(asr.shape, F0.shape, N.shape, 'TF')
355
 
356
 
357
  x = torch.cat([asr, F0, N], axis=1)
 
122
 
123
  rad_values = (f0_values / self.sampling_rate) % 1 # -21 % 10 = 9 as -3*10 + 9 = 21 NOTICE THAT LCM IS SIGNED HENCE not POSITIVE integer
124
 
125
+
126
 
127
 
128
 
129
  rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
130
  scale_factor=1/self.upsample_scale,
131
  mode="linear").transpose(1, 2)
132
+
133
  phase = torch.cumsum(rad_values, dim=1) * 1.84 * np.pi # 1.89 sounds also nice has woofer at punctuation
134
  phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
135
  scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
 
215
 
216
  # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 484]) GENERAT 249
217
  f0 = self.f0_upsamp(f0).transpose(1, 2)
218
+
219
  # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 145200, 1]) GENERAT 253
220
 
221
  har_source = self.m_source(f0) # [1, 145400, 1] f0 enters already upsampled to full wav 24kHz length
 
229
  x_source = self.noise_res[i](x_source, s)
230
 
231
  x = self.ups[i](x)
232
+ # print(x.min(), x.max(), x_source.min(), x_source.max())
233
  x = x + x_source
234
 
235
  xs = None
 
351
  N = self.N_conv(N)
352
 
353
 
354
+ # print(asr.shape, F0.shape, N.shape, 'TF')
355
 
356
 
357
  x = torch.cat([asr, F0, N], axis=1)
Utils/text_utils.py CHANGED
@@ -85,7 +85,7 @@ def split_into_sentences(text):
85
 
86
  # Split Very long sentences >500 phoneme - StyleTTS2 crashes
87
  # -- even 400 phonemes sometimes OOM in cuda:4
88
- sentences = [sub_sent+' ' for s in sentences for sub_sent in textwrap.wrap(s, 300, break_long_words=0)]
89
 
90
  # if sentences and not sentences[-1]:
91
  # sentences = sentences[:-1]
 
85
 
86
  # Split Very long sentences >500 phoneme - StyleTTS2 crashes
87
  # -- even 400 phonemes sometimes OOM in cuda:4
88
+ sentences = [sub_sent+' ' for s in sentences for sub_sent in textwrap.wrap(s, 200, break_long_words=0)]
89
 
90
  # if sentences and not sentences[-1]:
91
  # sentences = sentences[:-1]
api.py CHANGED
@@ -6,6 +6,7 @@ from Utils.text_utils import split_into_sentences
6
  import msinference
7
  import re
8
  import srt
 
9
  import subprocess
10
  import cv2
11
  from pathlib import Path
@@ -20,6 +21,54 @@ sound_generator = AudioGen().to('cuda:0').eval() # duration chosen in generate(
20
 
21
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def _shorten(filename):
24
  return filename.replace("/","")[-6:]
25
 
@@ -57,15 +106,19 @@ def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
57
 
58
  def overlay(x,soundscape=None):
59
  if soundscape is not None:
 
60
  background = sound_generator.generate(soundscape,
61
- duration=len(x)/24000 + .74, # seconds - TTS @ 24kHz
62
  ).detach().cpu().numpy() # bs, 11400 @.74s
63
-
64
- # blend TTS
65
- # background /= np.abs(background).max() + 1e-7 # amplify speech to full [-1,1]
66
- x = .4 * x + .46 * background[:len(x)]
67
- # TTS & AudioGen at 24kHz
68
- return x
 
 
 
69
 
70
 
71
  def tts_multi_sentence(precomputed_style_vector=None,
@@ -87,6 +140,8 @@ def tts_multi_sentence(precomputed_style_vector=None,
87
 
88
  if precomputed_style_vector is not None:
89
  x = []
 
 
90
  for _sentence in text:
91
 
92
  # StyleTTS2 - pronounciation Fx
@@ -96,7 +151,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
96
  # fix sounding of sleepy AAABS TRAACT
97
  _sentence = _sentence.replace('abstract', 'ahbstract') # 'ahstract'
98
  x.append(msinference.inference(_sentence,
99
- precomputed_style_vector)
100
  )
101
  x = np.concatenate(x)
102
 
@@ -104,7 +159,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
104
 
105
  else:
106
 
107
- # dont split foreign sentences: Avoids re-load of VITS & random speaker change issue
108
  x = msinference.foreign(text=text,
109
  lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
110
  speed=speed) # normalisation externally
@@ -164,7 +219,7 @@ def serve_wav():
164
  text = [[j.content, j.start.total_seconds(), j.end.total_seconds()] for j in srt.parse(s)]
165
  assert args.video is not None
166
  native_audio_file = '_tmp.wav'
167
- subprocess.call(
168
  ["ffmpeg",
169
  "-y", # https://stackoverflow.com/questions/39788972/ffmpeg-overwrite-output-file-if-exists
170
  "-i",
@@ -172,20 +227,22 @@ def serve_wav():
172
  "-f",
173
  "mp3",
174
  "-ar",
175
- "24000", # "22050 for mimic3",
176
  "-vn",
177
  native_audio_file])
178
  x_native, _ = soundfile.read(native_audio_file) # reads mp3
179
- x_native = x_native[:, 0] # stereo
 
 
 
 
180
  # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wa
181
  else:
182
  with open(args.text, 'r') as f:
183
- t = ''.join(f)
184
- t = re.sub(' +', ' ', t) # delete spaces
185
- # -- sub all punctuation with ' '
186
- text = split_into_sentences(t) # split to short sentences (~100 phonemes max for OOM)
187
 
188
- # ====STYLE VECTOR====
189
 
190
  precomputed_style_vector = None
191
 
@@ -199,15 +256,13 @@ def serve_wav():
199
  native_audio_file += '__native_audio_track.wav'
200
  soundfile.write('tgt_spk.wav',
201
  np.concatenate([
202
- x_native[:int(4 * 24000)]], 0).astype(np.float32), 24000) # 27400?
203
  precomputed_style_vector = msinference.compute_style('tgt_spk.wav')
204
 
205
- # NOTE: style vector may be None
206
 
207
- # Native Eng
208
-
209
  if precomputed_style_vector is None:
210
-
211
  if 'en_US' in args.voice or 'en_UK' in args.voice:
212
  _dir = '/' if args.affective else '_v2/'
213
  precomputed_style_vector = msinference.compute_style(
@@ -216,23 +271,20 @@ def serve_wav():
216
  '#', '_').replace(
217
  'cmu-arctic', 'cmu_arctic').replace(
218
  '_low', '') + '.wav')
219
-
220
- # Non-Native Eng
221
-
222
  elif '_' in args.voice:
223
  precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
224
  '/', '_').replace('#', '_').replace(
225
  'cmu-arctic', 'cmu_arctic').replace(
226
  '_low', '') + '.wav')
227
-
228
-
229
- # Foreign Lang - MMS/TTS
230
  else:
231
  print(f'\n\n\n\n\n FallBack to MMS TTS due to: {args.voice=}')
232
 
233
 
234
- # precomputed_style_vector is None for Foreign langs
235
- # ====SILENT VIDEO====
 
236
 
237
  if args.video is not None:
238
  # banner - precomput @ 1920 pixels
@@ -304,14 +356,17 @@ def serve_wav():
304
  im = np.copy(get_frame(t)) # pic
305
 
306
 
307
- ix = int(t * 24000)
308
-
309
- if is_tts[ix] > .5: # mask == 1 => tts / mask == 0 -> native
310
- frame = frame_tts # rename frame to rsz_frame_... because if frame_tts is mod
311
- # then is considered a "local variable" thus the "outer var"
312
- # is not observed by python raising referenced before assign
 
 
 
313
  else:
314
- frame = frame_orig
315
 
316
  # im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
317
 
@@ -352,16 +407,13 @@ def serve_wav():
352
  if do_video_dub:
353
  OUT_FILE = 'tmp.mp4' #args.out_file + '_video_dub.mp4'
354
  subtitles = text
355
- MAX_LEN = int(subtitles[-1][2] + 17) * 24000
356
  # 17 extra seconds fail-safe for long-last-segment
357
  print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================')
358
  pieces = []
359
  for k, (_text_, orig_start, orig_end) in enumerate(subtitles):
360
 
361
- # PAUSES ?????????????????????????
362
-
363
-
364
- pieces.append(tts_multi_sentence(text=[_text_],
365
  precomputed_style_vector=precomputed_style_vector,
366
  voice=args.voice,
367
  soundscape=args.soundscape,
@@ -379,7 +431,7 @@ def serve_wav():
379
  soundfile.write(AUDIO_TRACK,
380
  # (is_tts * total + (1-is_tts) * x_native)[:, None],
381
  (.64 * total + .27 * x_native)[:, None],
382
- 24000)
383
  else: # Video from plain (.txt)
384
  OUT_FILE = 'tmp.mp4'
385
  x = tts_multi_sentence(text=text,
@@ -387,13 +439,20 @@ def serve_wav():
387
  voice=args.voice,
388
  soundscape=args.soundscape,
389
  speed=args.speed)
390
- soundfile.write(AUDIO_TRACK, x, 24000)
391
 
392
  # IMAGE 2 SPEECH
393
 
394
  if args.image is not None:
395
-
396
- STATIC_FRAME = args.image # 'assets/image_from_T31.jpg'
 
 
 
 
 
 
 
397
  OUT_FILE = 'tmp.mp4' #args.out_file + '_image_to_speech.mp4'
398
 
399
  # SILENT CLIP
@@ -408,10 +467,10 @@ def serve_wav():
408
  soundscape=args.soundscape,
409
  speed=args.speed
410
  )
411
- soundfile.write(AUDIO_TRACK, x, 24000)
412
  if args.video or args.image:
413
  # write final output video
414
- subprocess.call(
415
  ["ffmpeg",
416
  "-y",
417
  "-i",
@@ -437,7 +496,7 @@ def serve_wav():
437
  soundscape=args.soundscape,
438
  speed=args.speed)
439
  OUT_FILE = 'tmp.wav'
440
- soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
441
 
442
 
443
 
@@ -451,7 +510,7 @@ def serve_wav():
451
  # response.headers["Content-Type"] = "audio/wav"
452
  # https://stackoverflow.com/questions/67591467/
453
  # flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
454
-
455
 
456
 
457
  # send server's output as default file -> srv_result.xx
@@ -461,6 +520,30 @@ def serve_wav():
461
  print('________________\n ? \n_______________')
462
  return response
463
 
464
-
465
  if __name__ == "__main__":
466
  app.run(host="0.0.0.0")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import msinference
7
  import re
8
  import srt
9
+ import time
10
  import subprocess
11
  import cv2
12
  from pathlib import Path
 
21
 
22
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
23
 
24
+
25
+ def resize_with_white_padding(image):
26
+ """
27
+ Resizes an image to 1920x1080 while preserving aspect ratio
28
+ by adding white padding.
29
+
30
+ Args:
31
+ image (np.ndarray): The input image as a NumPy array.
32
+
33
+ Returns:
34
+ np.ndarray: The resized image with white padding.
35
+ """
36
+ h, w = image.shape[:2]
37
+ target_h, target_w = 1080, 1920
38
+ aspect_ratio = w / h
39
+ target_aspect_ratio = target_w / target_h
40
+
41
+ if aspect_ratio > target_aspect_ratio:
42
+ # Image is wider than the target, pad top and bottom
43
+ new_w = target_w
44
+ new_h = int(new_w / aspect_ratio)
45
+ resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
46
+ padding_h = target_h - new_h
47
+ top_padding = padding_h // 2
48
+ bottom_padding = padding_h - top_padding
49
+ padding = [(top_padding, bottom_padding), (0, 0)]
50
+ if len(image.shape) == 3:
51
+ padding.append((0, 0)) # Add padding for color channels
52
+ padded_image = np.pad(resized_image, padding, mode='constant', constant_values=255)
53
+ elif aspect_ratio < target_aspect_ratio:
54
+ # Image is taller than the target, pad left and right
55
+ new_h = target_h
56
+ new_w = int(new_h * aspect_ratio)
57
+ resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
58
+ padding_w = target_w - new_w
59
+ left_padding = padding_w // 2
60
+ right_padding = padding_w - left_padding
61
+ padding = [(0, 0), (left_padding, right_padding)]
62
+ if len(image.shape) == 3:
63
+ padding.append((0, 0)) # Add padding for color channels
64
+ padded_image = np.pad(resized_image, padding, mode='constant', constant_values=255)
65
+ else:
66
+ # Aspect ratio matches the target, just resize
67
+ padded_image = cv2.resize(image, (target_w, target_h), interpolation=cv2.INTER_LANCZOS4)
68
+
69
+ return padded_image # image 2 speech
70
+
71
+
72
  def _shorten(filename):
73
  return filename.replace("/","")[-6:]
74
 
 
106
 
107
  def overlay(x,soundscape=None):
108
  if soundscape is not None:
109
+ # AudioGen sound is suffice to be ~10s long
110
  background = sound_generator.generate(soundscape,
111
+ duration=len(x)/16000 + .74, # sound duration = TTS dur
112
  ).detach().cpu().numpy() # bs, 11400 @.74s
113
+
114
+ # len_soundscape = len(background)
115
+
116
+ # fading = .5 + .5 * np.tanh(4*(np.linspace(10, -10, len_soundscape) + 9.4)) # fade heaviside 1,1,1,1,...,0
117
+
118
+ # x = np.concatenate([fading * background, x], 0) # blend TTS with AudioGen
119
+ #background /= np.abs(background).max() + 1e-7 # amplify speech to full [-1,1]
120
+ x = .4 * x + .46 * background[:len(x)] # background will be longer by xtra .74s
121
+ return x # TTS / AudioGen @ 16kHz
122
 
123
 
124
  def tts_multi_sentence(precomputed_style_vector=None,
 
140
 
141
  if precomputed_style_vector is not None:
142
  x = []
143
+ if not isinstance(text, list):
144
+ text = split_into_sentences(text) # Avoid OOM in StyleTTS2
145
  for _sentence in text:
146
 
147
  # StyleTTS2 - pronounciation Fx
 
151
  # fix sounding of sleepy AAABS TRAACT
152
  _sentence = _sentence.replace('abstract', 'ahbstract') # 'ahstract'
153
  x.append(msinference.inference(_sentence,
154
+ precomputed_style_vector)
155
  )
156
  x = np.concatenate(x)
157
 
 
159
 
160
  else:
161
 
162
+ # dont split foreign sentences: Avoids speaker change issue
163
  x = msinference.foreign(text=text,
164
  lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
165
  speed=speed) # normalisation externally
 
219
  text = [[j.content, j.start.total_seconds(), j.end.total_seconds()] for j in srt.parse(s)]
220
  assert args.video is not None
221
  native_audio_file = '_tmp.wav'
222
+ subprocess.run(
223
  ["ffmpeg",
224
  "-y", # https://stackoverflow.com/questions/39788972/ffmpeg-overwrite-output-file-if-exists
225
  "-i",
 
227
  "-f",
228
  "mp3",
229
  "-ar",
230
+ "16000", # "22050 for mimic3",
231
  "-vn",
232
  native_audio_file])
233
  x_native, _ = soundfile.read(native_audio_file) # reads mp3
234
+
235
+ # stereo in video
236
+ if x_native.ndim > 1:
237
+ x_native = x_native[:, 0] # stereo
238
+
239
  # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wa
240
  else:
241
  with open(args.text, 'r') as f:
242
+ text = ''.join(f)
243
+ text = re.sub(' +', ' ', text) # delete spaces / split in list in tts_multi_sentence()
 
 
244
 
245
+ # == STYLE VECTOR ==
246
 
247
  precomputed_style_vector = None
248
 
 
256
  native_audio_file += '__native_audio_track.wav'
257
  soundfile.write('tgt_spk.wav',
258
  np.concatenate([
259
+ x_native[:int(4 * 16000)]], 0).astype(np.float32), 16000) # 27400?
260
  precomputed_style_vector = msinference.compute_style('tgt_spk.wav')
261
 
262
+ # NOTE: style vector is normally None here - except if --native arg was passed
263
 
264
+ # Native English Accent TTS
 
265
  if precomputed_style_vector is None:
 
266
  if 'en_US' in args.voice or 'en_UK' in args.voice:
267
  _dir = '/' if args.affective else '_v2/'
268
  precomputed_style_vector = msinference.compute_style(
 
271
  '#', '_').replace(
272
  'cmu-arctic', 'cmu_arctic').replace(
273
  '_low', '') + '.wav')
274
+ # Non-Native English Accent TTS
 
 
275
  elif '_' in args.voice:
276
  precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
277
  '/', '_').replace('#', '_').replace(
278
  'cmu-arctic', 'cmu_arctic').replace(
279
  '_low', '') + '.wav')
280
+ # Foreign Lang
 
 
281
  else:
282
  print(f'\n\n\n\n\n FallBack to MMS TTS due to: {args.voice=}')
283
 
284
 
285
+ # NOTE : precomputed_style_vector is still None if MMS TTS
286
+
287
+ # == SILENT VIDEO ==
288
 
289
  if args.video is not None:
290
  # banner - precomput @ 1920 pixels
 
356
  im = np.copy(get_frame(t)) # pic
357
 
358
 
359
+ ix = int(t * 16000) # ix may overflow the is_tts.shape
360
+ if ix < num:
361
+ if is_tts[ix] > .5: # mask == 1 => tts / mask == 0 -> native
362
+ frame = frame_tts # rename frame to rsz_frame_... because if frame_tts is mod
363
+ # then is considered a "local variable" thus the "outer var"
364
+ # is not observed by python raising referenced before assign
365
+ else:
366
+ frame = frame_orig
367
+ # For the ix that is out of bounds of num assume frame_tts
368
  else:
369
+ frame = frame_tts
370
 
371
  # im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
372
 
 
407
  if do_video_dub:
408
  OUT_FILE = 'tmp.mp4' #args.out_file + '_video_dub.mp4'
409
  subtitles = text
410
+ MAX_LEN = int(subtitles[-1][2] + 17) * 16000
411
  # 17 extra seconds fail-safe for long-last-segment
412
  print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================')
413
  pieces = []
414
  for k, (_text_, orig_start, orig_end) in enumerate(subtitles):
415
 
416
+ pieces.append(tts_multi_sentence(text=_text_,
 
 
 
417
  precomputed_style_vector=precomputed_style_vector,
418
  voice=args.voice,
419
  soundscape=args.soundscape,
 
431
  soundfile.write(AUDIO_TRACK,
432
  # (is_tts * total + (1-is_tts) * x_native)[:, None],
433
  (.64 * total + .27 * x_native)[:, None],
434
+ 16000)
435
  else: # Video from plain (.txt)
436
  OUT_FILE = 'tmp.mp4'
437
  x = tts_multi_sentence(text=text,
 
439
  voice=args.voice,
440
  soundscape=args.soundscape,
441
  speed=args.speed)
442
+ soundfile.write(AUDIO_TRACK, x, 16000)
443
 
444
  # IMAGE 2 SPEECH
445
 
446
  if args.image is not None:
447
+
448
+ # Resize Input Image to 1920x1080 - Issue of .mp4 non visible for other aspect ratios
449
+
450
+ STATIC_FRAME = args.image + '.jpg' # 'assets/image_from_T31.jpg'
451
+ cv2.imwrite(
452
+ STATIC_FRAME,
453
+ resize_with_white_padding(cv2.imread(args.image)
454
+ ))
455
+
456
  OUT_FILE = 'tmp.mp4' #args.out_file + '_image_to_speech.mp4'
457
 
458
  # SILENT CLIP
 
467
  soundscape=args.soundscape,
468
  speed=args.speed
469
  )
470
+ soundfile.write(AUDIO_TRACK, x, 16000)
471
  if args.video or args.image:
472
  # write final output video
473
+ subprocess.run(
474
  ["ffmpeg",
475
  "-y",
476
  "-i",
 
496
  soundscape=args.soundscape,
497
  speed=args.speed)
498
  OUT_FILE = 'tmp.wav'
499
+ soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
500
 
501
 
502
 
 
510
  # response.headers["Content-Type"] = "audio/wav"
511
  # https://stackoverflow.com/questions/67591467/
512
  # flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
513
+ # time.sleep(4)
514
 
515
 
516
  # send server's output as default file -> srv_result.xx
 
520
  print('________________\n ? \n_______________')
521
  return response
522
 
 
523
  if __name__ == "__main__":
524
  app.run(host="0.0.0.0")
525
+
526
+
527
+ # Concat. .mp4
528
+
529
+ # _list.txt
530
+ #
531
+ # file out/som_utasitvany_en_txt.mp4
532
+ # file out/som_utasitvany_hu_txt.mp4
533
+ #
534
+ #
535
+ # subprocess.run(
536
+ # [
537
+ # "ffmpeg",
538
+ # "-f",
539
+ # "concat",
540
+ # '-safe',
541
+ # '0',
542
+ # '-i',
543
+ # '_list.txt',
544
+ # '-c',
545
+ # 'copy',
546
+ # f'fusion.mp4', # save to correct location is handled in client
547
+ # ])
548
+ #
549
+ # ffmpeg -f concat -i mylist.txt -c copy output.mp4
models.py CHANGED
@@ -304,7 +304,7 @@ class ProsodyPredictor(nn.Module):
304
  self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
305
 
306
  def F0Ntrain(self, x, s):
307
- print(x.shape, s.shape, 'F)N T T T')
308
  x, _ = self.shared(x.transpose(1, 2)) # [bs, time, ch] LSTM
309
 
310
  x = x.transpose(1, 2) # [bs, ch, time]
@@ -313,11 +313,11 @@ class ProsodyPredictor(nn.Module):
313
  F0 = x
314
 
315
  for block in self.F0:
316
- print(f'LOOP {F0.shape=} {s.shape=}\n')
317
  # )N F0.shape=torch.Size([1, 512, 147]) s.shape=torch.Size([1, 128])
318
  F0 = block(F0, s) # This is an AdainResBlk1d expects conv1d dimensions
319
  F0 = self.F0_proj(F0)
320
- print('____________________________2nd F0Ntra')
321
  N = x
322
 
323
  for block in self.N:
 
304
  self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
305
 
306
  def F0Ntrain(self, x, s):
307
+
308
  x, _ = self.shared(x.transpose(1, 2)) # [bs, time, ch] LSTM
309
 
310
  x = x.transpose(1, 2) # [bs, ch, time]
 
313
  F0 = x
314
 
315
  for block in self.F0:
316
+ # print(f'LOOP {F0.shape=} {s.shape=}\n')
317
  # )N F0.shape=torch.Size([1, 512, 147]) s.shape=torch.Size([1, 128])
318
  F0 = block(F0, s) # This is an AdainResBlk1d expects conv1d dimensions
319
  F0 = self.F0_proj(F0)
320
+
321
  N = x
322
 
323
  for block in self.N:
msinference.py CHANGED
@@ -223,10 +223,15 @@ def inference(text,
223
  s=ref)
224
 
225
  x = x.cpu().numpy()[0, 0, :-400] # weird pulse at the end of sentences
226
-
227
- print(x.shape,' A')
 
228
  if x.shape[0] > 10:
229
  x /= np.abs(x).max() + 1e-7
 
 
 
 
230
  else:
231
  print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
232
  x = np.zeros(0)
@@ -393,18 +398,20 @@ def foreign(text=None, # split sentences here so we can prepend a txt for germ
393
  tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
394
 
395
  # CALL MMS TTS VITS
396
-
397
  total_audio = []
398
-
399
  # Split long sentences if deu to control voice switch - for other languages let text no-split
400
  if not isinstance(text, list):
401
  if lang_code == 'deu':
402
  # Split Very long sentences >500 phoneme - StyleTTS2 crashes # -- even 400 phonemes sometimes OOM in cuda:4
403
  # However prosody is nicer on non-split for MMS TTS
404
- text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 300, break_long_words=0)]
 
405
  else:
406
- text = [text]
407
-
 
408
  for _t in text:
409
 
410
  _t = _t.lower()
@@ -413,9 +420,9 @@ def foreign(text=None, # split sentences here so we can prepend a txt for germ
413
 
414
  _t = re.sub(r'\d+', number_to_phonemes, _t)
415
  _t = fix_phones(_t)
416
-
417
  elif lang_code == 'ron':
418
-
419
  # numerals
420
  _t = romanian_num2str(_t)
421
 
@@ -425,31 +432,28 @@ def foreign(text=None, # split sentences here so we can prepend a txt for germ
425
 
426
  # /data/dkounadis/.hf7/hub/models--facebook--mms-tts/snapshots/44cc7fb408064ef9ea6e7c59130d88cac1274671/models/rmc-script_latin/vocab.txt
427
  inputs = tokenizer(_t, return_tensors="pt") # input_ids / attention_mask
428
-
429
  with torch.no_grad():
430
-
431
  # MMS
432
-
433
  x = net_g(input_ids=inputs.input_ids.to(device),
434
  attention_mask=inputs.attention_mask.to(device),
435
- speed = 1.14 + .44 * np.random.rand() # variable speed / sentence
436
  )[0, :]
437
-
438
  # crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()
439
-
440
  total_audio.append(x)
441
-
442
  print(f'\n\n_______________________________ {_t} {x.shape=}')
443
-
444
  x = torch.cat(total_audio).cpu().numpy()
445
-
446
  x /= np.abs(x).max() + 1e-7
447
 
448
  # print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
449
-
450
- x = audresample.resample(signal=x.astype(np.float32),
451
- original_rate=16000,
452
- target_rate=24000)[0, :] # reshapes (64,) -> (1,64)
453
- return x
454
 
455
 
 
223
  s=ref)
224
 
225
  x = x.cpu().numpy()[0, 0, :-400] # weird pulse at the end of sentences
226
+
227
+ # StyleTTS2 is 24kHz -> Resample to 16kHz ofAudioGen / MMS
228
+
229
  if x.shape[0] > 10:
230
  x /= np.abs(x).max() + 1e-7
231
+ x = audresample.resample(signal=x.astype(np.float32),
232
+ original_rate=24000,
233
+ target_rate=16000)[0, :] # reshapes (64,) -> (1,64)
234
+
235
  else:
236
  print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
237
  x = np.zeros(0)
 
398
  tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
399
 
400
  # CALL MMS TTS VITS
401
+
402
  total_audio = []
403
+
404
  # Split long sentences if deu to control voice switch - for other languages let text no-split
405
  if not isinstance(text, list):
406
  if lang_code == 'deu':
407
  # Split Very long sentences >500 phoneme - StyleTTS2 crashes # -- even 400 phonemes sometimes OOM in cuda:4
408
  # However prosody is nicer on non-split for MMS TTS
409
+ text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 200, break_long_words=0)] # prepend txt snippet
410
+ # assert that it chooses unique voice
411
  else:
412
+ text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 140, break_long_words=0)] # allow longer non split text
413
+ # for non deu MMS TTS lang.
414
+
415
  for _t in text:
416
 
417
  _t = _t.lower()
 
420
 
421
  _t = re.sub(r'\d+', number_to_phonemes, _t)
422
  _t = fix_phones(_t)
423
+
424
  elif lang_code == 'ron':
425
+
426
  # numerals
427
  _t = romanian_num2str(_t)
428
 
 
432
 
433
  # /data/dkounadis/.hf7/hub/models--facebook--mms-tts/snapshots/44cc7fb408064ef9ea6e7c59130d88cac1274671/models/rmc-script_latin/vocab.txt
434
  inputs = tokenizer(_t, return_tensors="pt") # input_ids / attention_mask
435
+
436
  with torch.no_grad():
437
+
438
  # MMS
439
+
440
  x = net_g(input_ids=inputs.input_ids.to(device),
441
  attention_mask=inputs.attention_mask.to(device),
442
+ speed = speed + .44 * np.random.rand() # variable speed for different sentence
443
  )[0, :]
444
+
445
  # crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()
446
+
447
  total_audio.append(x)
448
+
449
  print(f'\n\n_______________________________ {_t} {x.shape=}')
450
+
451
  x = torch.cat(total_audio).cpu().numpy()
452
+
453
  x /= np.abs(x).max() + 1e-7
454
 
455
  # print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
456
+
457
+ return x # 16kHz - only resample StyleTTS2 from 24Hkz -> 16kHz
 
 
 
458
 
459