mrfakename commited on
Commit
abc80dc
·
verified ·
1 Parent(s): 62711be

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (2) hide show
  1. app.py +1 -1
  2. src/f5_tts/infer/utils_infer.py +26 -15
app.py CHANGED
@@ -189,7 +189,7 @@ def infer(
189
 
190
  # Remove silence
191
  if remove_silence:
192
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
193
  sf.write(f.name, final_wave, final_sample_rate)
194
  remove_silence_for_generated_wav(f.name)
195
  final_wave, _ = torchaudio.load(f.name)
 
189
 
190
  # Remove silence
191
  if remove_silence:
192
+ with tempfile.NamedTemporaryFile(suffix=".wav") as f:
193
  sf.write(f.name, final_wave, final_sample_rate)
194
  remove_silence_for_generated_wav(f.name)
195
  final_wave, _ = torchaudio.load(f.name)
src/f5_tts/infer/utils_infer.py CHANGED
@@ -33,6 +33,7 @@ from f5_tts.model.utils import convert_char_to_pinyin, get_tokenizer
33
 
34
 
35
  _ref_audio_cache = {}
 
36
 
37
  device = (
38
  "cuda"
@@ -290,12 +291,24 @@ def remove_silence_edges(audio, silence_threshold=-42):
290
  # preprocess reference audio and text
291
 
292
 
293
- def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_info=print):
294
  show_info("Converting audio...")
295
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
296
- aseg = AudioSegment.from_file(ref_audio_orig)
297
 
298
- if clip_short:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  # 1. try to find long silence for clipping
300
  non_silent_segs = silence.split_on_silence(
301
  aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
@@ -326,26 +339,24 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
326
  aseg = aseg[:12000]
327
  show_info("Audio is over 12s, clipping short. (3)")
328
 
329
- aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
330
- aseg.export(f.name, format="wav")
331
- ref_audio = f.name
332
 
333
- # Compute a hash of the reference audio file
334
- with open(ref_audio, "rb") as audio_file:
335
- audio_data = audio_file.read()
336
- audio_hash = hashlib.md5(audio_data).hexdigest()
337
 
338
  if not ref_text.strip():
339
- global _ref_audio_cache
340
- if audio_hash in _ref_audio_cache:
341
  # Use cached asr transcription
342
  show_info("Using cached reference text...")
343
- ref_text = _ref_audio_cache[audio_hash]
344
  else:
345
  show_info("No reference text provided, transcribing reference audio...")
346
  ref_text = transcribe(ref_audio)
347
  # Cache the transcribed text (not caching custom ref_text, enabling users to do manual tweak)
348
- _ref_audio_cache[audio_hash] = ref_text
349
  else:
350
  show_info("Using custom reference text...")
351
 
 
33
 
34
 
35
  _ref_audio_cache = {}
36
+ _ref_text_cache = {}
37
 
38
  device = (
39
  "cuda"
 
291
  # preprocess reference audio and text
292
 
293
 
294
+ def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print):
295
  show_info("Converting audio...")
 
 
296
 
297
+ # Compute a hash of the reference audio file
298
+ with open(ref_audio_orig, "rb") as audio_file:
299
+ audio_data = audio_file.read()
300
+ audio_hash = hashlib.md5(audio_data).hexdigest()
301
+
302
+ global _ref_audio_cache
303
+
304
+ if audio_hash in _ref_audio_cache:
305
+ show_info("Using cached preprocessed reference audio...")
306
+ ref_audio = _ref_audio_cache[audio_hash]
307
+
308
+ else: # first pass, do preprocess
309
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
310
+ aseg = AudioSegment.from_file(ref_audio_orig)
311
+
312
  # 1. try to find long silence for clipping
313
  non_silent_segs = silence.split_on_silence(
314
  aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
 
339
  aseg = aseg[:12000]
340
  show_info("Audio is over 12s, clipping short. (3)")
341
 
342
+ aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
343
+ aseg.export(f.name, format="wav")
344
+ ref_audio = f.name
345
 
346
+ # Cache the processed reference audio
347
+ _ref_audio_cache[audio_hash] = ref_audio
 
 
348
 
349
  if not ref_text.strip():
350
+ global _ref_text_cache
351
+ if audio_hash in _ref_text_cache:
352
  # Use cached asr transcription
353
  show_info("Using cached reference text...")
354
+ ref_text = _ref_text_cache[audio_hash]
355
  else:
356
  show_info("No reference text provided, transcribing reference audio...")
357
  ref_text = transcribe(ref_audio)
358
  # Cache the transcribed text (not caching custom ref_text, enabling users to do manual tweak)
359
+ _ref_text_cache[audio_hash] = ref_text
360
  else:
361
  show_info("Using custom reference text...")
362