mrfakename commited on
Commit
34d3b0e
·
verified ·
1 Parent(s): 833791a

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

app.py CHANGED
@@ -758,9 +758,9 @@ This is {"a local web UI for [F5 TTS](https://github.com/SWivid/F5-TTS)" if not
758
 
759
  The checkpoints currently support English and Chinese.
760
 
761
- If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
762
 
763
- **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
764
  """
765
  )
766
 
 
758
 
759
  The checkpoints currently support English and Chinese.
760
 
761
+ If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 12s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
762
 
763
+ **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<12s). Ensure the audio is fully uploaded before generating.**
764
  """
765
  )
766
 
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "f5-tts"
7
- version = "1.0.4"
8
  description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
9
  readme = "README.md"
10
  license = {text = "MIT License"}
 
4
 
5
  [project]
6
  name = "f5-tts"
7
+ version = "1.0.5"
8
  description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
9
  readme = "README.md"
10
  license = {text = "MIT License"}
src/f5_tts/infer/utils_infer.py CHANGED
@@ -302,7 +302,7 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
302
  non_silent_wave = AudioSegment.silent(duration=0)
303
  for non_silent_seg in non_silent_segs:
304
  if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
305
- show_info("Audio is over 15s, clipping short. (1)")
306
  break
307
  non_silent_wave += non_silent_seg
308
 
@@ -314,7 +314,7 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
314
  non_silent_wave = AudioSegment.silent(duration=0)
315
  for non_silent_seg in non_silent_segs:
316
  if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
317
- show_info("Audio is over 15s, clipping short. (2)")
318
  break
319
  non_silent_wave += non_silent_seg
320
 
@@ -323,7 +323,7 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
323
  # 3. if no proper silence found for clipping
324
  if len(aseg) > 12000:
325
  aseg = aseg[:12000]
326
- show_info("Audio is over 15s, clipping short. (3)")
327
 
328
  aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
329
  aseg.export(f.name, format="wav")
 
302
  non_silent_wave = AudioSegment.silent(duration=0)
303
  for non_silent_seg in non_silent_segs:
304
  if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
305
+ show_info("Audio is over 12s, clipping short. (1)")
306
  break
307
  non_silent_wave += non_silent_seg
308
 
 
314
  non_silent_wave = AudioSegment.silent(duration=0)
315
  for non_silent_seg in non_silent_segs:
316
  if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
317
+ show_info("Audio is over 12s, clipping short. (2)")
318
  break
319
  non_silent_wave += non_silent_seg
320
 
 
323
  # 3. if no proper silence found for clipping
324
  if len(aseg) > 12000:
325
  aseg = aseg[:12000]
326
+ show_info("Audio is over 12s, clipping short. (3)")
327
 
328
  aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
329
  aseg.export(f.name, format="wav")
src/f5_tts/train/finetune_cli.py CHANGED
@@ -40,15 +40,15 @@ def parse_args():
40
  parser.add_argument("--grad_accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
41
  parser.add_argument("--max_grad_norm", type=float, default=1.0, help="Max gradient norm for clipping")
42
  parser.add_argument("--epochs", type=int, default=100, help="Number of training epochs")
43
- parser.add_argument("--num_warmup_updates", type=int, default=300, help="Warmup updates")
44
- parser.add_argument("--save_per_updates", type=int, default=10000, help="Save checkpoint every X updates")
45
  parser.add_argument(
46
  "--keep_last_n_checkpoints",
47
  type=int,
48
  default=-1,
49
  help="-1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints",
50
  )
51
- parser.add_argument("--last_per_updates", type=int, default=50000, help="Save last checkpoint every X updates")
52
  parser.add_argument("--finetune", action="store_true", help="Use Finetune")
53
  parser.add_argument("--pretrain", type=str, default=None, help="the path to the checkpoint")
54
  parser.add_argument(
@@ -65,7 +65,7 @@ def parse_args():
65
  action="store_true",
66
  help="Log inferenced samples per ckpt save updates",
67
  )
68
- parser.add_argument("--logger", type=str, default=None, choices=["wandb", "tensorboard"], help="logger")
69
  parser.add_argument(
70
  "--bnb_optimizer",
71
  action="store_true",
 
40
  parser.add_argument("--grad_accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
41
  parser.add_argument("--max_grad_norm", type=float, default=1.0, help="Max gradient norm for clipping")
42
  parser.add_argument("--epochs", type=int, default=100, help="Number of training epochs")
43
+ parser.add_argument("--num_warmup_updates", type=int, default=20000, help="Warmup updates")
44
+ parser.add_argument("--save_per_updates", type=int, default=50000, help="Save checkpoint every N updates")
45
  parser.add_argument(
46
  "--keep_last_n_checkpoints",
47
  type=int,
48
  default=-1,
49
  help="-1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints",
50
  )
51
+ parser.add_argument("--last_per_updates", type=int, default=5000, help="Save last checkpoint every N updates")
52
  parser.add_argument("--finetune", action="store_true", help="Use Finetune")
53
  parser.add_argument("--pretrain", type=str, default=None, help="the path to the checkpoint")
54
  parser.add_argument(
 
65
  action="store_true",
66
  help="Log inferenced samples per ckpt save updates",
67
  )
68
+ parser.add_argument("--logger", type=str, default=None, choices=[None, "wandb", "tensorboard"], help="logger")
69
  parser.add_argument(
70
  "--bnb_optimizer",
71
  action="store_true",
src/f5_tts/train/finetune_gradio.py CHANGED
@@ -120,11 +120,11 @@ def load_settings(project_name):
120
  default_settings = {
121
  "exp_name": "F5TTS_v1_Base",
122
  "learning_rate": 1e-5,
123
- "batch_size_per_gpu": 1,
124
- "batch_size_type": "sample",
125
  "max_samples": 64,
126
- "grad_accumulation_steps": 4,
127
- "max_grad_norm": 1,
128
  "epochs": 100,
129
  "num_warmup_updates": 100,
130
  "save_per_updates": 500,
@@ -134,8 +134,8 @@ def load_settings(project_name):
134
  "file_checkpoint_train": "",
135
  "tokenizer_type": "pinyin",
136
  "tokenizer_file": "",
137
- "mixed_precision": "none",
138
- "logger": "wandb",
139
  "bnb_optimizer": False,
140
  }
141
 
@@ -361,27 +361,27 @@ def terminate_process(pid):
361
 
362
 
363
  def start_training(
364
- dataset_name="",
365
- exp_name="F5TTS_v1_Base",
366
- learning_rate=1e-5,
367
- batch_size_per_gpu=1,
368
- batch_size_type="sample",
369
- max_samples=64,
370
- grad_accumulation_steps=4,
371
- max_grad_norm=1.0,
372
- epochs=100,
373
- num_warmup_updates=100,
374
- save_per_updates=500,
375
- keep_last_n_checkpoints=-1,
376
- last_per_updates=100,
377
- finetune=True,
378
- file_checkpoint_train="",
379
- tokenizer_type="pinyin",
380
- tokenizer_file="",
381
- mixed_precision="fp16",
382
- stream=False,
383
- logger="wandb",
384
- ch_8bit_adam=False,
385
  ):
386
  global training_process, tts_api, stop_signal
387
 
@@ -458,7 +458,10 @@ def start_training(
458
 
459
  cmd += f" --tokenizer {tokenizer_type}"
460
 
461
- cmd += f" --log_samples --logger {logger}"
 
 
 
462
 
463
  if ch_8bit_adam:
464
  cmd += " --bnb_optimizer"
@@ -515,7 +518,7 @@ def start_training(
515
  training_process = subprocess.Popen(
516
  cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1, env=env
517
  )
518
- yield "Training started...", gr.update(interactive=False), gr.update(interactive=True)
519
 
520
  stdout_queue = queue.Queue()
521
  stderr_queue = queue.Queue()
@@ -584,7 +587,11 @@ def start_training(
584
  gr.update(interactive=True),
585
  )
586
  else:
587
- yield "Training complete!", gr.update(interactive=False), gr.update(interactive=True)
 
 
 
 
588
  break
589
 
590
  # Small sleep to prevent CPU thrashing
@@ -598,9 +605,9 @@ def start_training(
598
  time.sleep(1)
599
 
600
  if training_process is None:
601
- text_info = "train stop"
602
  else:
603
- text_info = "train complete !"
604
 
605
  except Exception as e: # Catch all exceptions
606
  # Ensure that we reset the training process variable in case of an error
@@ -615,11 +622,11 @@ def stop_training():
615
  global training_process, stop_signal
616
 
617
  if training_process is None:
618
- return "Train not run !", gr.update(interactive=True), gr.update(interactive=False)
619
  terminate_process_tree(training_process.pid)
620
  # training_process = None
621
  stop_signal = True
622
- return "train stop", gr.update(interactive=True), gr.update(interactive=False)
623
 
624
 
625
  def get_list_projects():
@@ -1128,7 +1135,7 @@ def vocab_check(project_name):
1128
  info = "You can train using your language !"
1129
  else:
1130
  vocab_miss = ",".join(miss_symbols)
1131
- info = f"The following symbols are missing in your language {len(miss_symbols)}\n\n"
1132
 
1133
  return info, vocab_miss
1134
 
@@ -1215,6 +1222,9 @@ def infer(
1215
 
1216
  print("update >> ", device_test, file_checkpoint, use_ema)
1217
 
 
 
 
1218
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
1219
  tts_api.infer(
1220
  ref_file=ref_audio,
@@ -1433,9 +1443,9 @@ Skip this step if you have your dataset, metadata.csv, and a folder wavs with al
1433
  )
1434
 
1435
  audio_speaker = gr.File(label="Voice", type="filepath", file_count="multiple")
1436
- txt_lang = gr.Text(label="Language", value="English")
1437
  bt_transcribe = bt_create = gr.Button("Transcribe")
1438
- txt_info_transcribe = gr.Text(label="Info", value="")
1439
  bt_transcribe.click(
1440
  fn=transcribe_all,
1441
  inputs=[cm_project, audio_speaker, txt_lang, ch_manual],
@@ -1446,7 +1456,7 @@ Skip this step if you have your dataset, metadata.csv, and a folder wavs with al
1446
  random_sample_transcribe = gr.Button("Random Sample")
1447
 
1448
  with gr.Row():
1449
- random_text_transcribe = gr.Text(label="Text")
1450
  random_audio_transcribe = gr.Audio(label="Audio", type="filepath")
1451
 
1452
  random_sample_transcribe.click(
@@ -1461,7 +1471,7 @@ Check the vocabulary for fine-tuning Emilia_ZH_EN to ensure all symbols are incl
1461
  ```""")
1462
 
1463
  check_button = gr.Button("Check Vocab")
1464
- txt_info_check = gr.Text(label="Info", value="")
1465
 
1466
  gr.Markdown("""```plaintext
1467
  Using the extended model, you can finetune to a new language that is missing symbols in the vocab. This creates a new model with a new vocabulary size and saves it in your ckpts/project folder.
@@ -1481,7 +1491,7 @@ Using the extended model, you can finetune to a new language that is missing sym
1481
  txt_count_symbol = gr.Textbox(label="New Vocab Size", value="", scale=1)
1482
 
1483
  extend_button = gr.Button("Extend")
1484
- txt_info_extend = gr.Text(label="Info", value="")
1485
 
1486
  txt_extend.change(vocab_count, inputs=[txt_extend], outputs=[txt_count_symbol])
1487
  check_button.click(fn=vocab_check, inputs=[cm_project], outputs=[txt_info_check, txt_extend])
@@ -1521,8 +1531,8 @@ Skip this step if you have your dataset, raw.arrow, duration.json, and vocab.txt
1521
  ch_tokenizern = gr.Checkbox(label="Create Vocabulary", value=False, visible=False)
1522
 
1523
  bt_prepare = bt_create = gr.Button("Prepare")
1524
- txt_info_prepare = gr.Text(label="Info", value="")
1525
- txt_vocab_prepare = gr.Text(label="Vocab", value="")
1526
 
1527
  bt_prepare.click(
1528
  fn=create_metadata, inputs=[cm_project, ch_tokenizern], outputs=[txt_info_prepare, txt_vocab_prepare]
@@ -1531,7 +1541,7 @@ Skip this step if you have your dataset, raw.arrow, duration.json, and vocab.txt
1531
  random_sample_prepare = gr.Button("Random Sample")
1532
 
1533
  with gr.Row():
1534
- random_text_prepare = gr.Text(label="Tokenizer")
1535
  random_audio_prepare = gr.Audio(label="Audio", type="filepath")
1536
 
1537
  random_sample_prepare.click(
@@ -1544,50 +1554,60 @@ The auto-setting is still experimental. Set a large value of epoch if not sure;
1544
  If you encounter a memory error, try reducing the batch size per GPU to a smaller number.
1545
  ```""")
1546
  with gr.Row():
1547
- bt_calculate = bt_create = gr.Button("Auto Settings")
1548
- lb_samples = gr.Label(label="Samples")
1549
- batch_size_type = gr.Radio(label="Batch Size Type", choices=["frame", "sample"], value="frame")
1550
 
1551
  with gr.Row():
1552
- ch_finetune = bt_create = gr.Checkbox(label="Finetune", value=True)
1553
- tokenizer_file = gr.Textbox(label="Tokenizer File", value="")
1554
- file_checkpoint_train = gr.Textbox(label="Path to the Pretrained Checkpoint", value="")
1555
-
1556
- with gr.Row():
1557
- exp_name = gr.Radio(
1558
- label="Model", choices=["F5TTS_v1_Base", "F5TTS_Base", "E2TTS_Base"], value="F5TTS_v1_Base"
1559
- )
1560
- learning_rate = gr.Number(label="Learning Rate", value=1e-5, step=1e-5)
1561
-
1562
- with gr.Row():
1563
- batch_size_per_gpu = gr.Number(label="Batch Size per GPU", value=3200)
1564
- max_samples = gr.Number(label="Max Samples", value=64)
1565
 
1566
  with gr.Row():
1567
- grad_accumulation_steps = gr.Number(label="Gradient Accumulation Steps", value=1)
1568
- max_grad_norm = gr.Number(label="Max Gradient Norm", value=1.0)
 
 
1569
 
1570
  with gr.Row():
1571
- epochs = gr.Number(label="Epochs", value=100)
1572
- num_warmup_updates = gr.Number(label="Warmup Updates", value=100)
 
 
 
 
 
 
 
 
1573
 
1574
  with gr.Row():
1575
- save_per_updates = gr.Number(label="Save per Updates", value=500)
 
 
 
 
1576
  keep_last_n_checkpoints = gr.Number(
1577
  label="Keep Last N Checkpoints",
1578
- value=-1,
1579
  step=1,
1580
  precision=0,
1581
- info="-1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints",
 
1582
  )
1583
- last_per_updates = gr.Number(label="Last per Updates", value=100)
 
 
 
 
 
1584
 
1585
  with gr.Row():
1586
  ch_8bit_adam = gr.Checkbox(label="Use 8-bit Adam optimizer")
1587
- mixed_precision = gr.Radio(label="mixed_precision", choices=["none", "fp16", "bf16"], value="fp16")
1588
- cd_logger = gr.Radio(label="logger", choices=["wandb", "tensorboard"], value="wandb")
1589
- start_button = gr.Button("Start Training")
1590
- stop_button = gr.Button("Stop Training", interactive=False)
 
1591
 
1592
  if projects_selelect is not None:
1593
  (
@@ -1634,7 +1654,7 @@ If you encounter a memory error, try reducing the batch size per GPU to a smalle
1634
  ch_8bit_adam.value = bnb_optimizer_value
1635
 
1636
  ch_stream = gr.Checkbox(label="Stream Output Experiment", value=True)
1637
- txt_info_train = gr.Text(label="Info", value="")
1638
 
1639
  list_audios, select_audio = get_audio_project(projects_selelect, False)
1640
 
@@ -1763,7 +1783,7 @@ If you encounter a memory error, try reducing the batch size per GPU to a smalle
1763
 
1764
  with gr.TabItem("Test Model"):
1765
  gr.Markdown("""```plaintext
1766
- SOS: Check the use_ema setting (True or False) for your model to see what works best for you. use seed -1 from random
1767
  ```""")
1768
  exp_name = gr.Radio(
1769
  label="Model", choices=["F5TTS_v1_Base", "F5TTS_Base", "E2TTS_Base"], value="F5TTS_v1_Base"
@@ -1773,11 +1793,13 @@ SOS: Check the use_ema setting (True or False) for your model to see what works
1773
  with gr.Row():
1774
  nfe_step = gr.Number(label="NFE Step", value=32)
1775
  speed = gr.Slider(label="Speed", value=1.0, minimum=0.3, maximum=2.0, step=0.1)
1776
- seed = gr.Number(label="Seed", value=-1, minimum=-1)
1777
  remove_silence = gr.Checkbox(label="Remove Silence")
1778
 
1779
- ch_use_ema = gr.Checkbox(label="Use EMA", value=True)
1780
  with gr.Row():
 
 
 
1781
  cm_checkpoint = gr.Dropdown(
1782
  choices=list_checkpoints, value=checkpoint_select, label="Checkpoints", allow_custom_value=True
1783
  )
@@ -1785,20 +1807,20 @@ SOS: Check the use_ema setting (True or False) for your model to see what works
1785
 
1786
  random_sample_infer = gr.Button("Random Sample")
1787
 
1788
- ref_text = gr.Textbox(label="Ref Text")
1789
- ref_audio = gr.Audio(label="Audio Ref", type="filepath")
1790
- gen_text = gr.Textbox(label="Gen Text")
1791
 
1792
  random_sample_infer.click(
1793
  fn=get_random_sample_infer, inputs=[cm_project], outputs=[ref_text, gen_text, ref_audio]
1794
  )
1795
 
1796
  with gr.Row():
1797
- txt_info_gpu = gr.Textbox("", label="Device")
1798
- seed_info = gr.Text(label="Seed :")
1799
- check_button_infer = gr.Button("Infer")
1800
 
1801
- gen_audio = gr.Audio(label="Audio Gen", type="filepath")
1802
 
1803
  check_button_infer.click(
1804
  fn=infer,
@@ -1825,10 +1847,10 @@ SOS: Check the use_ema setting (True or False) for your model to see what works
1825
  gr.Markdown("""```plaintext
1826
  Reduce the Base model size from 5GB to 1.3GB. The new checkpoint file prunes out optimizer and etc., can be used for inference or finetuning afterward, but not able to resume pretraining.
1827
  ```""")
1828
- txt_path_checkpoint = gr.Text(label="Path to Checkpoint:")
1829
- txt_path_checkpoint_small = gr.Text(label="Path to Output:")
1830
  ch_safetensors = gr.Checkbox(label="Safetensors", value="")
1831
- txt_info_reduse = gr.Text(label="Info", value="")
1832
  reduse_button = gr.Button("Reduce")
1833
  reduse_button.click(
1834
  fn=extract_and_save_ema_model,
 
120
  default_settings = {
121
  "exp_name": "F5TTS_v1_Base",
122
  "learning_rate": 1e-5,
123
+ "batch_size_per_gpu": 3200,
124
+ "batch_size_type": "frame",
125
  "max_samples": 64,
126
+ "grad_accumulation_steps": 1,
127
+ "max_grad_norm": 1.0,
128
  "epochs": 100,
129
  "num_warmup_updates": 100,
130
  "save_per_updates": 500,
 
134
  "file_checkpoint_train": "",
135
  "tokenizer_type": "pinyin",
136
  "tokenizer_file": "",
137
+ "mixed_precision": "fp16",
138
+ "logger": "none",
139
  "bnb_optimizer": False,
140
  }
141
 
 
361
 
362
 
363
  def start_training(
364
+ dataset_name,
365
+ exp_name,
366
+ learning_rate,
367
+ batch_size_per_gpu,
368
+ batch_size_type,
369
+ max_samples,
370
+ grad_accumulation_steps,
371
+ max_grad_norm,
372
+ epochs,
373
+ num_warmup_updates,
374
+ save_per_updates,
375
+ keep_last_n_checkpoints,
376
+ last_per_updates,
377
+ finetune,
378
+ file_checkpoint_train,
379
+ tokenizer_type,
380
+ tokenizer_file,
381
+ mixed_precision,
382
+ stream,
383
+ logger,
384
+ ch_8bit_adam,
385
  ):
386
  global training_process, tts_api, stop_signal
387
 
 
458
 
459
  cmd += f" --tokenizer {tokenizer_type}"
460
 
461
+ if logger != "none":
462
+ cmd += f" --logger {logger}"
463
+
464
+ cmd += " --log_samples"
465
 
466
  if ch_8bit_adam:
467
  cmd += " --bnb_optimizer"
 
518
  training_process = subprocess.Popen(
519
  cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1, env=env
520
  )
521
+ yield "Training started ...", gr.update(interactive=False), gr.update(interactive=True)
522
 
523
  stdout_queue = queue.Queue()
524
  stderr_queue = queue.Queue()
 
587
  gr.update(interactive=True),
588
  )
589
  else:
590
+ yield (
591
+ "Training complete or paused ...",
592
+ gr.update(interactive=False),
593
+ gr.update(interactive=True),
594
+ )
595
  break
596
 
597
  # Small sleep to prevent CPU thrashing
 
605
  time.sleep(1)
606
 
607
  if training_process is None:
608
+ text_info = "Train stopped !"
609
  else:
610
+ text_info = "Train complete at end !"
611
 
612
  except Exception as e: # Catch all exceptions
613
  # Ensure that we reset the training process variable in case of an error
 
622
  global training_process, stop_signal
623
 
624
  if training_process is None:
625
+ return "Train not running !", gr.update(interactive=True), gr.update(interactive=False)
626
  terminate_process_tree(training_process.pid)
627
  # training_process = None
628
  stop_signal = True
629
+ return "Train stopped !", gr.update(interactive=True), gr.update(interactive=False)
630
 
631
 
632
  def get_list_projects():
 
1135
  info = "You can train using your language !"
1136
  else:
1137
  vocab_miss = ",".join(miss_symbols)
1138
+ info = f"The following {len(miss_symbols)} symbols are missing in your language\n\n"
1139
 
1140
  return info, vocab_miss
1141
 
 
1222
 
1223
  print("update >> ", device_test, file_checkpoint, use_ema)
1224
 
1225
+ if seed == -1: # -1 used for random
1226
+ seed = None
1227
+
1228
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
1229
  tts_api.infer(
1230
  ref_file=ref_audio,
 
1443
  )
1444
 
1445
  audio_speaker = gr.File(label="Voice", type="filepath", file_count="multiple")
1446
+ txt_lang = gr.Textbox(label="Language", value="English")
1447
  bt_transcribe = bt_create = gr.Button("Transcribe")
1448
+ txt_info_transcribe = gr.Textbox(label="Info", value="")
1449
  bt_transcribe.click(
1450
  fn=transcribe_all,
1451
  inputs=[cm_project, audio_speaker, txt_lang, ch_manual],
 
1456
  random_sample_transcribe = gr.Button("Random Sample")
1457
 
1458
  with gr.Row():
1459
+ random_text_transcribe = gr.Textbox(label="Text")
1460
  random_audio_transcribe = gr.Audio(label="Audio", type="filepath")
1461
 
1462
  random_sample_transcribe.click(
 
1471
  ```""")
1472
 
1473
  check_button = gr.Button("Check Vocab")
1474
+ txt_info_check = gr.Textbox(label="Info", value="")
1475
 
1476
  gr.Markdown("""```plaintext
1477
  Using the extended model, you can finetune to a new language that is missing symbols in the vocab. This creates a new model with a new vocabulary size and saves it in your ckpts/project folder.
 
1491
  txt_count_symbol = gr.Textbox(label="New Vocab Size", value="", scale=1)
1492
 
1493
  extend_button = gr.Button("Extend")
1494
+ txt_info_extend = gr.Textbox(label="Info", value="")
1495
 
1496
  txt_extend.change(vocab_count, inputs=[txt_extend], outputs=[txt_count_symbol])
1497
  check_button.click(fn=vocab_check, inputs=[cm_project], outputs=[txt_info_check, txt_extend])
 
1531
  ch_tokenizern = gr.Checkbox(label="Create Vocabulary", value=False, visible=False)
1532
 
1533
  bt_prepare = bt_create = gr.Button("Prepare")
1534
+ txt_info_prepare = gr.Textbox(label="Info", value="")
1535
+ txt_vocab_prepare = gr.Textbox(label="Vocab", value="")
1536
 
1537
  bt_prepare.click(
1538
  fn=create_metadata, inputs=[cm_project, ch_tokenizern], outputs=[txt_info_prepare, txt_vocab_prepare]
 
1541
  random_sample_prepare = gr.Button("Random Sample")
1542
 
1543
  with gr.Row():
1544
+ random_text_prepare = gr.Textbox(label="Tokenizer")
1545
  random_audio_prepare = gr.Audio(label="Audio", type="filepath")
1546
 
1547
  random_sample_prepare.click(
 
1554
  If you encounter a memory error, try reducing the batch size per GPU to a smaller number.
1555
  ```""")
1556
  with gr.Row():
1557
+ exp_name = gr.Radio(label="Model", choices=["F5TTS_v1_Base", "F5TTS_Base", "E2TTS_Base"])
1558
+ tokenizer_file = gr.Textbox(label="Tokenizer File")
1559
+ file_checkpoint_train = gr.Textbox(label="Path to the Pretrained Checkpoint")
1560
 
1561
  with gr.Row():
1562
+ ch_finetune = bt_create = gr.Checkbox(label="Finetune")
1563
+ lb_samples = gr.Label(label="Samples")
1564
+ bt_calculate = bt_create = gr.Button("Auto Settings")
 
 
 
 
 
 
 
 
 
 
1565
 
1566
  with gr.Row():
1567
+ epochs = gr.Number(label="Epochs")
1568
+ learning_rate = gr.Number(label="Learning Rate", step=0.5e-5)
1569
+ max_grad_norm = gr.Number(label="Max Gradient Norm")
1570
+ num_warmup_updates = gr.Number(label="Warmup Updates")
1571
 
1572
  with gr.Row():
1573
+ batch_size_type = gr.Radio(
1574
+ label="Batch Size Type",
1575
+ choices=["frame", "sample"],
1576
+ info="frame is calculated as seconds * sampling_rate / hop_length",
1577
+ )
1578
+ batch_size_per_gpu = gr.Number(label="Batch Size per GPU", info="N frames or N samples")
1579
+ grad_accumulation_steps = gr.Number(
1580
+ label="Gradient Accumulation Steps", info="Effective batch size is multiplied by this value"
1581
+ )
1582
+ max_samples = gr.Number(label="Max Samples", info="Maximum number of samples per single GPU batch")
1583
 
1584
  with gr.Row():
1585
+ save_per_updates = gr.Number(
1586
+ label="Save per Updates",
1587
+ info="Save intermediate checkpoints every N updates",
1588
+ minimum=10,
1589
+ )
1590
  keep_last_n_checkpoints = gr.Number(
1591
  label="Keep Last N Checkpoints",
 
1592
  step=1,
1593
  precision=0,
1594
+ info="-1 to keep all, 0 to not save intermediate, > 0 to keep last N",
1595
+ minimum=-1,
1596
  )
1597
+ last_per_updates = gr.Number(
1598
+ label="Last per Updates",
1599
+ info="Save latest checkpoint with suffix _last.pt every N updates",
1600
+ minimum=10,
1601
+ )
1602
+ gr.Radio(label="") # placeholder
1603
 
1604
  with gr.Row():
1605
  ch_8bit_adam = gr.Checkbox(label="Use 8-bit Adam optimizer")
1606
+ mixed_precision = gr.Radio(label="Mixed Precision", choices=["none", "fp16", "bf16"])
1607
+ cd_logger = gr.Radio(label="Logger", choices=["none", "wandb", "tensorboard"])
1608
+ with gr.Column():
1609
+ start_button = gr.Button("Start Training")
1610
+ stop_button = gr.Button("Stop Training", interactive=False)
1611
 
1612
  if projects_selelect is not None:
1613
  (
 
1654
  ch_8bit_adam.value = bnb_optimizer_value
1655
 
1656
  ch_stream = gr.Checkbox(label="Stream Output Experiment", value=True)
1657
+ txt_info_train = gr.Textbox(label="Info", value="")
1658
 
1659
  list_audios, select_audio = get_audio_project(projects_selelect, False)
1660
 
 
1783
 
1784
  with gr.TabItem("Test Model"):
1785
  gr.Markdown("""```plaintext
1786
+ Check the use_ema setting (True or False) for your model to see what works best for you. Set seed to -1 for random.
1787
  ```""")
1788
  exp_name = gr.Radio(
1789
  label="Model", choices=["F5TTS_v1_Base", "F5TTS_Base", "E2TTS_Base"], value="F5TTS_v1_Base"
 
1793
  with gr.Row():
1794
  nfe_step = gr.Number(label="NFE Step", value=32)
1795
  speed = gr.Slider(label="Speed", value=1.0, minimum=0.3, maximum=2.0, step=0.1)
1796
+ seed = gr.Number(label="Random Seed", value=-1, minimum=-1)
1797
  remove_silence = gr.Checkbox(label="Remove Silence")
1798
 
 
1799
  with gr.Row():
1800
+ ch_use_ema = gr.Checkbox(
1801
+ label="Use EMA", value=True, info="Turn off at early stage might offer better results"
1802
+ )
1803
  cm_checkpoint = gr.Dropdown(
1804
  choices=list_checkpoints, value=checkpoint_select, label="Checkpoints", allow_custom_value=True
1805
  )
 
1807
 
1808
  random_sample_infer = gr.Button("Random Sample")
1809
 
1810
+ ref_text = gr.Textbox(label="Reference Text")
1811
+ ref_audio = gr.Audio(label="Reference Audio", type="filepath")
1812
+ gen_text = gr.Textbox(label="Text to Generate")
1813
 
1814
  random_sample_infer.click(
1815
  fn=get_random_sample_infer, inputs=[cm_project], outputs=[ref_text, gen_text, ref_audio]
1816
  )
1817
 
1818
  with gr.Row():
1819
+ txt_info_gpu = gr.Textbox("", label="Inference on Device :")
1820
+ seed_info = gr.Textbox(label="Used Random Seed :")
1821
+ check_button_infer = gr.Button("Inference")
1822
 
1823
+ gen_audio = gr.Audio(label="Generated Audio", type="filepath")
1824
 
1825
  check_button_infer.click(
1826
  fn=infer,
 
1847
  gr.Markdown("""```plaintext
1848
  Reduce the Base model size from 5GB to 1.3GB. The new checkpoint file prunes out optimizer and etc., can be used for inference or finetuning afterward, but not able to resume pretraining.
1849
  ```""")
1850
+ txt_path_checkpoint = gr.Textbox(label="Path to Checkpoint:")
1851
+ txt_path_checkpoint_small = gr.Textbox(label="Path to Output:")
1852
  ch_safetensors = gr.Checkbox(label="Safetensors", value="")
1853
+ txt_info_reduse = gr.Textbox(label="Info", value="")
1854
  reduse_button = gr.Button("Reduce")
1855
  reduse_button.click(
1856
  fn=extract_and_save_ema_model,