Spaces:
Running
on
Zero
Running
on
Zero
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
- app.py +2 -2
- pyproject.toml +1 -1
- src/f5_tts/infer/utils_infer.py +3 -3
- src/f5_tts/train/finetune_cli.py +4 -4
- src/f5_tts/train/finetune_gradio.py +107 -85
app.py
CHANGED
@@ -758,9 +758,9 @@ This is {"a local web UI for [F5 TTS](https://github.com/SWivid/F5-TTS)" if not
|
|
758 |
|
759 |
The checkpoints currently support English and Chinese.
|
760 |
|
761 |
-
If you're having issues, try converting your reference audio to WAV or MP3, clipping it to
|
762 |
|
763 |
-
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<
|
764 |
"""
|
765 |
)
|
766 |
|
|
|
758 |
|
759 |
The checkpoints currently support English and Chinese.
|
760 |
|
761 |
+
If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 12s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
|
762 |
|
763 |
+
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<12s). Ensure the audio is fully uploaded before generating.**
|
764 |
"""
|
765 |
)
|
766 |
|
pyproject.toml
CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4 |
|
5 |
[project]
|
6 |
name = "f5-tts"
|
7 |
-
version = "1.0.
|
8 |
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
9 |
readme = "README.md"
|
10 |
license = {text = "MIT License"}
|
|
|
4 |
|
5 |
[project]
|
6 |
name = "f5-tts"
|
7 |
+
version = "1.0.5"
|
8 |
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
9 |
readme = "README.md"
|
10 |
license = {text = "MIT License"}
|
src/f5_tts/infer/utils_infer.py
CHANGED
@@ -302,7 +302,7 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
|
|
302 |
non_silent_wave = AudioSegment.silent(duration=0)
|
303 |
for non_silent_seg in non_silent_segs:
|
304 |
if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
|
305 |
-
show_info("Audio is over
|
306 |
break
|
307 |
non_silent_wave += non_silent_seg
|
308 |
|
@@ -314,7 +314,7 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
|
|
314 |
non_silent_wave = AudioSegment.silent(duration=0)
|
315 |
for non_silent_seg in non_silent_segs:
|
316 |
if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
|
317 |
-
show_info("Audio is over
|
318 |
break
|
319 |
non_silent_wave += non_silent_seg
|
320 |
|
@@ -323,7 +323,7 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
|
|
323 |
# 3. if no proper silence found for clipping
|
324 |
if len(aseg) > 12000:
|
325 |
aseg = aseg[:12000]
|
326 |
-
show_info("Audio is over
|
327 |
|
328 |
aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
|
329 |
aseg.export(f.name, format="wav")
|
|
|
302 |
non_silent_wave = AudioSegment.silent(duration=0)
|
303 |
for non_silent_seg in non_silent_segs:
|
304 |
if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
|
305 |
+
show_info("Audio is over 12s, clipping short. (1)")
|
306 |
break
|
307 |
non_silent_wave += non_silent_seg
|
308 |
|
|
|
314 |
non_silent_wave = AudioSegment.silent(duration=0)
|
315 |
for non_silent_seg in non_silent_segs:
|
316 |
if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
|
317 |
+
show_info("Audio is over 12s, clipping short. (2)")
|
318 |
break
|
319 |
non_silent_wave += non_silent_seg
|
320 |
|
|
|
323 |
# 3. if no proper silence found for clipping
|
324 |
if len(aseg) > 12000:
|
325 |
aseg = aseg[:12000]
|
326 |
+
show_info("Audio is over 12s, clipping short. (3)")
|
327 |
|
328 |
aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
|
329 |
aseg.export(f.name, format="wav")
|
src/f5_tts/train/finetune_cli.py
CHANGED
@@ -40,15 +40,15 @@ def parse_args():
|
|
40 |
parser.add_argument("--grad_accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
|
41 |
parser.add_argument("--max_grad_norm", type=float, default=1.0, help="Max gradient norm for clipping")
|
42 |
parser.add_argument("--epochs", type=int, default=100, help="Number of training epochs")
|
43 |
-
parser.add_argument("--num_warmup_updates", type=int, default=
|
44 |
-
parser.add_argument("--save_per_updates", type=int, default=
|
45 |
parser.add_argument(
|
46 |
"--keep_last_n_checkpoints",
|
47 |
type=int,
|
48 |
default=-1,
|
49 |
help="-1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints",
|
50 |
)
|
51 |
-
parser.add_argument("--last_per_updates", type=int, default=
|
52 |
parser.add_argument("--finetune", action="store_true", help="Use Finetune")
|
53 |
parser.add_argument("--pretrain", type=str, default=None, help="the path to the checkpoint")
|
54 |
parser.add_argument(
|
@@ -65,7 +65,7 @@ def parse_args():
|
|
65 |
action="store_true",
|
66 |
help="Log inferenced samples per ckpt save updates",
|
67 |
)
|
68 |
-
parser.add_argument("--logger", type=str, default=None, choices=["wandb", "tensorboard"], help="logger")
|
69 |
parser.add_argument(
|
70 |
"--bnb_optimizer",
|
71 |
action="store_true",
|
|
|
40 |
parser.add_argument("--grad_accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
|
41 |
parser.add_argument("--max_grad_norm", type=float, default=1.0, help="Max gradient norm for clipping")
|
42 |
parser.add_argument("--epochs", type=int, default=100, help="Number of training epochs")
|
43 |
+
parser.add_argument("--num_warmup_updates", type=int, default=20000, help="Warmup updates")
|
44 |
+
parser.add_argument("--save_per_updates", type=int, default=50000, help="Save checkpoint every N updates")
|
45 |
parser.add_argument(
|
46 |
"--keep_last_n_checkpoints",
|
47 |
type=int,
|
48 |
default=-1,
|
49 |
help="-1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints",
|
50 |
)
|
51 |
+
parser.add_argument("--last_per_updates", type=int, default=5000, help="Save last checkpoint every N updates")
|
52 |
parser.add_argument("--finetune", action="store_true", help="Use Finetune")
|
53 |
parser.add_argument("--pretrain", type=str, default=None, help="the path to the checkpoint")
|
54 |
parser.add_argument(
|
|
|
65 |
action="store_true",
|
66 |
help="Log inferenced samples per ckpt save updates",
|
67 |
)
|
68 |
+
parser.add_argument("--logger", type=str, default=None, choices=[None, "wandb", "tensorboard"], help="logger")
|
69 |
parser.add_argument(
|
70 |
"--bnb_optimizer",
|
71 |
action="store_true",
|
src/f5_tts/train/finetune_gradio.py
CHANGED
@@ -120,11 +120,11 @@ def load_settings(project_name):
|
|
120 |
default_settings = {
|
121 |
"exp_name": "F5TTS_v1_Base",
|
122 |
"learning_rate": 1e-5,
|
123 |
-
"batch_size_per_gpu":
|
124 |
-
"batch_size_type": "
|
125 |
"max_samples": 64,
|
126 |
-
"grad_accumulation_steps":
|
127 |
-
"max_grad_norm": 1,
|
128 |
"epochs": 100,
|
129 |
"num_warmup_updates": 100,
|
130 |
"save_per_updates": 500,
|
@@ -134,8 +134,8 @@ def load_settings(project_name):
|
|
134 |
"file_checkpoint_train": "",
|
135 |
"tokenizer_type": "pinyin",
|
136 |
"tokenizer_file": "",
|
137 |
-
"mixed_precision": "
|
138 |
-
"logger": "
|
139 |
"bnb_optimizer": False,
|
140 |
}
|
141 |
|
@@ -361,27 +361,27 @@ def terminate_process(pid):
|
|
361 |
|
362 |
|
363 |
def start_training(
|
364 |
-
dataset_name
|
365 |
-
exp_name
|
366 |
-
learning_rate
|
367 |
-
batch_size_per_gpu
|
368 |
-
batch_size_type
|
369 |
-
max_samples
|
370 |
-
grad_accumulation_steps
|
371 |
-
max_grad_norm
|
372 |
-
epochs
|
373 |
-
num_warmup_updates
|
374 |
-
save_per_updates
|
375 |
-
keep_last_n_checkpoints
|
376 |
-
last_per_updates
|
377 |
-
finetune
|
378 |
-
file_checkpoint_train
|
379 |
-
tokenizer_type
|
380 |
-
tokenizer_file
|
381 |
-
mixed_precision
|
382 |
-
stream
|
383 |
-
logger
|
384 |
-
ch_8bit_adam
|
385 |
):
|
386 |
global training_process, tts_api, stop_signal
|
387 |
|
@@ -458,7 +458,10 @@ def start_training(
|
|
458 |
|
459 |
cmd += f" --tokenizer {tokenizer_type}"
|
460 |
|
461 |
-
|
|
|
|
|
|
|
462 |
|
463 |
if ch_8bit_adam:
|
464 |
cmd += " --bnb_optimizer"
|
@@ -515,7 +518,7 @@ def start_training(
|
|
515 |
training_process = subprocess.Popen(
|
516 |
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1, env=env
|
517 |
)
|
518 |
-
yield "Training started...", gr.update(interactive=False), gr.update(interactive=True)
|
519 |
|
520 |
stdout_queue = queue.Queue()
|
521 |
stderr_queue = queue.Queue()
|
@@ -584,7 +587,11 @@ def start_training(
|
|
584 |
gr.update(interactive=True),
|
585 |
)
|
586 |
else:
|
587 |
-
yield
|
|
|
|
|
|
|
|
|
588 |
break
|
589 |
|
590 |
# Small sleep to prevent CPU thrashing
|
@@ -598,9 +605,9 @@ def start_training(
|
|
598 |
time.sleep(1)
|
599 |
|
600 |
if training_process is None:
|
601 |
-
text_info = "
|
602 |
else:
|
603 |
-
text_info = "
|
604 |
|
605 |
except Exception as e: # Catch all exceptions
|
606 |
# Ensure that we reset the training process variable in case of an error
|
@@ -615,11 +622,11 @@ def stop_training():
|
|
615 |
global training_process, stop_signal
|
616 |
|
617 |
if training_process is None:
|
618 |
-
return "Train not
|
619 |
terminate_process_tree(training_process.pid)
|
620 |
# training_process = None
|
621 |
stop_signal = True
|
622 |
-
return "
|
623 |
|
624 |
|
625 |
def get_list_projects():
|
@@ -1128,7 +1135,7 @@ def vocab_check(project_name):
|
|
1128 |
info = "You can train using your language !"
|
1129 |
else:
|
1130 |
vocab_miss = ",".join(miss_symbols)
|
1131 |
-
info = f"The following symbols are missing in your language
|
1132 |
|
1133 |
return info, vocab_miss
|
1134 |
|
@@ -1215,6 +1222,9 @@ def infer(
|
|
1215 |
|
1216 |
print("update >> ", device_test, file_checkpoint, use_ema)
|
1217 |
|
|
|
|
|
|
|
1218 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
1219 |
tts_api.infer(
|
1220 |
ref_file=ref_audio,
|
@@ -1433,9 +1443,9 @@ Skip this step if you have your dataset, metadata.csv, and a folder wavs with al
|
|
1433 |
)
|
1434 |
|
1435 |
audio_speaker = gr.File(label="Voice", type="filepath", file_count="multiple")
|
1436 |
-
txt_lang = gr.
|
1437 |
bt_transcribe = bt_create = gr.Button("Transcribe")
|
1438 |
-
txt_info_transcribe = gr.
|
1439 |
bt_transcribe.click(
|
1440 |
fn=transcribe_all,
|
1441 |
inputs=[cm_project, audio_speaker, txt_lang, ch_manual],
|
@@ -1446,7 +1456,7 @@ Skip this step if you have your dataset, metadata.csv, and a folder wavs with al
|
|
1446 |
random_sample_transcribe = gr.Button("Random Sample")
|
1447 |
|
1448 |
with gr.Row():
|
1449 |
-
random_text_transcribe = gr.
|
1450 |
random_audio_transcribe = gr.Audio(label="Audio", type="filepath")
|
1451 |
|
1452 |
random_sample_transcribe.click(
|
@@ -1461,7 +1471,7 @@ Check the vocabulary for fine-tuning Emilia_ZH_EN to ensure all symbols are incl
|
|
1461 |
```""")
|
1462 |
|
1463 |
check_button = gr.Button("Check Vocab")
|
1464 |
-
txt_info_check = gr.
|
1465 |
|
1466 |
gr.Markdown("""```plaintext
|
1467 |
Using the extended model, you can finetune to a new language that is missing symbols in the vocab. This creates a new model with a new vocabulary size and saves it in your ckpts/project folder.
|
@@ -1481,7 +1491,7 @@ Using the extended model, you can finetune to a new language that is missing sym
|
|
1481 |
txt_count_symbol = gr.Textbox(label="New Vocab Size", value="", scale=1)
|
1482 |
|
1483 |
extend_button = gr.Button("Extend")
|
1484 |
-
txt_info_extend = gr.
|
1485 |
|
1486 |
txt_extend.change(vocab_count, inputs=[txt_extend], outputs=[txt_count_symbol])
|
1487 |
check_button.click(fn=vocab_check, inputs=[cm_project], outputs=[txt_info_check, txt_extend])
|
@@ -1521,8 +1531,8 @@ Skip this step if you have your dataset, raw.arrow, duration.json, and vocab.txt
|
|
1521 |
ch_tokenizern = gr.Checkbox(label="Create Vocabulary", value=False, visible=False)
|
1522 |
|
1523 |
bt_prepare = bt_create = gr.Button("Prepare")
|
1524 |
-
txt_info_prepare = gr.
|
1525 |
-
txt_vocab_prepare = gr.
|
1526 |
|
1527 |
bt_prepare.click(
|
1528 |
fn=create_metadata, inputs=[cm_project, ch_tokenizern], outputs=[txt_info_prepare, txt_vocab_prepare]
|
@@ -1531,7 +1541,7 @@ Skip this step if you have your dataset, raw.arrow, duration.json, and vocab.txt
|
|
1531 |
random_sample_prepare = gr.Button("Random Sample")
|
1532 |
|
1533 |
with gr.Row():
|
1534 |
-
random_text_prepare = gr.
|
1535 |
random_audio_prepare = gr.Audio(label="Audio", type="filepath")
|
1536 |
|
1537 |
random_sample_prepare.click(
|
@@ -1544,50 +1554,60 @@ The auto-setting is still experimental. Set a large value of epoch if not sure;
|
|
1544 |
If you encounter a memory error, try reducing the batch size per GPU to a smaller number.
|
1545 |
```""")
|
1546 |
with gr.Row():
|
1547 |
-
|
1548 |
-
|
1549 |
-
|
1550 |
|
1551 |
with gr.Row():
|
1552 |
-
ch_finetune = bt_create = gr.Checkbox(label="Finetune"
|
1553 |
-
|
1554 |
-
|
1555 |
-
|
1556 |
-
with gr.Row():
|
1557 |
-
exp_name = gr.Radio(
|
1558 |
-
label="Model", choices=["F5TTS_v1_Base", "F5TTS_Base", "E2TTS_Base"], value="F5TTS_v1_Base"
|
1559 |
-
)
|
1560 |
-
learning_rate = gr.Number(label="Learning Rate", value=1e-5, step=1e-5)
|
1561 |
-
|
1562 |
-
with gr.Row():
|
1563 |
-
batch_size_per_gpu = gr.Number(label="Batch Size per GPU", value=3200)
|
1564 |
-
max_samples = gr.Number(label="Max Samples", value=64)
|
1565 |
|
1566 |
with gr.Row():
|
1567 |
-
|
1568 |
-
|
|
|
|
|
1569 |
|
1570 |
with gr.Row():
|
1571 |
-
|
1572 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1573 |
|
1574 |
with gr.Row():
|
1575 |
-
save_per_updates = gr.Number(
|
|
|
|
|
|
|
|
|
1576 |
keep_last_n_checkpoints = gr.Number(
|
1577 |
label="Keep Last N Checkpoints",
|
1578 |
-
value=-1,
|
1579 |
step=1,
|
1580 |
precision=0,
|
1581 |
-
info="-1 to keep all, 0 to not save intermediate, > 0 to keep last N
|
|
|
1582 |
)
|
1583 |
-
last_per_updates = gr.Number(
|
|
|
|
|
|
|
|
|
|
|
1584 |
|
1585 |
with gr.Row():
|
1586 |
ch_8bit_adam = gr.Checkbox(label="Use 8-bit Adam optimizer")
|
1587 |
-
mixed_precision = gr.Radio(label="
|
1588 |
-
cd_logger = gr.Radio(label="
|
1589 |
-
|
1590 |
-
|
|
|
1591 |
|
1592 |
if projects_selelect is not None:
|
1593 |
(
|
@@ -1634,7 +1654,7 @@ If you encounter a memory error, try reducing the batch size per GPU to a smalle
|
|
1634 |
ch_8bit_adam.value = bnb_optimizer_value
|
1635 |
|
1636 |
ch_stream = gr.Checkbox(label="Stream Output Experiment", value=True)
|
1637 |
-
txt_info_train = gr.
|
1638 |
|
1639 |
list_audios, select_audio = get_audio_project(projects_selelect, False)
|
1640 |
|
@@ -1763,7 +1783,7 @@ If you encounter a memory error, try reducing the batch size per GPU to a smalle
|
|
1763 |
|
1764 |
with gr.TabItem("Test Model"):
|
1765 |
gr.Markdown("""```plaintext
|
1766 |
-
|
1767 |
```""")
|
1768 |
exp_name = gr.Radio(
|
1769 |
label="Model", choices=["F5TTS_v1_Base", "F5TTS_Base", "E2TTS_Base"], value="F5TTS_v1_Base"
|
@@ -1773,11 +1793,13 @@ SOS: Check the use_ema setting (True or False) for your model to see what works
|
|
1773 |
with gr.Row():
|
1774 |
nfe_step = gr.Number(label="NFE Step", value=32)
|
1775 |
speed = gr.Slider(label="Speed", value=1.0, minimum=0.3, maximum=2.0, step=0.1)
|
1776 |
-
seed = gr.Number(label="Seed", value=-1, minimum=-1)
|
1777 |
remove_silence = gr.Checkbox(label="Remove Silence")
|
1778 |
|
1779 |
-
ch_use_ema = gr.Checkbox(label="Use EMA", value=True)
|
1780 |
with gr.Row():
|
|
|
|
|
|
|
1781 |
cm_checkpoint = gr.Dropdown(
|
1782 |
choices=list_checkpoints, value=checkpoint_select, label="Checkpoints", allow_custom_value=True
|
1783 |
)
|
@@ -1785,20 +1807,20 @@ SOS: Check the use_ema setting (True or False) for your model to see what works
|
|
1785 |
|
1786 |
random_sample_infer = gr.Button("Random Sample")
|
1787 |
|
1788 |
-
ref_text = gr.Textbox(label="
|
1789 |
-
ref_audio = gr.Audio(label="Audio
|
1790 |
-
gen_text = gr.Textbox(label="
|
1791 |
|
1792 |
random_sample_infer.click(
|
1793 |
fn=get_random_sample_infer, inputs=[cm_project], outputs=[ref_text, gen_text, ref_audio]
|
1794 |
)
|
1795 |
|
1796 |
with gr.Row():
|
1797 |
-
txt_info_gpu = gr.Textbox("", label="Device")
|
1798 |
-
seed_info = gr.
|
1799 |
-
check_button_infer = gr.Button("
|
1800 |
|
1801 |
-
gen_audio = gr.Audio(label="Audio
|
1802 |
|
1803 |
check_button_infer.click(
|
1804 |
fn=infer,
|
@@ -1825,10 +1847,10 @@ SOS: Check the use_ema setting (True or False) for your model to see what works
|
|
1825 |
gr.Markdown("""```plaintext
|
1826 |
Reduce the Base model size from 5GB to 1.3GB. The new checkpoint file prunes out optimizer and etc., can be used for inference or finetuning afterward, but not able to resume pretraining.
|
1827 |
```""")
|
1828 |
-
txt_path_checkpoint = gr.
|
1829 |
-
txt_path_checkpoint_small = gr.
|
1830 |
ch_safetensors = gr.Checkbox(label="Safetensors", value="")
|
1831 |
-
txt_info_reduse = gr.
|
1832 |
reduse_button = gr.Button("Reduce")
|
1833 |
reduse_button.click(
|
1834 |
fn=extract_and_save_ema_model,
|
|
|
120 |
default_settings = {
|
121 |
"exp_name": "F5TTS_v1_Base",
|
122 |
"learning_rate": 1e-5,
|
123 |
+
"batch_size_per_gpu": 3200,
|
124 |
+
"batch_size_type": "frame",
|
125 |
"max_samples": 64,
|
126 |
+
"grad_accumulation_steps": 1,
|
127 |
+
"max_grad_norm": 1.0,
|
128 |
"epochs": 100,
|
129 |
"num_warmup_updates": 100,
|
130 |
"save_per_updates": 500,
|
|
|
134 |
"file_checkpoint_train": "",
|
135 |
"tokenizer_type": "pinyin",
|
136 |
"tokenizer_file": "",
|
137 |
+
"mixed_precision": "fp16",
|
138 |
+
"logger": "none",
|
139 |
"bnb_optimizer": False,
|
140 |
}
|
141 |
|
|
|
361 |
|
362 |
|
363 |
def start_training(
|
364 |
+
dataset_name,
|
365 |
+
exp_name,
|
366 |
+
learning_rate,
|
367 |
+
batch_size_per_gpu,
|
368 |
+
batch_size_type,
|
369 |
+
max_samples,
|
370 |
+
grad_accumulation_steps,
|
371 |
+
max_grad_norm,
|
372 |
+
epochs,
|
373 |
+
num_warmup_updates,
|
374 |
+
save_per_updates,
|
375 |
+
keep_last_n_checkpoints,
|
376 |
+
last_per_updates,
|
377 |
+
finetune,
|
378 |
+
file_checkpoint_train,
|
379 |
+
tokenizer_type,
|
380 |
+
tokenizer_file,
|
381 |
+
mixed_precision,
|
382 |
+
stream,
|
383 |
+
logger,
|
384 |
+
ch_8bit_adam,
|
385 |
):
|
386 |
global training_process, tts_api, stop_signal
|
387 |
|
|
|
458 |
|
459 |
cmd += f" --tokenizer {tokenizer_type}"
|
460 |
|
461 |
+
if logger != "none":
|
462 |
+
cmd += f" --logger {logger}"
|
463 |
+
|
464 |
+
cmd += " --log_samples"
|
465 |
|
466 |
if ch_8bit_adam:
|
467 |
cmd += " --bnb_optimizer"
|
|
|
518 |
training_process = subprocess.Popen(
|
519 |
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1, env=env
|
520 |
)
|
521 |
+
yield "Training started ...", gr.update(interactive=False), gr.update(interactive=True)
|
522 |
|
523 |
stdout_queue = queue.Queue()
|
524 |
stderr_queue = queue.Queue()
|
|
|
587 |
gr.update(interactive=True),
|
588 |
)
|
589 |
else:
|
590 |
+
yield (
|
591 |
+
"Training complete or paused ...",
|
592 |
+
gr.update(interactive=False),
|
593 |
+
gr.update(interactive=True),
|
594 |
+
)
|
595 |
break
|
596 |
|
597 |
# Small sleep to prevent CPU thrashing
|
|
|
605 |
time.sleep(1)
|
606 |
|
607 |
if training_process is None:
|
608 |
+
text_info = "Train stopped !"
|
609 |
else:
|
610 |
+
text_info = "Train complete at end !"
|
611 |
|
612 |
except Exception as e: # Catch all exceptions
|
613 |
# Ensure that we reset the training process variable in case of an error
|
|
|
622 |
global training_process, stop_signal
|
623 |
|
624 |
if training_process is None:
|
625 |
+
return "Train not running !", gr.update(interactive=True), gr.update(interactive=False)
|
626 |
terminate_process_tree(training_process.pid)
|
627 |
# training_process = None
|
628 |
stop_signal = True
|
629 |
+
return "Train stopped !", gr.update(interactive=True), gr.update(interactive=False)
|
630 |
|
631 |
|
632 |
def get_list_projects():
|
|
|
1135 |
info = "You can train using your language !"
|
1136 |
else:
|
1137 |
vocab_miss = ",".join(miss_symbols)
|
1138 |
+
info = f"The following {len(miss_symbols)} symbols are missing in your language\n\n"
|
1139 |
|
1140 |
return info, vocab_miss
|
1141 |
|
|
|
1222 |
|
1223 |
print("update >> ", device_test, file_checkpoint, use_ema)
|
1224 |
|
1225 |
+
if seed == -1: # -1 used for random
|
1226 |
+
seed = None
|
1227 |
+
|
1228 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
1229 |
tts_api.infer(
|
1230 |
ref_file=ref_audio,
|
|
|
1443 |
)
|
1444 |
|
1445 |
audio_speaker = gr.File(label="Voice", type="filepath", file_count="multiple")
|
1446 |
+
txt_lang = gr.Textbox(label="Language", value="English")
|
1447 |
bt_transcribe = bt_create = gr.Button("Transcribe")
|
1448 |
+
txt_info_transcribe = gr.Textbox(label="Info", value="")
|
1449 |
bt_transcribe.click(
|
1450 |
fn=transcribe_all,
|
1451 |
inputs=[cm_project, audio_speaker, txt_lang, ch_manual],
|
|
|
1456 |
random_sample_transcribe = gr.Button("Random Sample")
|
1457 |
|
1458 |
with gr.Row():
|
1459 |
+
random_text_transcribe = gr.Textbox(label="Text")
|
1460 |
random_audio_transcribe = gr.Audio(label="Audio", type="filepath")
|
1461 |
|
1462 |
random_sample_transcribe.click(
|
|
|
1471 |
```""")
|
1472 |
|
1473 |
check_button = gr.Button("Check Vocab")
|
1474 |
+
txt_info_check = gr.Textbox(label="Info", value="")
|
1475 |
|
1476 |
gr.Markdown("""```plaintext
|
1477 |
Using the extended model, you can finetune to a new language that is missing symbols in the vocab. This creates a new model with a new vocabulary size and saves it in your ckpts/project folder.
|
|
|
1491 |
txt_count_symbol = gr.Textbox(label="New Vocab Size", value="", scale=1)
|
1492 |
|
1493 |
extend_button = gr.Button("Extend")
|
1494 |
+
txt_info_extend = gr.Textbox(label="Info", value="")
|
1495 |
|
1496 |
txt_extend.change(vocab_count, inputs=[txt_extend], outputs=[txt_count_symbol])
|
1497 |
check_button.click(fn=vocab_check, inputs=[cm_project], outputs=[txt_info_check, txt_extend])
|
|
|
1531 |
ch_tokenizern = gr.Checkbox(label="Create Vocabulary", value=False, visible=False)
|
1532 |
|
1533 |
bt_prepare = bt_create = gr.Button("Prepare")
|
1534 |
+
txt_info_prepare = gr.Textbox(label="Info", value="")
|
1535 |
+
txt_vocab_prepare = gr.Textbox(label="Vocab", value="")
|
1536 |
|
1537 |
bt_prepare.click(
|
1538 |
fn=create_metadata, inputs=[cm_project, ch_tokenizern], outputs=[txt_info_prepare, txt_vocab_prepare]
|
|
|
1541 |
random_sample_prepare = gr.Button("Random Sample")
|
1542 |
|
1543 |
with gr.Row():
|
1544 |
+
random_text_prepare = gr.Textbox(label="Tokenizer")
|
1545 |
random_audio_prepare = gr.Audio(label="Audio", type="filepath")
|
1546 |
|
1547 |
random_sample_prepare.click(
|
|
|
1554 |
If you encounter a memory error, try reducing the batch size per GPU to a smaller number.
|
1555 |
```""")
|
1556 |
with gr.Row():
|
1557 |
+
exp_name = gr.Radio(label="Model", choices=["F5TTS_v1_Base", "F5TTS_Base", "E2TTS_Base"])
|
1558 |
+
tokenizer_file = gr.Textbox(label="Tokenizer File")
|
1559 |
+
file_checkpoint_train = gr.Textbox(label="Path to the Pretrained Checkpoint")
|
1560 |
|
1561 |
with gr.Row():
|
1562 |
+
ch_finetune = bt_create = gr.Checkbox(label="Finetune")
|
1563 |
+
lb_samples = gr.Label(label="Samples")
|
1564 |
+
bt_calculate = bt_create = gr.Button("Auto Settings")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1565 |
|
1566 |
with gr.Row():
|
1567 |
+
epochs = gr.Number(label="Epochs")
|
1568 |
+
learning_rate = gr.Number(label="Learning Rate", step=0.5e-5)
|
1569 |
+
max_grad_norm = gr.Number(label="Max Gradient Norm")
|
1570 |
+
num_warmup_updates = gr.Number(label="Warmup Updates")
|
1571 |
|
1572 |
with gr.Row():
|
1573 |
+
batch_size_type = gr.Radio(
|
1574 |
+
label="Batch Size Type",
|
1575 |
+
choices=["frame", "sample"],
|
1576 |
+
info="frame is calculated as seconds * sampling_rate / hop_length",
|
1577 |
+
)
|
1578 |
+
batch_size_per_gpu = gr.Number(label="Batch Size per GPU", info="N frames or N samples")
|
1579 |
+
grad_accumulation_steps = gr.Number(
|
1580 |
+
label="Gradient Accumulation Steps", info="Effective batch size is multiplied by this value"
|
1581 |
+
)
|
1582 |
+
max_samples = gr.Number(label="Max Samples", info="Maximum number of samples per single GPU batch")
|
1583 |
|
1584 |
with gr.Row():
|
1585 |
+
save_per_updates = gr.Number(
|
1586 |
+
label="Save per Updates",
|
1587 |
+
info="Save intermediate checkpoints every N updates",
|
1588 |
+
minimum=10,
|
1589 |
+
)
|
1590 |
keep_last_n_checkpoints = gr.Number(
|
1591 |
label="Keep Last N Checkpoints",
|
|
|
1592 |
step=1,
|
1593 |
precision=0,
|
1594 |
+
info="-1 to keep all, 0 to not save intermediate, > 0 to keep last N",
|
1595 |
+
minimum=-1,
|
1596 |
)
|
1597 |
+
last_per_updates = gr.Number(
|
1598 |
+
label="Last per Updates",
|
1599 |
+
info="Save latest checkpoint with suffix _last.pt every N updates",
|
1600 |
+
minimum=10,
|
1601 |
+
)
|
1602 |
+
gr.Radio(label="") # placeholder
|
1603 |
|
1604 |
with gr.Row():
|
1605 |
ch_8bit_adam = gr.Checkbox(label="Use 8-bit Adam optimizer")
|
1606 |
+
mixed_precision = gr.Radio(label="Mixed Precision", choices=["none", "fp16", "bf16"])
|
1607 |
+
cd_logger = gr.Radio(label="Logger", choices=["none", "wandb", "tensorboard"])
|
1608 |
+
with gr.Column():
|
1609 |
+
start_button = gr.Button("Start Training")
|
1610 |
+
stop_button = gr.Button("Stop Training", interactive=False)
|
1611 |
|
1612 |
if projects_selelect is not None:
|
1613 |
(
|
|
|
1654 |
ch_8bit_adam.value = bnb_optimizer_value
|
1655 |
|
1656 |
ch_stream = gr.Checkbox(label="Stream Output Experiment", value=True)
|
1657 |
+
txt_info_train = gr.Textbox(label="Info", value="")
|
1658 |
|
1659 |
list_audios, select_audio = get_audio_project(projects_selelect, False)
|
1660 |
|
|
|
1783 |
|
1784 |
with gr.TabItem("Test Model"):
|
1785 |
gr.Markdown("""```plaintext
|
1786 |
+
Check the use_ema setting (True or False) for your model to see what works best for you. Set seed to -1 for random.
|
1787 |
```""")
|
1788 |
exp_name = gr.Radio(
|
1789 |
label="Model", choices=["F5TTS_v1_Base", "F5TTS_Base", "E2TTS_Base"], value="F5TTS_v1_Base"
|
|
|
1793 |
with gr.Row():
|
1794 |
nfe_step = gr.Number(label="NFE Step", value=32)
|
1795 |
speed = gr.Slider(label="Speed", value=1.0, minimum=0.3, maximum=2.0, step=0.1)
|
1796 |
+
seed = gr.Number(label="Random Seed", value=-1, minimum=-1)
|
1797 |
remove_silence = gr.Checkbox(label="Remove Silence")
|
1798 |
|
|
|
1799 |
with gr.Row():
|
1800 |
+
ch_use_ema = gr.Checkbox(
|
1801 |
+
label="Use EMA", value=True, info="Turn off at early stage might offer better results"
|
1802 |
+
)
|
1803 |
cm_checkpoint = gr.Dropdown(
|
1804 |
choices=list_checkpoints, value=checkpoint_select, label="Checkpoints", allow_custom_value=True
|
1805 |
)
|
|
|
1807 |
|
1808 |
random_sample_infer = gr.Button("Random Sample")
|
1809 |
|
1810 |
+
ref_text = gr.Textbox(label="Reference Text")
|
1811 |
+
ref_audio = gr.Audio(label="Reference Audio", type="filepath")
|
1812 |
+
gen_text = gr.Textbox(label="Text to Generate")
|
1813 |
|
1814 |
random_sample_infer.click(
|
1815 |
fn=get_random_sample_infer, inputs=[cm_project], outputs=[ref_text, gen_text, ref_audio]
|
1816 |
)
|
1817 |
|
1818 |
with gr.Row():
|
1819 |
+
txt_info_gpu = gr.Textbox("", label="Inference on Device :")
|
1820 |
+
seed_info = gr.Textbox(label="Used Random Seed :")
|
1821 |
+
check_button_infer = gr.Button("Inference")
|
1822 |
|
1823 |
+
gen_audio = gr.Audio(label="Generated Audio", type="filepath")
|
1824 |
|
1825 |
check_button_infer.click(
|
1826 |
fn=infer,
|
|
|
1847 |
gr.Markdown("""```plaintext
|
1848 |
Reduce the Base model size from 5GB to 1.3GB. The new checkpoint file prunes out optimizer and etc., can be used for inference or finetuning afterward, but not able to resume pretraining.
|
1849 |
```""")
|
1850 |
+
txt_path_checkpoint = gr.Textbox(label="Path to Checkpoint:")
|
1851 |
+
txt_path_checkpoint_small = gr.Textbox(label="Path to Output:")
|
1852 |
ch_safetensors = gr.Checkbox(label="Safetensors", value="")
|
1853 |
+
txt_info_reduse = gr.Textbox(label="Info", value="")
|
1854 |
reduse_button = gr.Button("Reduce")
|
1855 |
reduse_button.click(
|
1856 |
fn=extract_and_save_ema_model,
|