In [1]:
!python --version
Python 3.10.12
In [2]:
!sudo apt-get -y install espeak-ng > /dev/null 2>&1
!sudo apt-get install python3.9-distutils > /dev/null 2>&1
!sudo apt-get install python3.9 > /dev/null 2>&1
In [3]:
!pip -q install virtualenv > /dev/null 2>&1
In [4]:
!virtualenv -p python3.9 my_env
created virtual environment CPython3.9.21.final.0-64 in 772ms
  creator CPython3Posix(dest=/kaggle/working/my_env, clear=False, no_vcs_ignore=False, global=False)
  seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/root/.local/share/virtualenv)
    added seed packages: pip==24.3.1, setuptools==75.8.0, wheel==0.45.1
  activators BashActivator,CShellActivator,FishActivator,NushellActivator,PowerShellActivator,PythonActivator
In [5]:
!source my_env/bin/activate; pip list
Package    Version
---------- -------
pip        24.3.1
setuptools 75.8.0
wheel      0.45.1
In [6]:
# pip install coqui-tts==0.25.1
In [7]:
!source my_env/bin/activate; pip -q install uv
!source my_env/bin/activate; uv pip -q install coqui-tts==0.25.1
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 16.2/16.2 MB 139.8 MB/s eta 0:00:00
In [8]:
!mkdir train_output
!mkdir train_output/kagg
In [9]:
code='''import os

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.shared_configs import BaseDatasetConfig , CharactersConfig
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits, VitsAudioConfig
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de

output_path = os.path.dirname(os.path.abspath(__file__))



dataset_config = BaseDatasetConfig(
    formatter='mozilla', meta_file_train="metadata.csv", path="/kaggle/input/persian-tts-dataset"
)



audio_config = BaseAudioConfig(
    sample_rate=22050,
    do_trim_silence=False,
    resample=False,
    mel_fmin=0,
    mel_fmax=None
)
character_config=CharactersConfig(
  characters='ءابتثجحخدذرزسشصضطظعغفقلمنهويِپچژکگیآأؤإئًَُّ',
  punctuations='!(),-.:;? ̠،؛؟‌<>',
  phonemes='ˈˌːˑpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟaegiouwyɪʊ̩æɑɔəɚɛɝɨ̃ʉʌʍ0123456789"#$%*+/=ABCDEFGHIJKLMNOPRSTUVWXYZ[]^_{}',
  pad="<PAD>",
  eos="<EOS>",
  bos="<BOS>",
  blank="<BLNK>",
  characters_class="TTS.tts.utils.text.characters.IPAPhonemes",
  )
config = VitsConfig(
    audio=audio_config,
    run_name="vits_fa_female",
    batch_size=32,
    eval_batch_size=16,
    batch_group_size=5,
    num_loader_workers=0,
    num_eval_loader_workers=2,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    save_step=1000,
    text_cleaner="basic_cleaners",
    use_phonemes=True,
    phoneme_language="fa",
    characters=character_config,
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    compute_input_seq_cache=True,
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    test_sentences=[
        ["سلطان محمود در زمستانی سخت به طلخک گفت که: با این جامه ی یک لا در این سرما چه می کنی "],
        ["مردی نزد بقالی آمد و گفت پیاز هم ده تا دهان بدان خو شبوی سازم."],
        ["از مال خود پاره ای گوشت بستان و زیره بایی معطّر بساز"],
        ["یک بار هم از جهنم بگویید."],
        ["یکی اسبی به عاریت خواست"]
    ],
    output_path=output_path,
    datasets=[dataset_config],
)

# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# config is updated with the default characters if not defined in the config.
tokenizer, config = TTSTokenizer.init_from_config(config)

# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

# init model
model = Vits(config, ap, tokenizer, speaker_manager=None)

# init the trainer and 🚀
trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)
trainer.fit()'''
f=open("train_output/train_vits.py","w",encoding="utf-8")

f.write(code)

f.close()
In [10]:
#!wget "https://huggingface.co/Kamtera/persian-tts-male-vits/resolve/main/train_vits.py" -O train_output/train_vits.py
!wget "https://huggingface.co/Kamtera/persian-tts-male1-vits/resolve/main/checkpoint_218000.pth" \
    -O train_output/kagg/checkpoint_218000.pth > /dev/null 2>&1
In [11]:
!ls -ul 'train_output'
!ls -ul /kaggle/working/
total 8
drwxr-xr-x 2 root root 4096 Jan 25 13:05 kagg
-rw-r--r-- 1 root root 3700 Jan 25 13:05 train_vits.py
total 28
drwxr-xr-x 5 root root  4096 Jan 25 13:04 my_env
---------- 1 root root 17051 Jan 25 13:04 __notebook__.ipynb
drwxr-xr-x 3 root root  4096 Jan 25 13:06 train_output
In [12]:
import torch
torch.cuda.empty_cache()
In [13]:
!source my_env/bin/activate; PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:512" python "train_output/train_vits.py" \
--restore_path "train_output/kagg/checkpoint_218000.pth" \
--coqpit.run_name "vits-male-finetune" 
fatal: not a git repository (or any parent up to mount point /kaggle)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
fatal: not a git repository (or any parent up to mount point /kaggle)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 4
 | > Num. of Torch Threads: 2
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/kaggle/working/train_output/vits-male-finetune-January-25-2025_01+06PM-0000000
 > Restoring from checkpoint_218000.pth ...
 > Restoring Model...
 > Restoring Optimizer...
 > Model restored from step 218000

 > Model has 83063980 parameters

 > EPOCH: 0/1000
 --> /kaggle/working/train_output/vits-male-finetune-January-25-2025_01+06PM-0000000
100%|███████████████████████████████████████| 3689/3689 [05:58<00:00, 10.30it/s]

 > TRAINING (2025-01-25 13:13:58) 
 ! Run is removed from /kaggle/working/train_output/vits-male-finetune-January-25-2025_01+06PM-0000000
Traceback (most recent call last):
  File "/kaggle/working/my_env/lib/python3.9/site-packages/trainer/trainer.py", line 1633, in fit
    self._fit()
  File "/kaggle/working/my_env/lib/python3.9/site-packages/trainer/trainer.py", line 1585, in _fit
    self.train_epoch()
  File "/kaggle/working/my_env/lib/python3.9/site-packages/trainer/trainer.py", line 1302, in train_epoch
    outputs, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/trainer/trainer.py", line 1179, in train_step
    outputs, loss_dict_new, step_time = self.optimize(
  File "/kaggle/working/my_env/lib/python3.9/site-packages/trainer/trainer.py", line 1018, in optimize
    outputs, loss_dict = self._compute_loss(
  File "/kaggle/working/my_env/lib/python3.9/site-packages/trainer/trainer.py", line 947, in _compute_loss
    outputs, loss_dict = self._model_train_step(batch, model, criterion, optimizer_idx=optimizer_idx)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/trainer/trainer.py", line 896, in _model_train_step
    return model.train_step(*input_args)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/TTS/tts/models/vits.py", line 1110, in train_step
    scores_disc_fake, _, scores_disc_real, _ = self.disc(
  File "/kaggle/working/my_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/TTS/tts/layers/vits/discriminator.py", line 82, in forward
    x_score, x_feat = net(x)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/TTS/vocoder/models/hifigan_discriminator.py", line 69, in forward
    x = l(x)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 554, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/torch/nn/utils/parametrize.py", line 407, in get_parametrized
    return parametrization()
  File "/kaggle/working/my_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/torch/nn/utils/parametrize.py", line 303, in forward
    x = self[0](*originals)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/kaggle/working/my_env/lib/python3.9/site-packages/torch/nn/utils/parametrizations.py", line 325, in forward
    return torch._weight_norm(weight_v, weight_g, self.dim)
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 35.12 MiB is free. Process 4102 has 15.85 GiB memory in use. Of the allocated memory 14.83 GiB is allocated by PyTorch, and 730.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
In [ ]: