Upload 16 files

Browse files

Files changed (16) hide show

.gitignore +12 -0
FUNDING.yml +1 -0
LICENSE.md +33 -0
README.md +190 -3
clone_voice.ipynb +255 -0
generate.ipynb +173 -0
generate_chunked.ipynb +344 -0
model-card.md +40 -0
pyproject.toml +58 -0
rvc_infer.py +169 -0
rvc_test.ipynb +85 -0
setup.py +3 -0
test_models.ipynb +454 -0
train_coarse.ipynb +936 -0
train_fine.ipynb +919 -0
train_semantic.ipynb +899 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+__pycache__/
+*.wav
+_temp/
+models/
+wandb/
+*_output/
+output.npz
+joe_biden_state_of_union/
+Retrieval-based-Voice-Conversion-WebUI/
+devin-youtube/
+train_rvc.ipynb
+*.pt

FUNDING.yml ADDED Viewed

	@@ -0,0 +1 @@


1	+ github: serp-ai

LICENSE.md ADDED Viewed

	@@ -0,0 +1,33 @@

+MIT License
+Copyright (c) 2023 [SERP](https://serp.co/) | [SERP AI](https://serp.ai/) | [DS](https://devinschumacher.com/)
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+# A humble request
+Our mission is to make artificial intelligence accessible & enjoyable, so we can all build bridges to the future, together.
+Please, feel free to use this as you see fit in accordance with the law & ideally inline with our values of accessibility, equality & AI for all.
+We only have one humble request (not requirement) ... that you represent these values by adding one of our (extremely awesome) AI badges on your website / github / etc.
+👉 You can generate & customize your own here: [https://serp.ly/@serpai/badges/ai](https://serp.ly/@serpai/badges/ai)
+Thank you!

README.md CHANGED Viewed

@@ -1,3 +1,190 @@
----
-license: apache-2.0
----

+# 🐶 BARK AI: but with the ability to use voice cloning on custom audio samples
+For RVC `git clone https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI` and train your model or point the code to you model (must clone RVC repo in bark-with-voice-clone directory)
+If you want to clone a voice just follow the `clone_voice.ipynb` notebook. If you want to generate audio from text, follow the `generate.ipynb` notebook.
+To create a voice clone sample, you need an audio sample of around 5-12 seconds
+You will get the best results by making generations with your cloned voice until you find one that is really close to the source. Then use that as the new history prompt (comes from the model so should theoretically be more consistent)
+- [BARK text to speech @ SERP AI](https://serp.ai/tools/bark-text-to-speech-ai-voice-clone-app/)
+# Contributors
+Huge shoutout & thank you to:
+[gitmylo](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/)
+for the solution to the semantic token generation for better voice clones and finetunes (HuBERT, etc.)
+***
+<div style="display: flex; flex-wrap: wrap;">
+  <a href="https://github.com/francislabountyjr" target="_blank" style="margin: 5px; display: inline-block;"><img src="https://avatars.githubusercontent.com/u/73464335?v=4" alt="francislabountyjr" style="border-radius: 50%; width: 75px; height: 75px;"></a>
+  <a href="https://github.com/gkucsko" target="_blank" style="margin: 5px; display: inline-block;"><img src="https://avatars.githubusercontent.com/u/5068315?v=4" alt="gkucsko" style="border-radius: 50%; width: 75px; height: 75px;"></a>
+  <a href="https://github.com/kmfreyberg" target="_blank" style="margin: 5px; display: inline-block;"><img src="https://avatars.githubusercontent.com/u/32879321?v=4" alt="kmfreyberg" style="border-radius: 50%; width: 75px; height: 75px;"></a>
+  <a href="https://github.com/Vaibhavs10" target="_blank" style="margin: 5px; display: inline-block;"><img src="https://avatars.githubusercontent.com/u/18682411?v=4" alt="Vaibhavs10" style="border-radius: 50%; width: 75px; height: 75px;"></a>
+  <a href="https://github.com/devinschumacher" target="_blank" style="margin: 5px; display: inline-block;"><img src="https://avatars.githubusercontent.com/u/45643901?v=4" alt="devinschumacher" style="border-radius: 50%; width: 75px; height: 75px;"></a>
+  <a href="https://github.com/mcamac" target="_blank" style="margin: 5px; display: inline-block;"><img src="https://avatars.githubusercontent.com/u/461009?v=4" alt="mcamac" style="border-radius: 50%; width: 75px; height: 75px;"></a>
+  <a href="https://github.com/fiq" target="_blank" style="margin: 5px; display: inline-block;"><img src="https://avatars.githubusercontent.com/u/236293?v=4" alt="fiq" style="border-radius: 50%; width: 75px; height: 75px;"></a>
+  <a href="https://github.com/zygi" target="_blank" style="margin: 5px; display: inline-block;"><img src="https://avatars.githubusercontent.com/u/2059901?v=4" alt="zygi" style="border-radius: 50%; width: 75px; height: 75px;"></a>
+  <a href="https://github.com/jn-jairo" target="_blank" style="margin: 5px; display: inline-block;"><img src="https://avatars.githubusercontent.com/u/5104869?v=4" alt="jn-jairo" style="border-radius: 50%; width: 75px; height: 75px;"></a>
+  <a href="https://github.com/gitmylo" target="_blank" style="margin: 5px; display: inline-block;"><img src="https://avatars.githubusercontent.com/u/36931363?v=4" alt="gitmylo" style="border-radius: 50%; width: 75px; height: 75px;"></a>
+  <a href="https://github.com/alyxdow" target="_blank" style="margin: 5px; display: inline-block;"><img src="https://avatars.githubusercontent.com/u/84633629?v=4" alt="alyxdow" style="border-radius: 50%; width: 75px; height: 75px;"></a>
+  <a href="https://github.com/mikeyshulman" target="_blank" style="margin: 5px; display: inline-block;"><img src="https://avatars.githubusercontent.com/u/2565833?v=4" alt="mikeyshulman" style="border-radius: 50%; width: 75px; height: 75px;"></a>
+</div>
+-------------------------------------------------------------------
+# Original README.md
+## 🤖 Usage
+```python
+from bark import SAMPLE_RATE, generate_audio, preload_models
+from IPython.display import Audio
+# download and load all models
+preload_models()
+# generate audio from text
+text_prompt = """
+     Hello, my name is Serpy. And, uh — and I like pizza. [laughs]
+     But I also have other interests such as playing tic tac toe.
+"""
+audio_array = generate_audio(text_prompt)
+# play text in notebook
+Audio(audio_array, rate=SAMPLE_RATE)
+```
+[pizza.webm](https://user-images.githubusercontent.com/5068315/230490503-417e688d-5115-4eee-9550-b46a2b465ee3.webm)
+To save `audio_array` as a WAV file:
+```python
+from scipy.io.wavfile import write as write_wav
+write_wav("/path/to/audio.wav", SAMPLE_RATE, audio_array)
+```
+### 🌎 Foreign Language
+Bark supports various languages out-of-the-box and automatically determines language from input text. When prompted with code-switched text, Bark will attempt to employ the native accent for the respective languages. English quality is best for the time being, and we expect other languages to further improve with scaling.
+```python
+text_prompt = """
+    Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo.
+    But I suppose your english isn't terrible.
+"""
+audio_array = generate_audio(text_prompt)
+```
+[miguel.webm](https://user-images.githubusercontent.com/5068315/230684752-10baadfe-1e7c-46a2-8323-43282aef2c8c.webm)
+### 🎶 Music
+Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics.
+```python
+text_prompt = """
+    ♪ In the jungle, the mighty jungle, the lion barks tonight ♪
+"""
+audio_array = generate_audio(text_prompt)
+```
+[lion.webm](https://user-images.githubusercontent.com/5068315/230684766-97f5ea23-ad99-473c-924b-66b6fab24289.webm)
+### 🎤 Voice Presets and Voice/Audio Cloning
+Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. The model also attempts to preserve music, ambient noise, etc. from input audio. However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from for each language. Specify following the pattern: `{lang_code}_speaker_{0-9}`.
+```python
+text_prompt = """
+    I have a silky smooth voice, and today I will tell you about
+    the exercise regimen of the common sloth.
+"""
+audio_array = generate_audio(text_prompt, history_prompt="en_speaker_1")
+```
+[sloth.webm](https://user-images.githubusercontent.com/5068315/230684883-a344c619-a560-4ff5-8b99-b4463a34487b.webm)
+*Note: since Bark recognizes languages automatically from input text, it is possible to use for example a german history prompt with english text. This usually leads to english audio with a german accent.*
+### 👥 Speaker Prompts
+You can provide certain speaker prompts such as NARRATOR, MAN, WOMAN, etc. Please note that these are not always respected, especially if a conflicting audio history prompt is given.
+```python
+text_prompt = """
+    WOMAN: I would like an oatmilk latte please.
+    MAN: Wow, that's expensive!
+"""
+audio_array = generate_audio(text_prompt)
+```
+[latte.webm](https://user-images.githubusercontent.com/5068315/230684864-12d101a1-a726-471d-9d56-d18b108efcb8.webm)
+## 💻 Installation
+```
+pip install git+https://github.com/suno-ai/bark.git
+```
+or
+```
+git clone https://github.com/suno-ai/bark
+cd bark && pip install .
+```
+## 🛠️ Hardware and Inference Speed
+Bark has been tested and works on both CPU and GPU (`pytorch 2.0+`, CUDA 11.7 and CUDA 12.0).
+Running Bark requires running >100M parameter transformer models.
+On modern GPUs and PyTorch nightly, Bark can generate audio in roughly realtime. On older GPUs, default colab, or CPU, inference time might be 10-100x slower.
+## ⚙️ Details
+Similar to [Vall-E](https://arxiv.org/abs/2301.02111) and some other amazing work in the field, Bark uses GPT-style
+models to generate audio from scratch. Different from Vall-E, the initial text prompt is embedded into high-level semantic tokens without the use of phonemes. It can therefore generalize to arbitrary instructions beyond speech that occur in the training data, such as music lyrics, sound effects or other non-speech sounds. A subsequent second model is used to convert the generated semantic tokens into audio codec tokens to generate the full waveform. To enable the community to use Bark via public code we used the fantastic
+[EnCodec codec](https://github.com/facebookresearch/encodec) from Facebook to act as an audio representation.
+Below is a list of some known non-speech sounds
+- `[laughter]`
+- `[laughs]`
+- `[sighs]`
+- `[music]`
+- `[gasps]`
+- `[clears throat]`
+- `—` or `...` for hesitations
+- `♪` for song lyrics
+- capitalization for emphasis of a word
+- `MAN/WOMAN:` for bias towards speaker
+**Supported Languages**
+| Language | Status |
+| --- | --- |
+| English (en) | ✅ |
+| German (de) | ✅ |
+| Spanish (es) | ✅ |
+| French (fr) | ✅ |
+| Hindi (hi) | ✅ |
+| Italian (it) | ✅ |
+| Japanese (ja) | ✅ |
+| Korean (ko) | ✅ |
+| Polish (pl) | ✅ |
+| Portuguese (pt) | ✅ |
+| Russian (ru) | ✅ |
+| Turkish (tr) | ✅ |
+| Chinese, simplified (zh) | ✅ |
+| Arabic  | Coming soon! |
+| Bengali | Coming soon! |
+| Telugu | Coming soon! |

clone_voice.ipynb ADDED Viewed

	@@ -0,0 +1,255 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bark.generation import load_codec_model, generate_text_semantic\n",
+    "from encodec.utils import convert_audio\n",
+    "\n",
+    "import torchaudio\n",
+    "import torch\n",
+    "\n",
+    "device = 'cuda' # or 'cpu'\n",
+    "model = load_codec_model(use_gpu=True if device == 'cuda' else False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer\n",
+    "from hubert.hubert_manager import HuBERTManager\n",
+    "hubert_manager = HuBERTManager()\n",
+    "hubert_manager.make_sure_hubert_installed()\n",
+    "hubert_manager.make_sure_tokenizer_installed()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer \n",
+    "# Load HuBERT for semantic tokens\n",
+    "from hubert.pre_kmeans_hubert import CustomHubert\n",
+    "from hubert.customtokenizer import CustomTokenizer\n",
+    "\n",
+    "# Load the HuBERT model\n",
+    "hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt').to(device)\n",
+    "\n",
+    "# Load the CustomTokenizer model\n",
+    "tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth').to(device)  # Automatically uses the right layers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load and pre-process the audio waveform\n",
+    "audio_filepath = 'audio.wav' # the audio you want to clone (under 13 seconds)\n",
+    "wav, sr = torchaudio.load(audio_filepath)\n",
+    "wav = convert_audio(wav, sr, model.sample_rate, model.channels)\n",
+    "wav = wav.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)\n",
+    "semantic_tokens = tokenizer.get_token(semantic_vectors)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract discrete codes from EnCodec\n",
+    "with torch.no_grad():\n",
+    "    encoded_frames = model.encode(wav.unsqueeze(0))\n",
+    "codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# move codes to cpu\n",
+    "codes = codes.cpu().numpy()\n",
+    "# move semantic tokens to cpu\n",
+    "semantic_tokens = semantic_tokens.cpu().numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "voice_name = 'output' # whatever you want the name of the voice to be\n",
+    "output_path = 'bark/assets/prompts/' + voice_name + '.npz'\n",
+    "np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Heres the generation stuff copy-pasted for convenience"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bark.api import generate_audio\n",
+    "from transformers import BertTokenizer\n",
+    "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic\n",
+    "\n",
+    "# Enter your prompt and speaker here\n",
+    "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n",
+    "voice_name = \"output\" # use your custom voice name here if you have one"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# download and load all models\n",
+    "preload_models(\n",
+    "    text_use_gpu=True,\n",
+    "    text_use_small=False,\n",
+    "    coarse_use_gpu=True,\n",
+    "    coarse_use_small=False,\n",
+    "    fine_use_gpu=True,\n",
+    "    fine_use_small=False,\n",
+    "    codec_use_gpu=True,\n",
+    "    force_reload=False,\n",
+    "    path=\"models\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# simple generation\n",
+    "audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# generation with more control\n",
+    "x_semantic = generate_text_semantic(\n",
+    "    text_prompt,\n",
+    "    history_prompt=voice_name,\n",
+    "    temp=0.7,\n",
+    "    top_k=50,\n",
+    "    top_p=0.95,\n",
+    ")\n",
+    "\n",
+    "x_coarse_gen = generate_coarse(\n",
+    "    x_semantic,\n",
+    "    history_prompt=voice_name,\n",
+    "    temp=0.7,\n",
+    "    top_k=50,\n",
+    "    top_p=0.95,\n",
+    ")\n",
+    "x_fine_gen = generate_fine(\n",
+    "    x_coarse_gen,\n",
+    "    history_prompt=voice_name,\n",
+    "    temp=0.5,\n",
+    ")\n",
+    "audio_array = codec_decode(x_fine_gen)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Audio\n",
+    "# play audio\n",
+    "Audio(audio_array, rate=SAMPLE_RATE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scipy.io.wavfile import write as write_wav\n",
+    "# save audio\n",
+    "filepath = \"/output/audio.wav\" # change this to your desired output path\n",
+    "write_wav(filepath, SAMPLE_RATE, audio_array)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

generate.ipynb ADDED Viewed

	@@ -0,0 +1,173 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Audio\n",
+    "from scipy.io.wavfile import write as write_wav\n",
+    "\n",
+    "from bark.api import generate_audio\n",
+    "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "semantic_path = \"semantic_output/pytorch_model.bin\" # set to None if you don't want to use finetuned semantic\n",
+    "coarse_path = \"coarse_output/pytorch_model.bin\" # set to None if you don't want to use finetuned coarse\n",
+    "fine_path = \"fine_output/pytorch_model.bin\" # set to None if you don't want to use finetuned fine\n",
+    "use_rvc = True # Set to False to use bark without RVC\n",
+    "rvc_name = 'mi-test'\n",
+    "rvc_path = f\"Retrieval-based-Voice-Conversion-WebUI/weights/{rvc_name}.pth\"\n",
+    "index_path = f\"Retrieval-based-Voice-Conversion-WebUI/logs/{rvc_name}/added_IVF256_Flat_nprobe_1_{rvc_name}_v2.index\"\n",
+    "device=\"cuda:0\"\n",
+    "is_half=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# download and load all models\n",
+    "preload_models(\n",
+    "    text_use_gpu=True,\n",
+    "    text_use_small=False,\n",
+    "    text_model_path=semantic_path,\n",
+    "    coarse_use_gpu=True,\n",
+    "    coarse_use_small=False,\n",
+    "    coarse_model_path=coarse_path,\n",
+    "    fine_use_gpu=True,\n",
+    "    fine_use_small=False,\n",
+    "    fine_model_path=fine_path,\n",
+    "    codec_use_gpu=True,\n",
+    "    force_reload=False,\n",
+    "    path=\"models\"\n",
+    ")\n",
+    "\n",
+    "if use_rvc:\n",
+    "    from rvc_infer import get_vc, vc_single\n",
+    "    get_vc(rvc_path, device, is_half)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# simple generation\n",
+    "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n",
+    "voice_name = \"speaker_0\" # use your custom voice name here if you have on\n",
+    "\n",
+    "filepath = \"output/audio.wav\"\n",
+    "audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)\n",
+    "write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -6\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "    try:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    except:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "Audio(audio_array, rate=SAMPLE_RATE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# generation with more control\n",
+    "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n",
+    "voice_name = \"speaker_0\" # use your custom voice name here if you have on\n",
+    "\n",
+    "filepath = \"output/audio.wav\"\n",
+    "\n",
+    "x_semantic = generate_text_semantic(\n",
+    "    text_prompt,\n",
+    "    history_prompt=voice_name,\n",
+    "    temp=0.7,\n",
+    "    top_k=50,\n",
+    "    top_p=0.95,\n",
+    ")\n",
+    "\n",
+    "x_coarse_gen = generate_coarse(\n",
+    "    x_semantic,\n",
+    "    history_prompt=voice_name,\n",
+    "    temp=0.7,\n",
+    "    top_k=50,\n",
+    "    top_p=0.95,\n",
+    ")\n",
+    "x_fine_gen = generate_fine(\n",
+    "    x_coarse_gen,\n",
+    "    history_prompt=voice_name,\n",
+    "    temp=0.5,\n",
+    ")\n",
+    "audio_array = codec_decode(x_fine_gen)\n",
+    "write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -6\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "    try:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    except:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "Audio(audio_array, rate=SAMPLE_RATE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

generate_chunked.ipynb ADDED Viewed

	@@ -0,0 +1,344 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Audio\n",
+    "from scipy.io.wavfile import write as write_wav\n",
+    "\n",
+    "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "semantic_path = \"semantic_output/pytorch_model.bin\" # set to None if you don't want to use finetuned semantic\n",
+    "coarse_path = \"coarse_output/pytorch_model.bin\" # set to None if you don't want to use finetuned coarse\n",
+    "fine_path = \"fine_output/pytorch_model.bin\" # set to None if you don't want to use finetuned fine\n",
+    "use_rvc = True # Set to False to use bark without RVC\n",
+    "rvc_name = 'mi-test'\n",
+    "rvc_path = f\"Retrieval-based-Voice-Conversion-WebUI/weights/{rvc_name}.pth\"\n",
+    "index_path = f\"Retrieval-based-Voice-Conversion-WebUI/logs/{rvc_name}/added_IVF256_Flat_nprobe_1_{rvc_name}_v2.index\" \n",
+    "device=\"cuda:0\"\n",
+    "is_half=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "def split_and_recombine_text(text, desired_length=100, max_length=150):\n",
+    "    # from https://github.com/neonbjb/tortoise-tts\n",
+    "    \"\"\"Split text it into chunks of a desired length trying to keep sentences intact.\"\"\"\n",
+    "    # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii\n",
+    "    text = re.sub(r\"\\n\\n+\", \"\\n\", text)\n",
+    "    text = re.sub(r\"\\s+\", \" \", text)\n",
+    "    text = re.sub(r\"[“”]\", '\"', text)\n",
+    "\n",
+    "    rv = []\n",
+    "    in_quote = False\n",
+    "    current = \"\"\n",
+    "    split_pos = []\n",
+    "    pos = -1\n",
+    "    end_pos = len(text) - 1\n",
+    "\n",
+    "    def seek(delta):\n",
+    "        nonlocal pos, in_quote, current\n",
+    "        is_neg = delta < 0\n",
+    "        for _ in range(abs(delta)):\n",
+    "            if is_neg:\n",
+    "                pos -= 1\n",
+    "                current = current[:-1]\n",
+    "            else:\n",
+    "                pos += 1\n",
+    "                current += text[pos]\n",
+    "            if text[pos] == '\"':\n",
+    "                in_quote = not in_quote\n",
+    "        return text[pos]\n",
+    "\n",
+    "    def peek(delta):\n",
+    "        p = pos + delta\n",
+    "        return text[p] if p < end_pos and p >= 0 else \"\"\n",
+    "\n",
+    "    def commit():\n",
+    "        nonlocal rv, current, split_pos\n",
+    "        rv.append(current)\n",
+    "        current = \"\"\n",
+    "        split_pos = []\n",
+    "\n",
+    "    while pos < end_pos:\n",
+    "        c = seek(1)\n",
+    "        # do we need to force a split?\n",
+    "        if len(current) >= max_length:\n",
+    "            if len(split_pos) > 0 and len(current) > (desired_length / 2):\n",
+    "                # we have at least one sentence and we are over half the desired length, seek back to the last split\n",
+    "                d = pos - split_pos[-1]\n",
+    "                seek(-d)\n",
+    "            else:\n",
+    "                # no full sentences, seek back until we are not in the middle of a word and split there\n",
+    "                while c not in \"!?.\\n \" and pos > 0 and len(current) > desired_length:\n",
+    "                    c = seek(-1)\n",
+    "            commit()\n",
+    "        # check for sentence boundaries\n",
+    "        elif not in_quote and (c in \"!?\\n\" or (c == \".\" and peek(1) in \"\\n \")):\n",
+    "            # seek forward if we have consecutive boundary markers but still within the max length\n",
+    "            while (\n",
+    "                pos < len(text) - 1 and len(current) < max_length and peek(1) in \"!?.\"\n",
+    "            ):\n",
+    "                c = seek(1)\n",
+    "            split_pos.append(pos)\n",
+    "            if len(current) >= desired_length:\n",
+    "                commit()\n",
+    "        # treat end of quote as a boundary if its followed by a space or newline\n",
+    "        elif in_quote and peek(1) == '\"' and peek(2) in \"\\n \":\n",
+    "            seek(2)\n",
+    "            split_pos.append(pos)\n",
+    "    rv.append(current)\n",
+    "\n",
+    "    # clean up, remove lines with only whitespace or punctuation\n",
+    "    rv = [s.strip() for s in rv]\n",
+    "    rv = [s for s in rv if len(s) > 0 and not re.match(r\"^[\\s\\.,;:!?]*$\", s)]\n",
+    "\n",
+    "    return rv\n",
+    "\n",
+    "def generate_with_settings(text_prompt, semantic_temp=0.7, semantic_top_k=50, semantic_top_p=0.95, coarse_temp=0.7, coarse_top_k=50, coarse_top_p=0.95, fine_temp=0.5, voice_name=None, use_semantic_history_prompt=True, use_coarse_history_prompt=True, use_fine_history_prompt=True, output_full=False):\n",
+    "    # generation with more control\n",
+    "    x_semantic = generate_text_semantic(\n",
+    "        text_prompt,\n",
+    "        history_prompt=voice_name if use_semantic_history_prompt else None,\n",
+    "        temp=semantic_temp,\n",
+    "        top_k=semantic_top_k,\n",
+    "        top_p=semantic_top_p,\n",
+    "    )\n",
+    "\n",
+    "    x_coarse_gen = generate_coarse(\n",
+    "        x_semantic,\n",
+    "        history_prompt=voice_name if use_coarse_history_prompt else None,\n",
+    "        temp=coarse_temp,\n",
+    "        top_k=coarse_top_k,\n",
+    "        top_p=coarse_top_p,\n",
+    "    )\n",
+    "    x_fine_gen = generate_fine(\n",
+    "        x_coarse_gen,\n",
+    "        history_prompt=voice_name if use_fine_history_prompt else None,\n",
+    "        temp=fine_temp,\n",
+    "    )\n",
+    "\n",
+    "    if output_full:\n",
+    "        full_generation = {\n",
+    "            'semantic_prompt': x_semantic,\n",
+    "            'coarse_prompt': x_coarse_gen,\n",
+    "            'fine_prompt': x_fine_gen,\n",
+    "        }\n",
+    "        return full_generation, codec_decode(x_fine_gen)\n",
+    "    return codec_decode(x_fine_gen)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# `[laughter]`\n",
+    "# - `[laughs]`\n",
+    "# - `[sighs]`\n",
+    "# - `[music]`\n",
+    "# - `[gasps]`\n",
+    "# - `[clears throat]`\n",
+    "# - `—` or `...` for hesitations\n",
+    "# - `♪` for song lyrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# download and load all models\n",
+    "preload_models(\n",
+    "    text_use_gpu=True,\n",
+    "    text_use_small=False,\n",
+    "    text_model_path=semantic_path,\n",
+    "    coarse_use_gpu=True,\n",
+    "    coarse_use_small=False,\n",
+    "    coarse_model_path=coarse_path,\n",
+    "    fine_use_gpu=True,\n",
+    "    fine_use_small=False,\n",
+    "    fine_model_path=fine_path,\n",
+    "    codec_use_gpu=True,\n",
+    "    force_reload=False,\n",
+    "    path=\"models\"\n",
+    ")\n",
+    "\n",
+    "if use_rvc:\n",
+    "    from rvc_infer import get_vc, vc_single\n",
+    "    get_vc(rvc_path, device, is_half)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"\"\"The Uncharted Land of Discovery: A Journey Through Time and Space\n",
+    "[clears throat]\n",
+    "Chapter 1: The Dawn of Curiosity\n",
+    "[takes breath]\n",
+    "Since the dawn of humankind, our species has been driven by a powerful force: curiosity. It is an innate, unquenchable desire to explore, understand, and unravel the mysteries of the world around us. This primal urge has led us on countless adventures, pushing us to the farthest reaches of our planet and beyond.\n",
+    "\n",
+    "Early humans, huddled around a flickering fire, gazed up at the night sky and wondered what those twinkling lights were. They had no idea that their curiosity would eventually propel us into the vast, uncharted realm of space. As time progressed, our ancestors began to explore their surroundings, venturing beyond their caves and settlements, driven by the need to discover what lay beyond the horizon.\n",
+    "\n",
+    "hapter 2: The Age of Exploration\n",
+    "\n",
+    "The Age of Exploration marked a turning point in human history, as brave souls took to the seas in search of new lands, wealth, and knowledge. Pioneers like Christopher Columbus, Vasco da Gama, and Ferdinand Magellan set sail on perilous voyages, pushing the boundaries of what was known and understood.\n",
+    "[clears throat]\n",
+    "These intrepid explorers discovered new continents, mapped out previously unknown territories, and encountered diverse cultures. They also established trade routes, allowing for the exchange of goods, ideas, and innovations between distant societies. The Age of Exploration was not without its dark moments, however, as conquest, colonization, and exploitation often went hand in hand with discovery.\n",
+    "[clears throat]\n",
+    "Chapter 3: The Scientific Revolution\n",
+    "[laughs]\n",
+    "The Scientific Revolution was a period of profound change, as humanity began to question long-held beliefs and seek empirical evidence. Pioneers like Galileo Galilei, Isaac Newton, and Johannes Kepler sought to understand the natural world through observation, experimentation, and reason.\n",
+    "[sighs]\n",
+    "Their discoveries laid the foundation for modern science, transforming the way we view the universe and our place within it. New technologies, such as the telescope and the microscope, allowed us to peer deeper into the cosmos and the microscopic world, further expanding our understanding of reality.\n",
+    "[gasps]\n",
+    "Chapter 4: The Information Age\n",
+    "\n",
+    "The Information Age, sometimes referred to as the Digital Age, has revolutionized the way we communicate, learn, and access knowledge. With the advent of the internet and personal computers, information that was once reserved for the privileged few is now available to the masses.\n",
+    "...\n",
+    "This democratization of knowledge has led to an explosion of innovation, as ideas and information are shared across borders and cultures at lightning speed. The Information Age has also brought new challenges, as the rapid pace of technological advancements threatens to outpace our ability to adapt and raises questions about the ethical implications of our increasingly interconnected world.\n",
+    "[laughter]\n",
+    "Chapter 5: The Final Frontier\n",
+    "[clears throat]\n",
+    "As our knowledge of the universe expands, so too does our desire to explore the cosmos. Space exploration has come a long way since the first successful satellite, Sputnik, was launched in 1957. We have landed humans on the moon, sent probes to the far reaches of our solar system, and even glimpsed distant galaxies through powerful telescopes.\n",
+    "\n",
+    "The future of space exploration is filled with possibilities, from establishing colonies on Mars to the search for extraterrestrial life. As we venture further into the unknown, we continue to be driven by the same curiosity that has propelled us throughout history, always seeking to uncover the secrets of the universe and our place within it.\n",
+    "...\n",
+    "In conclusion, the human journey is one of discovery, driven by our innate curiosity and desire to understand the world around us. From the dawn of our species to the present day, we have continued to explore, learn, and adapt, pushing the boundaries of what is known and possible. As we continue to unravel the mysteries of the cosmos, our spirit.\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Chunk the text into smaller pieces then combine the generated audio\n",
+    "from time import time\n",
+    "from tqdm.auto import tqdm\n",
+    "from IPython.display import Audio\n",
+    "from scipy.io.wavfile import write as write_wav\n",
+    "import os\n",
+    "import numpy as np\n",
+    "\n",
+    "# generation settings\n",
+    "voice_name = 'en_speaker_0'\n",
+    "out_filepath = 'audio/audio.wav'\n",
+    "\n",
+    "semantic_temp = 0.7\n",
+    "semantic_top_k = 50\n",
+    "semantic_top_p = 0.95\n",
+    "\n",
+    "coarse_temp = 0.7\n",
+    "coarse_top_k = 50\n",
+    "coarse_top_p = 0.95\n",
+    "\n",
+    "fine_temp = 0.5\n",
+    "\n",
+    "use_semantic_history_prompt = True\n",
+    "use_coarse_history_prompt = True\n",
+    "use_fine_history_prompt = True\n",
+    "\n",
+    "use_last_generation_as_history = True\n",
+    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -10\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "\n",
+    "texts = split_and_recombine_text(text)\n",
+    "\n",
+    "all_parts = []\n",
+    "for i, text in tqdm(enumerate(texts), total=len(texts)):\n",
+    "    full_generation, audio_array = generate_with_settings(\n",
+    "        text,\n",
+    "        semantic_temp=semantic_temp,\n",
+    "        semantic_top_k=semantic_top_k,\n",
+    "        semantic_top_p=semantic_top_p,\n",
+    "        coarse_temp=coarse_temp,\n",
+    "        coarse_top_k=coarse_top_k,\n",
+    "        coarse_top_p=coarse_top_p,\n",
+    "        fine_temp=fine_temp,\n",
+    "        voice_name=voice_name,\n",
+    "        use_semantic_history_prompt=use_semantic_history_prompt,\n",
+    "        use_coarse_history_prompt=use_coarse_history_prompt,\n",
+    "        use_fine_history_prompt=use_fine_history_prompt,\n",
+    "        output_full=True\n",
+    "    )\n",
+    "    if use_last_generation_as_history:\n",
+    "        # save to npz\n",
+    "        os.makedirs('_temp', exist_ok=True)\n",
+    "        np.savez_compressed(\n",
+    "            '_temp/history.npz',\n",
+    "            semantic_prompt=full_generation['semantic_prompt'],\n",
+    "            coarse_prompt=full_generation['coarse_prompt'],\n",
+    "            fine_prompt=full_generation['fine_prompt'],\n",
+    "        )\n",
+    "        voice_name = '_temp/history.npz'\n",
+    "    write_wav(out_filepath.replace('.wav', f'_{i}') + '.wav', SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "    if use_rvc:\n",
+    "        try:\n",
+    "            audio_array = vc_single(0,out_filepath.replace('.wav', f'_{i}') + '.wav',f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "        except:\n",
+    "            audio_array = vc_single(0,out_filepath.replace('.wav', f'_{i}') + '.wav',f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "        write_wav(out_filepath.replace('.wav', f'_{i}') + '.wav', SAMPLE_RATE, audio_array)\n",
+    "    all_parts.append(audio_array)\n",
+    "\n",
+    "audio_array = np.concatenate(all_parts, axis=-1)\n",
+    "\n",
+    "# save audio\n",
+    "write_wav(out_filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "# play audio\n",
+    "Audio(audio_array, rate=SAMPLE_RATE)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

model-card.md ADDED Viewed

	@@ -0,0 +1,40 @@

+# Model Card: Bark
+This is the official codebase for running the text to audio model, from Suno.ai.
+The following is additional information about the models released here.
+## Model Details
+Bark is a series of three transformer models that turn text into audio.
+### Text to semantic tokens
+ - Input: text, tokenized with [BERT tokenizer from Hugging Face](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer)
+ - Output: semantic tokens that encode the audio to be generated
+### Semantic to coarse tokens
+ - Input: semantic tokens
+ - Output: tokens from the first two codebooks of the [EnCodec Codec](https://github.com/facebookresearch/encodec) from facebook
+### Coarse to fine tokens
+ - Input: the first two codebooks from EnCodec
+ - Output: 8 codebooks from EnCodec
+### Architecture
+|           Model           | Parameters | Attention  | Output Vocab size |
+|:-------------------------:|:----------:|------------|:-----------------:|
+|  Text to semantic tokens  |    80 M    | Causal     |       10,000      |
+| Semantic to coarse tokens |    80 M    | Causal     |     2x 1,024      |
+|   Coarse to fine tokens   |    80 M    | Non-causal |     6x 1,024      |
+### Release date
+April 2023
+## Broader Implications
+We anticipate that this model's text to audio capabilities can be used to improve accessbility tools in a variety of languages.
+Straightforward improvements will allow models to run faster than realtime, rendering them useful for applications such as virtual assistants.
+While we hope that this release will enable users to express their creativity and build applications that are a force
+for good, we acknowledge that any text to audio model has the potential for dual use. While it is not straightforward
+to voice clone known people with Bark, they can still be used for nefarious purposes. To further reduce the chances of unintended use of Bark,
+we also release a simple classifier to detect Bark-generated audio with high accuracy (see notebooks section of the main repository).

pyproject.toml ADDED Viewed

	@@ -0,0 +1,58 @@

+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "suno-bark"
+version = "0.0.1a"
+description = "Bark text to audio model"
+readme = "README.md"
+requires-python = ">=3.8"
+authors =  [
+    {name = "Suno Inc", email = "[email protected]"},
+]
+# Apache 2.0
+license = {file = "LICENSE"}
+dependencies = [
+    "boto3",
+    "encodec",
+    "funcy",
+    "numpy",
+    "scipy",
+    "tokenizers",
+    "torch",
+    "tqdm",
+    "transformers",
+]
+[project.urls]
+source = "https://github.com/suno-ai/bark"
+[project.optional-dependencies]
+dev = [
+    "bandit",
+    "black",
+    "codecov",
+    "flake8",
+    "huggingface-hub",
+    "hypothesis>=6.14,<7",
+    "isort>=5.0.0,<6",
+    "jupyter",
+    "mypy",
+    "nbconvert",
+    "nbformat",
+    "pydocstyle",
+    "pylint",
+    "pytest",
+    "pytest-cov",
+]
+[tool.setuptools]
+packages = ["bark"]
+[tool.setuptools.package-data]
+bark = ["assets/prompts/*.npz"]
+[tool.black]
+line-length = 100

rvc_infer.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import os,sys,pdb,torch
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+import argparse
+import glob
+import sys
+import torch
+from multiprocessing import cpu_count
+import ffmpeg
+import numpy as np
+def load_audio(file, sr):
+    try:
+        # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        file = (
+            file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+        )  # 防止小白拷路径头尾带了空格和"和回车
+        out, _ = (
+            ffmpeg.input(file, threads=0)
+            .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
+            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+        )
+    except Exception as e:
+        raise RuntimeError(f"Failed to load audio: {e}")
+    return np.frombuffer(out, np.float32).flatten()
+class Config:
+    def __init__(self,device,is_half):
+        self.device = device
+        self.is_half = is_half
+        self.n_cpu = 0
+        self.gpu_name = None
+        self.gpu_mem = None
+        self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
+    def device_config(self) -> tuple:
+        if torch.cuda.is_available():
+            i_device = int(self.device.split(":")[-1])
+            self.gpu_name = torch.cuda.get_device_name(i_device)
+            if (
+                ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
+                or "P40" in self.gpu_name.upper()
+                or "1060" in self.gpu_name
+                or "1070" in self.gpu_name
+                or "1080" in self.gpu_name
+            ):
+                print("16系/10系显卡和P40强制单精度")
+                self.is_half = False
+                for config_file in ["32k.json", "40k.json", "48k.json"]:
+                    with open(f"configs/{config_file}", "r") as f:
+                        strr = f.read().replace("true", "false")
+                    with open(f"configs/{config_file}", "w") as f:
+                        f.write(strr)
+                with open("trainset_preprocess_pipeline_print.py", "r") as f:
+                    strr = f.read().replace("3.7", "3.0")
+                with open("trainset_preprocess_pipeline_print.py", "w") as f:
+                    f.write(strr)
+            else:
+                self.gpu_name = None
+            self.gpu_mem = int(
+                torch.cuda.get_device_properties(i_device).total_memory
+                / 1024
+                / 1024
+                / 1024
+                + 0.4
+            )
+            if self.gpu_mem <= 4:
+                with open("trainset_preprocess_pipeline_print.py", "r") as f:
+                    strr = f.read().replace("3.7", "3.0")
+                with open("trainset_preprocess_pipeline_print.py", "w") as f:
+                    f.write(strr)
+        elif torch.backends.mps.is_available():
+            print("没有发现支持的N卡, 使用MPS进行推理")
+            self.device = "mps"
+        else:
+            print("没有发现支持的N卡, 使用CPU进行推理")
+            self.device = "cpu"
+            self.is_half = True
+        if self.n_cpu == 0:
+            self.n_cpu = cpu_count()
+        if self.is_half:
+            # 6G显存配置
+            x_pad = 3
+            x_query = 10
+            x_center = 60
+            x_max = 65
+        else:
+            # 5G显存配置
+            x_pad = 1
+            x_query = 6
+            x_center = 38
+            x_max = 41
+        if self.gpu_mem != None and self.gpu_mem <= 4:
+            x_pad = 1
+            x_query = 5
+            x_center = 30
+            x_max = 32
+        return x_pad, x_query, x_center, x_max
+now_dir=os.getcwd()
+sys.path.append(now_dir)
+sys.path.append(os.path.join(now_dir,"Retrieval-based-Voice-Conversion-WebUI"))
+from vc_infer_pipeline import VC
+from lib.infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono
+from fairseq import checkpoint_utils
+from scipy.io import wavfile
+hubert_model=None
+def load_hubert():
+    global hubert_model
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(["hubert_base.pt"],suffix="",)
+    hubert_model = models[0]
+    hubert_model = hubert_model.to(device)
+    if(is_half):hubert_model = hubert_model.half()
+    else:hubert_model = hubert_model.float()
+    hubert_model.eval()
+def vc_single(sid,input_audio,f0_up_key,f0_file,f0_method,file_index,index_rate,filter_radius=3,resample_sr=48000,rms_mix_rate=0.25, protect=0.33):
+    global tgt_sr,net_g,vc,hubert_model
+    if input_audio is None:return "You need to upload an audio", None
+    f0_up_key = int(f0_up_key)
+    audio=load_audio(input_audio,16000)
+    times = [0, 0, 0]
+    if(hubert_model==None):load_hubert()
+    if_f0 = cpt.get("f0", 1)
+    version = cpt.get("version")
+    audio_opt=vc.pipeline(hubert_model,net_g,sid,audio,input_audio,times,f0_up_key,f0_method,file_index,index_rate,if_f0,filter_radius=filter_radius,tgt_sr=tgt_sr,resample_sr=resample_sr,rms_mix_rate=rms_mix_rate,version=version,protect=protect,f0_file=f0_file)
+    # print(times)
+    return audio_opt
+def get_vc(model_path, device_, is_half_):
+    global n_spk,tgt_sr,net_g,vc,cpt,device,is_half
+    device = device_
+    is_half = is_half_
+    config = Config(device, is_half)
+    print("loading pth %s"%model_path)
+    cpt = torch.load(model_path, map_location="cpu")
+    tgt_sr = cpt["config"][-1]
+    cpt["config"][-3]=cpt["weight"]["emb_g.weight"].shape[0]#n_spk
+    if_f0=cpt.get("f0",1)
+    version=cpt.get("version", "v2")
+    if(if_f0==1):
+        if version == "v1":
+            net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
+        else:
+            net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half)
+    else:
+        if version == "v1":
+            net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
+        else:
+            net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
+    del net_g.enc_q
+    print(net_g.load_state_dict(cpt["weight"], strict=False))  # 不加这一行清不干净，真奇葩
+    net_g.eval().to(device)
+    if (is_half):net_g = net_g.half()
+    else:net_g = net_g.float()
+    vc = VC(tgt_sr, config)
+    n_spk=cpt["config"][-3]

rvc_test.ipynb ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from rvc_infer import get_vc, vc_single"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_path = \"Retrieval-based-Voice-Conversion-WebUI/weights/mi-test.pth\"\n",
+    "device=\"cuda:0\"\n",
+    "is_half=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "get_vc(model_path, device, is_half)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index_rate = 0.75\n",
+    "f0up_key = -6\n",
+    "filter_radius = 3\n",
+    "rms_mix_rate = 0.25\n",
+    "protect = 0.33\n",
+    "resample_sr = 48000\n",
+    "f0method = \"harvest\" #harvest or pm\n",
+    "input_path = \"output/audio.wav\"\n",
+    "index_path = \"Retrieval-based-Voice-Conversion-WebUI/logs/mi-test/added_IVF256_Flat_nprobe_1_mi-test_v2.index\"\n",
+    "\n",
+    "wav_opt = vc_single(0,input_path,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Audio\n",
+    "# play audio\n",
+    "Audio(wav_opt, rate=48000)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

setup.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from setuptools import setup
2	+
3	+ setup()

test_models.ipynb ADDED Viewed

	@@ -0,0 +1,454 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Audio\n",
+    "from scipy.io.wavfile import write as write_wav\n",
+    "\n",
+    "from bark.api import generate_audio\n",
+    "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "semantic_path = \"semantic_output/pytorch_model.bin\" # set to None if you don't want to use finetuned semantic\n",
+    "coarse_path = \"coarse_output/pytorch_model.bin\" # set to None if you don't want to use finetuned coarse\n",
+    "fine_path = \"fine_output/pytorch_model.bin\" # set to None if you don't want to use finetuned fine\n",
+    "use_rvc = True # Set to False to use bark without RVC\n",
+    "rvc_name = 'mi-test'\n",
+    "rvc_path = f\"Retrieval-based-Voice-Conversion-WebUI/weights/{rvc_name}.pth\"\n",
+    "index_path = f\"Retrieval-based-Voice-Conversion-WebUI/logs/{rvc_name}/added_IVF256_Flat_nprobe_1_{rvc_name}_v2.index\"\n",
+    "device=\"cuda:0\"\n",
+    "is_half=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preload_models(\n",
+    "    text_use_gpu=True,\n",
+    "    text_use_small=False,\n",
+    "    text_model_path=semantic_path,\n",
+    "    coarse_use_gpu=True,\n",
+    "    coarse_use_small=False,\n",
+    "    coarse_model_path=coarse_path,\n",
+    "    fine_use_gpu=True,\n",
+    "    fine_use_small=False,\n",
+    "    fine_model_path=fine_path,\n",
+    "    codec_use_gpu=True,\n",
+    "    force_reload=False,\n",
+    "    path=\"models\"\n",
+    ")\n",
+    "\n",
+    "if use_rvc:\n",
+    "    from rvc_infer import get_vc, vc_single\n",
+    "    get_vc(rvc_path, device, is_half)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# simple generation\n",
+    "text_prompt = \"I am Joe Biden... and this is the finetuned semantic, coarse and fine model! A lot better than the original!\"\n",
+    "filepath = \"output/audio.wav\" # change this to your desired output path\n",
+    "audio_array = generate_audio(text_prompt, history_prompt=None, text_temp=0.7, waveform_temp=0.7)\n",
+    "write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -6\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "    try:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    except:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "Audio(audio_array, rate=SAMPLE_RATE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_with_settings(text_prompt, semantic_temp=0.7, semantic_top_k=50, semantic_top_p=0.95, coarse_temp=0.7, coarse_top_k=50, coarse_top_p=0.95, fine_temp=0.5, voice_name=None, use_semantic_history_prompt=True, use_coarse_history_prompt=True, use_fine_history_prompt=True, output_full=False):\n",
+    "    # generation with more control\n",
+    "    x_semantic = generate_text_semantic(\n",
+    "        text_prompt,\n",
+    "        history_prompt=voice_name if use_semantic_history_prompt else None,\n",
+    "        temp=semantic_temp,\n",
+    "        top_k=semantic_top_k,\n",
+    "        top_p=semantic_top_p,\n",
+    "    )\n",
+    "\n",
+    "    x_coarse_gen = generate_coarse(\n",
+    "        x_semantic,\n",
+    "        history_prompt=voice_name if use_coarse_history_prompt else None,\n",
+    "        temp=coarse_temp,\n",
+    "        top_k=coarse_top_k,\n",
+    "        top_p=coarse_top_p,\n",
+    "    )\n",
+    "    x_fine_gen = generate_fine(\n",
+    "        x_coarse_gen,\n",
+    "        history_prompt=voice_name if use_fine_history_prompt else None,\n",
+    "        temp=fine_temp,\n",
+    "    )\n",
+    "\n",
+    "    if output_full:\n",
+    "        full_generation = {\n",
+    "            'semantic_prompt': x_semantic,\n",
+    "            'coarse_prompt': x_coarse_gen,\n",
+    "            'fine_prompt': x_fine_gen,\n",
+    "        }\n",
+    "        return full_generation, codec_decode(x_fine_gen)\n",
+    "    return codec_decode(x_fine_gen)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text_prompt = \"I am Joe Biden... and this is the finetuned semantic, coarse and fine model! A lot better than the original!\"\n",
+    "filepath = \"output/audio.wav\" # change this to your desired output path\n",
+    "\n",
+    "audio_array = generate_with_settings(\n",
+    "    text_prompt,\n",
+    "    semantic_temp=0.7,\n",
+    "    semantic_top_k=50,\n",
+    "    semantic_top_p=0.99,\n",
+    "    coarse_temp=0.7,\n",
+    "    coarse_top_k=50,\n",
+    "    coarse_top_p=0.95,\n",
+    "    fine_temp=0.5,\n",
+    "    voice_name=\"datasets/joe_biden_state_of_union/tokens/257.npz\",\n",
+    "    use_semantic_history_prompt=False,\n",
+    "    use_coarse_history_prompt=True,\n",
+    "    use_fine_history_prompt=True,\n",
+    "    output_full=False\n",
+    ")\n",
+    "\n",
+    "write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -6\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "    try:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    except:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "Audio(audio_array, rate=SAMPLE_RATE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "def split_and_recombine_text(text, desired_length=100, max_length=150):\n",
+    "    # from https://github.com/neonbjb/tortoise-tts\n",
+    "    \"\"\"Split text it into chunks of a desired length trying to keep sentences intact.\"\"\"\n",
+    "    # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii\n",
+    "    text = re.sub(r\"\\n\\n+\", \"\\n\", text)\n",
+    "    text = re.sub(r\"\\s+\", \" \", text)\n",
+    "    text = re.sub(r\"[“”]\", '\"', text)\n",
+    "\n",
+    "    rv = []\n",
+    "    in_quote = False\n",
+    "    current = \"\"\n",
+    "    split_pos = []\n",
+    "    pos = -1\n",
+    "    end_pos = len(text) - 1\n",
+    "\n",
+    "    def seek(delta):\n",
+    "        nonlocal pos, in_quote, current\n",
+    "        is_neg = delta < 0\n",
+    "        for _ in range(abs(delta)):\n",
+    "            if is_neg:\n",
+    "                pos -= 1\n",
+    "                current = current[:-1]\n",
+    "            else:\n",
+    "                pos += 1\n",
+    "                current += text[pos]\n",
+    "            if text[pos] == '\"':\n",
+    "                in_quote = not in_quote\n",
+    "        return text[pos]\n",
+    "\n",
+    "    def peek(delta):\n",
+    "        p = pos + delta\n",
+    "        return text[p] if p < end_pos and p >= 0 else \"\"\n",
+    "\n",
+    "    def commit():\n",
+    "        nonlocal rv, current, split_pos\n",
+    "        rv.append(current)\n",
+    "        current = \"\"\n",
+    "        split_pos = []\n",
+    "\n",
+    "    while pos < end_pos:\n",
+    "        c = seek(1)\n",
+    "        # do we need to force a split?\n",
+    "        if len(current) >= max_length:\n",
+    "            if len(split_pos) > 0 and len(current) > (desired_length / 2):\n",
+    "                # we have at least one sentence and we are over half the desired length, seek back to the last split\n",
+    "                d = pos - split_pos[-1]\n",
+    "                seek(-d)\n",
+    "            else:\n",
+    "                # no full sentences, seek back until we are not in the middle of a word and split there\n",
+    "                while c not in \"!?.\\n \" and pos > 0 and len(current) > desired_length:\n",
+    "                    c = seek(-1)\n",
+    "            commit()\n",
+    "        # check for sentence boundaries\n",
+    "        elif not in_quote and (c in \"!?\\n\" or (c == \".\" and peek(1) in \"\\n \")):\n",
+    "            # seek forward if we have consecutive boundary markers but still within the max length\n",
+    "            while (\n",
+    "                pos < len(text) - 1 and len(current) < max_length and peek(1) in \"!?.\"\n",
+    "            ):\n",
+    "                c = seek(1)\n",
+    "            split_pos.append(pos)\n",
+    "            if len(current) >= desired_length:\n",
+    "                commit()\n",
+    "        # treat end of quote as a boundary if its followed by a space or newline\n",
+    "        elif in_quote and peek(1) == '\"' and peek(2) in \"\\n \":\n",
+    "            seek(2)\n",
+    "            split_pos.append(pos)\n",
+    "    rv.append(current)\n",
+    "\n",
+    "    # clean up, remove lines with only whitespace or punctuation\n",
+    "    rv = [s.strip() for s in rv]\n",
+    "    rv = [s for s in rv if len(s) > 0 and not re.match(r\"^[\\s\\.,;:!?]*$\", s)]\n",
+    "\n",
+    "    return rv\n",
+    "\n",
+    "def generate_with_settings(text_prompt, semantic_temp=0.7, semantic_top_k=50, semantic_top_p=0.95, coarse_temp=0.7, coarse_top_k=50, coarse_top_p=0.95, fine_temp=0.5, voice_name=None, use_semantic_history_prompt=True, use_coarse_history_prompt=True, use_fine_history_prompt=True, output_full=False):\n",
+    "    # generation with more control\n",
+    "    x_semantic = generate_text_semantic(\n",
+    "        text_prompt,\n",
+    "        history_prompt=voice_name if use_semantic_history_prompt else None,\n",
+    "        temp=semantic_temp,\n",
+    "        top_k=semantic_top_k,\n",
+    "        top_p=semantic_top_p,\n",
+    "    )\n",
+    "\n",
+    "    x_coarse_gen = generate_coarse(\n",
+    "        x_semantic,\n",
+    "        history_prompt=voice_name if use_coarse_history_prompt else None,\n",
+    "        temp=coarse_temp,\n",
+    "        top_k=coarse_top_k,\n",
+    "        top_p=coarse_top_p,\n",
+    "    )\n",
+    "    x_fine_gen = generate_fine(\n",
+    "        x_coarse_gen,\n",
+    "        history_prompt=voice_name if use_fine_history_prompt else None,\n",
+    "        temp=fine_temp,\n",
+    "    )\n",
+    "\n",
+    "    if output_full:\n",
+    "        full_generation = {\n",
+    "            'semantic_prompt': x_semantic,\n",
+    "            'coarse_prompt': x_coarse_gen,\n",
+    "            'fine_prompt': x_fine_gen,\n",
+    "        }\n",
+    "        return full_generation, codec_decode(x_fine_gen)\n",
+    "    return codec_decode(x_fine_gen)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"\"\"The Uncharted Land of Discovery: A Journey Through Time and Space\n",
+    "[clears throat]\n",
+    "Chapter 1: The Dawn of Curiosity\n",
+    "[takes breath]\n",
+    "Since the dawn of humankind, our species has been driven by a powerful force: curiosity. It is an innate, unquenchable desire to explore, understand, and unravel the mysteries of the world around us. This primal urge has led us on countless adventures, pushing us to the farthest reaches of our planet and beyond.\n",
+    "\n",
+    "Early humans, huddled around a flickering fire, gazed up at the night sky and wondered what those twinkling lights were. They had no idea that their curiosity would eventually propel us into the vast, uncharted realm of space. As time progressed, our ancestors began to explore their surroundings, venturing beyond their caves and settlements, driven by the need to discover what lay beyond the horizon.\n",
+    "\n",
+    "hapter 2: The Age of Exploration\n",
+    "\n",
+    "The Age of Exploration marked a turning point in human history, as brave souls took to the seas in search of new lands, wealth, and knowledge. Pioneers like Christopher Columbus, Vasco da Gama, and Ferdinand Magellan set sail on perilous voyages, pushing the boundaries of what was known and understood.\n",
+    "[clears throat]\n",
+    "These intrepid explorers discovered new continents, mapped out previously unknown territories, and encountered diverse cultures. They also established trade routes, allowing for the exchange of goods, ideas, and innovations between distant societies. The Age of Exploration was not without its dark moments, however, as conquest, colonization, and exploitation often went hand in hand with discovery.\n",
+    "[clears throat]\n",
+    "Chapter 3: The Scientific Revolution\n",
+    "[laughs]\n",
+    "The Scientific Revolution was a period of profound change, as humanity began to question long-held beliefs and seek empirical evidence. Pioneers like Galileo Galilei, Isaac Newton, and Johannes Kepler sought to understand the natural world through observation, experimentation, and reason.\n",
+    "[sighs]\n",
+    "Their discoveries laid the foundation for modern science, transforming the way we view the universe and our place within it. New technologies, such as the telescope and the microscope, allowed us to peer deeper into the cosmos and the microscopic world, further expanding our understanding of reality.\n",
+    "[gasps]\n",
+    "Chapter 4: The Information Age\n",
+    "\n",
+    "The Information Age, sometimes referred to as the Digital Age, has revolutionized the way we communicate, learn, and access knowledge. With the advent of the internet and personal computers, information that was once reserved for the privileged few is now available to the masses.\n",
+    "...\n",
+    "This democratization of knowledge has led to an explosion of innovation, as ideas and information are shared across borders and cultures at lightning speed. The Information Age has also brought new challenges, as the rapid pace of technological advancements threatens to outpace our ability to adapt and raises questions about the ethical implications of our increasingly interconnected world.\n",
+    "[laughter]\n",
+    "Chapter 5: The Final Frontier\n",
+    "[clears throat]\n",
+    "As our knowledge of the universe expands, so too does our desire to explore the cosmos. Space exploration has come a long way since the first successful satellite, Sputnik, was launched in 1957. We have landed humans on the moon, sent probes to the far reaches of our solar system, and even glimpsed distant galaxies through powerful telescopes.\n",
+    "\n",
+    "The future of space exploration is filled with possibilities, from establishing colonies on Mars to the search for extraterrestrial life. As we venture further into the unknown, we continue to be driven by the same curiosity that has propelled us throughout history, always seeking to uncover the secrets of the universe and our place within it.\n",
+    "...\n",
+    "In conclusion, the human journey is one of discovery, driven by our innate curiosity and desire to understand the world around us. From the dawn of our species to the present day, we have continued to explore, learn, and adapt, pushing the boundaries of what is known and possible. As we continue to unravel the mysteries of the cosmos, our spirit.\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Chunk the text into smaller pieces then combine the generated audio\n",
+    "from time import time\n",
+    "from tqdm.auto import tqdm\n",
+    "from IPython.display import Audio\n",
+    "from scipy.io.wavfile import write as write_wav\n",
+    "import os\n",
+    "import numpy as np\n",
+    "\n",
+    "# generation settings\n",
+    "voice_name = \"datasets/joe_biden_state_of_union/tokens/257.npz\"\n",
+    "out_filepath = 'audio/audio.wav'\n",
+    "\n",
+    "semantic_temp = 0.7\n",
+    "semantic_top_k = 100\n",
+    "semantic_top_p = 0.99\n",
+    "\n",
+    "coarse_temp = 0.7\n",
+    "coarse_top_k = 100\n",
+    "coarse_top_p = 0.95\n",
+    "\n",
+    "fine_temp = 0.7\n",
+    "\n",
+    "use_semantic_history_prompt = True\n",
+    "use_coarse_history_prompt = True\n",
+    "use_fine_history_prompt = True\n",
+    "\n",
+    "use_last_generation_as_history = False\n",
+    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -6\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "\n",
+    "texts = split_and_recombine_text(text)\n",
+    "\n",
+    "all_parts = []\n",
+    "for i, text in tqdm(enumerate(texts), total=len(texts)):\n",
+    "    full_generation, audio_array = generate_with_settings(\n",
+    "        text,\n",
+    "        semantic_temp=semantic_temp,\n",
+    "        semantic_top_k=semantic_top_k,\n",
+    "        semantic_top_p=semantic_top_p,\n",
+    "        coarse_temp=coarse_temp,\n",
+    "        coarse_top_k=coarse_top_k,\n",
+    "        coarse_top_p=coarse_top_p,\n",
+    "        fine_temp=fine_temp,\n",
+    "        voice_name=voice_name,\n",
+    "        use_semantic_history_prompt=use_semantic_history_prompt,\n",
+    "        use_coarse_history_prompt=use_coarse_history_prompt,\n",
+    "        use_fine_history_prompt=use_fine_history_prompt,\n",
+    "        output_full=True\n",
+    "    )\n",
+    "    if use_last_generation_as_history:\n",
+    "        # save to npz\n",
+    "        os.makedirs('_temp', exist_ok=True)\n",
+    "        np.savez_compressed(\n",
+    "            '_temp/history.npz',\n",
+    "            semantic_prompt=full_generation['semantic_prompt'],\n",
+    "            coarse_prompt=full_generation['coarse_prompt'],\n",
+    "            fine_prompt=full_generation['fine_prompt'],\n",
+    "        )\n",
+    "        voice_name = '_temp/history.npz'\n",
+    "    write_wav(out_filepath.replace('.wav', f'_{i}') + '.wav', SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "    if use_rvc:\n",
+    "        try:\n",
+    "            audio_array = vc_single(0,out_filepath.replace('.wav', f'_{i}') + '.wav',f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "        except:\n",
+    "            audio_array = vc_single(0,out_filepath.replace('.wav', f'_{i}') + '.wav',f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "        write_wav(out_filepath.replace('.wav', f'_{i}') + '.wav', SAMPLE_RATE, audio_array)\n",
+    "    all_parts.append(audio_array)\n",
+    "\n",
+    "audio_array = np.concatenate(all_parts, axis=-1)\n",
+    "\n",
+    "# save audio\n",
+    "write_wav(out_filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "# play audio\n",
+    "Audio(audio_array, rate=SAMPLE_RATE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

train_coarse.ipynb ADDED Viewed

	@@ -0,0 +1,936 @@

+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import os\n",
+    "import re\n",
+    "import gc\n",
+    "import math\n",
+    "import json\n",
+    "import hashlib\n",
+    "import numpy as np\n",
+    "import logging\n",
+    "import torchaudio\n",
+    "from tqdm.auto import tqdm\n",
+    "import torch.nn.functional as F\n",
+    "from encodec.utils import convert_audio\n",
+    "from accelerate import Accelerator\n",
+    "from accelerate.utils import set_seed\n",
+    "from transformers import BertTokenizer\n",
+    "from huggingface_hub import hf_hub_download\n",
+    "from packaging import version\n",
+    "from diffusers.optimization import get_scheduler\n",
+    "\n",
+    "from utils.bitsandbytes import BitsAndBytesConfig, importlib_metadata, get_keys_to_not_convert, replace_with_bnb_linear, set_module_quantized_tensor_to_device\n",
+    "from utils.lora import convert_linear_layer_to_lora, only_optimize_lora_parameters, convert_lora_to_linear_layer\n",
+    "from bark.model import GPTConfig, GPT\n",
+    "from bark.model_fine import FineGPT, FineGPTConfig"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training Args"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_batch_size = 8\n",
+    "eval_batch_size = 8\n",
+    "grad_accum = 2\n",
+    "ckpt_path = 'models/coarse_2.pt'\n",
+    "model_type = \"coarse\"\n",
+    "dataset_path = 'datasets/joe_biden_state_of_union/'\n",
+    "logging_dir = 'logs/'\n",
+    "log_with = 'wandb'\n",
+    "hubert_path = 'data/models/hubert/hubert.pt'\n",
+    "hubert_tokenizer_path = 'data/models/hubert/tokenizer.pth'\n",
+    "\n",
+    "output_dir = 'coarse_output/'\n",
+    "resume_from_checkpoint = None\n",
+    "\n",
+    "checkpointing_steps = 1000\n",
+    "\n",
+    "mixed_precision = 'bf16'\n",
+    "bits = 16 #4 4 and 8 bit are a work in progress\n",
+    "compute_dtype = torch.bfloat16\n",
+    "double_quant = True\n",
+    "quant_type = 'nf4'\n",
+    "\n",
+    "lora_dim = 64\n",
+    "lora_scaling = 1\n",
+    "lora_dropout = 0.1\n",
+    "lora_module_name = 'transformer.h'\n",
+    "optimize_lora_params_only = False\n",
+    "\n",
+    "learning_rate = 1e-4\n",
+    "scale_lr = False\n",
+    "use_8bit_adam = False\n",
+    "adam_beta1 = 0.9\n",
+    "adam_beta2 = 0.999\n",
+    "adam_epsilon = 1e-8\n",
+    "weight_decay = 0.01\n",
+    "\n",
+    "llm_int8_skip_modules = None\n",
+    "keep_in_fp32_modules = ['lm_head']\n",
+    "\n",
+    "lr_scheduler_type = 'linear'\n",
+    "lr_warmup_steps = 60\n",
+    "num_train_epochs = 5\n",
+    "max_train_steps = None\n",
+    "max_grad_norm = 1.0\n",
+    "\n",
+    "semantic_cross_entropy_loss_weight = 0.0\n",
+    "\n",
+    "seed = 741"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Define Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CONTEXT_WINDOW_SIZE = 1024\n",
+    "\n",
+    "MAX_SEMANTIC_LEN = 256\n",
+    "\n",
+    "SEMANTIC_RATE_HZ = 49.9\n",
+    "SEMANTIC_VOCAB_SIZE = 10_000\n",
+    "\n",
+    "TEXT_ENCODING_OFFSET = 10_048\n",
+    "SEMANTIC_PAD_TOKEN = 10_000\n",
+    "TEXT_PAD_TOKEN = 129_595\n",
+    "SEMANTIC_INFER_TOKEN = 129_599\n",
+    "\n",
+    "MAX_COARSE_LEN = 768\n",
+    "\n",
+    "SAMPLE_RATE = 24_000\n",
+    "CHANNELS = 1\n",
+    "\n",
+    "COARSE_SEMANTIC_PAD_TOKEN = 12_048\n",
+    "COARSE_INFER_TOKEN = 12_050\n",
+    "\n",
+    "CODEBOOK_SIZE = 1024\n",
+    "N_COARSE_CODEBOOKS = 2\n",
+    "N_FINE_CODEBOOKS = 8\n",
+    "COARSE_RATE_HZ = 75\n",
+    "\n",
+    "logger = logging.getLogger(__name__)\n",
+    "\n",
+    "\n",
+    "USE_SMALL_MODELS = os.environ.get(\"SERP_USE_SMALL_MODELS\", False)\n",
+    "\n",
+    "default_cache_dir = os.path.join(os.path.expanduser(\"~\"), \".cache\")\n",
+    "CACHE_DIR = os.path.join(os.getenv(\"XDG_CACHE_HOME\", default_cache_dir), \"serp\", \"bark_v0\")\n",
+    "\n",
+    "\n",
+    "def _clear_cuda_cache():\n",
+    "    if torch.cuda.is_available():\n",
+    "        torch.cuda.empty_cache()\n",
+    "        torch.cuda.synchronize()\n",
+    "\n",
+    "\n",
+    "def _md5(fname):\n",
+    "    hash_md5 = hashlib.md5()\n",
+    "    with open(fname, \"rb\") as f:\n",
+    "        for chunk in iter(lambda: f.read(4096), b\"\"):\n",
+    "            hash_md5.update(chunk)\n",
+    "    return hash_md5.hexdigest()\n",
+    "\n",
+    "\n",
+    "def _download(from_hf_path, file_name, to_local_path):\n",
+    "    to_local_path = to_local_path.replace(\"\\\\\", \"/\")\n",
+    "    path = '/'.join(to_local_path.split(\"/\")[:-1])\n",
+    "    os.makedirs(path, exist_ok=True)\n",
+    "    hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=path)\n",
+    "    os.replace(os.path.join(path, file_name), to_local_path)\n",
+    "\n",
+    "\n",
+    "def _tokenize(tokenizer, text):\n",
+    "    return tokenizer.encode(text, add_special_tokens=False)\n",
+    "\n",
+    "\n",
+    "def _detokenize(tokenizer, enc_text):\n",
+    "    return tokenizer.decode(enc_text)\n",
+    "\n",
+    "\n",
+    "def _normalize_whitespace(text):\n",
+    "    return re.sub(r\"\\s+\", \" \", text).strip()\n",
+    "\n",
+    "\n",
+    "REMOTE_MODEL_PATHS = {\n",
+    "    \"text_small\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"text.pt\",\n",
+    "        \"checksum\": \"b3e42bcbab23b688355cd44128c4cdd3\",\n",
+    "    },\n",
+    "    \"coarse_small\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"coarse.pt\",\n",
+    "        \"checksum\": \"5fe964825e3b0321f9d5f3857b89194d\",\n",
+    "    },\n",
+    "    \"fine_small\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"fine.pt\",\n",
+    "        \"checksum\": \"5428d1befe05be2ba32195496e58dc90\",\n",
+    "    },\n",
+    "    \"text\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"text_2.pt\",\n",
+    "        \"checksum\": \"54afa89d65e318d4f5f80e8e8799026a\",\n",
+    "    },\n",
+    "    \"coarse\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"coarse_2.pt\",\n",
+    "        \"checksum\": \"8a98094e5e3a255a5c9c0ab7efe8fd28\",\n",
+    "    },\n",
+    "    \"fine\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"fine_2.pt\",\n",
+    "        \"checksum\": \"59d184ed44e3650774a2f0503a48a97b\",\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "\n",
+    "def _load_model(ckpt_path, device, use_small=False, model_type=\"text\"):\n",
+    "    if model_type == \"text\":\n",
+    "        ConfigClass = GPTConfig\n",
+    "        ModelClass = GPT\n",
+    "    elif model_type == \"coarse\":\n",
+    "        ConfigClass = GPTConfig\n",
+    "        ModelClass = GPT\n",
+    "    elif model_type == \"fine\":\n",
+    "        ConfigClass = FineGPTConfig\n",
+    "        ModelClass = FineGPT\n",
+    "    else:\n",
+    "        raise NotImplementedError()\n",
+    "    model_key = f\"{model_type}_small\" if use_small or USE_SMALL_MODELS else model_type\n",
+    "    model_info = REMOTE_MODEL_PATHS[model_key]\n",
+    "    if ckpt_path in [None, '']:\n",
+    "        ckpt_path = os.path.join(CACHE_DIR, model_info[\"file_name\"])\n",
+    "    if not os.path.exists(ckpt_path):\n",
+    "        logger.info(f\"{model_type} model not found, downloading into `{CACHE_DIR}`.\")\n",
+    "        _download(model_info[\"repo_id\"], model_info[\"file_name\"], ckpt_path)\n",
+    "    checkpoint = torch.load(ckpt_path, map_location=device)\n",
+    "    # this is a hack\n",
+    "    model_args = checkpoint[\"model_args\"]\n",
+    "    if \"input_vocab_size\" not in model_args:\n",
+    "        model_args[\"input_vocab_size\"] = model_args[\"vocab_size\"]\n",
+    "        model_args[\"output_vocab_size\"] = model_args[\"vocab_size\"]\n",
+    "        del model_args[\"vocab_size\"]\n",
+    "    gptconf = ConfigClass(**checkpoint[\"model_args\"])\n",
+    "    model = ModelClass(gptconf)\n",
+    "    state_dict = checkpoint[\"model\"]\n",
+    "    # fixup checkpoint\n",
+    "    unwanted_prefix = \"_orig_mod.\"\n",
+    "    for k, v in list(state_dict.items()):\n",
+    "        if k.startswith(unwanted_prefix):\n",
+    "            state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)\n",
+    "    extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())\n",
+    "    extra_keys = set([k for k in extra_keys if not k.endswith(\".attn.bias\")])\n",
+    "    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())\n",
+    "    missing_keys = set([k for k in missing_keys if not k.endswith(\".attn.bias\")])\n",
+    "    if len(extra_keys) != 0:\n",
+    "        raise ValueError(f\"extra keys found: {extra_keys}\")\n",
+    "    if len(missing_keys) != 0:\n",
+    "        raise ValueError(f\"missing keys: {missing_keys}\")\n",
+    "    model.load_state_dict(state_dict, strict=False)\n",
+    "    n_params = model.get_num_params()\n",
+    "    val_loss = checkpoint[\"best_val_loss\"].item()\n",
+    "    print(f\"Loaded {model_type} model with {n_params} params, val_loss={val_loss:.4f}.\")\n",
+    "    del checkpoint, state_dict\n",
+    "    _clear_cuda_cache()\n",
+    "    if model_type == \"text\":\n",
+    "        tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\")\n",
+    "        return model, tokenizer\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "def _flatten_codebooks(arr, offset_size=CODEBOOK_SIZE):\n",
+    "    assert len(arr.shape) == 2\n",
+    "    arr = arr.copy()\n",
+    "    if offset_size is not None:\n",
+    "        for n in range(1, arr.shape[0]):\n",
+    "            arr[n, :] += offset_size * n\n",
+    "    flat_arr = arr.ravel(\"F\")\n",
+    "    return flat_arr\n",
+    "\n",
+    "\n",
+    "def load_filepaths_and_text(filename, split=\"|\"):\n",
+    "    with open(filename, encoding='utf-8', errors='ignore') as f:\n",
+    "        filepaths_and_text = [line.strip().split(split) for line in f]\n",
+    "        base = os.path.dirname(filename)\n",
+    "        for j in range(len(filepaths_and_text)):\n",
+    "            filepaths_and_text[j][0] = os.path.join(base, filepaths_and_text[j][0])\n",
+    "    return filepaths_and_text\n",
+    "\n",
+    "\n",
+    "class TtsDataset(torch.utils.data.Dataset):\n",
+    "    def __init__(self, opt):\n",
+    "        self.path = os.path.dirname(opt['path'])\n",
+    "        self.mode = opt['mode']\n",
+    "        self.audiopaths_and_text = load_filepaths_and_text(os.path.join(opt['path'] , opt['mode'] + '.txt'))\n",
+    "\n",
+    "    def __getitem__(self, index):\n",
+    "        audiopath_and_text = self.audiopaths_and_text[index]\n",
+    "        audiopath = audiopath_and_text[0]\n",
+    "\n",
+    "        tokens = np.load(audiopath.replace('.wav', '.npz').replace('wavs', 'tokens'))\n",
+    "        semantic_tokens = tokens['semantic']\n",
+    "        coarse_tokens = _flatten_codebooks(tokens['coarse'], offset_size=CODEBOOK_SIZE) + SEMANTIC_VOCAB_SIZE\n",
+    "\n",
+    "        return torch.from_numpy(semantic_tokens), torch.from_numpy(coarse_tokens)\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.audiopaths_and_text)\n",
+    "\n",
+    "\n",
+    "class TtsCollater():\n",
+    "    def __init__(self):\n",
+    "        pass\n",
+    "    def __call__(self, batch):\n",
+    "        max_semantic_len = MAX_SEMANTIC_LEN\n",
+    "        max_coarse_len = MAX_COARSE_LEN\n",
+    "        semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS\n",
+    "        semantic_tokens = []\n",
+    "        coarse_tokens = []\n",
+    "\n",
+    "        for b in batch:\n",
+    "            semantic_tokens_, coarse_tokens_ = b\n",
+    "            start_idx = None\n",
+    "            if len(semantic_tokens_) > max_semantic_len:\n",
+    "                start_idx = np.random.randint(0, len(semantic_tokens_) - max_semantic_len + 1)\n",
+    "                semantic_tokens_ = semantic_tokens_[start_idx:start_idx+max_semantic_len]\n",
+    "            semantic_tokens_ = F.pad(semantic_tokens_, (0, max_semantic_len-len(semantic_tokens_)), value=COARSE_SEMANTIC_PAD_TOKEN)\n",
+    "            semantic_tokens_ = torch.cat([semantic_tokens_, torch.tensor([COARSE_INFER_TOKEN])])\n",
+    "            semantic_tokens.append(semantic_tokens_)\n",
+    "\n",
+    "            if start_idx is not None:\n",
+    "                start_idx_coarse = int(start_idx * semantic_to_coarse_ratio) \n",
+    "                coarse_tokens_ = coarse_tokens_[start_idx_coarse:start_idx_coarse+max_coarse_len]\n",
+    "            coarse_tokens_ = F.pad(coarse_tokens_, (0, max_coarse_len-len(coarse_tokens_)), value=COARSE_SEMANTIC_PAD_TOKEN)\n",
+    "            coarse_tokens.append(coarse_tokens_)\n",
+    "\n",
+    "        return {\n",
+    "            'semantic_tokens': torch.stack(semantic_tokens).contiguous(),\n",
+    "            'coarse_tokens': torch.stack(coarse_tokens).contiguous()\n",
+    "        }\n",
+    "    \n",
+    "\n",
+    "accelerator = Accelerator(\n",
+    "    gradient_accumulation_steps=grad_accum,\n",
+    "    mixed_precision=mixed_precision,\n",
+    "    log_with=log_with,\n",
+    "    logging_dir=logging_dir,\n",
+    ")\n",
+    "device = accelerator.device\n",
+    "\n",
+    "os.makedirs(output_dir, exist_ok=True)\n",
+    "\n",
+    "set_seed(seed)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Setup Dataset (only need to do this once)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# max_duration_sec = 15.12 # the maximum allowed duration in seconds\n",
+    "\n",
+    "# path = dataset_path\n",
+    "\n",
+    "# # From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer\n",
+    "# from hubert.hubert_manager import HuBERTManager\n",
+    "# hubert_manager = HuBERTManager()\n",
+    "# from hubert.pre_kmeans_hubert import CustomHubert\n",
+    "# from hubert.customtokenizer import CustomTokenizer\n",
+    "# hubert_manager.make_sure_hubert_installed()\n",
+    "# hubert_manager.make_sure_tokenizer_installed()\n",
+    "\n",
+    "# # Load the HuBERT model\n",
+    "# hubert_model = CustomHubert(checkpoint_path=hubert_path).to(device)\n",
+    "# hubert_model.eval()\n",
+    "# for param in hubert_model.parameters():\n",
+    "#     param.requires_grad = False\n",
+    "\n",
+    "# # Load the CustomTokenizer model\n",
+    "# hubert_tokenizer = CustomTokenizer.load_from_checkpoint(hubert_tokenizer_path).to(device)  # Automatically uses the right layers\n",
+    "\n",
+    "# from bark.generation import load_codec_model\n",
+    "# codec_model = load_codec_model(use_gpu=True)\n",
+    "# codec_model.eval()\n",
+    "# for param in codec_model.parameters():\n",
+    "#     param.requires_grad = False\n",
+    "\n",
+    "\n",
+    "# def get_duration(wav, sr):\n",
+    "#     return wav.shape[1] / sr\n",
+    "\n",
+    "# valid_lines_train = []\n",
+    "# # convert wavs to semantic tokens\n",
+    "# for wav_path, txt in load_filepaths_and_text(path + 'train.txt'):\n",
+    "#     wav, sr = torchaudio.load(wav_path)\n",
+    "#     if not get_duration(wav, sr) > max_duration_sec:\n",
+    "#         valid_lines_train.append((wav_path, txt))\n",
+    "#     wav = convert_audio(wav, sr, SAMPLE_RATE, CHANNELS).to(device)\n",
+    "\n",
+    "#     semantic_vectors = hubert_model.forward(wav, input_sample_hz=SAMPLE_RATE)\n",
+    "#     semantic_tokens = hubert_tokenizer.get_token(semantic_vectors)\n",
+    "\n",
+    "#     # save semantic tokens\n",
+    "#     os.makedirs(os.path.join(path, 'tokens'), exist_ok=True)\n",
+    "#     semantic_tokens = semantic_tokens.cpu().numpy()\n",
+    "\n",
+    "#     # Extract discrete codes from EnCodec\n",
+    "#     with torch.no_grad():\n",
+    "#         encoded_frames = codec_model.encode(wav.unsqueeze(0))\n",
+    "#     codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]\n",
+    "\n",
+    "#     # move codes to cpu\n",
+    "#     codes = codes.cpu().numpy()\n",
+    "\n",
+    "#     # save tokens\n",
+    "#     np.savez_compressed(os.path.join(path, 'tokens', os.path.basename(wav_path).replace('.wav', '.npz')), fine=codes, coarse=codes[:2, :], semantic=semantic_tokens)\n",
+    "\n",
+    "# # rewrite train.txt with valid lines\n",
+    "# with open(path + 'train_valid.txt', 'w', encoding='utf-8') as f:\n",
+    "#     for wav_path, txt in valid_lines_train:\n",
+    "#         wav_path = os.path.relpath(wav_path, dataset_path).replace('\\\\', '/')\n",
+    "#         f.write(f'{wav_path}|{txt}\\n')\n",
+    "\n",
+    "# valid_lines_valid = []\n",
+    "# for wav_path, txt in load_filepaths_and_text(path + 'valid.txt'):\n",
+    "#     wav, sr = torchaudio.load(wav_path)\n",
+    "#     if not get_duration(wav, sr) > max_duration_sec:\n",
+    "#         valid_lines_valid.append((wav_path, txt))\n",
+    "#     wav = convert_audio(wav, sr, SAMPLE_RATE, CHANNELS).to(device)\n",
+    "\n",
+    "#     semantic_vectors = hubert_model.forward(wav, input_sample_hz=SAMPLE_RATE)\n",
+    "#     semantic_tokens = hubert_tokenizer.get_token(semantic_vectors)\n",
+    "\n",
+    "#     # save semantic tokens\n",
+    "#     os.makedirs(os.path.join(path, 'tokens'), exist_ok=True)\n",
+    "#     semantic_tokens = semantic_tokens.cpu().numpy()\n",
+    "    \n",
+    "#     # Extract discrete codes from EnCodec\n",
+    "#     with torch.no_grad():\n",
+    "#         encoded_frames = codec_model.encode(wav.unsqueeze(0))\n",
+    "#     codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]\n",
+    "\n",
+    "#     # move codes to cpu\n",
+    "#     codes = codes.cpu().numpy()\n",
+    "\n",
+    "#     # save tokens\n",
+    "#     np.savez_compressed(os.path.join(path, 'tokens', os.path.basename(wav_path).replace('.wav', '.npz')), fine=codes, coarse=codes[:2, :], semantic=semantic_tokens)\n",
+    "\n",
+    "# # rewrite valid.txt with valid lines\n",
+    "# with open(path + 'valid_valid.txt', 'w', encoding='utf-8') as f:\n",
+    "#     for wav_path, txt in valid_lines_valid:\n",
+    "#         wav_path = os.path.relpath(wav_path, dataset_path).replace('\\\\', '/')\n",
+    "#         f.write(f'{wav_path}|{txt}\\n')\n",
+    "\n",
+    "# del hubert_model\n",
+    "# del hubert_tokenizer\n",
+    "# del codec_model\n",
+    "# gc.collect()\n",
+    "# torch.cuda.empty_cache()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = _load_model(ckpt_path, device, use_small=False, model_type=model_type)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if scale_lr:\n",
+    "    learning_rate = (\n",
+    "        learning_rate * grad_accum * train_batch_size * accelerator.num_processes\n",
+    "    )\n",
+    "\n",
+    "if use_8bit_adam:\n",
+    "    try:\n",
+    "        import bitsandbytes as bnb\n",
+    "    except ImportError:\n",
+    "        raise ImportError(\n",
+    "            \"To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`.\"\n",
+    "        )\n",
+    "\n",
+    "    optimizer_class = bnb.optim.AdamW8bit\n",
+    "else:\n",
+    "    optimizer_class = torch.optim.AdamW"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "quantization_config=BitsAndBytesConfig(\n",
+    "    load_in_4bit=bits == 4,\n",
+    "    load_in_8bit=bits == 8,\n",
+    "    llm_int8_threshold=6.0,\n",
+    "    llm_int8_has_fp16_weight=False,\n",
+    "    bnb_4bit_compute_dtype=compute_dtype,\n",
+    "    bnb_4bit_use_double_quant=double_quant,\n",
+    "    bnb_4bit_quant_type=quant_type # {'fp4', 'nf4'}\n",
+    ")\n",
+    "\n",
+    "# if quantization_config.load_in_8bit or quantization_config.load_in_4bit:\n",
+    "#     if quantization_config.load_in_8bit:\n",
+    "#         logger.info(\"Detected 8-bit loading: activating 8-bit loading for this model\")\n",
+    "#     elif quantization_config.load_in_4bit:\n",
+    "#         logger.info(\"Detected 4-bit loading: activating 4-bit loading for this model\")\n",
+    "\n",
+    "#     # We keep some modules such as the lm_head in their original dtype for numerical stability reasons\n",
+    "#     if llm_int8_skip_modules is None or len(llm_int8_skip_modules) == 0:\n",
+    "#         modules_to_not_convert = [] # get_keys_to_not_convert(model)\n",
+    "#     else:\n",
+    "#         modules_to_not_convert = llm_int8_skip_modules\n",
+    "\n",
+    "#     if not isinstance(modules_to_not_convert, list):\n",
+    "#         modules_to_not_convert = [modules_to_not_convert]\n",
+    "\n",
+    "#     modules_to_not_convert.extend(keep_in_fp32_modules)\n",
+    "\n",
+    "#     supports_4bit = version.parse(importlib_metadata.version(\"bitsandbytes\")) >= version.parse(\"0.39.0\")\n",
+    "\n",
+    "#     if quantization_config.load_in_4bit and not supports_4bit:\n",
+    "#         raise ValueError(\n",
+    "#             \"You have a version of `bitsandbytes` that is not compatible with 4bit inference and training\"\n",
+    "#             \" make sure you have the latest version of `bitsandbytes` installed\"\n",
+    "#         )\n",
+    "    \n",
+    "#     if len(modules_to_not_convert) == 0:\n",
+    "#         modules_to_not_convert = None\n",
+    "\n",
+    "#     model = replace_with_bnb_linear(\n",
+    "#         model, modules_to_not_convert=modules_to_not_convert, quantization_config=quantization_config\n",
+    "#     )\n",
+    "\n",
+    "#     # training in 8-bit is only available in 0.37.0+\n",
+    "#     model._is_kbit_training_enabled = version.parse(\n",
+    "#         importlib_metadata.version(\"bitsandbytes\")\n",
+    "#     ) >= version.parse(\"0.37.0\")\n",
+    "\n",
+    "#     model.config.quantization_config = quantization_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if bits == 4:\n",
+    "    from accelerate.utils import CustomDtype\n",
+    "    target_dtype = CustomDtype.INT4\n",
+    "elif bits == 8:\n",
+    "    target_dtype = torch.int8\n",
+    "\n",
+    "if lora_dim > 0:\n",
+    "    for param in model.parameters():\n",
+    "        if param.ndim == 1:\n",
+    "            # cast the small parameters (e.g. layernorm) to fp32 for stability\n",
+    "            param.data = param.data.to(torch.float32)\n",
+    "            \n",
+    "    class CastOutputToFloat(nn.Sequential):\n",
+    "        def forward(self, x):\n",
+    "            return super().forward(x).to(torch.float32)\n",
+    "\n",
+    "    model.lm_head = CastOutputToFloat(model.lm_head)\n",
+    "\n",
+    "    model = convert_linear_layer_to_lora(model, lora_module_name,\n",
+    "                                            lora_dim=lora_dim, lora_scaling=lora_scaling,\n",
+    "                                            lora_dropout=lora_dropout)\n",
+    "    if optimize_lora_params_only:\n",
+    "        model = only_optimize_lora_parameters(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "params_to_optimize = (\n",
+    "        param for param in model.parameters() if param.requires_grad\n",
+    "    )\n",
+    "\n",
+    "optimizer = optimizer_class(\n",
+    "    params_to_optimize,\n",
+    "    lr=learning_rate,\n",
+    "    betas=(adam_beta1, adam_beta2),\n",
+    "    weight_decay=weight_decay,\n",
+    "    eps=adam_epsilon,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "opt_train = {\n",
+    "    'path': dataset_path,\n",
+    "    'mode': 'train',\n",
+    "}\n",
+    "\n",
+    "opt_val = {\n",
+    "    'path': dataset_path,\n",
+    "    'mode': 'valid',\n",
+    "}\n",
+    "\n",
+    "train_dataset = TtsDataset(opt_train)\n",
+    "validation_dataset = TtsDataset(opt_val)\n",
+    "\n",
+    "train_dataloader = torch.utils.data.DataLoader(\n",
+    "    train_dataset,\n",
+    "    batch_size=train_batch_size,\n",
+    "    collate_fn=TtsCollater(),\n",
+    ")\n",
+    "\n",
+    "validation_dataloader = torch.utils.data.DataLoader(\n",
+    "    validation_dataset,\n",
+    "    batch_size=eval_batch_size,\n",
+    "    collate_fn=TtsCollater(),\n",
+    ")\n",
+    "\n",
+    "criterion = torch.nn.CrossEntropyLoss(ignore_index=COARSE_SEMANTIC_PAD_TOKEN)\n",
+    "\n",
+    "# Scheduler and math around the number of training steps.\n",
+    "overrode_max_train_steps = False\n",
+    "num_update_steps_per_epoch = math.ceil(len(train_dataloader) / grad_accum)\n",
+    "if max_train_steps is None:\n",
+    "    max_train_steps = num_train_epochs * num_update_steps_per_epoch\n",
+    "    overrode_max_train_steps = True\n",
+    "\n",
+    "lr_scheduler = get_scheduler(\n",
+    "    lr_scheduler_type,\n",
+    "    optimizer=optimizer,\n",
+    "    num_warmup_steps=lr_warmup_steps * grad_accum,\n",
+    "    num_training_steps=max_train_steps * grad_accum,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, optimizer, train_dataloader, validation_dataloader, lr_scheduler = accelerator.prepare(\n",
+    "    model, optimizer, train_dataloader, validation_dataloader, lr_scheduler\n",
+    ")\n",
+    "accelerator.register_for_checkpointing(lr_scheduler)\n",
+    "\n",
+    "weight_dtype = torch.float32\n",
+    "if accelerator.mixed_precision == \"fp16\":\n",
+    "    weight_dtype = torch.float16\n",
+    "elif accelerator.mixed_precision == \"bf16\":\n",
+    "    weight_dtype = torch.bfloat16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We need to recalculate our total training steps as the size of the training dataloader may have changed.\n",
+    "num_update_steps_per_epoch = math.ceil(len(train_dataloader) / grad_accum)\n",
+    "if overrode_max_train_steps:\n",
+    "    max_train_steps = num_train_epochs * num_update_steps_per_epoch\n",
+    "# Afterwards we recalculate our number of training epochs\n",
+    "num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)\n",
+    "\n",
+    "# We need to initialize the trackers we use, and also store our configuration.\n",
+    "# The trackers initializes automatically on the main process.\n",
+    "if accelerator.is_main_process:\n",
+    "    accelerator.init_trackers(\"bark_coarse\", config={})\n",
+    "\n",
+    "# Train!\n",
+    "total_batch_size = train_batch_size * accelerator.num_processes * grad_accum\n",
+    "logger.info(\"***** Running training *****\")\n",
+    "logger.info(f\"  Num examples = {len(train_dataset)}\")\n",
+    "logger.info(f\"  Num batches each epoch = {len(train_dataloader)}\")\n",
+    "logger.info(f\"  Num Epochs = {num_train_epochs}\")\n",
+    "logger.info(f\"  Instantaneous batch size per device = {train_batch_size}\")\n",
+    "logger.info(f\"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}\")\n",
+    "logger.info(f\"  Gradient Accumulation steps = {grad_accum}\")\n",
+    "logger.info(f\"  Total optimization steps = {max_train_steps}\")\n",
+    "global_step = 0\n",
+    "first_epoch = 0\n",
+    "\n",
+    "if resume_from_checkpoint:\n",
+    "    if resume_from_checkpoint != \"latest\":\n",
+    "        path = os.path.basename(resume_from_checkpoint)\n",
+    "    else:\n",
+    "        # Get the most recent checkpoint\n",
+    "        dirs = os.listdir(output_dir)\n",
+    "        dirs = [d for d in dirs if d.startswith(\"checkpoint\")]\n",
+    "        dirs = sorted(dirs, key=lambda x: int(x.split(\"-\")[1]))\n",
+    "        path = dirs[-1]\n",
+    "    accelerator.print(f\"Resuming from checkpoint {path}\")\n",
+    "    accelerator.load_state(os.path.join(output_dir, path))\n",
+    "    global_step = int(path.split(\"-\")[1])\n",
+    "\n",
+    "    resume_global_step = global_step * grad_accum\n",
+    "    first_epoch = resume_global_step // num_update_steps_per_epoch\n",
+    "    resume_step = resume_global_step % num_update_steps_per_epoch\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if accelerator.is_main_process:\n",
+    "    model.eval()\n",
+    "    validation_loss = 0.0\n",
+    "    num_batches = 0\n",
+    "    num_samples = 0\n",
+    "    with torch.no_grad():\n",
+    "        for val_step, val_batch in enumerate(validation_dataloader):\n",
+    "            # Similar to training, process the validation batch\n",
+    "            val_targets = val_batch['coarse_tokens'][:, 1:].contiguous()\n",
+    "            val_coarse_inputs = val_batch['coarse_tokens'][:, :-1]\n",
+    "            val_inputs = torch.cat([val_batch['semantic_tokens'], val_coarse_inputs], dim=1)\n",
+    "\n",
+    "            # Forward pass for validation\n",
+    "            val_logits = model(val_inputs, training=True)\n",
+    "            val_coarse_logits = val_logits[:, val_batch['semantic_tokens'].size(1):].contiguous()\n",
+    "\n",
+    "            # Calculate the validation loss\n",
+    "            val_loss = criterion(val_coarse_logits.view(-1, model.config.output_vocab_size), val_targets.view(-1))\n",
+    "            validation_loss += val_loss.item()\n",
+    "            num_batches += 1\n",
+    "            num_samples += val_batch['semantic_tokens'].size(0)\n",
+    "\n",
+    "    average_validation_loss = validation_loss / num_batches\n",
+    "    logger.info(f\"Validation Loss: {average_validation_loss} over {num_samples} samples and {num_batches} batches.\")\n",
+    "    print(f\"Validation Loss: {average_validation_loss} over {num_samples} samples and {num_batches} batches.\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#  Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Only show the progress bar once on each machine.\n",
+    "progress_bar = tqdm(range(global_step, max_train_steps), disable=not accelerator.is_local_main_process)\n",
+    "progress_bar.set_description(\"Steps\")\n",
+    "\n",
+    "for epoch in range(first_epoch, num_train_epochs):\n",
+    "    model.train()\n",
+    "    for step, batch in enumerate(train_dataloader):\n",
+    "        # Skip steps until we reach the resumed step\n",
+    "        if resume_from_checkpoint and epoch == first_epoch and step < resume_step:\n",
+    "            if step % grad_accum == 0:\n",
+    "                progress_bar.update(1)\n",
+    "            continue\n",
+    "\n",
+    "        with accelerator.accumulate(model):\n",
+    "            targets = batch['coarse_tokens'][:, 1:].contiguous()\n",
+    "    \n",
+    "            # Remove the last coarse token from the inputs since there is no target for it.\n",
+    "            coarse_inputs = batch['coarse_tokens'][:, :-1]\n",
+    "\n",
+    "            # Combine the semantic tokens and coarse tokens and feed them into the model.\n",
+    "            inputs = torch.cat([batch['semantic_tokens'], coarse_inputs], dim=1)\n",
+    "            logits = model(inputs, training=True)\n",
+    "\n",
+    "            # We're only interested in the logits for the coarse tokens, so we ignore the logits for the input text tokens.\n",
+    "            coarse_logits = logits[:, batch['semantic_tokens'].size(1):].contiguous()\n",
+    "\n",
+    "            # Compute the loss.\n",
+    "            loss = criterion(coarse_logits.view(-1, model.config.output_vocab_size), targets.view(-1))\n",
+    "\n",
+    "            if semantic_cross_entropy_loss_weight > 0 and semantic_cross_entropy_loss_weight is not None:\n",
+    "                semantic_logits = logits[:, :batch['semantic_tokens'].size(1)].contiguous()\n",
+    "                semantic_loss = criterion(\n",
+    "                    semantic_logits.view(-1, model.config.input_vocab_size),\n",
+    "                    batch['semantic_tokens'].view(-1),\n",
+    "                )\n",
+    "                num_semantic_logits = semantic_logits.size(1)\n",
+    "                num_coarse_logits = coarse_logits.size(1)\n",
+    "                loss = (\n",
+    "                    semantic_loss * num_semantic_logits * semantic_cross_entropy_loss_weight +\n",
+    "                    loss * num_coarse_logits\n",
+    "                ) / (num_semantic_logits + num_coarse_logits)\n",
+    "\n",
+    "            accelerator.backward(loss)\n",
+    "            if accelerator.sync_gradients:\n",
+    "                params_to_clip = (\n",
+    "                    param for param in model.parameters() if param.requires_grad\n",
+    "                )\n",
+    "                accelerator.clip_grad_norm_(params_to_clip, max_grad_norm)\n",
+    "            optimizer.step()\n",
+    "            lr_scheduler.step()\n",
+    "            optimizer.zero_grad()\n",
+    "\n",
+    "        # Checks if the accelerator has performed an optimization step behind the scenes\n",
+    "        if accelerator.sync_gradients:\n",
+    "            progress_bar.update(1)\n",
+    "            global_step += 1\n",
+    "\n",
+    "            if global_step % checkpointing_steps == 0:\n",
+    "                if accelerator.is_main_process:\n",
+    "                    save_path = os.path.join(output_dir, f\"checkpoint-{global_step}\")\n",
+    "                    accelerator.save_state(save_path)\n",
+    "                    logger.info(f\"Saved state to {save_path}\")\n",
+    "\n",
+    "        logs = {\"loss\": loss.detach().item(), \"lr\": lr_scheduler.get_last_lr()[0]}\n",
+    "        progress_bar.set_postfix(**logs)\n",
+    "        accelerator.log(logs, step=global_step)\n",
+    "\n",
+    "        if global_step >= max_train_steps:\n",
+    "            break\n",
+    "    \n",
+    "    accelerator.wait_for_everyone()\n",
+    "\n",
+    "if accelerator.is_main_process:\n",
+    "    if lora_dim > 0:\n",
+    "        model = convert_lora_to_linear_layer(model)\n",
+    "    # save model\n",
+    "    accelerator.save(model.state_dict(), os.path.join(output_dir, \"pytorch_model.bin\"))\n",
+    "    \n",
+    "    config = model.config.__dict__\n",
+    "    # save config\n",
+    "    with open(os.path.join(output_dir, \"config.json\"), \"w\") as f:\n",
+    "        json.dump(config, f, indent=2)\n",
+    "\n",
+    "accelerator.end_training()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if accelerator.is_main_process:\n",
+    "    model.eval()\n",
+    "    validation_loss = 0.0\n",
+    "    num_batches = 0\n",
+    "    num_samples = 0\n",
+    "    with torch.no_grad():\n",
+    "        for val_step, val_batch in enumerate(validation_dataloader):\n",
+    "            # Similar to training, process the validation batch\n",
+    "            val_targets = val_batch['coarse_tokens'][:, 1:].contiguous()\n",
+    "            val_coarse_inputs = val_batch['coarse_tokens'][:, :-1]\n",
+    "            val_inputs = torch.cat([val_batch['semantic_tokens'], val_coarse_inputs], dim=1)\n",
+    "\n",
+    "            # Forward pass for validation\n",
+    "            val_logits = model(val_inputs, training=True)\n",
+    "            val_coarse_logits = val_logits[:, val_batch['semantic_tokens'].size(1):].contiguous()\n",
+    "\n",
+    "            # Calculate the validation loss\n",
+    "            val_loss = criterion(val_coarse_logits.view(-1, model.config.output_vocab_size), val_targets.view(-1))\n",
+    "            validation_loss += val_loss.item()\n",
+    "            num_batches += 1\n",
+    "            num_samples += val_batch['semantic_tokens'].size(0)\n",
+    "\n",
+    "    average_validation_loss = validation_loss / num_batches\n",
+    "    logger.info(f\"Validation Loss: {average_validation_loss} over {num_samples} samples and {num_batches} batches.\")\n",
+    "    print(f\"Validation Loss: {average_validation_loss} over {num_samples} samples and {num_batches} batches.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

train_fine.ipynb ADDED Viewed

	@@ -0,0 +1,919 @@

+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import os\n",
+    "import re\n",
+    "import gc\n",
+    "import json\n",
+    "import math\n",
+    "import hashlib\n",
+    "import numpy as np\n",
+    "import logging\n",
+    "import torchaudio\n",
+    "from tqdm.auto import tqdm\n",
+    "import torch.nn.functional as F\n",
+    "from encodec.utils import convert_audio\n",
+    "from accelerate import Accelerator\n",
+    "from accelerate.utils import set_seed\n",
+    "from transformers import BertTokenizer\n",
+    "from huggingface_hub import hf_hub_download\n",
+    "from packaging import version\n",
+    "from diffusers.optimization import get_scheduler\n",
+    "\n",
+    "from utils.bitsandbytes import BitsAndBytesConfig, importlib_metadata, get_keys_to_not_convert, replace_with_bnb_linear, set_module_quantized_tensor_to_device\n",
+    "from utils.lora import convert_linear_layer_to_lora, only_optimize_lora_parameters, convert_lora_to_linear_layer\n",
+    "from bark.model import GPTConfig, GPT\n",
+    "from bark.model_fine import FineGPT, FineGPTConfig"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training Args"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_batch_size = 8\n",
+    "eval_batch_size = 8\n",
+    "grad_accum = 2\n",
+    "ckpt_path = 'models/fine_2.pt'\n",
+    "model_type = \"fine\"\n",
+    "dataset_path = 'datasets/joe_biden_state_of_union/'\n",
+    "logging_dir = 'logs/'\n",
+    "log_with = 'wandb'\n",
+    "hubert_path = 'data/models/hubert/hubert.pt'\n",
+    "hubert_tokenizer_path = 'data/models/hubert/tokenizer.pth'\n",
+    "\n",
+    "output_dir = 'fine_output/'\n",
+    "resume_from_checkpoint = None\n",
+    "\n",
+    "checkpointing_steps = 1000\n",
+    "\n",
+    "mixed_precision = 'bf16'\n",
+    "bits = 16 #4 4 and 8 bit are a work in progress\n",
+    "compute_dtype = torch.bfloat16\n",
+    "double_quant = True\n",
+    "quant_type = 'nf4'\n",
+    "\n",
+    "lora_dim = 64\n",
+    "lora_scaling = 1\n",
+    "lora_dropout = 0.1\n",
+    "lora_module_name = 'transformer.h'\n",
+    "optimize_lora_params_only = False\n",
+    "\n",
+    "learning_rate = 1e-4\n",
+    "scale_lr = False\n",
+    "use_8bit_adam = False\n",
+    "adam_beta1 = 0.9\n",
+    "adam_beta2 = 0.999\n",
+    "adam_epsilon = 1e-8\n",
+    "weight_decay = 0.01\n",
+    "\n",
+    "llm_int8_skip_modules = None\n",
+    "keep_in_fp32_modules = ['lm_head']\n",
+    "\n",
+    "lr_scheduler_type = 'linear'\n",
+    "lr_warmup_steps = 60\n",
+    "num_train_epochs = 5\n",
+    "max_train_steps = None\n",
+    "max_grad_norm = 1.0\n",
+    "\n",
+    "seed = 741"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Define Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CONTEXT_WINDOW_SIZE = 1024\n",
+    "\n",
+    "MAX_SEMANTIC_LEN = 256\n",
+    "\n",
+    "SEMANTIC_RATE_HZ = 49.9\n",
+    "SEMANTIC_VOCAB_SIZE = 10_000\n",
+    "\n",
+    "TEXT_ENCODING_OFFSET = 10_048\n",
+    "SEMANTIC_PAD_TOKEN = 10_000\n",
+    "TEXT_PAD_TOKEN = 129_595\n",
+    "SEMANTIC_INFER_TOKEN = 129_599\n",
+    "\n",
+    "MAX_COARSE_LEN = 768\n",
+    "\n",
+    "SAMPLE_RATE = 24_000\n",
+    "CHANNELS = 1\n",
+    "\n",
+    "COARSE_SEMANTIC_PAD_TOKEN = 12_048\n",
+    "COARSE_INFER_TOKEN = 12_050\n",
+    "\n",
+    "CODEBOOK_SIZE = 1024\n",
+    "N_COARSE_CODEBOOKS = 2\n",
+    "N_FINE_CODEBOOKS = 8\n",
+    "COARSE_RATE_HZ = 75\n",
+    "\n",
+    "logger = logging.getLogger(__name__)\n",
+    "\n",
+    "\n",
+    "USE_SMALL_MODELS = os.environ.get(\"SERP_USE_SMALL_MODELS\", False)\n",
+    "\n",
+    "default_cache_dir = os.path.join(os.path.expanduser(\"~\"), \".cache\")\n",
+    "CACHE_DIR = os.path.join(os.getenv(\"XDG_CACHE_HOME\", default_cache_dir), \"serp\", \"bark_v0\")\n",
+    "\n",
+    "\n",
+    "def _clear_cuda_cache():\n",
+    "    if torch.cuda.is_available():\n",
+    "        torch.cuda.empty_cache()\n",
+    "        torch.cuda.synchronize()\n",
+    "\n",
+    "\n",
+    "def _md5(fname):\n",
+    "    hash_md5 = hashlib.md5()\n",
+    "    with open(fname, \"rb\") as f:\n",
+    "        for chunk in iter(lambda: f.read(4096), b\"\"):\n",
+    "            hash_md5.update(chunk)\n",
+    "    return hash_md5.hexdigest()\n",
+    "\n",
+    "\n",
+    "def _download(from_hf_path, file_name, to_local_path):\n",
+    "    to_local_path = to_local_path.replace(\"\\\\\", \"/\")\n",
+    "    path = '/'.join(to_local_path.split(\"/\")[:-1])\n",
+    "    os.makedirs(path, exist_ok=True)\n",
+    "    hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=path)\n",
+    "    os.replace(os.path.join(path, file_name), to_local_path)\n",
+    "\n",
+    "\n",
+    "def _tokenize(tokenizer, text):\n",
+    "    return tokenizer.encode(text, add_special_tokens=False)\n",
+    "\n",
+    "\n",
+    "def _detokenize(tokenizer, enc_text):\n",
+    "    return tokenizer.decode(enc_text)\n",
+    "\n",
+    "\n",
+    "def _normalize_whitespace(text):\n",
+    "    return re.sub(r\"\\s+\", \" \", text).strip()\n",
+    "\n",
+    "\n",
+    "REMOTE_MODEL_PATHS = {\n",
+    "    \"text_small\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"text.pt\",\n",
+    "        \"checksum\": \"b3e42bcbab23b688355cd44128c4cdd3\",\n",
+    "    },\n",
+    "    \"coarse_small\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"coarse.pt\",\n",
+    "        \"checksum\": \"5fe964825e3b0321f9d5f3857b89194d\",\n",
+    "    },\n",
+    "    \"fine_small\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"fine.pt\",\n",
+    "        \"checksum\": \"5428d1befe05be2ba32195496e58dc90\",\n",
+    "    },\n",
+    "    \"text\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"text_2.pt\",\n",
+    "        \"checksum\": \"54afa89d65e318d4f5f80e8e8799026a\",\n",
+    "    },\n",
+    "    \"coarse\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"coarse_2.pt\",\n",
+    "        \"checksum\": \"8a98094e5e3a255a5c9c0ab7efe8fd28\",\n",
+    "    },\n",
+    "    \"fine\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"fine_2.pt\",\n",
+    "        \"checksum\": \"59d184ed44e3650774a2f0503a48a97b\",\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "\n",
+    "def _load_model(ckpt_path, device, use_small=False, model_type=\"text\"):\n",
+    "    if model_type == \"text\":\n",
+    "        ConfigClass = GPTConfig\n",
+    "        ModelClass = GPT\n",
+    "    elif model_type == \"coarse\":\n",
+    "        ConfigClass = GPTConfig\n",
+    "        ModelClass = GPT\n",
+    "    elif model_type == \"fine\":\n",
+    "        ConfigClass = FineGPTConfig\n",
+    "        ModelClass = FineGPT\n",
+    "    else:\n",
+    "        raise NotImplementedError()\n",
+    "    model_key = f\"{model_type}_small\" if use_small or USE_SMALL_MODELS else model_type\n",
+    "    model_info = REMOTE_MODEL_PATHS[model_key]\n",
+    "    if ckpt_path in [None, '']:\n",
+    "        ckpt_path = os.path.join(CACHE_DIR, model_info[\"file_name\"])\n",
+    "    if not os.path.exists(ckpt_path):\n",
+    "        logger.info(f\"{model_type} model not found, downloading into `{CACHE_DIR}`.\")\n",
+    "        _download(model_info[\"repo_id\"], model_info[\"file_name\"], ckpt_path)\n",
+    "    checkpoint = torch.load(ckpt_path, map_location=device)\n",
+    "    # this is a hack\n",
+    "    model_args = checkpoint[\"model_args\"]\n",
+    "    if \"input_vocab_size\" not in model_args:\n",
+    "        model_args[\"input_vocab_size\"] = model_args[\"vocab_size\"]\n",
+    "        model_args[\"output_vocab_size\"] = model_args[\"vocab_size\"]\n",
+    "        del model_args[\"vocab_size\"]\n",
+    "    gptconf = ConfigClass(**checkpoint[\"model_args\"])\n",
+    "    model = ModelClass(gptconf)\n",
+    "    state_dict = checkpoint[\"model\"]\n",
+    "    # fixup checkpoint\n",
+    "    unwanted_prefix = \"_orig_mod.\"\n",
+    "    for k, v in list(state_dict.items()):\n",
+    "        if k.startswith(unwanted_prefix):\n",
+    "            state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)\n",
+    "    extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())\n",
+    "    extra_keys = set([k for k in extra_keys if not k.endswith(\".attn.bias\")])\n",
+    "    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())\n",
+    "    missing_keys = set([k for k in missing_keys if not k.endswith(\".attn.bias\")])\n",
+    "    if len(extra_keys) != 0:\n",
+    "        raise ValueError(f\"extra keys found: {extra_keys}\")\n",
+    "    if len(missing_keys) != 0:\n",
+    "        raise ValueError(f\"missing keys: {missing_keys}\")\n",
+    "    model.load_state_dict(state_dict, strict=False)\n",
+    "    n_params = model.get_num_params()\n",
+    "    val_loss = checkpoint[\"best_val_loss\"].item()\n",
+    "    print(f\"Loaded {model_type} model with {n_params} params, val_loss={val_loss:.4f}.\")\n",
+    "    del checkpoint, state_dict\n",
+    "    _clear_cuda_cache()\n",
+    "    if model_type == \"text\":\n",
+    "        tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\")\n",
+    "        return model, tokenizer\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "def _flatten_codebooks(arr, offset_size=CODEBOOK_SIZE):\n",
+    "    assert len(arr.shape) == 2\n",
+    "    arr = arr.copy()\n",
+    "    if offset_size is not None:\n",
+    "        for n in range(1, arr.shape[0]):\n",
+    "            arr[n, :] += offset_size * n\n",
+    "    flat_arr = arr.ravel(\"F\")\n",
+    "    return flat_arr\n",
+    "\n",
+    "\n",
+    "def load_filepaths_and_text(filename, split=\"|\"):\n",
+    "    with open(filename, encoding='utf-8', errors='ignore') as f:\n",
+    "        filepaths_and_text = [line.strip().split(split) for line in f]\n",
+    "        base = os.path.dirname(filename)\n",
+    "        for j in range(len(filepaths_and_text)):\n",
+    "            filepaths_and_text[j][0] = os.path.join(base, filepaths_and_text[j][0])\n",
+    "    return filepaths_and_text\n",
+    "\n",
+    "\n",
+    "class TtsDataset(torch.utils.data.Dataset):\n",
+    "    def __init__(self, opt):\n",
+    "        self.path = os.path.dirname(opt['path'])\n",
+    "        self.mode = opt['mode']\n",
+    "        self.audiopaths_and_text = load_filepaths_and_text(os.path.join(opt['path'] , opt['mode'] + '.txt'))\n",
+    "\n",
+    "    def __getitem__(self, index):\n",
+    "        audiopath_and_text = self.audiopaths_and_text[index]\n",
+    "        audiopath = audiopath_and_text[0]\n",
+    "\n",
+    "        tokens = np.load(audiopath.replace('.wav', '.npz').replace('wavs', 'tokens'))\n",
+    "        fine_tokens = tokens['fine']\n",
+    "\n",
+    "        return torch.from_numpy(fine_tokens)\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.audiopaths_and_text)\n",
+    "\n",
+    "\n",
+    "class TtsCollater():\n",
+    "    def __init__(self):\n",
+    "        pass\n",
+    "    def __call__(self, batch):\n",
+    "        max_len = 1024\n",
+    "        fine_tokens = []\n",
+    "\n",
+    "        for fine_tokens_ in batch:\n",
+    "            if fine_tokens_.shape[1] > max_len:\n",
+    "                start_idx = np.random.randint(0, fine_tokens_.shape[1] - max_len + 1)\n",
+    "                fine_tokens_ = fine_tokens_[:, start_idx : start_idx + max_len]\n",
+    "\n",
+    "            pad_size = max_len - fine_tokens_.shape[1]\n",
+    "            fine_tokens_ = F.pad(fine_tokens_, (0, pad_size), value=CODEBOOK_SIZE)\n",
+    "\n",
+    "            fine_tokens_ = fine_tokens_.T\n",
+    "\n",
+    "            fine_tokens.append(fine_tokens_)\n",
+    "\n",
+    "        return {'fine_tokens': torch.stack(fine_tokens).contiguous()}\n",
+    "    \n",
+    "\n",
+    "accelerator = Accelerator(\n",
+    "    gradient_accumulation_steps=grad_accum,\n",
+    "    mixed_precision=mixed_precision,\n",
+    "    log_with=log_with,\n",
+    "    logging_dir=logging_dir,\n",
+    ")\n",
+    "device = accelerator.device\n",
+    "\n",
+    "os.makedirs(output_dir, exist_ok=True)\n",
+    "\n",
+    "set_seed(seed)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Setup Dataset (only need to do this once)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# max_duration_sec = 15.12 # the maximum allowed duration in seconds\n",
+    "\n",
+    "# path = dataset_path\n",
+    "\n",
+    "# # From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer\n",
+    "# from hubert.hubert_manager import HuBERTManager\n",
+    "# hubert_manager = HuBERTManager()\n",
+    "# from hubert.pre_kmeans_hubert import CustomHubert\n",
+    "# from hubert.customtokenizer import CustomTokenizer\n",
+    "# hubert_manager.make_sure_hubert_installed()\n",
+    "# hubert_manager.make_sure_tokenizer_installed()\n",
+    "\n",
+    "# # Load the HuBERT model\n",
+    "# hubert_model = CustomHubert(checkpoint_path=hubert_path).to(device)\n",
+    "# hubert_model.eval()\n",
+    "# for param in hubert_model.parameters():\n",
+    "#     param.requires_grad = False\n",
+    "\n",
+    "# # Load the CustomTokenizer model\n",
+    "# hubert_tokenizer = CustomTokenizer.load_from_checkpoint(hubert_tokenizer_path).to(device)  # Automatically uses the right layers\n",
+    "\n",
+    "# from bark.generation import load_codec_model\n",
+    "# codec_model = load_codec_model(use_gpu=True)\n",
+    "# codec_model.eval()\n",
+    "# for param in codec_model.parameters():\n",
+    "#     param.requires_grad = False\n",
+    "\n",
+    "\n",
+    "# def get_duration(wav, sr):\n",
+    "#     return wav.shape[1] / sr\n",
+    "\n",
+    "# valid_lines_train = []\n",
+    "# # convert wavs to semantic tokens\n",
+    "# for wav_path, txt in load_filepaths_and_text(path + 'train.txt'):\n",
+    "#     wav, sr = torchaudio.load(wav_path)\n",
+    "#     if not get_duration(wav, sr) > max_duration_sec:\n",
+    "#         valid_lines_train.append((wav_path, txt))\n",
+    "#     wav = convert_audio(wav, sr, SAMPLE_RATE, CHANNELS).to(device)\n",
+    "\n",
+    "#     semantic_vectors = hubert_model.forward(wav, input_sample_hz=SAMPLE_RATE)\n",
+    "#     semantic_tokens = hubert_tokenizer.get_token(semantic_vectors)\n",
+    "\n",
+    "#     # save semantic tokens\n",
+    "#     os.makedirs(os.path.join(path, 'tokens'), exist_ok=True)\n",
+    "#     semantic_tokens = semantic_tokens.cpu().numpy()\n",
+    "\n",
+    "#     # Extract discrete codes from EnCodec\n",
+    "#     with torch.no_grad():\n",
+    "#         encoded_frames = codec_model.encode(wav.unsqueeze(0))\n",
+    "#     codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]\n",
+    "\n",
+    "#     # move codes to cpu\n",
+    "#     codes = codes.cpu().numpy()\n",
+    "\n",
+    "#     # save tokens\n",
+    "#     np.savez_compressed(os.path.join(path, 'tokens', os.path.basename(wav_path).replace('.wav', '.npz')), fine=codes, coarse=codes[:2, :], semantic=semantic_tokens)\n",
+    "\n",
+    "# # rewrite train.txt with valid lines\n",
+    "# with open(path + 'train_valid.txt', 'w', encoding='utf-8') as f:\n",
+    "#     for wav_path, txt in valid_lines_train:\n",
+    "#         wav_path = os.path.relpath(wav_path, dataset_path).replace('\\\\', '/')\n",
+    "#         f.write(f'{wav_path}|{txt}\\n')\n",
+    "\n",
+    "# valid_lines_valid = []\n",
+    "# for wav_path, txt in load_filepaths_and_text(path + 'valid.txt'):\n",
+    "#     wav, sr = torchaudio.load(wav_path)\n",
+    "#     if not get_duration(wav, sr) > max_duration_sec:\n",
+    "#         valid_lines_valid.append((wav_path, txt))\n",
+    "#     wav = convert_audio(wav, sr, SAMPLE_RATE, CHANNELS).to(device)\n",
+    "\n",
+    "#     semantic_vectors = hubert_model.forward(wav, input_sample_hz=SAMPLE_RATE)\n",
+    "#     semantic_tokens = hubert_tokenizer.get_token(semantic_vectors)\n",
+    "\n",
+    "#     # save semantic tokens\n",
+    "#     os.makedirs(os.path.join(path, 'tokens'), exist_ok=True)\n",
+    "#     semantic_tokens = semantic_tokens.cpu().numpy()\n",
+    "    \n",
+    "#     # Extract discrete codes from EnCodec\n",
+    "#     with torch.no_grad():\n",
+    "#         encoded_frames = codec_model.encode(wav.unsqueeze(0))\n",
+    "#     codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]\n",
+    "\n",
+    "#     # move codes to cpu\n",
+    "#     codes = codes.cpu().numpy()\n",
+    "\n",
+    "#     # save tokens\n",
+    "#     np.savez_compressed(os.path.join(path, 'tokens', os.path.basename(wav_path).replace('.wav', '.npz')), fine=codes, coarse=codes[:2, :], semantic=semantic_tokens)\n",
+    "\n",
+    "# # rewrite valid.txt with valid lines\n",
+    "# with open(path + 'valid_valid.txt', 'w', encoding='utf-8') as f:\n",
+    "#     for wav_path, txt in valid_lines_valid:\n",
+    "#         wav_path = os.path.relpath(wav_path, dataset_path).replace('\\\\', '/')\n",
+    "#         f.write(f'{wav_path}|{txt}\\n')\n",
+    "\n",
+    "# del hubert_model\n",
+    "# del hubert_tokenizer\n",
+    "# del codec_model\n",
+    "# gc.collect()\n",
+    "# torch.cuda.empty_cache()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = _load_model(ckpt_path, device, use_small=False, model_type=model_type)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if scale_lr:\n",
+    "    learning_rate = (\n",
+    "        learning_rate * grad_accum * train_batch_size * accelerator.num_processes\n",
+    "    )\n",
+    "\n",
+    "if use_8bit_adam:\n",
+    "    try:\n",
+    "        import bitsandbytes as bnb\n",
+    "    except ImportError:\n",
+    "        raise ImportError(\n",
+    "            \"To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`.\"\n",
+    "        )\n",
+    "\n",
+    "    optimizer_class = bnb.optim.AdamW8bit\n",
+    "else:\n",
+    "    optimizer_class = torch.optim.AdamW"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "quantization_config=BitsAndBytesConfig(\n",
+    "    load_in_4bit=bits == 4,\n",
+    "    load_in_8bit=bits == 8,\n",
+    "    llm_int8_threshold=6.0,\n",
+    "    llm_int8_has_fp16_weight=False,\n",
+    "    bnb_4bit_compute_dtype=compute_dtype,\n",
+    "    bnb_4bit_use_double_quant=double_quant,\n",
+    "    bnb_4bit_quant_type=quant_type # {'fp4', 'nf4'}\n",
+    ")\n",
+    "\n",
+    "# if quantization_config.load_in_8bit or quantization_config.load_in_4bit:\n",
+    "#     if quantization_config.load_in_8bit:\n",
+    "#         logger.info(\"Detected 8-bit loading: activating 8-bit loading for this model\")\n",
+    "#     elif quantization_config.load_in_4bit:\n",
+    "#         logger.info(\"Detected 4-bit loading: activating 4-bit loading for this model\")\n",
+    "\n",
+    "#     # We keep some modules such as the lm_head in their original dtype for numerical stability reasons\n",
+    "#     if llm_int8_skip_modules is None or len(llm_int8_skip_modules) == 0:\n",
+    "#         modules_to_not_convert = [] # get_keys_to_not_convert(model)\n",
+    "#     else:\n",
+    "#         modules_to_not_convert = llm_int8_skip_modules\n",
+    "\n",
+    "#     if not isinstance(modules_to_not_convert, list):\n",
+    "#         modules_to_not_convert = [modules_to_not_convert]\n",
+    "\n",
+    "#     modules_to_not_convert.extend(keep_in_fp32_modules)\n",
+    "\n",
+    "#     supports_4bit = version.parse(importlib_metadata.version(\"bitsandbytes\")) >= version.parse(\"0.39.0\")\n",
+    "\n",
+    "#     if quantization_config.load_in_4bit and not supports_4bit:\n",
+    "#         raise ValueError(\n",
+    "#             \"You have a version of `bitsandbytes` that is not compatible with 4bit inference and training\"\n",
+    "#             \" make sure you have the latest version of `bitsandbytes` installed\"\n",
+    "#         )\n",
+    "    \n",
+    "#     if len(modules_to_not_convert) == 0:\n",
+    "#         modules_to_not_convert = None\n",
+    "\n",
+    "#     model = replace_with_bnb_linear(\n",
+    "#         model, modules_to_not_convert=modules_to_not_convert, quantization_config=quantization_config\n",
+    "#     )\n",
+    "\n",
+    "#     # training in 8-bit is only available in 0.37.0+\n",
+    "#     model._is_kbit_training_enabled = version.parse(\n",
+    "#         importlib_metadata.version(\"bitsandbytes\")\n",
+    "#     ) >= version.parse(\"0.37.0\")\n",
+    "\n",
+    "#     model.config.quantization_config = quantization_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if bits == 4:\n",
+    "    from accelerate.utils import CustomDtype\n",
+    "    target_dtype = CustomDtype.INT4\n",
+    "elif bits == 8:\n",
+    "    target_dtype = torch.int8\n",
+    "\n",
+    "if lora_dim > 0:\n",
+    "    for param in model.parameters():\n",
+    "        if param.ndim == 1:\n",
+    "            # cast the small parameters (e.g. layernorm) to fp32 for stability\n",
+    "            param.data = param.data.to(torch.float32)\n",
+    "            \n",
+    "    class CastOutputToFloat(nn.Sequential):\n",
+    "        def forward(self, x):\n",
+    "            return super().forward(x).to(torch.float32)\n",
+    "\n",
+    "    # model.lm_head = CastOutputToFloat(model.lm_head)\n",
+    "    for i, lm_head in enumerate(model.lm_heads):\n",
+    "        model.lm_heads[i] = CastOutputToFloat(lm_head)\n",
+    "\n",
+    "    model = convert_linear_layer_to_lora(model, lora_module_name,\n",
+    "                                            lora_dim=lora_dim, lora_scaling=lora_scaling,\n",
+    "                                            lora_dropout=lora_dropout)\n",
+    "    if optimize_lora_params_only:\n",
+    "        model = only_optimize_lora_parameters(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "params_to_optimize = (\n",
+    "        param for param in model.parameters() if param.requires_grad\n",
+    "    )\n",
+    "\n",
+    "optimizer = optimizer_class(\n",
+    "    params_to_optimize,\n",
+    "    lr=learning_rate,\n",
+    "    betas=(adam_beta1, adam_beta2),\n",
+    "    weight_decay=weight_decay,\n",
+    "    eps=adam_epsilon,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "opt_train = {\n",
+    "    'path': dataset_path,\n",
+    "    'mode': 'train',\n",
+    "}\n",
+    "\n",
+    "opt_val = {\n",
+    "    'path': dataset_path,\n",
+    "    'mode': 'valid',\n",
+    "}\n",
+    "\n",
+    "train_dataset = TtsDataset(opt_train)\n",
+    "validation_dataset = TtsDataset(opt_val)\n",
+    "\n",
+    "train_dataloader = torch.utils.data.DataLoader(\n",
+    "    train_dataset,\n",
+    "    batch_size=train_batch_size,\n",
+    "    collate_fn=TtsCollater(),\n",
+    ")\n",
+    "\n",
+    "validation_dataloader = torch.utils.data.DataLoader(\n",
+    "    validation_dataset,\n",
+    "    batch_size=eval_batch_size,\n",
+    "    collate_fn=TtsCollater(),\n",
+    ")\n",
+    "\n",
+    "criterion = torch.nn.CrossEntropyLoss(ignore_index=COARSE_SEMANTIC_PAD_TOKEN)\n",
+    "\n",
+    "# Scheduler and math around the number of training steps.\n",
+    "overrode_max_train_steps = False\n",
+    "num_update_steps_per_epoch = math.ceil(len(train_dataloader) / grad_accum)\n",
+    "if max_train_steps is None:\n",
+    "    max_train_steps = num_train_epochs * num_update_steps_per_epoch\n",
+    "    overrode_max_train_steps = True\n",
+    "\n",
+    "lr_scheduler = get_scheduler(\n",
+    "    lr_scheduler_type,\n",
+    "    optimizer=optimizer,\n",
+    "    num_warmup_steps=lr_warmup_steps * grad_accum,\n",
+    "    num_training_steps=max_train_steps * grad_accum,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, optimizer, train_dataloader, validation_dataloader, lr_scheduler = accelerator.prepare(\n",
+    "    model, optimizer, train_dataloader, validation_dataloader, lr_scheduler\n",
+    ")\n",
+    "accelerator.register_for_checkpointing(lr_scheduler)\n",
+    "\n",
+    "weight_dtype = torch.float32\n",
+    "if accelerator.mixed_precision == \"fp16\":\n",
+    "    weight_dtype = torch.float16\n",
+    "elif accelerator.mixed_precision == \"bf16\":\n",
+    "    weight_dtype = torch.bfloat16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We need to recalculate our total training steps as the size of the training dataloader may have changed.\n",
+    "num_update_steps_per_epoch = math.ceil(len(train_dataloader) / grad_accum)\n",
+    "if overrode_max_train_steps:\n",
+    "    max_train_steps = num_train_epochs * num_update_steps_per_epoch\n",
+    "# Afterwards we recalculate our number of training epochs\n",
+    "num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)\n",
+    "\n",
+    "# We need to initialize the trackers we use, and also store our configuration.\n",
+    "# The trackers initializes automatically on the main process.\n",
+    "if accelerator.is_main_process:\n",
+    "    accelerator.init_trackers(\"bark_coarse\", config={})\n",
+    "\n",
+    "total_batch_size = train_batch_size * accelerator.num_processes * grad_accum\n",
+    "logger.info(\"***** Running training *****\")\n",
+    "logger.info(f\"  Num examples = {len(train_dataset)}\")\n",
+    "logger.info(f\"  Num batches each epoch = {len(train_dataloader)}\")\n",
+    "logger.info(f\"  Num Epochs = {num_train_epochs}\")\n",
+    "logger.info(f\"  Instantaneous batch size per device = {train_batch_size}\")\n",
+    "logger.info(f\"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}\")\n",
+    "logger.info(f\"  Gradient Accumulation steps = {grad_accum}\")\n",
+    "logger.info(f\"  Total optimization steps = {max_train_steps}\")\n",
+    "global_step = 0\n",
+    "first_epoch = 0\n",
+    "\n",
+    "if resume_from_checkpoint:\n",
+    "    if resume_from_checkpoint != \"latest\":\n",
+    "        path = os.path.basename(resume_from_checkpoint)\n",
+    "    else:\n",
+    "        # Get the most recent checkpoint\n",
+    "        dirs = os.listdir(output_dir)\n",
+    "        dirs = [d for d in dirs if d.startswith(\"checkpoint\")]\n",
+    "        dirs = sorted(dirs, key=lambda x: int(x.split(\"-\")[1]))\n",
+    "        path = dirs[-1]\n",
+    "    accelerator.print(f\"Resuming from checkpoint {path}\")\n",
+    "    accelerator.load_state(os.path.join(output_dir, path))\n",
+    "    global_step = int(path.split(\"-\")[1])\n",
+    "\n",
+    "    resume_global_step = global_step * grad_accum\n",
+    "    first_epoch = resume_global_step // num_update_steps_per_epoch\n",
+    "    resume_step = resume_global_step % num_update_steps_per_epoch\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if accelerator.is_main_process:\n",
+    "    model.eval()\n",
+    "    validation_loss = 0.0\n",
+    "    num_batches = 0\n",
+    "    num_samples = 0\n",
+    "    with torch.no_grad():\n",
+    "        for val_step, val_batch in enumerate(validation_dataloader):\n",
+    "            # Similar to training, process the validation batch\n",
+    "            fine_targets_7 = val_batch['fine_tokens'][:, :, 6]\n",
+    "            fine_tokens_input_7 = torch.cat([val_batch['fine_tokens'][:, :, :6], torch.zeros_like(val_batch['fine_tokens'][:, :, 6:])], dim=2)\n",
+    "            fine_targets_8 = val_batch['fine_tokens'][:, :, 7]\n",
+    "            fine_tokens_input_8 = torch.cat([val_batch['fine_tokens'][:, :, :7], torch.zeros_like(val_batch['fine_tokens'][:, :, 7:])], dim=2)\n",
+    "\n",
+    "            # Forward pass for validation\n",
+    "            logits_7 = model(6, fine_tokens_input_7)\n",
+    "            logits_8 = model(7, fine_tokens_input_8)\n",
+    "\n",
+    "            # Calculate the validation loss\n",
+    "            loss_7 = criterion(logits_7.view(-1, model.config.output_vocab_size), fine_targets_7.view(-1))\n",
+    "            loss_8 = criterion(logits_8.view(-1, model.config.output_vocab_size), fine_targets_8.view(-1))\n",
+    "\n",
+    "            loss = (loss_7 + loss_8) / 2\n",
+    "            validation_loss += loss.item()\n",
+    "            num_batches += 1\n",
+    "            num_samples += val_batch['fine_tokens'].size(0)\n",
+    "\n",
+    "    average_validation_loss = validation_loss / num_batches\n",
+    "    logger.info(f\"Validation Loss: {average_validation_loss} over {num_samples} samples and {num_batches} batches.\")\n",
+    "    print(f\"Validation Loss: {average_validation_loss} over {num_samples} samples and {num_batches} batches.\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#  Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Only show the progress bar once on each machine.\n",
+    "progress_bar = tqdm(range(global_step, max_train_steps), disable=not accelerator.is_local_main_process)\n",
+    "progress_bar.set_description(\"Steps\")\n",
+    "\n",
+    "for epoch in range(first_epoch, num_train_epochs):\n",
+    "    model.train()\n",
+    "    for step, batch in enumerate(train_dataloader):\n",
+    "        # Skip steps until we reach the resumed step\n",
+    "        if resume_from_checkpoint and epoch == first_epoch and step < resume_step:\n",
+    "            if step % grad_accum == 0:\n",
+    "                progress_bar.update(1)\n",
+    "            continue\n",
+    "\n",
+    "        with accelerator.accumulate(model):\n",
+    "            fine_targets_7 = batch['fine_tokens'][:, :, 6]\n",
+    "            fine_tokens_input_7 = torch.cat([batch['fine_tokens'][:, :, :6], torch.zeros_like(batch['fine_tokens'][:, :, 6:])], dim=2)\n",
+    "            fine_targets_8 = batch['fine_tokens'][:, :, 7]\n",
+    "            fine_tokens_input_8 = torch.cat([batch['fine_tokens'][:, :, :7], torch.zeros_like(batch['fine_tokens'][:, :, 7:])], dim=2)\n",
+    "\n",
+    "            # Forward pass\n",
+    "            logits_7 = model(6, fine_tokens_input_7)\n",
+    "            logits_8 = model(7, fine_tokens_input_8)\n",
+    "\n",
+    "            # Calculate the loss\n",
+    "            loss_7 = criterion(logits_7.view(-1, model.config.output_vocab_size), fine_targets_7.view(-1))\n",
+    "            loss_8 = criterion(logits_8.view(-1, model.config.output_vocab_size), fine_targets_8.view(-1))\n",
+    "\n",
+    "            loss = (loss_7 + loss_8) / 2\n",
+    "\n",
+    "            accelerator.backward(loss)\n",
+    "            if accelerator.sync_gradients:\n",
+    "                params_to_clip = (\n",
+    "                    param for param in model.parameters() if param.requires_grad\n",
+    "                )\n",
+    "                accelerator.clip_grad_norm_(params_to_clip, max_grad_norm)\n",
+    "            optimizer.step()\n",
+    "            lr_scheduler.step()\n",
+    "            optimizer.zero_grad()\n",
+    "\n",
+    "        # Checks if the accelerator has performed an optimization step behind the scenes\n",
+    "        if accelerator.sync_gradients:\n",
+    "            progress_bar.update(1)\n",
+    "            global_step += 1\n",
+    "\n",
+    "            if global_step % checkpointing_steps == 0:\n",
+    "                if accelerator.is_main_process:\n",
+    "                    save_path = os.path.join(output_dir, f\"checkpoint-{global_step}\")\n",
+    "                    accelerator.save_state(save_path)\n",
+    "                    logger.info(f\"Saved state to {save_path}\")\n",
+    "\n",
+    "        logs = {\"loss\": loss.detach().item(), \"lr\": lr_scheduler.get_last_lr()[0]}\n",
+    "        progress_bar.set_postfix(**logs)\n",
+    "        accelerator.log(logs, step=global_step)\n",
+    "\n",
+    "        if global_step >= max_train_steps:\n",
+    "            break\n",
+    "    \n",
+    "    accelerator.wait_for_everyone()\n",
+    "\n",
+    "if accelerator.is_main_process:\n",
+    "    if lora_dim > 0:\n",
+    "        model = convert_lora_to_linear_layer(model)\n",
+    "    # save model\n",
+    "    accelerator.save(model.state_dict(), os.path.join(output_dir, \"pytorch_model.bin\"))\n",
+    "    \n",
+    "    config = model.config.__dict__\n",
+    "    # save config\n",
+    "    with open(os.path.join(output_dir, \"config.json\"), \"w\") as f:\n",
+    "        json.dump(config, f, indent=2)\n",
+    "\n",
+    "accelerator.end_training()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if accelerator.is_main_process:\n",
+    "    model.eval()\n",
+    "    validation_loss = 0.0\n",
+    "    num_batches = 0\n",
+    "    num_samples = 0\n",
+    "    with torch.no_grad():\n",
+    "        for val_step, val_batch in enumerate(validation_dataloader):\n",
+    "            # Similar to training, process the validation batch\n",
+    "            fine_targets_7 = val_batch['fine_tokens'][:, :, 6]\n",
+    "            fine_tokens_input_7 = torch.cat([val_batch['fine_tokens'][:, :, :6], torch.zeros_like(val_batch['fine_tokens'][:, :, 6:])], dim=2)\n",
+    "            fine_targets_8 = val_batch['fine_tokens'][:, :, 7]\n",
+    "            fine_tokens_input_8 = torch.cat([val_batch['fine_tokens'][:, :, :7], torch.zeros_like(val_batch['fine_tokens'][:, :, 7:])], dim=2)\n",
+    "\n",
+    "            # Forward pass for validation\n",
+    "            logits_7 = model(6, fine_tokens_input_7)\n",
+    "            logits_8 = model(7, fine_tokens_input_8)\n",
+    "\n",
+    "            # Calculate the validation loss\n",
+    "            loss_7 = criterion(logits_7.view(-1, model.config.output_vocab_size), fine_targets_7.view(-1))\n",
+    "            loss_8 = criterion(logits_8.view(-1, model.config.output_vocab_size), fine_targets_8.view(-1))\n",
+    "\n",
+    "            loss = (loss_7 + loss_8) / 2\n",
+    "            validation_loss += loss.item()\n",
+    "            num_batches += 1\n",
+    "            num_samples += val_batch['fine_tokens'].size(0)\n",
+    "\n",
+    "    average_validation_loss = validation_loss / num_batches\n",
+    "    logger.info(f\"Validation Loss: {average_validation_loss} over {num_samples} samples and {num_batches} batches.\")\n",
+    "    print(f\"Validation Loss: {average_validation_loss} over {num_samples} samples and {num_batches} batches.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

train_semantic.ipynb ADDED Viewed

	@@ -0,0 +1,899 @@

+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import os\n",
+    "import re\n",
+    "import gc\n",
+    "import json\n",
+    "import math\n",
+    "import hashlib\n",
+    "import numpy as np\n",
+    "import logging\n",
+    "import torchaudio\n",
+    "from tqdm.auto import tqdm\n",
+    "import torch.nn.functional as F\n",
+    "from encodec.utils import convert_audio\n",
+    "from accelerate import Accelerator\n",
+    "from accelerate.utils import set_seed\n",
+    "from transformers import BertTokenizer\n",
+    "from huggingface_hub import hf_hub_download\n",
+    "from packaging import version\n",
+    "from diffusers.optimization import get_scheduler\n",
+    "\n",
+    "from utils.bitsandbytes import BitsAndBytesConfig, importlib_metadata, get_keys_to_not_convert, replace_with_bnb_linear, set_module_quantized_tensor_to_device\n",
+    "from utils.lora import convert_linear_layer_to_lora, only_optimize_lora_parameters, convert_lora_to_linear_layer\n",
+    "from bark.model import GPTConfig, GPT\n",
+    "from bark.model_fine import FineGPT, FineGPTConfig"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training Args"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_batch_size = 8\n",
+    "eval_batch_size = 8\n",
+    "grad_accum = 2\n",
+    "ckpt_path = 'models/text_2.pt'\n",
+    "model_type = \"text\"\n",
+    "dataset_path = 'datasets/joe_biden_state_of_union/'\n",
+    "logging_dir = 'logs/'\n",
+    "log_with = 'wandb'\n",
+    "hubert_path = 'data/models/hubert/hubert.pt'\n",
+    "hubert_tokenizer_path = 'data/models/hubert/tokenizer.pth'\n",
+    "\n",
+    "output_dir = 'semantic_output/'\n",
+    "resume_from_checkpoint = None\n",
+    "\n",
+    "checkpointing_steps = 1000\n",
+    "\n",
+    "mixed_precision = 'bf16'\n",
+    "bits = 16 #4 4 and 8 bit are a work in progress\n",
+    "compute_dtype = torch.bfloat16\n",
+    "double_quant = True\n",
+    "quant_type = 'nf4'\n",
+    "\n",
+    "lora_dim = 64\n",
+    "lora_scaling = 1\n",
+    "lora_dropout = 0.1\n",
+    "lora_module_name = 'transformer.h'\n",
+    "optimize_lora_params_only = False\n",
+    "\n",
+    "learning_rate = 1e-4\n",
+    "scale_lr = False\n",
+    "use_8bit_adam = False\n",
+    "adam_beta1 = 0.9\n",
+    "adam_beta2 = 0.999\n",
+    "adam_epsilon = 1e-8\n",
+    "weight_decay = 0.01\n",
+    "\n",
+    "llm_int8_skip_modules = None\n",
+    "keep_in_fp32_modules = ['lm_head']\n",
+    "\n",
+    "lr_scheduler_type = 'linear'\n",
+    "lr_warmup_steps = 60\n",
+    "num_train_epochs = 5\n",
+    "max_train_steps = None\n",
+    "max_grad_norm = 1.0\n",
+    "\n",
+    "seed = 741"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Define Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CONTEXT_WINDOW_SIZE = 1024\n",
+    "\n",
+    "MAX_TEXT_LEN = 256\n",
+    "\n",
+    "SEMANTIC_RATE_HZ = 49.9\n",
+    "SEMANTIC_VOCAB_SIZE = 10_000\n",
+    "\n",
+    "TEXT_ENCODING_OFFSET = 10_048\n",
+    "SEMANTIC_PAD_TOKEN = 10_000\n",
+    "TEXT_PAD_TOKEN = 129_595\n",
+    "SEMANTIC_INFER_TOKEN = 129_599\n",
+    "\n",
+    "MAX_SEMANTIC_LEN = 511\n",
+    "\n",
+    "SAMPLE_RATE = 24_000\n",
+    "CHANNELS = 1\n",
+    "\n",
+    "logger = logging.getLogger(__name__)\n",
+    "\n",
+    "\n",
+    "USE_SMALL_MODELS = os.environ.get(\"SERP_USE_SMALL_MODELS\", False)\n",
+    "\n",
+    "default_cache_dir = os.path.join(os.path.expanduser(\"~\"), \".cache\")\n",
+    "CACHE_DIR = os.path.join(os.getenv(\"XDG_CACHE_HOME\", default_cache_dir), \"serp\", \"bark_v0\")\n",
+    "\n",
+    "\n",
+    "def _clear_cuda_cache():\n",
+    "    if torch.cuda.is_available():\n",
+    "        torch.cuda.empty_cache()\n",
+    "        torch.cuda.synchronize()\n",
+    "\n",
+    "\n",
+    "def _md5(fname):\n",
+    "    hash_md5 = hashlib.md5()\n",
+    "    with open(fname, \"rb\") as f:\n",
+    "        for chunk in iter(lambda: f.read(4096), b\"\"):\n",
+    "            hash_md5.update(chunk)\n",
+    "    return hash_md5.hexdigest()\n",
+    "\n",
+    "\n",
+    "def _download(from_hf_path, file_name, to_local_path):\n",
+    "    to_local_path = to_local_path.replace(\"\\\\\", \"/\")\n",
+    "    path = '/'.join(to_local_path.split(\"/\")[:-1])\n",
+    "    os.makedirs(path, exist_ok=True)\n",
+    "    hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=path)\n",
+    "    os.replace(os.path.join(path, file_name), to_local_path)\n",
+    "\n",
+    "\n",
+    "def _tokenize(tokenizer, text):\n",
+    "    return tokenizer.encode(text, add_special_tokens=False)\n",
+    "\n",
+    "\n",
+    "def _detokenize(tokenizer, enc_text):\n",
+    "    return tokenizer.decode(enc_text)\n",
+    "\n",
+    "\n",
+    "def _normalize_whitespace(text):\n",
+    "    return re.sub(r\"\\s+\", \" \", text).strip()\n",
+    "\n",
+    "\n",
+    "REMOTE_MODEL_PATHS = {\n",
+    "    \"text_small\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"text.pt\",\n",
+    "        \"checksum\": \"b3e42bcbab23b688355cd44128c4cdd3\",\n",
+    "    },\n",
+    "    \"coarse_small\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"coarse.pt\",\n",
+    "        \"checksum\": \"5fe964825e3b0321f9d5f3857b89194d\",\n",
+    "    },\n",
+    "    \"fine_small\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"fine.pt\",\n",
+    "        \"checksum\": \"5428d1befe05be2ba32195496e58dc90\",\n",
+    "    },\n",
+    "    \"text\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"text_2.pt\",\n",
+    "        \"checksum\": \"54afa89d65e318d4f5f80e8e8799026a\",\n",
+    "    },\n",
+    "    \"coarse\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"coarse_2.pt\",\n",
+    "        \"checksum\": \"8a98094e5e3a255a5c9c0ab7efe8fd28\",\n",
+    "    },\n",
+    "    \"fine\": {\n",
+    "        \"repo_id\": \"suno/bark\",\n",
+    "        \"file_name\": \"fine_2.pt\",\n",
+    "        \"checksum\": \"59d184ed44e3650774a2f0503a48a97b\",\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "\n",
+    "def _load_model(ckpt_path, device, use_small=False, model_type=\"text\"):\n",
+    "    if model_type == \"text\":\n",
+    "        ConfigClass = GPTConfig\n",
+    "        ModelClass = GPT\n",
+    "    elif model_type == \"coarse\":\n",
+    "        ConfigClass = GPTConfig\n",
+    "        ModelClass = GPT\n",
+    "    elif model_type == \"fine\":\n",
+    "        ConfigClass = FineGPTConfig\n",
+    "        ModelClass = FineGPT\n",
+    "    else:\n",
+    "        raise NotImplementedError()\n",
+    "    model_key = f\"{model_type}_small\" if use_small or USE_SMALL_MODELS else model_type\n",
+    "    model_info = REMOTE_MODEL_PATHS[model_key]\n",
+    "    if ckpt_path in [None, '']:\n",
+    "        ckpt_path = os.path.join(CACHE_DIR, model_info[\"file_name\"])\n",
+    "    if not os.path.exists(ckpt_path):\n",
+    "        logger.info(f\"{model_type} model not found, downloading into `{CACHE_DIR}`.\")\n",
+    "        _download(model_info[\"repo_id\"], model_info[\"file_name\"], ckpt_path)\n",
+    "    checkpoint = torch.load(ckpt_path, map_location=device)\n",
+    "    # this is a hack\n",
+    "    model_args = checkpoint[\"model_args\"]\n",
+    "    if \"input_vocab_size\" not in model_args:\n",
+    "        model_args[\"input_vocab_size\"] = model_args[\"vocab_size\"]\n",
+    "        model_args[\"output_vocab_size\"] = model_args[\"vocab_size\"]\n",
+    "        del model_args[\"vocab_size\"]\n",
+    "    gptconf = ConfigClass(**checkpoint[\"model_args\"])\n",
+    "    model = ModelClass(gptconf)\n",
+    "    state_dict = checkpoint[\"model\"]\n",
+    "    # fixup checkpoint\n",
+    "    unwanted_prefix = \"_orig_mod.\"\n",
+    "    for k, v in list(state_dict.items()):\n",
+    "        if k.startswith(unwanted_prefix):\n",
+    "            state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)\n",
+    "    extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())\n",
+    "    extra_keys = set([k for k in extra_keys if not k.endswith(\".attn.bias\")])\n",
+    "    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())\n",
+    "    missing_keys = set([k for k in missing_keys if not k.endswith(\".attn.bias\")])\n",
+    "    if len(extra_keys) != 0:\n",
+    "        raise ValueError(f\"extra keys found: {extra_keys}\")\n",
+    "    if len(missing_keys) != 0:\n",
+    "        raise ValueError(f\"missing keys: {missing_keys}\")\n",
+    "    model.load_state_dict(state_dict, strict=False)\n",
+    "    n_params = model.get_num_params()\n",
+    "    val_loss = checkpoint[\"best_val_loss\"].item()\n",
+    "    print(f\"Loaded {model_type} model with {n_params} params, val_loss={val_loss:.4f}.\")\n",
+    "    del checkpoint, state_dict\n",
+    "    _clear_cuda_cache()\n",
+    "    if model_type == \"text\":\n",
+    "        tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\")\n",
+    "        return model, tokenizer\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "def load_filepaths_and_text(filename, split=\"|\"):\n",
+    "    with open(filename, encoding='utf-8', errors='ignore') as f:\n",
+    "        filepaths_and_text = [line.strip().split(split) for line in f]\n",
+    "        base = os.path.dirname(filename)\n",
+    "        for j in range(len(filepaths_and_text)):\n",
+    "            filepaths_and_text[j][0] = os.path.join(base, filepaths_and_text[j][0])\n",
+    "    return filepaths_and_text\n",
+    "\n",
+    "class TtsDataset(torch.utils.data.Dataset):\n",
+    "    def __init__(self, opt):\n",
+    "        self.path = os.path.dirname(opt['path'])\n",
+    "        self.mode = opt['mode']\n",
+    "        self.audiopaths_and_text = load_filepaths_and_text(os.path.join(opt['path'] , opt['mode'] + '_valid.txt'))\n",
+    "        self.tokenizer = opt['tokenizer']\n",
+    "\n",
+    "    def __getitem__(self, index):\n",
+    "        audiopath_and_text = self.audiopaths_and_text[index]\n",
+    "        audiopath, text = audiopath_and_text[0], audiopath_and_text[1]\n",
+    "\n",
+    "        input_ids = np.array(_tokenize(self.tokenizer, text)) + TEXT_ENCODING_OFFSET\n",
+    "        input_ids = torch.from_numpy(input_ids).long()\n",
+    "        tokens = np.load(audiopath.replace('.wav', '.npz').replace('wavs', 'tokens'))\n",
+    "        semantic_tokens = tokens['semantic']\n",
+    "        semantic_tokens = torch.from_numpy(semantic_tokens).long()\n",
+    "\n",
+    "        return input_ids, semantic_tokens\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.audiopaths_and_text)\n",
+    "\n",
+    "\n",
+    "class TtsCollater():\n",
+    "    def __init__(self):\n",
+    "        pass\n",
+    "    def __call__(self, batch):\n",
+    "        max_text_len = MAX_TEXT_LEN\n",
+    "        max_semantic_tokens_len = MAX_SEMANTIC_LEN\n",
+    "        texts = []\n",
+    "        semantic_tokens = []\n",
+    "\n",
+    "        for b in batch:\n",
+    "            text, semantic_tokens_ = b\n",
+    "            text = F.pad(text, (0, max_text_len-len(text)), value=TEXT_PAD_TOKEN)\n",
+    "            semantic_history = torch.from_numpy(np.array([SEMANTIC_PAD_TOKEN] * 256))\n",
+    "            text = torch.cat([text, semantic_history, torch.tensor([SEMANTIC_INFER_TOKEN])])\n",
+    "            texts.append(text)\n",
+    "            semantic_tokens_ = semantic_tokens_[:max_semantic_tokens_len]\n",
+    "            semantic_tokens.append(F.pad(semantic_tokens_, (0, max_semantic_tokens_len-len(semantic_tokens_)), value=SEMANTIC_PAD_TOKEN))\n",
+    "\n",
+    "        return {\n",
+    "            'input_ids': torch.stack(texts).contiguous(),\n",
+    "            'semantic_tokens': torch.stack(semantic_tokens).contiguous()\n",
+    "        }\n",
+    "    \n",
+    "\n",
+    "accelerator = Accelerator(\n",
+    "    gradient_accumulation_steps=grad_accum,\n",
+    "    mixed_precision=mixed_precision,\n",
+    "    log_with=log_with,\n",
+    "    logging_dir=logging_dir,\n",
+    ")\n",
+    "device = accelerator.device\n",
+    "\n",
+    "os.makedirs(output_dir, exist_ok=True)\n",
+    "\n",
+    "set_seed(seed)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Setup Dataset (only need to do this once)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_duration_sec = 15.12 # the maximum allowed duration in seconds\n",
+    "\n",
+    "path = dataset_path\n",
+    "\n",
+    "# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer\n",
+    "from hubert.hubert_manager import HuBERTManager\n",
+    "hubert_manager = HuBERTManager()\n",
+    "from hubert.pre_kmeans_hubert import CustomHubert\n",
+    "from hubert.customtokenizer import CustomTokenizer\n",
+    "hubert_manager.make_sure_hubert_installed()\n",
+    "hubert_manager.make_sure_tokenizer_installed()\n",
+    "\n",
+    "# Load the HuBERT model\n",
+    "hubert_model = CustomHubert(checkpoint_path=hubert_path).to(device)\n",
+    "hubert_model.eval()\n",
+    "for param in hubert_model.parameters():\n",
+    "    param.requires_grad = False\n",
+    "\n",
+    "# Load the CustomTokenizer model\n",
+    "hubert_tokenizer = CustomTokenizer.load_from_checkpoint(hubert_tokenizer_path).to(device)  # Automatically uses the right layers\n",
+    "\n",
+    "from bark.generation import load_codec_model\n",
+    "codec_model = load_codec_model(use_gpu=True)\n",
+    "codec_model.eval()\n",
+    "for param in codec_model.parameters():\n",
+    "    param.requires_grad = False\n",
+    "\n",
+    "\n",
+    "def get_duration(wav, sr):\n",
+    "    return wav.shape[1] / sr\n",
+    "\n",
+    "valid_lines_train = []\n",
+    "# convert wavs to semantic tokens\n",
+    "for wav_path, txt in load_filepaths_and_text(path + 'train.txt'):\n",
+    "    wav, sr = torchaudio.load(wav_path)\n",
+    "    if not get_duration(wav, sr) > max_duration_sec:\n",
+    "        valid_lines_train.append((wav_path, txt))\n",
+    "    wav = convert_audio(wav, sr, SAMPLE_RATE, CHANNELS).to(device)\n",
+    "\n",
+    "    semantic_vectors = hubert_model.forward(wav, input_sample_hz=SAMPLE_RATE)\n",
+    "    semantic_tokens = hubert_tokenizer.get_token(semantic_vectors)\n",
+    "\n",
+    "    # save semantic tokens\n",
+    "    os.makedirs(os.path.join(path, 'tokens'), exist_ok=True)\n",
+    "    semantic_tokens = semantic_tokens.cpu().numpy()\n",
+    "\n",
+    "    # Extract discrete codes from EnCodec\n",
+    "    with torch.no_grad():\n",
+    "        encoded_frames = codec_model.encode(wav.unsqueeze(0))\n",
+    "    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]\n",
+    "\n",
+    "    # move codes to cpu\n",
+    "    codes = codes.cpu().numpy()\n",
+    "\n",
+    "    # save tokens\n",
+    "    np.savez_compressed(os.path.join(path, 'tokens', os.path.basename(wav_path).replace('.wav', '.npz')), fine=codes, coarse=codes[:2, :], semantic=semantic_tokens)\n",
+    "\n",
+    "# rewrite train.txt with valid lines\n",
+    "with open(path + 'train_valid.txt', 'w', encoding='utf-8') as f:\n",
+    "    for wav_path, txt in valid_lines_train:\n",
+    "        wav_path = os.path.relpath(wav_path, dataset_path).replace('\\\\', '/')\n",
+    "        f.write(f'{wav_path}|{txt}\\n')\n",
+    "\n",
+    "valid_lines_valid = []\n",
+    "for wav_path, txt in load_filepaths_and_text(path + 'valid.txt'):\n",
+    "    wav, sr = torchaudio.load(wav_path)\n",
+    "    if not get_duration(wav, sr) > max_duration_sec:\n",
+    "        valid_lines_valid.append((wav_path, txt))\n",
+    "    wav = convert_audio(wav, sr, SAMPLE_RATE, CHANNELS).to(device)\n",
+    "\n",
+    "    semantic_vectors = hubert_model.forward(wav, input_sample_hz=SAMPLE_RATE)\n",
+    "    semantic_tokens = hubert_tokenizer.get_token(semantic_vectors)\n",
+    "\n",
+    "    # save semantic tokens\n",
+    "    os.makedirs(os.path.join(path, 'tokens'), exist_ok=True)\n",
+    "    semantic_tokens = semantic_tokens.cpu().numpy()\n",
+    "    \n",
+    "    # Extract discrete codes from EnCodec\n",
+    "    with torch.no_grad():\n",
+    "        encoded_frames = codec_model.encode(wav.unsqueeze(0))\n",
+    "    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]\n",
+    "\n",
+    "    # move codes to cpu\n",
+    "    codes = codes.cpu().numpy()\n",
+    "\n",
+    "    # save tokens\n",
+    "    np.savez_compressed(os.path.join(path, 'tokens', os.path.basename(wav_path).replace('.wav', '.npz')), fine=codes, coarse=codes[:2, :], semantic=semantic_tokens)\n",
+    "\n",
+    "# rewrite valid.txt with valid lines\n",
+    "with open(path + 'valid_valid.txt', 'w', encoding='utf-8') as f:\n",
+    "    for wav_path, txt in valid_lines_valid:\n",
+    "        wav_path = os.path.relpath(wav_path, dataset_path).replace('\\\\', '/')\n",
+    "        f.write(f'{wav_path}|{txt}\\n')\n",
+    "\n",
+    "del hubert_model\n",
+    "del hubert_tokenizer\n",
+    "del codec_model\n",
+    "gc.collect()\n",
+    "torch.cuda.empty_cache()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, tokenizer = _load_model(ckpt_path, device, use_small=False, model_type=model_type)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if scale_lr:\n",
+    "    learning_rate = (\n",
+    "        learning_rate * grad_accum * train_batch_size * accelerator.num_processes\n",
+    "    )\n",
+    "\n",
+    "if use_8bit_adam:\n",
+    "    try:\n",
+    "        import bitsandbytes as bnb\n",
+    "    except ImportError:\n",
+    "        raise ImportError(\n",
+    "            \"To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`.\"\n",
+    "        )\n",
+    "\n",
+    "    optimizer_class = bnb.optim.AdamW8bit\n",
+    "else:\n",
+    "    optimizer_class = torch.optim.AdamW"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "quantization_config=BitsAndBytesConfig(\n",
+    "    load_in_4bit=bits == 4,\n",
+    "    load_in_8bit=bits == 8,\n",
+    "    llm_int8_threshold=6.0,\n",
+    "    llm_int8_has_fp16_weight=False,\n",
+    "    bnb_4bit_compute_dtype=compute_dtype,\n",
+    "    bnb_4bit_use_double_quant=double_quant,\n",
+    "    bnb_4bit_quant_type=quant_type # {'fp4', 'nf4'}\n",
+    ")\n",
+    "\n",
+    "# if quantization_config.load_in_8bit or quantization_config.load_in_4bit:\n",
+    "#     if quantization_config.load_in_8bit:\n",
+    "#         logger.info(\"Detected 8-bit loading: activating 8-bit loading for this model\")\n",
+    "#     elif quantization_config.load_in_4bit:\n",
+    "#         logger.info(\"Detected 4-bit loading: activating 4-bit loading for this model\")\n",
+    "\n",
+    "#     # We keep some modules such as the lm_head in their original dtype for numerical stability reasons\n",
+    "#     if llm_int8_skip_modules is None or len(llm_int8_skip_modules) == 0:\n",
+    "#         modules_to_not_convert = [] # get_keys_to_not_convert(model)\n",
+    "#     else:\n",
+    "#         modules_to_not_convert = llm_int8_skip_modules\n",
+    "\n",
+    "#     if not isinstance(modules_to_not_convert, list):\n",
+    "#         modules_to_not_convert = [modules_to_not_convert]\n",
+    "\n",
+    "#     modules_to_not_convert.extend(keep_in_fp32_modules)\n",
+    "\n",
+    "#     supports_4bit = version.parse(importlib_metadata.version(\"bitsandbytes\")) >= version.parse(\"0.39.0\")\n",
+    "\n",
+    "#     if quantization_config.load_in_4bit and not supports_4bit:\n",
+    "#         raise ValueError(\n",
+    "#             \"You have a version of `bitsandbytes` that is not compatible with 4bit inference and training\"\n",
+    "#             \" make sure you have the latest version of `bitsandbytes` installed\"\n",
+    "#         )\n",
+    "    \n",
+    "#     if len(modules_to_not_convert) == 0:\n",
+    "#         modules_to_not_convert = None\n",
+    "\n",
+    "#     model = replace_with_bnb_linear(\n",
+    "#         model, modules_to_not_convert=modules_to_not_convert, quantization_config=quantization_config\n",
+    "#     )\n",
+    "\n",
+    "#     # training in 8-bit is only available in 0.37.0+\n",
+    "#     model._is_kbit_training_enabled = version.parse(\n",
+    "#         importlib_metadata.version(\"bitsandbytes\")\n",
+    "#     ) >= version.parse(\"0.37.0\")\n",
+    "\n",
+    "#     model.config.quantization_config = quantization_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if bits == 4:\n",
+    "    from accelerate.utils import CustomDtype\n",
+    "    target_dtype = CustomDtype.INT4\n",
+    "elif bits == 8:\n",
+    "    target_dtype = torch.int8\n",
+    "\n",
+    "if lora_dim > 0:\n",
+    "    for param in model.parameters():\n",
+    "        if param.ndim == 1:\n",
+    "            # cast the small parameters (e.g. layernorm) to fp32 for stability\n",
+    "            param.data = param.data.to(torch.float32)\n",
+    "            \n",
+    "    class CastOutputToFloat(nn.Sequential):\n",
+    "        def forward(self, x):\n",
+    "            return super().forward(x).to(torch.float32)\n",
+    "\n",
+    "    model.lm_head = CastOutputToFloat(model.lm_head)\n",
+    "\n",
+    "    model = convert_linear_layer_to_lora(model, lora_module_name,\n",
+    "                                            lora_dim=lora_dim, lora_scaling=lora_scaling,\n",
+    "                                            lora_dropout=lora_dropout)\n",
+    "    if optimize_lora_params_only:\n",
+    "        model = only_optimize_lora_parameters(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "params_to_optimize = (\n",
+    "        param for param in model.parameters() if param.requires_grad\n",
+    "    )\n",
+    "\n",
+    "optimizer = optimizer_class(\n",
+    "    params_to_optimize,\n",
+    "    lr=learning_rate,\n",
+    "    betas=(adam_beta1, adam_beta2),\n",
+    "    weight_decay=weight_decay,\n",
+    "    eps=adam_epsilon,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "opt_train = {\n",
+    "    'path': dataset_path,\n",
+    "    'tokenizer': tokenizer,\n",
+    "    'mode': 'train',\n",
+    "}\n",
+    "\n",
+    "opt_val = {\n",
+    "    'path': dataset_path,\n",
+    "    'tokenizer': tokenizer,\n",
+    "    'mode': 'valid',\n",
+    "}\n",
+    "\n",
+    "train_dataset = TtsDataset(opt_train)\n",
+    "validation_dataset = TtsDataset(opt_val)\n",
+    "\n",
+    "train_dataloader = torch.utils.data.DataLoader(\n",
+    "    train_dataset,\n",
+    "    batch_size=train_batch_size,\n",
+    "    collate_fn=TtsCollater(),\n",
+    ")\n",
+    "\n",
+    "validation_dataloader = torch.utils.data.DataLoader(\n",
+    "    validation_dataset,\n",
+    "    batch_size=eval_batch_size,\n",
+    "    collate_fn=TtsCollater(),\n",
+    ")\n",
+    "\n",
+    "criterion = torch.nn.CrossEntropyLoss() #ignore_index=SEMANTIC_PAD_TOKEN)\n",
+    "\n",
+    "# Scheduler and math around the number of training steps.\n",
+    "overrode_max_train_steps = False\n",
+    "num_update_steps_per_epoch = math.ceil(len(train_dataloader) / grad_accum)\n",
+    "if max_train_steps is None:\n",
+    "    max_train_steps = num_train_epochs * num_update_steps_per_epoch\n",
+    "    overrode_max_train_steps = True\n",
+    "\n",
+    "lr_scheduler = get_scheduler(\n",
+    "    lr_scheduler_type,\n",
+    "    optimizer=optimizer,\n",
+    "    num_warmup_steps=lr_warmup_steps * grad_accum,\n",
+    "    num_training_steps=max_train_steps * grad_accum,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, optimizer, train_dataloader, validation_dataloader, lr_scheduler = accelerator.prepare(\n",
+    "    model, optimizer, train_dataloader, validation_dataloader, lr_scheduler\n",
+    ")\n",
+    "accelerator.register_for_checkpointing(lr_scheduler)\n",
+    "\n",
+    "weight_dtype = torch.float32\n",
+    "if accelerator.mixed_precision == \"fp16\":\n",
+    "    weight_dtype = torch.float16\n",
+    "elif accelerator.mixed_precision == \"bf16\":\n",
+    "    weight_dtype = torch.bfloat16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We need to recalculate our total training steps as the size of the training dataloader may have changed.\n",
+    "num_update_steps_per_epoch = math.ceil(len(train_dataloader) / grad_accum)\n",
+    "if overrode_max_train_steps:\n",
+    "    max_train_steps = num_train_epochs * num_update_steps_per_epoch\n",
+    "# Afterwards we recalculate our number of training epochs\n",
+    "num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)\n",
+    "\n",
+    "# We need to initialize the trackers we use, and also store our configuration.\n",
+    "# The trackers initializes automatically on the main process.\n",
+    "if accelerator.is_main_process:\n",
+    "    accelerator.init_trackers(\"bark_semantic\", config={})\n",
+    "\n",
+    "# Train!\n",
+    "total_batch_size = train_batch_size * accelerator.num_processes * grad_accum\n",
+    "logger.info(\"***** Running training *****\")\n",
+    "logger.info(f\"  Num examples = {len(train_dataset)}\")\n",
+    "logger.info(f\"  Num batches each epoch = {len(train_dataloader)}\")\n",
+    "logger.info(f\"  Num Epochs = {num_train_epochs}\")\n",
+    "logger.info(f\"  Instantaneous batch size per device = {train_batch_size}\")\n",
+    "logger.info(f\"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}\")\n",
+    "logger.info(f\"  Gradient Accumulation steps = {grad_accum}\")\n",
+    "logger.info(f\"  Total optimization steps = {max_train_steps}\")\n",
+    "global_step = 0\n",
+    "first_epoch = 0\n",
+    "\n",
+    "if resume_from_checkpoint:\n",
+    "    if resume_from_checkpoint != \"latest\":\n",
+    "        path = os.path.basename(resume_from_checkpoint)\n",
+    "    else:\n",
+    "        # Get the most recent checkpoint\n",
+    "        dirs = os.listdir(output_dir)\n",
+    "        dirs = [d for d in dirs if d.startswith(\"checkpoint\")]\n",
+    "        dirs = sorted(dirs, key=lambda x: int(x.split(\"-\")[1]))\n",
+    "        path = dirs[-1]\n",
+    "    accelerator.print(f\"Resuming from checkpoint {path}\")\n",
+    "    accelerator.load_state(os.path.join(output_dir, path))\n",
+    "    global_step = int(path.split(\"-\")[1])\n",
+    "\n",
+    "    resume_global_step = global_step * grad_accum\n",
+    "    first_epoch = resume_global_step // num_update_steps_per_epoch\n",
+    "    resume_step = resume_global_step % num_update_steps_per_epoch\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if accelerator.is_main_process:\n",
+    "    model.eval()\n",
+    "    validation_loss = 0.0\n",
+    "    num_batches = 0\n",
+    "    num_samples = 0\n",
+    "    with torch.no_grad():\n",
+    "        for val_step, val_batch in enumerate(validation_dataloader):\n",
+    "            # Similar to training, process the validation batch\n",
+    "            val_targets = val_batch['semantic_tokens'][:, 1:].contiguous()\n",
+    "            val_semantic_inputs = val_batch['semantic_tokens'][:, :-1]\n",
+    "            val_inputs = torch.cat([val_batch['input_ids'], val_semantic_inputs], dim=1)\n",
+    "\n",
+    "            # Forward pass for validation\n",
+    "            val_logits = model(val_inputs, training=True)\n",
+    "            val_semantic_logits = val_logits[:, val_batch['input_ids'].size(1):].contiguous()\n",
+    "\n",
+    "            # Calculate the validation loss\n",
+    "            val_loss = criterion(val_semantic_logits.view(-1, model.config.output_vocab_size), val_targets.view(-1))\n",
+    "            validation_loss += val_loss.item()\n",
+    "            num_batches += 1\n",
+    "            num_samples += val_batch['input_ids'].size(0)\n",
+    "\n",
+    "    average_validation_loss = validation_loss / num_batches\n",
+    "    logger.info(f\"Validation Loss: {average_validation_loss} over {num_samples} samples and {num_batches} batches.\")\n",
+    "    print(f\"Validation Loss: {average_validation_loss} over {num_samples} samples and {num_batches} batches.\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#  Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Only show the progress bar once on each machine.\n",
+    "progress_bar = tqdm(range(global_step, max_train_steps), disable=not accelerator.is_local_main_process)\n",
+    "progress_bar.set_description(\"Steps\")\n",
+    "\n",
+    "for epoch in range(first_epoch, num_train_epochs):\n",
+    "    model.train()\n",
+    "    for step, batch in enumerate(train_dataloader):\n",
+    "        # Skip steps until we reach the resumed step\n",
+    "        if resume_from_checkpoint and epoch == first_epoch and step < resume_step:\n",
+    "            if step % grad_accum == 0:\n",
+    "                progress_bar.update(1)\n",
+    "            continue\n",
+    "\n",
+    "        with accelerator.accumulate(model):\n",
+    "            targets = batch['semantic_tokens'][:, 1:].contiguous()\n",
+    "    \n",
+    "            # Remove the last semantic token from the inputs since there is no target for it.\n",
+    "            semantic_inputs = batch['semantic_tokens'][:, :-1]\n",
+    "\n",
+    "            # Combine the text and semantic tokens and feed them into the model.\n",
+    "            inputs = torch.cat([batch['input_ids'], semantic_inputs], dim=1)\n",
+    "            logits = model(inputs, training=True)\n",
+    "\n",
+    "            # We're only interested in the logits for the semantic tokens, so we ignore the logits for the input text tokens.\n",
+    "            semantic_logits = logits[:, batch['input_ids'].size(1):].contiguous()\n",
+    "\n",
+    "            # Compute the loss.\n",
+    "            loss = criterion(semantic_logits.view(-1, model.config.output_vocab_size), targets.view(-1))\n",
+    "\n",
+    "            accelerator.backward(loss)\n",
+    "            if accelerator.sync_gradients:\n",
+    "                params_to_clip = (\n",
+    "                    param for param in model.parameters() if param.requires_grad\n",
+    "                )\n",
+    "                accelerator.clip_grad_norm_(params_to_clip, max_grad_norm)\n",
+    "            optimizer.step()\n",
+    "            lr_scheduler.step()\n",
+    "            optimizer.zero_grad()\n",
+    "\n",
+    "        # Checks if the accelerator has performed an optimization step behind the scenes\n",
+    "        if accelerator.sync_gradients:\n",
+    "            progress_bar.update(1)\n",
+    "            global_step += 1\n",
+    "\n",
+    "            if global_step % checkpointing_steps == 0:\n",
+    "                if accelerator.is_main_process:\n",
+    "                    save_path = os.path.join(output_dir, f\"checkpoint-{global_step}\")\n",
+    "                    accelerator.save_state(save_path)\n",
+    "                    logger.info(f\"Saved state to {save_path}\")\n",
+    "\n",
+    "        logs = {\"loss\": loss.detach().item(), \"lr\": lr_scheduler.get_last_lr()[0]}\n",
+    "        progress_bar.set_postfix(**logs)\n",
+    "        accelerator.log(logs, step=global_step)\n",
+    "\n",
+    "        if global_step >= max_train_steps:\n",
+    "            break\n",
+    "    \n",
+    "    accelerator.wait_for_everyone()\n",
+    "\n",
+    "if accelerator.is_main_process:\n",
+    "    if lora_dim > 0:\n",
+    "        model = convert_lora_to_linear_layer(model)\n",
+    "    # save model\n",
+    "    accelerator.save(model.state_dict(), os.path.join(output_dir, \"pytorch_model.bin\"))\n",
+    "\n",
+    "    config = model.config.__dict__\n",
+    "    # save config\n",
+    "    with open(os.path.join(output_dir, \"config.json\"), \"w\") as f:\n",
+    "        json.dump(config, f, indent=2)\n",
+    "\n",
+    "accelerator.end_training()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if accelerator.is_main_process:\n",
+    "    model.eval()\n",
+    "    validation_loss = 0.0\n",
+    "    num_batches = 0\n",
+    "    num_samples = 0\n",
+    "    with torch.no_grad():\n",
+    "        for val_step, val_batch in enumerate(validation_dataloader):\n",
+    "            # Similar to training, process the validation batch\n",
+    "            val_targets = val_batch['semantic_tokens'][:, 1:].contiguous()\n",
+    "            val_semantic_inputs = val_batch['semantic_tokens'][:, :-1]\n",
+    "            val_inputs = torch.cat([val_batch['input_ids'], val_semantic_inputs], dim=1)\n",
+    "\n",
+    "            # Forward pass for validation\n",
+    "            val_logits = model(val_inputs, training=True)\n",
+    "            val_semantic_logits = val_logits[:, val_batch['input_ids'].size(1):].contiguous()\n",
+    "\n",
+    "            # Calculate the validation loss\n",
+    "            val_loss = criterion(val_semantic_logits.view(-1, model.config.output_vocab_size), val_targets.view(-1))\n",
+    "            validation_loss += val_loss.item()\n",
+    "            num_batches += 1\n",
+    "            num_samples += val_batch['input_ids'].size(0)\n",
+    "\n",
+    "    average_validation_loss = validation_loss / num_batches\n",
+    "    logger.info(f\"Validation Loss: {average_validation_loss} over {num_samples} samples and {num_batches} batches.\")\n",
+    "    print(f\"Validation Loss: {average_validation_loss} over {num_samples} samples and {num_batches} batches.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}