Spaces:

mrfakename
/

Spark-TTS-0.5B

Build error

App Files Files Community

mrfakename commited on Nov 22, 2024

Commit

775c5c6

1 Parent(s): fc3fbd4

empty

Browse files

Files changed (20) hide show

.gitattributes +0 -37
LICENSE +0 -201
README.md +0 -13
app.py +0 -142
inference.ipynb +0 -236
inference_client.py +0 -161
inference_server.py +0 -170
ioblocks.py +0 -333
model.py +0 -441
prompts/bob_duo.wav +0 -0
prompts/bob_mono.wav +0 -0
prompts/countdown_mono.wav +0 -3
prompts/toaskanymore.wav +0 -3
requirements.txt +0 -14
tokenizer.py +0 -581
transformer.py +0 -382
utils/__init__.py +0 -3
utils/blocks.py +0 -92
utils/dist.py +0 -99
utils/interp.py +0 -84

.gitattributes DELETED Viewed

@@ -1,37 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-prompts/countdown_mono.wav filter=lfs diff=lfs merge=lfs -text
-prompts/toaskanymore.wav filter=lfs diff=lfs merge=lfs -text

LICENSE DELETED Viewed

@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright 2024 Standard Intelligence PBC
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

README.md DELETED Viewed

@@ -1,13 +0,0 @@
----
-title: hertz-dev
-emoji: ⚡
-colorFrom: gray
-colorTo: gray
-sdk: gradio
-sdk_version: 5.4.0
-app_file: app.py
-pinned: false
-short_description: Unofficial demo for hertz-dev
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py DELETED Viewed

@@ -1,142 +0,0 @@
-import torch as T
-import torch.nn as nn
-import torch.nn.functional as F
-import torchaudio
-from utils import load_ckpt, print_colored
-from tokenizer import make_tokenizer
-from model import get_hertz_dev_config
-import matplotlib.pyplot as plt
-import spaces
-import gradio as gr
-device = 'cuda' if T.cuda.is_available() else 'cpu'
-#T.cuda.set_device(0)
-print(f"Using device: {device}")
-audio_tokenizer = make_tokenizer(device)
-TWO_SPEAKER = False
-model_config = get_hertz_dev_config(is_split=TWO_SPEAKER)
-generator = model_config()
-generator = generator.eval().to(T.bfloat16).to(device)
-##############
-# Load audio
-def load_and_preprocess_audio(audio_path):
-    gr.Info("Loading and preprocessing audio...")
-    # Load audio file
-    audio_tensor, sr = torchaudio.load(audio_path)
-    gr.Info(f"Loaded audio shape: {audio_tensor.shape}")
-    if TWO_SPEAKER:
-        if audio_tensor.shape[0] == 1:
-            gr.Info("Converting mono to stereo...")
-            audio_tensor = audio_tensor.repeat(2, 1)
-            gr.Info(f"Stereo audio shape: {audio_tensor.shape}")
-    else:
-        if audio_tensor.shape[0] == 2:
-            gr.Info("Converting stereo to mono...")
-            audio_tensor = audio_tensor.mean(dim=0).unsqueeze(0)
-            gr.Info(f"Mono audio shape: {audio_tensor.shape}")
-    # Resample to 16kHz if needed
-    if sr != 16000:
-        gr.Info(f"Resampling from {sr}Hz to 16000Hz...")
-        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
-        audio_tensor = resampler(audio_tensor)
-    # Clip to 5 minutes if needed
-    max_samples = 16000 * 60 * 5
-    if audio_tensor.shape[1] > max_samples:
-        # gr.Info("Clipping audio to 5 minutes...")
-        raise gr.Erorr("Maximum prompt is 5 minutes")
-        # audio_tensor = audio_tensor[:, :max_samples]
-    duration_seconds = audio_tensor.shape[1] / sr
-    gr.Info("Audio preprocessing complete!")
-    return audio_tensor.unsqueeze(0), duration_seconds
-##############
-# Return audio to gradio
-def display_audio(audio_tensor):
-    audio_tensor = audio_tensor.cpu().squeeze()
-    if audio_tensor.ndim == 1:
-        audio_tensor = audio_tensor.unsqueeze(0)
-    audio_tensor = audio_tensor.float()
-    # Make a waveform plot
-    # plt.figure(figsize=(4, 1))
-    # plt.plot(audio_tensor.numpy()[0], linewidth=0.5)
-    # plt.axis('off')
-    # plt.show()
-    # Make an audio player
-    return (16000, audio_tensor.numpy())
-def get_completion(encoded_prompt_audio, prompt_len):
-    prompt_len_seconds = prompt_len / 8
-    gr.Info(f"Prompt length: {prompt_len_seconds:.2f}s")
-    with T.autocast(device_type='cuda', dtype=T.bfloat16):
-        completed_audio_batch = generator.completion(
-            encoded_prompt_audio,
-            temps=(.8, (0.5, 0.1)), # (token_temp, (categorical_temp, gaussian_temp))
-            use_cache=True)
-        completed_audio = completed_audio_batch
-        print_colored(f"Decoding completion...", "blue")
-        if TWO_SPEAKER:
-            decoded_completion_ch1 = audio_tokenizer.data_from_latent(completed_audio[:, :, :32].bfloat16())
-            decoded_completion_ch2 = audio_tokenizer.data_from_latent(completed_audio[:, :, 32:].bfloat16())
-            decoded_completion = T.cat([decoded_completion_ch1, decoded_completion_ch2], dim=0)
-        else:
-            decoded_completion = audio_tokenizer.data_from_latent(completed_audio.bfloat16())
-        gr.Info(f"Decoded completion shape: {decoded_completion.shape}")
-    gr.Info("Preparing audio for playback...")
-    audio_tensor = decoded_completion.cpu().squeeze()
-    if audio_tensor.ndim == 1:
-        audio_tensor = audio_tensor.unsqueeze(0)
-    audio_tensor = audio_tensor.float()
-    if audio_tensor.abs().max() > 1:
-        audio_tensor = audio_tensor / audio_tensor.abs().max()
-    # return audio_tensor[:, max(prompt_len*2000 - 16000, 0):]
-    return audio_tensor
-@spaces.GPU
-def run(audio_path):
-    prompt_audio, prompt_len_seconds = load_and_preprocess_audio(audio_path)
-    prompt_len = prompt_len_seconds * 8
-    gr.Info("Encoding prompt...")
-    with T.autocast(device_type='cuda', dtype=T.bfloat16):
-        if TWO_SPEAKER:
-            encoded_prompt_audio_ch1 = audio_tokenizer.latent_from_data(prompt_audio[:, 0:1].to(device))
-            encoded_prompt_audio_ch2 = audio_tokenizer.latent_from_data(prompt_audio[:, 1:2].to(device))
-            encoded_prompt_audio = T.cat([encoded_prompt_audio_ch1, encoded_prompt_audio_ch2], dim=-1)
-        else:
-            encoded_prompt_audio = audio_tokenizer.latent_from_data(prompt_audio.to(device))
-    gr.Info(f"Encoded prompt shape: {encoded_prompt_audio.shape}")
-    gr.Info("Prompt encoded successfully!")
-    # num_completions = 10
-    completion = get_completion(encoded_prompt_audio, prompt_len)
-    return display_audio(completion)
-with gr.Blocks() as demo:
-    gr.Markdown("# hertz-dev")
-    inp = gr.Audio(label="Input Audio", type="filepath", interactive=True)
-    btn = gr.Button("Continue", variant="primary")
-    out = gr.Audio(label="Output", interactive=False)
-    btn.click(run, inputs=inp, outputs=out)
-demo.queue().launch()

inference.ipynb DELETED Viewed

@@ -1,236 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch as T\n",
-    "import torch.nn as nn\n",
-    "import torch.nn.functional as F\n",
-    "import torchaudio\n",
-    "from utils import load_ckpt, print_colored\n",
-    "from tokenizer import make_tokenizer\n",
-    "from model import get_hertz_dev_config\n",
-    "import matplotlib.pyplot as plt\n",
-    "from IPython.display import Audio, display\n",
-    "\n",
-    "\n",
-    "# If you get an error like \"undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12\",\n",
-    "# you need to install PyTorch with the correct CUDA version. Run:\n",
-    "# `pip3 uninstall torch torchaudio && pip3 install torch torchaudio --index-url https://download.pytorch.org/whl/cu121`\n",
-    "\n",
-    "device = 'cuda' if T.cuda.is_available() else 'cpu'\n",
-    "T.cuda.set_device(0)\n",
-    "print_colored(f\"Using device: {device}\", \"grey\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# If you've already downloaded the model checkpoints, save them in ckpt/.\n",
-    "# This code will automatically download them if it can't find them.\n",
-    "audio_tokenizer = make_tokenizer(device)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# We have different checkpoints for the single-speaker and two-speaker models\n",
-    "# Set to True to load and run inference with the two-speaker model\n",
-    "TWO_SPEAKER = False"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_config = get_hertz_dev_config(is_split=TWO_SPEAKER)\n",
-    "\n",
-    "generator = model_config()\n",
-    "generator = generator.eval().to(T.bfloat16).to(device)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def load_and_preprocess_audio(audio_path):\n",
-    "    print_colored(\"Loading and preprocessing audio...\", \"blue\", bold=True)\n",
-    "    # Load audio file\n",
-    "    audio_tensor, sr = torchaudio.load(audio_path)\n",
-    "    print_colored(f\"Loaded audio shape: {audio_tensor.shape}\", \"grey\")\n",
-    "    \n",
-    "    if TWO_SPEAKER:\n",
-    "        if audio_tensor.shape[0] == 1:\n",
-    "            print_colored(\"Converting mono to stereo...\", \"grey\")\n",
-    "            audio_tensor = audio_tensor.repeat(2, 1)\n",
-    "            print_colored(f\"Stereo audio shape: {audio_tensor.shape}\", \"grey\")\n",
-    "    else:\n",
-    "        if audio_tensor.shape[0] == 2:\n",
-    "            print_colored(\"Converting stereo to mono...\", \"grey\")\n",
-    "            audio_tensor = audio_tensor.mean(dim=0).unsqueeze(0)\n",
-    "            print_colored(f\"Mono audio shape: {audio_tensor.shape}\", \"grey\")\n",
-    "        \n",
-    "    # Resample to 16kHz if needed\n",
-    "    if sr != 16000:\n",
-    "        print_colored(f\"Resampling from {sr}Hz to 16000Hz...\", \"grey\")\n",
-    "        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)\n",
-    "        audio_tensor = resampler(audio_tensor)\n",
-    "        \n",
-    "    # Clip to 5 minutes if needed\n",
-    "    max_samples = 16000 * 60 * 5\n",
-    "    if audio_tensor.shape[1] > max_samples:\n",
-    "        print_colored(\"Clipping audio to 5 minutes...\", \"grey\")\n",
-    "        audio_tensor = audio_tensor[:, :max_samples]\n",
-    "\n",
-    "    \n",
-    "    print_colored(\"Audio preprocessing complete!\", \"green\")\n",
-    "    return audio_tensor.unsqueeze(0)\n",
-    "\n",
-    "def display_audio(audio_tensor):\n",
-    "    audio_tensor = audio_tensor.cpu().squeeze()\n",
-    "    if audio_tensor.ndim == 1:\n",
-    "        audio_tensor = audio_tensor.unsqueeze(0)\n",
-    "    audio_tensor = audio_tensor.float()\n",
-    "\n",
-    "    # Make a waveform plot\n",
-    "    plt.figure(figsize=(4, 1))\n",
-    "    plt.plot(audio_tensor.numpy()[0], linewidth=0.5)\n",
-    "    plt.axis('off')\n",
-    "    plt.show()\n",
-    "\n",
-    "    # Make an audio player\n",
-    "    display(Audio(audio_tensor.numpy(), rate=16000))\n",
-    "    print_colored(f\"Audio ready for playback ↑\", \"green\", bold=True)\n",
-    "    \n",
-    "    \n",
-    "\n",
-    "# Our model is very prompt-sensitive, so we recommend experimenting with a diverse set of prompts.\n",
-    "prompt_audio = load_and_preprocess_audio('./prompts/toaskanymore.wav')\n",
-    "display_audio(prompt_audio)\n",
-    "prompt_len_seconds = 3\n",
-    "prompt_len = prompt_len_seconds * 8"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print_colored(\"Encoding prompt...\", \"blue\")\n",
-    "with T.autocast(device_type='cuda', dtype=T.bfloat16):\n",
-    "    if TWO_SPEAKER:\n",
-    "        encoded_prompt_audio_ch1 = audio_tokenizer.latent_from_data(prompt_audio[:, 0:1].to(device))\n",
-    "        encoded_prompt_audio_ch2 = audio_tokenizer.latent_from_data(prompt_audio[:, 1:2].to(device))\n",
-    "        encoded_prompt_audio = T.cat([encoded_prompt_audio_ch1, encoded_prompt_audio_ch2], dim=-1)\n",
-    "    else:\n",
-    "        encoded_prompt_audio = audio_tokenizer.latent_from_data(prompt_audio.to(device))\n",
-    "print_colored(f\"Encoded prompt shape: {encoded_prompt_audio.shape}\", \"grey\")\n",
-    "print_colored(\"Prompt encoded successfully!\", \"green\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_completion(encoded_prompt_audio, prompt_len):\n",
-    "    prompt_len_seconds = prompt_len / 8\n",
-    "    print_colored(f\"Prompt length: {prompt_len_seconds:.2f}s\", \"grey\")\n",
-    "    print_colored(\"Completing audio...\", \"blue\")\n",
-    "    with T.autocast(device_type='cuda', dtype=T.bfloat16):\n",
-    "        completed_audio_batch = generator.completion(\n",
-    "            encoded_prompt_audio, \n",
-    "            temps=(.8, (0.5, 0.1)), # (token_temp, (categorical_temp, gaussian_temp))\n",
-    "            use_cache=True)\n",
-    "\n",
-    "        completed_audio = completed_audio_batch\n",
-    "        print_colored(f\"Decoding completion...\", \"blue\")\n",
-    "        if TWO_SPEAKER:\n",
-    "            decoded_completion_ch1 = audio_tokenizer.data_from_latent(completed_audio[:, :, :32].bfloat16())\n",
-    "            decoded_completion_ch2 = audio_tokenizer.data_from_latent(completed_audio[:, :, 32:].bfloat16())\n",
-    "            decoded_completion = T.cat([decoded_completion_ch1, decoded_completion_ch2], dim=0)\n",
-    "        else:\n",
-    "            decoded_completion = audio_tokenizer.data_from_latent(completed_audio.bfloat16())\n",
-    "        print_colored(f\"Decoded completion shape: {decoded_completion.shape}\", \"grey\")\n",
-    "\n",
-    "    print_colored(\"Preparing audio for playback...\", \"blue\")\n",
-    "\n",
-    "    audio_tensor = decoded_completion.cpu().squeeze()\n",
-    "    if audio_tensor.ndim == 1:\n",
-    "        audio_tensor = audio_tensor.unsqueeze(0)\n",
-    "    audio_tensor = audio_tensor.float()\n",
-    "\n",
-    "    if audio_tensor.abs().max() > 1:\n",
-    "        audio_tensor = audio_tensor / audio_tensor.abs().max()\n",
-    "\n",
-    "    return audio_tensor[:, max(prompt_len*2000 - 16000, 0):]\n",
-    "\n",
-    "num_completions = 10\n",
-    "print_colored(f\"Generating {num_completions} completions...\", \"blue\")\n",
-    "for _ in range(num_completions):\n",
-    "    completion = get_completion(encoded_prompt_audio, prompt_len)\n",
-    "    display_audio(completion)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

inference_client.py DELETED Viewed

@@ -1,161 +0,0 @@
-# server.py remains the same as before
-# Updated client.py
-import asyncio
-import websockets
-import sounddevice as sd
-import numpy as np
-import base64
-import queue
-import argparse
-import requests
-import time
-class AudioClient:
-    def __init__(self, server_url="ws://localhost:8000", token_temp=None, categorical_temp=None, gaussian_temp=None):
-        # Convert ws:// to http:// for the base URL
-        self.base_url = server_url.replace("ws://", "http://")
-        self.server_url = f"{server_url}/audio"
-        # Set temperatures if provided
-        if any(t is not None for t in [token_temp, categorical_temp, gaussian_temp]):
-            self.set_temperature_and_echo(token_temp, categorical_temp, gaussian_temp)
-        # Initialize queues
-        self.audio_queue = queue.Queue()
-        self.output_queue = queue.Queue()
-    def set_temperature_and_echo(self, token_temp=None, categorical_temp=None, gaussian_temp=None, echo_testing = False):
-        """Send temperature settings to server"""
-        params = {}
-        if token_temp is not None:
-            params['token_temp'] = token_temp
-        if categorical_temp is not None:
-            params['categorical_temp'] = categorical_temp
-        if gaussian_temp is not None:
-            params['gaussian_temp'] = gaussian_temp
-        response = requests.post(f"{self.base_url}/set_temperature", params=params)
-        print(response.json()['message'])
-    def audio_callback(self, indata, frames, time, status):
-        """This is called for each audio block"""
-        if status:
-            print(status)
-        # if np.isclose(indata, 0).all():
-        #     raise Exception('Audio input is not working - received all zeros')
-        # Convert float32 to int16 for efficient transmission
-        indata_int16 = (indata.copy() * 32767).astype(np.int16)
-        # indata_int16 = np.zeros_like(indata_int16)
-        self.audio_queue.put(indata_int16)
-    def output_stream_callback(self, outdata, frames, time, status):
-        """Callback for output stream to get audio data"""
-        if status:
-            print(status)
-        try:
-            data = self.output_queue.get_nowait()
-            data = data.astype(np.float32) / 32767.0
-            if len(data) < len(outdata):
-                outdata[:len(data)] = data
-                outdata[len(data):] = 0
-            else:
-                outdata[:] = data[:len(outdata)]
-        except queue.Empty:
-            outdata.fill(0)
-    async def process_audio(self):
-        async with websockets.connect(self.server_url) as ws:
-            while self.running:
-                if not self.audio_queue.empty():
-                    # Get recorded audio
-                    audio_data = self.audio_queue.get()
-                    print(f'Data from microphone:{audio_data.shape, audio_data.dtype, audio_data.min(), audio_data.max()}')
-                    # Convert to base64
-                    audio_b64 = base64.b64encode(audio_data.tobytes()).decode('utf-8')
-                    # Send to server
-                    time_sent = time.time()
-                    await ws.send(f"data:audio/raw;base64,{audio_b64}")
-                    # Receive processed audio
-                    response = await ws.recv()
-                    response = response.split(",")[1]
-                    time_received = time.time()
-                    print(f"Data sent: {audio_b64[:10]}. Data received: {response[:10]}. Received in {(time_received - time_sent) * 1000:.2f} ms")
-                    processed_audio = np.frombuffer(
-                        base64.b64decode(response),
-                        dtype=np.int16
-                    ).reshape(-1, CHANNELS)
-                    print(f'Data from model:{processed_audio.shape, processed_audio.dtype, processed_audio.min(), processed_audio.max()}')
-                    self.output_queue.put(processed_audio)
-    def start(self):
-        self.running = True
-        # Print audio device information
-        devices = sd.query_devices()
-        default_input = sd.query_devices(kind='input')
-        default_output = sd.query_devices(kind='output')
-        print("\nAudio Device Configuration:")
-        print("-" * 50)
-        print(f"Default Input Device:\n{default_input}\n")
-        print(f"Default Output Device:\n{default_output}\n")
-        print("\nAll Available Devices:")
-        print("-" * 50)
-        for i, device in enumerate(devices):
-            print(f"Device {i}:")
-            print(f"Name: {device['name']}")
-            print(f"Channels (in/out): {device['max_input_channels']}/{device['max_output_channels']}")
-            print(f"Sample Rates: {device['default_samplerate']}")
-            print()
-        input_device = input("Enter the index of the input device or press enter for default: ")
-        output_device = input("Enter the index of the output device or press enter for default: ")
-        if input_device == "":
-            input_device = default_input['index']
-        if output_device == "":
-            output_device = default_output['index']
-        with sd.InputStream(callback=self.audio_callback,
-                          channels=CHANNELS,
-                          samplerate=SAMPLE_RATE,
-                          device=int(input_device),
-                          blocksize=2000), \
-             sd.OutputStream(callback=self.output_stream_callback,
-                           channels=CHANNELS,
-                           samplerate=SAMPLE_RATE,
-                           blocksize=2000,
-                           device=int(output_device)):
-            asyncio.run(self.process_audio())
-    def stop(self):
-        self.running = False
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Audio Client with Temperature Control')
-    parser.add_argument('--token_temp', '-t1', type=float, help='Token (LM) temperature parameter')
-    parser.add_argument('--categorical_temp', '-t2', type=float, help='Categorical (VAE) temperature parameter')
-    parser.add_argument('--gaussian_temp', '-t3', type=float, help='Gaussian (VAE) temperature parameter')
-    parser.add_argument('--server', '-s', default="ws://localhost:8000",
-                        help='Server URL (default: ws://localhost:8000)')
-    args = parser.parse_args()
-    # Audio settings
-    SAMPLE_RATE = 16000
-    CHANNELS = 1
-    client = AudioClient(
-        server_url=args.server,
-        token_temp=args.token_temp,
-        categorical_temp=args.categorical_temp,
-        gaussian_temp=args.gaussian_temp
-    )
-    try:
-        client.start()
-    except KeyboardInterrupt:
-        client.stop()

inference_server.py DELETED Viewed

@@ -1,170 +0,0 @@
-import time
-import numpy as np
-from fastapi import FastAPI, WebSocket
-from fastapi.middleware.cors import CORSMiddleware
-import base64
-import uvicorn
-import traceback
-import numpy as np
-import argparse
-import torch as T
-import torch.nn.functional as F
-import torchaudio
-import os
-from typing import Optional
-from utils import print_colored
-from model import get_hertz_dev_config
-argparse = argparse.ArgumentParser()
-argparse.add_argument('--prompt_path', type=str, default='./prompts/bob_mono.wav', help="""
-                      We highly recommend making your own prompt based on a conversation between you and another person.
-                      bob_mono.wav seems to work better for two-channel than bob_stereo.wav.
-                      """)
-args = argparse.parse_args()
-device = 'cuda' if T.cuda.is_available() else T.device('cpu')
-print_colored(f"Using device: {device}", "grey")
-model_config = get_hertz_dev_config(is_split=True)
-model = model_config()
-model = model.eval().bfloat16().to(device)
-app = FastAPI()
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Hyperparams or something.
-SAMPLE_RATE = 16000 # Don't change this
-TEMPS = (0.8, (0.4, 0.1)) # You can change this, but there's also an endpoint for it.
-class AudioProcessor:
-    def __init__(self, model, prompt_path):
-        self.model = model
-        self.prompt_path = prompt_path
-        self.initialize_state(prompt_path)
-    def initialize_state(self, prompt_path):
-        loaded_audio, sr = torchaudio.load(prompt_path)
-        if sr != SAMPLE_RATE:
-            resampler = torchaudio.transforms.Resample(sr, SAMPLE_RATE)
-            loaded_audio = resampler(loaded_audio)
-        if loaded_audio.shape[0] == 1:
-            loaded_audio = loaded_audio.repeat(2, 1)
-        audio_length = loaded_audio.shape[-1]
-        num_chunks = audio_length // 2000
-        loaded_audio = loaded_audio[..., :num_chunks * 2000]
-        self.loaded_audio = loaded_audio.to(device)
-        with T.autocast(device_type=device, dtype=T.bfloat16), T.inference_mode():
-                self.model.init_cache(bsize=1, device=device, dtype=T.bfloat16, length=1024)
-                self.next_model_audio = self.model.next_audio_from_audio(self.loaded_audio.unsqueeze(0), temps=TEMPS)
-        self.prompt_buffer = None
-        self.prompt_position = 0
-        self.chunks_until_live = 24
-        self.initialize_prompt_buffer()
-        print_colored("AudioProcessor state initialized", "green")
-    def initialize_prompt_buffer(self):
-        self.recorded_audio = self.loaded_audio
-        prompt_audio = self.loaded_audio.reshape(1, 2, -1)
-        prompt_audio = prompt_audio[:, :, -48000:].cpu().numpy()
-        prompt_audio_mono = prompt_audio.mean(axis=1)
-        self.prompt_buffer = np.array_split(prompt_audio_mono[0], 24)
-        print_colored(f"Initialized prompt buffer with {len(self.prompt_buffer)} chunks", "grey")
-    async def process_audio(self, audio_data):
-        if self.chunks_until_live > 0:
-            print_colored(f"Serving from prompt buffer, {self.chunks_until_live} chunks left", "grey")
-            chunk = self.prompt_buffer[24 - self.chunks_until_live]
-            self.chunks_until_live -= 1
-            if self.chunks_until_live == 0:
-                print_colored("Switching to live processing mode", "green")
-            time.sleep(0.05)
-            return chunk
-        audio_tensor = T.from_numpy(audio_data).to(device)
-        audio_tensor = audio_tensor.reshape(1, 1, -1)
-        audio_tensor = T.cat([audio_tensor, self.next_model_audio], dim=1)
-        with T.autocast(device_type=device, dtype=T.bfloat16), T.inference_mode():
-            curr_model_audio = self.model.next_audio_from_audio(
-                audio_tensor,
-                temps=TEMPS
-            )
-        print(f"Recorded audio shape {self.recorded_audio.shape}, audio tensor shape {audio_tensor.shape}")
-        self.recorded_audio = T.cat([self.recorded_audio.cpu(), audio_tensor.squeeze(0).cpu()], dim=-1)
-        self.next_model_audio = curr_model_audio
-        return curr_model_audio.float().cpu().numpy()
-    def cleanup(self):
-        print_colored("Cleaning up audio processor...", "blue")
-        os.makedirs('audio_recordings', exist_ok=True)
-        torchaudio.save(f'audio_recordings/{time.strftime("%d-%H-%M")}.wav', self.recorded_audio.cpu(), SAMPLE_RATE)
-        self.model.deinit_cache()
-        self.initialize_state(self.prompt_path)
-        print_colored("Audio processor cleanup complete", "green")
-@app.post("/set_temperature")
-async def set_temperature(token_temp: Optional[float] = None, categorical_temp: Optional[float] = None, gaussian_temp: Optional[float] = None):
-    try:
-        global TEMPS
-        TEMPS = (token_temp, (categorical_temp, gaussian_temp))
-        print_colored(f"Temperature updated to: {TEMPS}", "green")
-        return {"message": f"Temperature updated to: {TEMPS}", "status": "success"}
-    except Exception as e:
-        print_colored(f"Error setting temperature: {str(e)}", "red")
-        return {"message": f"Error setting temperature: {str(e)}", "status": "error"}
-@app.websocket("/audio")
-async def websocket_endpoint(websocket: WebSocket):
-    await websocket.accept()
-    try:
-        while True:
-            data = await websocket.receive_text()
-            audio_data = np.frombuffer(
-                base64.b64decode(data.split(",")[1]),
-                dtype=np.int16
-            )
-            audio_data = audio_data.astype(np.float32) / 32767.0
-            processed_audio = await audio_processor.process_audio(audio_data)
-            processed_audio = (processed_audio * 32767).astype(np.int16)
-            processed_data = base64.b64encode(processed_audio.tobytes()).decode('utf-8')
-            await websocket.send_text(f"data:audio/raw;base64,{processed_data}")
-    except Exception as e:
-            print_colored(f"WebSocket error: {e}", "red")
-            print_colored(f"Full traceback:\n{traceback.format_exc()}", "red")
-    finally:
-        audio_processor.cleanup()
-        await websocket.close()
-audio_processor = AudioProcessor(model=model, prompt_path=args.prompt_path)
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)
-    print("Server started")

ioblocks.py DELETED Viewed

@@ -1,333 +0,0 @@
-from __future__ import annotations
-from functools import partial
-from contextlib import nullcontext
-from typing import List, Tuple
-from math import ceil
-import torch as T
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.distributed as dist
-from torch import Tensor, int32
-from torch.amp import autocast
-from einops import rearrange, pack, unpack
-from utils import si_module, exists, default, maybe
-@si_module
-class GaussianMixtureIOLayer(nn.Module):
-    class Config:
-        latent_dim: int
-        dim: int
-        num_components: int
-    def __init__(self, c: Config):
-        super().__init__()
-        self.latent_dim = c.latent_dim
-        self.num_components = c.num_components
-        self.input_projection = nn.Linear(c.latent_dim, c.dim)
-        self.fc_loc = nn.Linear(c.dim, c.num_components * c.latent_dim)
-        self.fc_scale = nn.Linear(c.dim, c.num_components * c.latent_dim)
-        self.fc_weight = nn.Linear(c.dim, c.num_components)
-    def _square_plus(self, x):
-        return (x + T.sqrt(T.square(x) + 4)) / 2
-    def input(self, sampled_latents: T.Tensor) -> T.Tensor:
-        """Pre-sampled latents T.Tensor (B, L, Z) -> float tensor (B, L, D)"""
-        hidden = self.input_projection(sampled_latents)
-        return hidden
-    def output(self, h: T.Tensor) -> Tuple[T.Tensor, T.Tensor, T.Tensor]:
-        """float tensor (B, L, D) -> Tuple of locs, scales, and weights"""
-        batch_size, seq_len, _ = h.shape
-        locs = self.fc_loc(h).view(batch_size, seq_len, self.num_components, self.latent_dim)
-        scales = T.clamp(self._square_plus(self.fc_scale(h)), min=1e-6).view(batch_size, seq_len, self.num_components, self.latent_dim)
-        weights = self.fc_weight(h).view(batch_size, seq_len, self.num_components)
-        return (locs, scales, weights)
-    def loss(self, data, dataHat):
-        locs, scales, weights = dataHat
-        log_probs = -0.5 * T.sum(
-            (data.unsqueeze(-2) - locs).pow(2) / scales.pow(2) +
-            2 * T.log(scales) +
-            T.log(T.tensor(2 * T.pi)),
-            dim=-1
-        )
-        log_weights = F.log_softmax(weights, dim=-1)
-        return -T.logsumexp(log_weights + log_probs, dim=-1)
-    def temp_sample(self, orig_pdist, temp):
-        locs, scales, weights = orig_pdist
-        if temp is None:
-            component_samples = locs + scales * T.randn_like(scales)
-            mixture_samples = F.gumbel_softmax(weights, hard=True)
-            sampled = (component_samples * mixture_samples.unsqueeze(-1)).sum(dim=-2)
-        elif isinstance(temp, tuple):
-            assert len(temp) == 2
-            categorical_temp, gaussian_temp = temp
-            component_samples = locs + scales * gaussian_temp * T.randn_like(scales)
-            mixture_samples = F.gumbel_softmax(weights / categorical_temp, hard=True)
-            sampled = (component_samples * mixture_samples.unsqueeze(-1)).sum(dim=-2)
-        else:
-            component_samples = locs + scales * temp * T.randn_like(scales)
-            mixture_samples = F.gumbel_softmax(weights / temp, hard=True)
-            sampled = (component_samples * mixture_samples.unsqueeze(-1)).sum(dim=-2)
-        return sampled
-class GPTOutput(nn.Module):
-    def __init__(self, dim, vocab_size):
-        super().__init__()
-        self.output = nn.Linear(dim, vocab_size, bias=False)
-    def forward(self, x):
-        return self.output(x)
-# helper functions
-def pack_one(t, pattern):
-    return pack([t], pattern)
-def unpack_one(t, ps, pattern):
-    return unpack(t, ps, pattern)[0]
-def first(l):
-    return l[0]
-def round_up_multiple(num, mult):
-    return ceil(num / mult) * mult
-def get_code_utilization(codes, codebook_size, get_global=False):
-    if get_global and dist.is_initialized():
-        world_size = dist.get_world_size()
-    else:
-        world_size = 1
-    if world_size > 1:
-        gathered_tokens = [T.zeros_like(codes) for _ in range(world_size)]
-        dist.all_gather(gathered_tokens, codes)
-        gathered_tokens = T.cat(gathered_tokens, dim=0)
-    else:
-        gathered_tokens = codes
-    unique_tokens = len(T.unique(gathered_tokens))
-    code_utilization = unique_tokens / min(gathered_tokens.numel(), codebook_size)
-    return code_utilization
-# tensor helpers
-def round_ste(z: Tensor) -> Tensor:
-    """Round with straight through gradients."""
-    zhat = z.round()
-    return z + (zhat - z).detach()
-# main class
-# lucidrains fsq
-@si_module
-class FSQ(nn.Module):
-    @property
-    def needs_float32_params(self):
-        return True
-    class Config:
-        levels: List[int]
-        dim: int | None = None
-        num_codebooks: int = 1
-        keep_num_codebooks_dim: bool | None = None
-        scale: float | None = None
-        allowed_dtypes: Tuple[str, ...] = ('float32', 'float64')
-        channel_first: bool = False
-        projection_has_bias: bool = True
-        return_indices: bool = True
-        force_quantization_f32: bool = True
-        use_rms: bool = False
-    def __init__(self, c: Config):
-        super().__init__()
-        _levels = T.tensor(c.levels, dtype=int32)
-        self.register_buffer("_levels", _levels, persistent = False)
-        _basis = T.cumprod(T.tensor([1] + c.levels[:-1]), dim=0, dtype=int32)
-        self.register_buffer("_basis", _basis, persistent = False)
-        self.scale = c.scale
-        codebook_dim = len(c.levels)
-        self.codebook_dim = codebook_dim
-        effective_codebook_dim = codebook_dim * c.num_codebooks
-        self.num_codebooks = c.num_codebooks
-        self.allowed_dtypes = []
-        for dtype_str in c.allowed_dtypes:
-            if hasattr(T, dtype_str):
-                self.allowed_dtypes.append(getattr(T, dtype_str))
-            else:
-                raise ValueError(f"Invalid dtype string: {dtype_str}")
-        self.effective_codebook_dim = effective_codebook_dim
-        keep_num_codebooks_dim = default(c.keep_num_codebooks_dim, c.num_codebooks > 1)
-        assert not (c.num_codebooks > 1 and not keep_num_codebooks_dim)
-        self.keep_num_codebooks_dim = keep_num_codebooks_dim
-        self.dim = default(c.dim, len(_levels) * c.num_codebooks)
-        self.channel_first = c.channel_first
-        has_projections = self.dim != effective_codebook_dim
-        self.project_in = nn.Linear(self.dim, effective_codebook_dim, bias = c.projection_has_bias) if has_projections else nn.Identity()
-        self.project_out = nn.Linear(effective_codebook_dim, self.dim, bias = c.projection_has_bias) if has_projections else nn.Identity()
-        self.has_projections = has_projections
-        self.return_indices = c.return_indices
-        if c.return_indices:
-            self.codebook_size = self._levels.prod().item()
-            implicit_codebook = self._indices_to_codes(T.arange(self.codebook_size))
-            self.register_buffer("implicit_codebook", implicit_codebook, persistent = False)
-        self.allowed_dtypes = c.allowed_dtypes
-        self.force_quantization_f32 = c.force_quantization_f32
-        self.latent_loss = None
-    def latent_metric(self, codes, get_global=False):
-        return {'code_util_estimate': get_code_utilization(codes, self.codebook_size, get_global)}
-    def repr_from_latent(self, latent):
-        return self.indices_to_codes(latent)
-    def bound(self, z, eps: float = 1e-3):
-        """ Bound `z`, an array of shape (..., d). """
-        half_l = (self._levels - 1) * (1 + eps) / 2
-        offset = T.where(self._levels % 2 == 0, 0.5, 0.0)
-        shift = (offset / half_l).atanh()
-        return (z + shift).tanh() * half_l - offset
-    def quantize(self, z):
-        """ Quantizes z, returns quantized zhat, same shape as z. """
-        quantized = round_ste(self.bound(z))
-        half_width = self._levels // 2 # Renormalize to [-1, 1].
-        return quantized / half_width
-    def _scale_and_shift(self, zhat_normalized):
-        half_width = self._levels // 2
-        return (zhat_normalized * half_width) + half_width
-    def _scale_and_shift_inverse(self, zhat):
-        half_width = self._levels // 2
-        return (zhat - half_width) / half_width
-    def _indices_to_codes(self, indices):
-        level_indices = self.indices_to_level_indices(indices)
-        codes = self._scale_and_shift_inverse(level_indices)
-        return codes
-    def codes_to_indices(self, zhat):
-        """ Converts a `code` to an index in the codebook. """
-        assert zhat.shape[-1] == self.codebook_dim
-        zhat = self._scale_and_shift(zhat)
-        return (zhat * self._basis).sum(dim=-1).to(int32)
-    def indices_to_level_indices(self, indices):
-        """ Converts indices to indices at each level, perhaps needed for a transformer with factorized embeddings """
-        indices = rearrange(indices, '... -> ... 1')
-        codes_non_centered = (indices // self._basis) % self._levels
-        return codes_non_centered
-    def indices_to_codes(self, indices):
-        """ Inverse of `codes_to_indices`. """
-        assert exists(indices)
-        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
-        codes = self._indices_to_codes(indices)
-        if self.keep_num_codebooks_dim:
-            codes = rearrange(codes, '... c d -> ... (c d)')
-        codes = self.project_out(codes)
-        if is_img_or_video or self.channel_first:
-            codes = rearrange(codes, 'b ... d -> b d ...')
-        return codes
-    # @autocast(device_type='cuda', enabled = False)
-    def forward(self, z, return_codes=False):
-        """
-        einstein notation
-        b - batch
-        n - sequence (or flattened spatial dimensions)
-        d - feature dimension
-        c - number of codebook dim
-        """
-        is_img_or_video = z.ndim >= 4
-        need_move_channel_last = is_img_or_video or self.channel_first
-        # standardize image or video into (batch, seq, dimension)
-        if need_move_channel_last:
-            z = rearrange(z, 'b d ... -> b ... d')
-            z, ps = pack_one(z, 'b * d')
-        assert z.shape[-1] == self.dim, f'expected dimension of {self.dim} but found dimension of {z.shape[-1]}'
-        z = self.project_in(z)
-        z = rearrange(z, 'b n (c d) -> b n c d', c = self.num_codebooks)
-        # whether to force quantization step to be full precision or not
-        force_f32 = self.force_quantization_f32
-        quantization_context = partial(autocast, device_type='cuda', enabled = False) if force_f32 else nullcontext
-        with quantization_context():
-            orig_dtype = z.dtype
-            if force_f32 and orig_dtype not in self.allowed_dtypes:
-                z = z.float()
-            codes = self.quantize(z)
-            # returning indices could be optional
-            indices = None
-            if self.return_indices:
-                indices = self.codes_to_indices(codes)
-            codes = rearrange(codes, 'b n c d -> b n (c d)')
-            codes = codes.type(orig_dtype)
-        # project out
-        if return_codes:
-            return codes, indices
-        out = self.project_out(codes)
-        # reconstitute image or video dimensions
-        if need_move_channel_last:
-            out = unpack_one(out, ps, 'b * d')
-            out = rearrange(out, 'b ... d -> b d ...')
-            indices = maybe(unpack_one)(indices, ps, 'b * c')
-        if not self.keep_num_codebooks_dim and self.return_indices:
-            indices = maybe(rearrange)(indices, '... 1 -> ...')
-        # return quantized output and indices
-        return out, indices

model.py DELETED Viewed

@@ -1,441 +0,0 @@
-from typing import Optional, Tuple
-import torch as T
-import torch.nn as nn
-import torch.nn.functional as F
-from ioblocks import GaussianMixtureIOLayer, FSQ
-from transformer import Stack, ShapeRotator, Block as PerfBlock, GPTOutput, CACHE_FILL_VALUE, FFNN, Norm
-from tokenizer import make_tokenizer
-from utils import si_module, exists, isnt, tqdm0, print0, default, print0_colored
-from utils import load_ckpt
-@si_module
-class LatentQuantizer(nn.Module):
-    class Config:
-        compressor_config: Optional[FSQ.Config] = None
-        dim: Optional[int] = None
-        ff_dim: Optional[int] = None
-        input_dim: int = None
-        from_pretrained: Optional[Tuple[str, str]] = None
-    def __init__(self, c: Config):
-        super().__init__()
-        if exists(c.from_pretrained):
-            checkpoint = load_ckpt(*c.from_pretrained)
-        else:
-            assert exists(c.compressor_config), f'hmm {c}'
-        self.compressor = c.compressor_config()
-        self.ffnn = FFNN(c.dim, c.ff_dim)
-        self.input = nn.Linear(c.input_dim, c.dim) if exists(c.input_dim) else nn.Identity()
-        if exists(c.from_pretrained):
-            self.load_state_dict(checkpoint)
-    @T.no_grad()
-    def forward(self, x, return_latent=False, known_latent=None):
-        """
-        x: (B, S, D)
-        """
-        if exists(known_latent):
-            return self.compressor.indices_to_codes(known_latent)
-        x = self.input(x)
-        x = self.ffnn(x)
-        x, tokens = self.compressor(x)
-        if return_latent:
-            return x, tokens
-        return x
-@si_module
-class TransformerVAE(nn.Module):
-    class Config:
-        io_config: Optional[GaussianMixtureIOLayer.Config] = None
-        stack_config: Optional[Stack.Config] = None
-        quantizer_config: Optional[LatentQuantizer.Config] = None
-        plex_layer: int = None
-        plex_roll: int = 1
-        split: bool = True
-        from_pretrained: Optional[Tuple[str, str]] = None
-    def __init__(self, c: Config):
-        super().__init__()
-        if exists(c.from_pretrained):
-            checkpoint = load_ckpt(*c.from_pretrained)
-        else:
-            assert (exists(c.io_config) and exists(c.stack_config) and exists(c.quantizer_config)), f'hmm {c}'
-        self.io = c.io_config()
-        self.stack = c.stack_config()
-        self.plex_layer = c.stack_config.layers//2
-        self.plex_roll = c.plex_roll
-        self.plex_dim = c.quantizer_config.dim
-        assert self.plex_dim is not None and c.stack_config.dim is not None, f'One of the following are None: self.plex_dim: {self.plex_dim}, c.stack_config.dim: {c.stack_config.dim}'
-        self.plex_projection = nn.Linear(self.plex_dim, c.stack_config.dim)
-        self.out_norm = Norm(c.stack_config.dim)
-        if c.split:
-            self.io2 = c.io_config()
-            self.plex_projection2 = nn.Linear(self.plex_dim, c.stack_config.dim)
-            self.io2.fc_loc = None
-            self.io2.fc_scale = None
-            self.io2.fc_weight = None
-        kv_heads = c.stack_config.kv_heads or c.stack_config.n_head
-        head_dim = c.stack_config.dim // c.stack_config.n_head
-        self.cache_num_layers = c.stack_config.layers + ((c.stack_config.layers - self.plex_layer) if c.split else 0)
-        cache_shape = [self.cache_num_layers, c.stack_config.seq_len, 2, kv_heads, head_dim]
-        self.cache_shape = cache_shape
-        self.cache = [None] * self.cache_num_layers
-        if exists(c.from_pretrained):
-            result = self.load_state_dict(checkpoint, strict=False)
-            print0_colored(result, 'yellow')
-        self.quantizer = c.quantizer_config().eval()
-        self.quantizer.requires_grad = False
-    @T.no_grad()
-    def quantize(self, x):
-        if self.c.split:
-            x1, x2 = x.chunk(2, dim=-1)
-            with T.autocast(device_type='cuda', dtype=T.bfloat16):
-                quantized1 = self.quantizer(x1)
-                quantized2 = self.quantizer(x2)
-            return quantized1, quantized2
-        else:
-            with T.autocast(device_type='cuda', dtype=T.bfloat16):
-                return self.quantizer(x)
-    @T.no_grad()
-    def untokenize(self, token_data):
-        return self.quantizer(None, known_latent=token_data)
-    def init_cache(self, bsize, device, dtype, length:int=None):
-        cache_shape = self.cache_shape.copy()
-        cache_shape[1] = length or cache_shape[1]
-        self.cache = T.full((bsize, *cache_shape), CACHE_FILL_VALUE, device=device, dtype=dtype).transpose(0, 1)
-    def deinit_cache(self):
-        self.cache = [None] * self.cache_num_layers
-    @T.no_grad()
-    def forward(self, data, next_tokens: Optional[Tuple[T.Tensor, T.Tensor]] = None, temps: Optional[Tuple[float, Tuple[float, float]]] = None):
-        if self.c.split:
-            x1, x2 = data.chunk(2, dim=-1)
-            x = self.io.input(x1) + self.io2.input(x2)
-        else:
-            x = self.io.input(data)
-        cache_idx = 0
-        for l, layer in enumerate(self.stack.layers):
-            if l == self.plex_layer:
-                if self.c.split:
-                    plex1, plex2 = self.quantize(data)
-                    plex1 = T.roll(plex1, -self.c.plex_roll, dims=1)
-                    plex2 = T.roll(plex2, -self.c.plex_roll, dims=1)
-                    if exists(next_tokens):
-                        plex1[:, -1:] = self.untokenize(next_tokens[0])
-                        plex2[:, -1:] = self.untokenize(next_tokens[1])
-                    x1 = x + self.plex_projection(plex1)
-                    x2 = x + self.plex_projection2(plex2)
-                else:
-                    plex = self.quantize(data)
-                    plex = T.roll(plex, -self.c.plex_roll, dims=1)
-                    if exists(next_tokens):
-                        plex[:, -1:] = self.untokenize(next_tokens)
-                    x = x + self.plex_projection(plex)
-            if l < self.plex_layer:
-                x = layer(x, kv=self.cache[l])
-            else:
-                if self.c.split:
-                    x1 = layer(x1, kv=self.cache[self.plex_layer + cache_idx])
-                    cache_idx += 1
-                    x2 = layer(x2, kv=self.cache[self.plex_layer + cache_idx])
-                    cache_idx += 1
-                else:
-                    x = layer(x, kv=self.cache[l])
-        with T.autocast(device_type='cuda', dtype=T.bfloat16):
-            if self.c.split:
-                x1, x2 = self.out_norm(x1), self.out_norm(x2)
-                out1, out2 = self.io.output(x1), self.io.output(x2)
-            else:
-                x = self.out_norm(x)
-                out = self.io.output(x)
-        if isnt(temps):
-            if self.c.split:
-                return out1, out2
-            else:
-                return out
-        else:
-            if self.c.split:
-                next_data1 = self.io.temp_sample(out1, temps)[:, -1:, :]
-                next_data2 = self.io2.temp_sample(out2, temps)[:, -1:, :]
-                next_data = T.cat([next_data1, next_data2], dim=-1)
-                return next_data
-            else:
-                next_data = self.io.temp_sample(out, temps)[:, -1:, :]
-                return next_data
-@si_module
-class HertzDevModel(nn.Module):
-    class Config:
-        dim: int
-        vocab_size: int
-        stack_config: Optional[Stack.Config] = None
-        latent_size: int = 32
-        split: bool = True
-        quantizer_config: Optional[LatentQuantizer.Config] = None
-        resynthesizer_config: Optional[TransformerVAE.Config] = None
-        from_pretrained: Optional[Tuple[str, str]] = None
-    def __init__(self, c: Config):
-        super().__init__()
-        if exists(c.from_pretrained):
-            checkpoint = load_ckpt(*c.from_pretrained)
-        else:
-            assert (exists(c.stack_config)), f'hmm {c}'
-        self.input = nn.Linear(c.latent_size, c.dim)
-        if self.c.split:
-            self.input2 = nn.Linear(c.latent_size, c.dim)
-        self.shape_rotator = ShapeRotator(c.stack_config.dim//c.stack_config.n_head, c.stack_config.seq_len, theta=c.stack_config.theta)
-        self.layers = nn.ModuleList([
-            PerfBlock(
-                dim=c.stack_config.dim,
-                layer_id=l,
-                n_head=c.stack_config.n_head,
-                kv_heads=c.stack_config.kv_heads,
-                ff_dim=c.stack_config.ff_dim,
-                eps=c.stack_config.eps,
-                shape_rotator=self.shape_rotator,
-            ) for l in range(c.stack_config.layers)
-        ])
-        self.output = GPTOutput(c.dim, c.vocab_size)
-        if self.c.split:
-            self.output2 = GPTOutput(c.dim, c.vocab_size)
-        self.cache = [None] * c.stack_config.layers
-        self.kv_heads = c.stack_config.kv_heads or c.stack_config.n_head
-        self.head_dim = c.stack_config.dim // c.stack_config.n_head
-        if exists(c.from_pretrained):
-            result = self.load_state_dict(checkpoint, strict=False)
-            print0_colored(result, 'yellow')
-        self.resynthesizer = c.resynthesizer_config().eval()
-        self.resynthesizer.requires_grad = False
-        self.audio_tokenizer = make_tokenizer(device='cpu')
-        self.audio_cache = None
-        self.audio_latent_cache = None
-        self.use_audio_cache = False
-    @T.no_grad()
-    def tokenize(self, audio_data):
-        orig_audio_shape = audio_data.shape
-        if exists(self.audio_cache):
-            audio_data = T.cat([self.audio_cache, audio_data], dim=-1)
-            self.audio_cache = audio_data[..., -(6*16_000):]
-        elif self.use_audio_cache:
-            self.audio_cache = audio_data[..., -(6*16_000):]
-        if audio_data.shape[1] == 2:
-            enc_ch1 = self.audio_tokenizer.latent_from_data(audio_data[:, 0:1])
-            enc_ch2 = self.audio_tokenizer.latent_from_data(audio_data[:, 1:2])
-            return T.cat([enc_ch1, enc_ch2], dim=-1)[:, -(orig_audio_shape[-1]//2000):]
-        else:
-            return self.audio_tokenizer.latent_from_data(audio_data)[:, -(orig_audio_shape[-1]//2000):]
-    @T.no_grad()
-    def untokenize(self, token_data):
-        if exists(self.audio_latent_cache):
-            token_data = T.cat([self.audio_latent_cache, token_data], dim=1)
-            self.audio_latent_cache = token_data[:, -(6*8):]
-        elif self.use_audio_cache:
-            self.audio_latent_cache = token_data[:, -(6*8):]
-        if token_data.shape[-1] == 2*self.c.latent_size:
-            dec_ch1 = self.audio_tokenizer.data_from_latent(token_data[:, :self.c.latent_size])
-            dec_ch2 = self.audio_tokenizer.data_from_latent(token_data[:, self.c.latent_size:])
-            return T.cat([dec_ch1, dec_ch2], dim=1)[..., -(token_data.shape[1]*2000):]
-        else:
-            return self.audio_tokenizer.data_from_latent(token_data)[..., -(token_data.shape[1]*2000):]
-    def init_cache(self, bsize, device, dtype, length:int=None):
-        cache_shape = [self.c.stack_config.layers, length or self.c.stack_config.seq_len, 2, self.kv_heads, self.head_dim]
-        self.cache = T.full((bsize, *cache_shape), CACHE_FILL_VALUE, device=device, dtype=dtype).transpose(0, 1)
-        self.resynthesizer.init_cache(bsize, device, dtype, length)
-        self.use_audio_cache = True
-    def deinit_cache(self):
-        self.cache = [None] * len(self.layers)
-        self.resynthesizer.deinit_cache()
-        self.audio_cache = None
-        self.audio_latent_cache = None
-        self.use_audio_cache = False
-    @T.no_grad()
-    def forward(self, data):
-        if self.c.split:
-            x1, x2 = data.chunk(2, dim=-1)
-            x = self.input(x1) + self.input2(x2)
-        else:
-            x = self.input(data)
-        for l, layer in enumerate(self.layers):
-            x = layer(x, kv=self.cache[l])
-        if self.c.split:
-            return self.output(x), self.output2(x)
-        else:
-            return self.output(x)
-    @T.no_grad()
-    def next_audio_from_audio(self, audio_data: T.Tensor, temps=(0.8, (0.5, 0.1))):
-        latents_in = self.tokenize(audio_data)
-        next_latents = self.next_latent(latents_in, temps)
-        next_model_latent = next_latents[..., self.c.latent_size:]
-        audio_decoded = self.untokenize(next_model_latent)[..., -2000:]
-        return audio_decoded
-    @T.no_grad()
-    def next_latent(self, model_input: T.Tensor, temps=(0.8, (0.5, 0.1))):
-        if self.c.split:
-            logits1, logits2 = self.forward(model_input)
-            next_logits1 = logits1[:, -1]
-            next_logits2 = logits2[:, -1]
-            next_token1 = F.softmax(next_logits1 / temps[0], dim=-1).multinomial(1)
-            next_token2 = F.softmax(next_logits2 / temps[0], dim=-1).multinomial(1)
-            next_input = self.resynthesizer(model_input, next_tokens=(next_token1, next_token2), temps=temps[1])
-        else:
-            logits = self.forward(model_input)
-            next_logits = logits[:, -1]
-            next_token = F.softmax(next_logits / temps[0], dim=-1).multinomial(1)
-            next_input = self.resynthesizer(model_input, next_tokens=next_token, temps=temps[1])
-        return next_input
-    @T.no_grad()
-    def completion(self, data: T.Tensor, temps=(0.8, (0.5, 0.1)), gen_len=None, use_cache=True) -> T.Tensor:
-        """
-        only accepts latent-space data.
-        """
-        if use_cache:
-            self.init_cache(data.shape[0], data.device, T.bfloat16)
-        next_input = generated = data
-        target_len = min(data.shape[1] + default(gen_len, data.shape[1]), self.c.stack_config.seq_len)
-        for _ in tqdm0(range(data.shape[1], target_len)):
-            model_input = next_input if use_cache else generated
-            next_input = self.next_latent(model_input, temps)
-            generated = T.cat([generated, next_input], dim=1)
-        if use_cache:
-            self.deinit_cache()
-        return generated
-def get_hertz_dev_config(is_split=True):
-    if is_split:
-        checkpoints = [('inference_care_50000', 'e4ff4fe5c7e9f066410d2a5673b7a935'), ('inference_scion_54000', 'cb8bc484423922747b277ebc2933af5d')]
-    else:
-        checkpoints = [('inference_whip_72000', '5e7cee7316900737d55fc5d44cc7a8f7'), ('inference_caraway_112000', 'fcb8368ef8ebf7712f3e31e6856da580')]
-    quantizer_config=LatentQuantizer.Config(
-        from_pretrained=('inference_volcano_3', 'd42bf674022c5f84b051d5d7794f6169'),
-        compressor_config=FSQ.Config(
-            levels=[8,8,8,8,8],
-            dim=2048,
-            num_codebooks=1,
-            keep_num_codebooks_dim=None,
-            scale=None,
-            allowed_dtypes=['float32', 'float64', 'bfloat16'],
-            channel_first=False,
-            projection_has_bias=True,
-            return_indices=True,
-            force_quantization_f32=True,
-            use_rms=False
-        ),
-        dim=2048,
-        ff_dim=8192,
-        input_dim=32
-    )
-    resynthesizer_config=TransformerVAE.Config(
-        io_config=GaussianMixtureIOLayer.Config(
-            latent_dim=32,
-            dim=4096,
-            num_components=8,
-        ),
-        stack_config=Stack.Config(
-            layers=8,
-            dim=4096,
-            seq_len=8192,
-            n_head=16,
-            ff_dim=11008,
-            kv_heads=16,
-            eps=1e-5,
-            theta=10_000
-        ),
-        quantizer_config=quantizer_config,
-        plex_layer=None,
-        plex_roll=1,
-        split=is_split,
-        from_pretrained=checkpoints[0],
-    )
-    return HertzDevModel.Config(
-        dim=4096,
-        vocab_size=32_768,
-        stack_config=Stack.Config(
-            layers=32,
-            dim=4096,
-            seq_len=2048,
-            n_head=32,
-            ff_dim=None,
-            kv_heads=None,
-            eps=1e-5,
-            theta=10_000,
-        ),
-        quantizer_config=quantizer_config,
-        resynthesizer_config=resynthesizer_config,
-        split=is_split,
-        from_pretrained=checkpoints[1],
-    )

prompts/bob_duo.wav DELETED Viewed

Binary file (554 kB)

prompts/bob_mono.wav DELETED Viewed

Binary file (560 kB)

prompts/countdown_mono.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5f22399fd8039c043758ef527a588f608394f761395a437ffc02077fa0dca517
-size 1664746

prompts/toaskanymore.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5fa50bbee5e0f5a952f1b3d8afa2c051e00530b1c9040fcf45e7803a89ce6881
-size 1088044

requirements.txt DELETED Viewed

@@ -1,14 +0,0 @@
-torch
-torchaudio
-einops
-tqdm
-IPython
-numpy
-soundfile
-websockets
-requests
-sounddevice
-matplotlib
-fastapi
-uvicorn
-argparse

tokenizer.py DELETED Viewed

@@ -1,581 +0,0 @@
-import math
-from dataclasses import dataclass
-from typing import Union, Tuple, Literal
-import torch as T
-import torch.nn as nn
-from torch.nn.utils.parametrizations import weight_norm
-from utils import load_ckpt
-from utils.interp import print_colored
-from utils import si_module, get_activation
-# Adapted from https://github.com/facebookresearch/AudioDec
-def Conv1d1x1(in_channels, out_channels, bias=True):
-    return nn.Conv1d(in_channels, out_channels, kernel_size=1, bias=bias)
-class NonCausalConv1d(nn.Module):
-    """1D noncausal convloution w/ 2-sides padding."""
-    def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=1,
-            padding=-1,
-            dilation=1,
-            groups=1,
-            bias=True):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        if padding < 0:
-            padding = (kernel_size - 1) // 2 * dilation
-        self.dilation = dilation
-        self.conv = nn.Conv1d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias,
-        )
-    def forward(self, x):
-        """
-        Args:
-            x (Tensor): Float tensor variable with the shape  (B, C, T).
-        Returns:
-            Tensor: Float tensor variable with the shape (B, C, T).
-        """
-        x = self.conv(x)
-        return x
-class NonCausalConvTranspose1d(nn.Module):
-    """1D noncausal transpose convloution."""
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride,
-        padding=-1,
-        output_padding=-1,
-        groups=1,
-        bias=True,
-    ):
-        super().__init__()
-        if padding < 0:
-            padding = (stride+1) // 2
-        if output_padding < 0:
-            output_padding = 1 if stride % 2 else 0
-        self.deconv = nn.ConvTranspose1d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            output_padding=output_padding,
-            groups=groups,
-            bias=bias,
-        )
-    def forward(self, x):
-        """
-        Args:
-            x (Tensor): Float tensor variable with the shape  (B, C, T).
-        Returns:
-            Tensor: Float tensor variable with the shape (B, C', T').
-        """
-        x = self.deconv(x)
-        return x
-class CausalConv1d(NonCausalConv1d):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride=1,
-        dilation=1,
-        groups=1,
-        bias=True
-    ):
-        super(CausalConv1d, self).__init__(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=0,
-            dilation=dilation,
-            groups=groups,
-            bias=bias,
-        )
-        self.stride = stride
-        self.pad_length = (kernel_size - 1) * dilation
-    def forward(self, x):
-        pad = nn.ConstantPad1d((self.pad_length, 0), 0.0)
-        x = pad(x)
-        return self.conv(x)
-class CausalConvTranspose1d(NonCausalConvTranspose1d):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride,
-        bias=True,
-        pad_buffer=None,
-    ):
-        super(CausalConvTranspose1d, self).__init__(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=0,
-            output_padding=0,
-            bias=bias,
-        )
-        self.stride = stride
-        self.pad_length = (math.ceil(kernel_size/stride) - 1)
-        if pad_buffer is None:
-            pad_buffer = T.zeros(1, in_channels, self.pad_length)
-        self.register_buffer("pad_buffer", pad_buffer)
-    def forward(self, x):
-        pad = nn.ReplicationPad1d((self.pad_length, 0))
-        x = pad(x)
-        return self.deconv(x)[:, :, self.stride : -self.stride]
-    def inference(self, x):
-        x = T.cat((self.pad_buffer, x), -1)
-        self.pad_buffer = x[:, :, -self.pad_length:]
-        return self.deconv(x)[:, :, self.stride : -self.stride]
-    def reset_buffer(self):
-        self.pad_buffer.zero_()
-class NonCausalResUnit(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size=7,
-        dilation=1,
-        bias=False,
-    ):
-        super().__init__()
-        self.activation = nn.ELU()
-        self.conv1 = NonCausalConv1d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=1,
-            dilation=dilation,
-            bias=bias,
-        )
-        self.conv2 = Conv1d1x1(out_channels, out_channels, bias)
-    def forward(self, x):
-        y = self.conv1(self.activation(x))
-        y = self.conv2(self.activation(y))
-        return x + y
-class CausalResUnit(NonCausalResUnit):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size=7,
-        dilation=1,
-        bias=False,
-    ):
-        super(CausalResUnit, self).__init__(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            dilation=dilation,
-            bias=bias,
-        )
-        self.conv1 = CausalConv1d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=1,
-            dilation=dilation,
-            bias=bias,
-        )
-    def inference(self, x):
-        y = self.conv1.inference(self.activation(x))
-        y = self.conv2(self.activation(y))
-        return x + y
-class ResNetBlock(nn.Module):
-    def __init__(self,
-        in_channels,
-        out_channels,
-        stride,
-        kernel_size=7,
-        dilations=(1, 3, 9),
-        bias=True,
-        mode='encoder',
-    ):
-        super().__init__()
-        assert mode in ('encoder', 'decoder'), f"Mode ({mode}) is not supported!"
-        self.mode = mode
-        self.stride = stride
-        ConvUnit = CausalConv1d if mode == 'encoder' else CausalConvTranspose1d
-        res_channels = in_channels if mode == 'encoder' else out_channels
-        res_units = [CausalResUnit(
-            res_channels,
-            res_channels,
-            kernel_size=kernel_size,
-            dilation=dilation,
-        ) for dilation in dilations]
-        if in_channels == out_channels:
-            if mode == 'encoder':
-                self.pool = nn.AvgPool1d(kernel_size=stride, stride=stride)
-            if mode == 'decoder':
-                self.upsample = nn.Upsample(scale_factor=stride, mode='nearest')
-            conv_unit = nn.Conv1d(
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=1,
-                bias=bias,
-            ) if in_channels != out_channels else nn.Identity()
-        else:
-            conv_unit = ConvUnit(
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=(2 * stride),
-                stride=stride,
-                bias=bias,
-            )
-        if mode == 'encoder':
-            if in_channels == out_channels:
-                self.res_block = nn.Sequential(*res_units, self.pool, conv_unit)
-            else:
-                self.res_block = nn.Sequential(*res_units, conv_unit)
-        elif mode == 'decoder':
-            if in_channels == out_channels:
-                self.res_block = nn.Sequential(self.upsample, conv_unit, *res_units)
-            else:
-                self.res_block = nn.Sequential(conv_unit, *res_units)
-    def forward(self, x):
-        out = x
-        for unit in self.res_block:
-            out = unit(out)
-        return out
-    def inference(self, x):
-        for unit in self.res_block:
-            x = unit.inference(x)
-        return x
-@si_module
-class ResNetStack(nn.Module):
-    """
-    ResNet encoder or decoder stack. Channel ratios
-    and strides take the default order of from
-    data/io-layer, to the middle of the model.
-    """
-    class Config:
-        input_channels: int = 1
-        output_channels: int = 1
-        encode_channels: int = 32
-        decode_channel_multiplier: int = 1
-        latent_dim: int = None
-        kernel_size: int = 7
-        bias: bool = True
-        channel_ratios: Tuple[int, ...] = (2, 4, 8, 16)
-        strides: Tuple[int, ...] = (3, 4, 5, 5)
-        mode: Literal['encoder', 'decoder'] = 'encoder'
-    def __init__(self, c: Config):
-        super().__init__()
-        assert c.mode in ('encoder', 'decoder'), f"Mode ({c.mode}) is not supported!"
-        self.mode = c.mode
-        assert len(c.channel_ratios) == len(c.strides)
-        channel_ratios = (1,) + c.channel_ratios
-        strides = c.strides
-        self.middle_channels = c.encode_channels * channel_ratios[-1]
-        if c.mode == 'decoder':
-            channel_ratios = tuple(reversed(channel_ratios))
-            strides = tuple(reversed(strides))
-        self.multiplier = c.decode_channel_multiplier if c.mode == 'decoder' else 1
-        res_blocks = [ResNetBlock(
-            c.encode_channels * channel_ratios[s_idx] * self.multiplier,
-            c.encode_channels * channel_ratios[s_idx+1] * self.multiplier,
-            stride,
-            kernel_size=c.kernel_size,
-            bias=c.bias,
-            mode=c.mode,
-        ) for s_idx, stride in enumerate(strides)]
-        data_conv = CausalConv1d(
-            in_channels=c.input_channels if c.mode == 'encoder' else c.encode_channels * self.multiplier,
-            out_channels=c.encode_channels if c.mode == 'encoder' else c.output_channels,
-            kernel_size=c.kernel_size,
-            stride=1,
-            bias=False,
-        )
-        if c.mode == 'encoder':
-            self.res_stack = nn.Sequential(data_conv, *res_blocks)
-        elif c.mode == 'decoder':
-            self.res_stack = nn.Sequential(*res_blocks, data_conv)
-        if c.latent_dim is not None:
-            self.latent_proj = Conv1d1x1(self.middle_channels, c.latent_dim, bias=c.bias) if c.mode == 'encoder' else Conv1d1x1(c.latent_dim, self.middle_channels, bias=c.bias)
-        if self.multiplier != 1:
-            self.multiplier_proj = Conv1d1x1(self.middle_channels, self.middle_channels * self.multiplier, bias=c.bias)
-    def forward(self, x, return_feats=False):
-        if self.c.latent_dim is not None and self.mode == 'decoder':
-            x = self.latent_proj(x)
-        if self.multiplier != 1:
-            x = self.multiplier_proj(x)
-        feats = []
-        for block in self.res_stack:
-            x = block(x)
-            if return_feats:
-                feats.append(x)
-        if self.c.latent_dim is not None and self.mode == 'encoder':
-            x = self.latent_proj(x)
-            if return_feats:
-                feats.append(x)
-        if return_feats:
-            return feats
-        return x
-    def inference(self, x):
-        for block in self.res_stack:
-            x = block.inference(x)
-        return x
-    def reset_buffer(self):
-        def _reset_buffer(m):
-            if isinstance(m, CausalConv1d) or isinstance(m, CausalConvTranspose1d):
-                m.reset_buffer()
-        self.apply(_reset_buffer)
-    def reset_parameters(self):
-        def _reset_parameters(m):
-            if isinstance(m, (nn.Conv1d, nn.ConvTranspose1d)):
-                m.weight.data.normal_(0.0, 0.01)
-        self.apply(_reset_parameters)
-    def apply_weight_norm(self):
-        def _apply_weight_norm(m):
-            if isinstance(m, nn.Conv1d) or isinstance(
-                m, nn.ConvTranspose1d
-            ):
-                nn.utils.parametrizations.weight_norm(m)
-        self.apply(_apply_weight_norm)
-    def remove_weight_norm(self):
-        def _remove_weight_norm(m):
-            try:
-                print(m)
-                nn.utils.remove_weight_norm(m)
-            except ValueError:  # this module didn't have weight norm
-                return
-        self.apply(_remove_weight_norm)
-@si_module
-class GaussianZ(nn.Module):
-    class Config:
-        dim: int
-        latent_dim: int
-        bias: bool = False
-        use_weight_norm: bool = False
-    def __init__(self, c: Config):
-        super().__init__()
-        self.proj_in = nn.Linear(c.dim, c.latent_dim * 2, bias=c.bias)
-        self.proj_out = nn.Linear(c.latent_dim, c.dim, bias=c.bias)
-        if c.use_weight_norm:
-            self.proj_in = weight_norm(self.proj_in)
-            self.proj_out = weight_norm(self.proj_out)
-    def reparam(self, mu, logvar):
-        std = T.exp(logvar / 2)
-        eps = T.randn_like(std)
-        return mu + eps * std
-    def kl_divergence(self, mu, logvar):
-        return T.mean(-0.5 * T.sum(
-                1 + logvar - mu.pow(2) - logvar.exp(),
-                dim=(1, 2))
-            )
-    def repr_from_latent(self, latent: Union[dict, T.Tensor]):
-        if isinstance(latent, T.Tensor):
-            z = latent
-        else:
-            z = self.reparam(latent['mu'], latent['logvar'])
-        l = self.proj_out(z)
-        return l
-    def forward(self, x: T.Tensor) -> Tuple[T.Tensor, dict]:
-        mu, logvar = self.proj_in(x).chunk(2, dim=-1)
-        kl_div = self.kl_divergence(mu, logvar)
-        z = self.reparam(mu, logvar)
-        xhat = self.proj_out(z)
-        latent = {'mu': mu, 'logvar': logvar, 'z': z, 'kl_divergence': kl_div}
-        return xhat, latent
-@si_module
-class WaveCodec(nn.Module):
-    class Config:
-        resnet_config: ResNetStack.Config = None
-        sample_rate: int = 16_000
-        use_weight_norm: bool = False
-        compressor_config: dataclass = None
-        norm_stddev: float = 1.0
-    def __init__(self, c: Config):
-        super().__init__()
-        self.norm_stddev = c.norm_stddev
-        self.encoder = c.resnet_config(mode='encoder')
-        self.sample_rate = c.sample_rate
-        self.total_stride = 1
-        for stride in c.resnet_config.strides:
-            self.total_stride *= stride
-        self.tokens_per_second = self.sample_rate / self.total_stride
-        self.compressor = c.compressor_config(dim=self.encoder.middle_channels)
-        self.decoder = c.resnet_config(mode='decoder')
-        if c.use_weight_norm:
-            self.encoder.apply_weight_norm()
-            self.decoder.apply_weight_norm()
-            self.encoder.reset_parameters()
-            self.decoder.reset_parameters()
-    def encode(self, data):
-        return self.encoder(data/self.norm_stddev)
-    def decode(self, latent):
-        return self.decoder(latent.transpose(1, 2))*self.norm_stddev
-    @T.no_grad()
-    def latent_from_data(self, data, get_parameters=False):
-        x = self.encode(data)
-        l_in = x.transpose(1, 2)
-        l, latent = self.compressor(l_in)
-        return latent['z'] if not get_parameters else {
-            'mu': latent['mu'],
-            'logvar': latent['logvar'],
-            'z': latent['z'],
-        }
-    @T.no_grad()
-    def data_from_latent(self, latent):
-        l = self.compressor.repr_from_latent(latent)
-        x = self.decode(l)
-        return x
-    def process(self, x):
-        return self.latent_from_data(x)
-    def unprocess(self, latent):
-        return self.data_from_latent(latent)
-    def forward(self, audio_input):
-        x = self.encode(audio_input)
-        l_in = x.transpose(1, 2)
-        l, latent = self.compressor(l_in)
-        xhat = self.decode(l)
-        return xhat, latent
-def make_tokenizer(device='cuda'):
-    generator_config = WaveCodec.Config(
-        resnet_config=ResNetStack.Config(
-            input_channels=1,
-            output_channels=1,
-            encode_channels=16,
-            decode_channel_multiplier=4,
-            kernel_size=7,
-            bias=True,
-            channel_ratios=(4, 8, 16, 16, 16, 16),
-            strides=(2, 2, 4, 5, 5, 5),
-            mode=None,
-        ),
-        use_weight_norm=True,
-        compressor_config=GaussianZ.Config(
-            dim=None,
-            latent_dim=32,
-            bias=True,
-            use_weight_norm=True
-        ),
-        norm_stddev=0.05,
-    )
-    checkpoint = load_ckpt("inference_apatosaurus_95000", expected_hash="ba876edb97b988e9196e449dd176ca97")
-    tokenizer = generator_config()
-    load_result = tokenizer.load_state_dict(checkpoint, strict=False)
-    print_colored(f"Loaded tokenizer state dict: {load_result}", "grey")
-    tokenizer = tokenizer.eval()
-    # Only convert to bfloat16 if using CUDA
-    if device == 'cuda':
-        tokenizer = tokenizer.bfloat16()
-    tokenizer = tokenizer.to(device)
-    tokenizer.requires_grad_ = False
-    return tokenizer

transformer.py DELETED Viewed

@@ -1,382 +0,0 @@
-from typing import Optional, Tuple, MutableMapping
-from typing import Union
-import math
-from contextlib import nullcontext
-import torch
-import torch as T
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-from torch.nn.attention import SDPBackend
-from einops import rearrange
-from utils import si_module, default, exists, load_ckpt
-CACHE_FILL_VALUE = -1
-def get_cache_len(cache: Optional[Tensor]) -> int:
-    """
-    cache: (batch, seq_len, 2, kv_heads, head_dim)
-    """
-    if cache is None:
-        return 0
-    nonzeros = T.any(cache.flatten(2) != CACHE_FILL_VALUE, dim=-1)
-    length = nonzeros.sum(dim=-1).int()
-    assert T.all(length == length[0])
-    return length[0]
-def rotate_half(x):
-    x1, x2 = x.chunk(2, dim=-1)
-    return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(x, cos, sin, offset: int = 0):
-    assert (
-        cos.shape[1] >= offset + x.shape[1]
-    ), f"Offset and/or input sequence is too large,\
-        \n offset: {offset}, seq_len: {x.shape[1]}, max: {cos.shape[1]}"
-    cos_out = cos[:, offset : offset + x.shape[1], :, :]
-    sin_out = sin[:, offset : offset + x.shape[1], :, :]
-    return (x * cos_out) + (rotate_half(x) * sin_out)
-# Adapted from https://github.com/foundation-model-stack/foundation-model-stack
-class ShapeRotator:
-    def __init__(
-        self,
-        dim: int,
-        end: int,
-        theta: float = 10_000,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.ratio = theta
-        self.cached_freqs: MutableMapping[int, MutableMapping[int, torch.Tensor]] = {}
-        self.max_seq_len_cached: MutableMapping[int, int] = {}
-        self.ntk_scaling = False
-        self.max_seq_len = end
-    def compute_freqs_cis(self, device, max_seq_len=None):
-        alpha = 1
-        dev_idx = device.index
-        max_seq_len = default(max_seq_len, self.max_seq_len)
-        if dev_idx not in self.cached_freqs:
-            self.cached_freqs[dev_idx] = {}
-        if dev_idx not in self.max_seq_len_cached:
-            self.max_seq_len_cached[dev_idx] = 0
-        if self.max_seq_len_cached[dev_idx] > 0:
-            return 1
-        max_seq_len = max(max_seq_len, self.max_seq_len)
-        if (
-            1 in self.cached_freqs[dev_idx]
-            and max_seq_len <= self.max_seq_len_cached[dev_idx]
-        ):
-            return 1
-        ratio = self.ratio
-        dim = self.dim
-        freqs = 1.0 / (ratio ** (torch.arange(0, dim, 2, device=device).float() / dim))
-        t = torch.arange(max_seq_len, device=device, dtype=freqs.dtype)
-        freqs = torch.einsum("i,j->ij", t, freqs)
-        emb = torch.cat((freqs, freqs), dim=-1).to(device)
-        cos_to_cache = emb.cos()[None, :, None, :]
-        sin_to_cache = emb.sin()[None, :, None, :]
-        self.max_seq_len_cached[dev_idx] = max_seq_len
-        self.cached_freqs[dev_idx][alpha] = torch.stack(
-            [
-                cos_to_cache,
-                sin_to_cache,
-            ],
-            dim=-1,
-        )
-        return alpha
-    def rotate(
-        self,
-        q: Tensor,
-        k: Tensor,
-        offset: int = 0,
-    ) -> Tuple[Tensor, Tensor]:
-        """
-        Args
-        ----
-        q : torch.Tensor
-            Embedded query tensor, expected size is B x S x H x Eh
-        k : torch.Tensor
-            Embedded query tensor, expected size is B x S x H x Eh
-        """
-        assert len(q.size()) == 4
-        assert len(k.size()) == 4
-        seq_len = self.max_seq_len
-        alpha = self.compute_freqs_cis(q.device, seq_len)
-        freqs = self.cached_freqs[q.device.index][alpha]
-        freqs = freqs.float()  # 1 L D/2 2 2
-        q_out = apply_rotary_pos_emb(q, freqs[..., 0], freqs[..., 1], offset=offset).type_as(q)
-        k_out = apply_rotary_pos_emb(k, freqs[..., 0], freqs[..., 1], offset=offset).type_as(k)
-        return q_out.view_as(q), k_out.view_as(k)
-class Linear(nn.Linear):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs, bias=False)
-class Norm(nn.Module):
-    def __init__(self,
-            dim: int,
-            eps: float = 1e-5,) -> None:
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(T.ones((dim,)))
-    def forward(self, input: Tensor) -> Tensor:
-        return F.layer_norm(input, (self.weight.shape[0],), weight=self.weight, bias=None, eps=self.eps)
-class FFNN(nn.Module):
-    def __init__(self,
-            dim: int,
-            expand_dim: int = None,):
-        super().__init__()
-        expand_dim = default(expand_dim, 256 * ((int(2 * 4 * dim / 3) + 256 - 1) // 256))
-        self.dim = dim
-        self.expand_dim = expand_dim
-        self.gateup_proj = Linear(dim, 2*expand_dim)
-        self.down_proj = Linear(expand_dim, dim)
-    def forward(self, x):
-        gate, up = self.gateup_proj(x).chunk(2, dim=-1)
-        return self.down_proj(up * F.silu(gate))
-class GQA(nn.Module):
-    def __init__(self,
-            dim: int,
-            n_head: int,
-            shape_rotator: ShapeRotator,
-            kv_heads: Optional[int] = None,
-            eps: float = 1e-5,
-            causal: bool = True,):
-        super().__init__()
-        self.n_heads = n_head
-        self.kv_heads = default(kv_heads, n_head)
-        self.head_dim = dim // n_head
-        self.causal = causal
-        self.proj_qkv = Linear(dim, self.head_dim*(n_head+2*self.kv_heads))
-        self.norm_q = Norm(self.head_dim*n_head, eps=eps)
-        self.norm_k = Norm(self.head_dim*self.kv_heads, eps=eps)
-        self.attn_out = Linear(dim, dim)
-        self.shape_rotator = shape_rotator
-    def _sdpa(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
-        k = k.repeat_interleave(self.n_heads // self.kv_heads, dim=2)
-        v = v.repeat_interleave(self.n_heads // self.kv_heads, dim=2)
-        with nn.attention.sdpa_kernel(SDPBackend.FLASH_ATTENTION) if k.device.type == 'cuda' else nullcontext():
-            x = F.scaled_dot_product_attention(
-                q.transpose(1, 2),
-                k.transpose(1, 2),
-                v.transpose(1, 2),
-                is_causal=False if (q.size(1) != k.size(1)) else self.causal,
-            )
-        x = x.transpose(1, 2).contiguous()
-        return x
-    def _attend(self, q: Tensor, k: Tensor, v: Tensor, kv_cache: Optional[Tensor] = None,):
-        cache_len = get_cache_len(kv_cache)
-        q, k = self.shape_rotator.rotate(q, k, offset=cache_len)
-        if exists(kv_cache):
-            k = T.cat([kv_cache[:, :cache_len, 0], k], dim=1)
-            v = T.cat([kv_cache[:, :cache_len, 1], v], dim=1)
-            kv_cache[:, :k.size(1), 0] = k
-            kv_cache[:, :v.size(1), 1] = v
-        x = self._sdpa(q, k, v)
-        return self.attn_out(rearrange(x, 'b s h d -> b s (h d)'))
-    def _project(self, x):
-        full_q, full_k, full_v = self.proj_qkv(x).chunk(3, dim=-1)
-        normed_full_q = self.norm_q(full_q).to(full_q.dtype)
-        normed_full_k = self.norm_k(full_k).to(full_k.dtype)
-        q = rearrange(normed_full_q, 'b s (h d) -> b s h d', h=self.n_heads)
-        k = rearrange(normed_full_k, 'b s (h d) -> b s h d', h=self.kv_heads)
-        v = rearrange(full_v, 'b s (h d) -> b s h d', h=self.kv_heads)
-        return q, k, v
-    def forward(self,
-            x: Tensor,
-            kv: Optional[Tensor] = None,):
-        """
-        x: (B, S, D)
-        kv: (B, S, H, D)
-        """
-        q, k, v = self._project(x)
-        return self._attend(q, k, v, kv_cache=kv)
-class PreNormAttn(nn.Module):
-    def __init__(self,
-            dim: int,
-            n_head: int,
-            shape_rotator: ShapeRotator,
-            kv_heads: Optional[int] = None,
-            eps: float = 1e-5,
-            causal: bool = True,):
-        super().__init__()
-        self.attn_norm = Norm(dim, eps=eps)
-        self.attn = GQA(dim, n_head, shape_rotator, kv_heads, eps=eps, causal=causal)
-    def forward(self, x: Tensor, kv: Optional[Tensor] = None) -> Tensor:
-        """
-        x: (B, S, D)
-        kv: (B, S, H, D)
-        """
-        return x + self.attn(self.attn_norm(x), kv)
-class PreNormFFNN(nn.Module):
-    def __init__(self,
-            dim: int,
-            ff_dim: int,
-            eps: float = 1e-5,):
-        super().__init__()
-        self.ffnn_norm = Norm(dim, eps=eps)
-        self.ffnn = FFNN(dim, ff_dim)
-    def forward(self, x: Tensor) -> Tensor:
-        return x + self.ffnn(self.ffnn_norm(x))
-class Block(nn.Module):
-    def __init__(self,
-            dim: int,
-            layer_id: int = 0,
-            n_head: int = 16,
-            kv_heads: Optional[int] = None,
-            ff_dim: Optional[int] = None,
-            eps: float = 1e-5,
-            causal: bool = True,
-            shape_rotator: ShapeRotator = None):
-        super().__init__()
-        self.attn = PreNormAttn(dim, n_head, shape_rotator, kv_heads, eps=eps, causal=causal)
-        self.ffnn = PreNormFFNN(dim, ff_dim, eps=eps)
-        self.dim = dim
-        self.layer_id = layer_id
-        self.head_dim = dim // n_head
-        self.expand_dim = self.ffnn.ffnn.expand_dim
-        self.reset_parameters()
-    def reset_parameters(self):
-        std = 1.0 / math.sqrt(self.dim)
-        nn.init.trunc_normal_(self.ffnn.ffnn.gateup_proj.weight, std=std, a=-3 * std, b=3 * std)
-        nn.init.trunc_normal_(self.attn.attn.proj_qkv.weight, std=std, a=-3 * std, b=3 * std)
-        nn.init.trunc_normal_(self.attn.attn.attn_out.weight, std=std, a=-3 * std, b=3 * std)
-        xstd = 1.0 / math.sqrt(self.expand_dim)
-        nn.init.trunc_normal_(self.ffnn.ffnn.down_proj.weight, std=xstd, a=-3 * xstd, b=3 * xstd)
-    def forward(self, x: Tensor, kv: Optional[Tensor] = None) -> Tensor:
-        """
-        x: (B, S, D)
-        kv: (B, S, H, D)
-        """
-        h = self.attn(x, kv)
-        out = self.ffnn(h)
-        return out
-class GPTOutput(nn.Module):
-    def __init__(self, dim, vocab_size):
-        super().__init__()
-        self.dim = dim
-        self.norm = Norm(dim)
-        self.output = Linear(dim, vocab_size)
-        self.reset_parameters()
-    def reset_parameters(self):
-        std = 1.0 / math.sqrt(self.dim**2)
-        nn.init.trunc_normal_(self.output.weight, std=std, a=-3 * std, b=3 * std)
-    def forward(self, x):
-        return self.output(self.norm(x))
-@si_module
-class Stack(nn.Module):
-    class Config:
-        layers: int
-        dim: int
-        seq_len: int
-        n_head: int = 32
-        ff_dim: int = None
-        kv_heads: int = None
-        eps: float = 1e-5
-        theta: Union[int, float] = 10_000
-        causal: bool = True
-        from_pretrained: Optional[Tuple[str, int]] = None
-    def __init__(self, c: Config):
-        super().__init__()
-        from_pretrained = c.from_pretrained
-        if exists(from_pretrained):
-            checkpoint = load_ckpt(c.from_pretrained)
-        self.shape_rotator = ShapeRotator(c.dim//c.n_head, c.seq_len, theta=c.theta)
-        self.layers = nn.ModuleList([
-            Block(
-                dim=c.dim,
-                layer_id=l,
-                n_head=c.n_head,
-                kv_heads=c.kv_heads,
-                ff_dim=c.ff_dim,
-                eps=c.eps,
-                causal=c.causal,
-                shape_rotator=self.shape_rotator,
-            ) for l in range(c.layers)
-        ])
-        kv_heads = c.kv_heads or c.n_head
-        head_dim = c.dim // c.n_head
-        cache_shape = [c.layers, c.seq_len, 2, kv_heads, head_dim]
-        self.cache_shape = cache_shape
-        self.cache = [None] * c.layers
-        if exists(from_pretrained):
-            self.load_state_dict(checkpoint)
-    def init_cache(self, bsize, device, dtype, length:int=None):
-        if self.cache_shape is None:
-            return
-        cache_shape = self.cache_shape.copy()
-        cache_shape[1] = length or cache_shape[1]
-        self.cache = T.full((bsize, *cache_shape), CACHE_FILL_VALUE, device=device, dtype=dtype).transpose(0, 1)
-    def deinit_cache(self):
-        self.cache = [None] * len(self.cache)
-    def forward(self, x: Tensor) -> Tensor:
-        for l, layer in enumerate(self.layers):
-            x = layer(x, kv=self.cache[l])
-        return x

utils/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .blocks import *
-from .dist import *
-from .interp import *

utils/blocks.py DELETED Viewed

@@ -1,92 +0,0 @@
-from dataclasses import dataclass
-from typing import TypeVar, Generic, Type, Optional
-from functools import wraps
-import time
-import random
-import torch as T
-import torch.nn as nn
-# @TODO: remove si_module from codebase
-# we use this in our research codebase to make modules from callable configs
-si_module_TpV = TypeVar('si_module_TpV')
-def si_module(cls: Type[si_module_TpV]) -> Type[si_module_TpV]:
-    if not hasattr(cls, 'Config') or not isinstance(cls.Config, type):
-        class Config:
-            pass
-        cls.Config = Config
-    cls.Config = dataclass(cls.Config)
-    class ConfigWrapper(cls.Config, Generic[si_module_TpV]):
-        def __call__(self, *args, **kwargs) -> si_module_TpV:
-            if len(kwargs) > 0:
-                config_dict = {field.name: getattr(self, field.name) for field in self.__dataclass_fields__.values()}
-                config_dict.update(kwargs)
-                new_config = type(self)(**config_dict)
-                return cls(new_config)
-            else:
-                return cls(self, *args)
-    ConfigWrapper.__module__ = cls.__module__
-    ConfigWrapper.__name__ = f"{cls.__name__}Config"
-    ConfigWrapper.__qualname__ = f"{cls.__qualname__}.Config"
-    cls.Config = ConfigWrapper
-    original_init = cls.__init__
-    def new_init(self, *args, **kwargs):
-        self.c = next((arg for arg in args if isinstance(arg, cls.Config)), None) or next((arg for arg in kwargs.values() if isinstance(arg, cls.Config)), None)
-        original_init(self, *args, **kwargs)
-        self.register_buffer('_device_tracker', T.Tensor(), persistent=False)
-    cls.__init__ = new_init
-    @property
-    def device(self):
-        return self._device_tracker.device
-    @property
-    def dtype(self):
-        return self._device_tracker.dtype
-    cls.device = device
-    cls.dtype = dtype
-    return cls
-def get_activation(nonlinear_activation, nonlinear_activation_params={}):
-    if hasattr(nn, nonlinear_activation):
-        return getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
-    else:
-        raise NotImplementedError(f"Activation {nonlinear_activation} not found in torch.nn")
-def exists(v):
-    return v is not None
-def isnt(v):
-    return not exists(v)
-def truthyexists(v):
-    return exists(v) and v is not False
-def truthyattr(obj, attr):
-    return hasattr(obj, attr) and truthyexists(getattr(obj, attr))
-defaultT = TypeVar('defaultT')
-def default(*args: Optional[defaultT]) -> Optional[defaultT]:
-    for arg in args:
-        if exists(arg):
-            return arg
-    return None
-def maybe(fn):
-    @wraps(fn)
-    def inner(x, *args, **kwargs):
-        if not exists(x):
-            return x
-        return fn(x, *args, **kwargs)
-    return inner

utils/dist.py DELETED Viewed

@@ -1,99 +0,0 @@
-import os
-import torch as T
-import re
-from tqdm import tqdm
-from datetime import timedelta
-import requests
-import hashlib
-from io import BytesIO
-def rank0():
-    rank = os.environ.get('RANK')
-    if rank is None or rank == '0':
-        return True
-    else:
-        return False
-def local0():
-    local_rank = os.environ.get('LOCAL_RANK')
-    if local_rank is None or local_rank == '0':
-        return True
-    else:
-        return False
-class tqdm0(tqdm):
-    def __init__(self, *args, **kwargs):
-        total = kwargs.get('total', None)
-        if total is None and len(args) > 0:
-            try:
-                total = len(args[0])
-            except TypeError:
-                pass
-        if total is not None:
-            kwargs['miniters'] = max(1, total // 20)
-        super().__init__(*args, **kwargs, disable=not rank0(), bar_format='{bar}| {n_fmt}/{total_fmt} [{rate_fmt}{postfix}]')
-def print0(*args, **kwargs):
-    if rank0():
-        print(*args, **kwargs)
-_PRINTED_IDS = set()
-def printonce(*args, id=None, **kwargs):
-    if id is None:
-        id = ' '.join(map(str, args))
-    if id not in _PRINTED_IDS:
-        print(*args, **kwargs)
-        _PRINTED_IDS.add(id)
-def print0once(*args, **kwargs):
-    if rank0():
-        printonce(*args, **kwargs)
-def init_dist():
-    if T.distributed.is_initialized():
-        print0('Distributed already initialized')
-        rank = T.distributed.get_rank()
-        local_rank = int(os.environ.get('LOCAL_RANK', 0))
-        world_size = T.distributed.get_world_size()
-    else:
-        try:
-            rank = int(os.environ['RANK'])
-            local_rank = int(os.environ['LOCAL_RANK'])
-            world_size = int(os.environ['WORLD_SIZE'])
-            device = f'cuda:{local_rank}'
-            T.cuda.set_device(device)
-            T.distributed.init_process_group(backend='nccl', timeout=timedelta(minutes=30), rank=rank, world_size=world_size, device_id=T.device(device))
-            print(f'Rank {rank} of {world_size}.')
-        except Exception as e:
-            print0once(f'Not initializing distributed env: {e}')
-            rank = 0
-            local_rank = 0
-            world_size = 1
-    return rank, local_rank, world_size
-def load_ckpt(load_from_location, expected_hash=None):
-    if local0():
-        os.makedirs('ckpt', exist_ok=True)
-        url = f"https://ckpt.si.inc/hertz-dev/{load_from_location}.pt"
-        save_path = f"ckpt/{load_from_location}.pt"
-        if not os.path.exists(save_path):
-            response = requests.get(url, stream=True)
-            total_size = int(response.headers.get('content-length', 0))
-            with open(save_path, 'wb') as f, tqdm(total=total_size, desc=f'Downloading {load_from_location}.pt', unit='GB', unit_scale=1/(1024*1024*1024)) as pbar:
-                for chunk in response.iter_content(chunk_size=8192):
-                    f.write(chunk)
-                    pbar.update(len(chunk))
-        if expected_hash is not None:
-            with open(save_path, 'rb') as f:
-                file_hash = hashlib.md5(f.read()).hexdigest()
-            if file_hash != expected_hash:
-                print(f'Hash mismatch for {save_path}. Expected {expected_hash} but got {file_hash}. Deleting checkpoint and trying again.')
-                os.remove(save_path)
-                return load_ckpt(load_from_location, expected_hash)
-    if T.distributed.is_initialized():
-        T.distributed.barrier() # so that ranks don't try to laod checkpoint before it's finished downloading
-    loaded = T.load(f"ckpt/{load_from_location}.pt", weights_only=False, map_location='cpu')
-    return loaded

utils/interp.py DELETED Viewed

@@ -1,84 +0,0 @@
-import torch as T
-import os
-def rank0():
-    rank = os.environ.get('RANK')
-    if rank is None or rank == '0':
-        return True
-    else:
-        return False
-def print_colored(message, color='reset', bold=False, **kwargs):
-    color_dict = {
-        'bold': '\033[1m',
-        'green': '\033[92m',
-        'yellow': '\033[93m',
-        'red': '\033[91m',
-        'blue': '\033[94m',
-        'grey': '\033[90m',
-        'white': '\033[97m',
-        'reset': '\033[0m'
-    }
-    color_code = color_dict.get(color.lower(), color_dict['reset'])
-    prefix = color_dict['bold'] if bold else ''
-    print(f"{prefix}{color_code}{message}{color_dict['reset']}", **kwargs)
-def print0_colored(*args, **kwargs):
-    if rank0():
-        print_colored(*args, **kwargs)
-def param_count(module):
-    def count_parameters(model):
-        return sum(p.numel() for p in model.parameters() if p.requires_grad)
-    total_params = count_parameters(module)
-    output = [f'Total model parameters: {total_params:,}', '---------------------------']
-    for name, child in module.named_children():
-        params = count_parameters(child)
-        output.append(f'{name} parameters: {params:,}')
-    return '\n'.join(output)
-def model_size_estimation(module):
-    def estimate_size(model):
-        param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
-        buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
-        return param_size + buffer_size
-    total_size = estimate_size(module)
-    output = [f'Total model size: {total_size / 1024**2:.2f} MB', '---------------------------']
-    for name, child in module.named_children():
-        child_size = estimate_size(child)
-        output.append(f'{name} size: {child_size / 1024**2:.2f} MB')
-    return '\n'.join(output)
-def layer_param_distribution(module):
-    def count_parameters(model):
-        return sum(p.numel() for p in model.parameters() if p.requires_grad)
-    def get_layer_types(model):
-        layer_types = {}
-        for name, module in model.named_modules():
-            layer_type = module.__class__.__name__
-            params = sum(p.numel() for p in module.parameters(recurse=False) if p.requires_grad)
-            if params > 0:
-                if layer_type not in layer_types:
-                    layer_types[layer_type] = 0
-                layer_types[layer_type] += params
-        return layer_types
-    total_params = count_parameters(module)
-    layer_types = get_layer_types(module)
-    output = [f'Total trainable parameters: {total_params:,}', '---------------------------']
-    for layer_type, count in sorted(layer_types.items(), key=lambda x: x[1], reverse=True):
-        percentage = (count / total_params) * 100
-        output.append(f'{layer_type}: {count:,} ({percentage:.2f}%)')
-    return '\n'.join(output)