Upload 4 files

Browse files

Files changed (4) hide show

README.md +114 -3
config.toml +60 -0
[email protected] +3 -0
tokenizer_spm_48k_multi6_2.model +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,114 @@
----
-license: cc-by-4.0
----

+---
+# For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
+# Doc / guide: https://huggingface.co/docs/hub/model-cards
+license: cc-by-4.0
+language:
+- fr
+- en
+library_name: hibiki
+tags:
+- speech
+- translation
+- streaming
+metrics:
+- bleu
+---
+# Model Card for Hibiki
+[Hibiki](https://github.com/kyutai-labs/hibiki) is a model for streaming speech translation (also known as *simultaneous* translation). Unlike offline translation—where one waits for the end of the source utterance to start translating--- Hibiki adapts its flow to accumulate just enough context to produce a correct translation in real-time, chunk by chunk. As the user speaks, Hibiki generates natural speech in the target language, optionally with voice transfer, along with a text translation.
+Hibiki currently only supports French-to-English translation.
+## Model Details
+This is a model referred to as *Hibiki-M* (for *Mobile*) in our [paper](https://arxiv.org/abs/2502.03382), a 1.7B parameter
+hierarchical Transformer producing speech and text tokens at a framerate of 12.5Hz, with audio being generated at a
+1.1kbps bitrate.
+### Model Description
+Hibiki is a decoder-only model for simultaneous speech translation. Hibiki leverages the multistream architecture of [Moshi](https://arxiv.org/abs/2410.00037)
+to model source and target speech jointly. This allows Hibiki to continuously process the input stream while generating
+the target speech. Hibiki produces text and audio tokens at a constant framerate of 12.5Hz. This allows for a continuous
+output audio stream, along with timestamped text tranlsation. Since Hibiki relies on simple temperature sampling,
+it is compatible with batching unlike models that rely on complex inference policies. Moreover, the fidelity of Hibiki's
+voice transfer can be controlled by changing the coefficient of the Classifier-Free Guidance: a larger coefficient will
+increase voice similarity, but excessive coefficients can lead to worse translations.
+- **Developed by:**  Kyutai
+- **Model type:** Simultaneous speech-to-speech and speech-to-text translation.
+- **Language(s) (NLP):** French-to-English
+- **License:** CC-BY
+### Model Sources
+- **Repository:** [repo](https://github.com/kyutai-labs/hibiki)
+- **Paper:** [paper](https://arxiv.org/abs/2502.03382)
+- **Examples:** [demo](https://hf.co/spaces/kyutai/hibiki-samples)
+## Uses
+### Direct Use
+The model can be used for streaming translation from French to English in real-time settings, or for batched
+simultaneous translation of many input sequences. It is robust to noisy conditions and is trained on sequences up
+to 120 seconds.
+### Downstream Use
+Some components of the model can be used independently or repurposed relatively easily.
+For instance the Mimi codec is a state-of-the-art audio neural codec that combines semantic and acoustic information into audio tokens running at 12Hz and a bitrate of 1.1kbps, which make it particularly adapted to train speech language models or text-to-speech systems. Regarding the main Hibiki architecture,
+supporting other pairs of languages would require finetuning.
+### Out-of-Scope Use
+The model is not intended to be used to impersonate other people or any malicious use of any kind.
+## How to Get Started with the Model
+See the main [README](https://github.com/kyutai-labs/hibiki) file.
+## Training Details
+### Training Data
+- Textual data: The underlying [Helium](https://huggingface.co/kyutai/helium-1-preview-2b) model is trained on a mix of
+data including: Wikipedia, Stack Exchange, open-access scientific articles (from peS2o) and Common Crawl.
+- Audio data
+  - **Unsupervised audio dataset:** used for pre-training, this is a collection of 7M hours of readily available audio content in English and 450k hours in French, following the preprocessing and recipe of [Moshi](https://arxiv.org/abs/2410.00037).
+  - **Synthetic translation dataset**: Around 40k hours of parallel French-English data synthesized with *contextual alignment* (see [Section 3.2](https://arxiv.org/pdf/2502.03382)) with various levels of speaker similarity.
+  - **Translation finetuning:** A 900 hours mixture of a resynthesized version of [CVSS-T](https://github.com/google-research-datasets/cvss) and synthetic long-form utterances.
+### Training procedure and hyper-parameters
+The different stages of the training procedure are detailled in the paper along with the hyper-parameters.
+### Compute Infrastructure
+The final model was trained on 48 H100 Nvidia GPUs.
+## Citation
+```
+@misc{labiausse2025hibiki,
+      title={High-Fidelity Simultaneous Speech-To-Speech Translation},
+      author={Tom Labiausse and Laurent Mazaré and Edouard Grave and Patrick Pérez and Alexandre Défossez and Neil Zeghidour},
+      year={2025},
+      eprint={2502.03382},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2502.03382},
+}
+```
+## Model Card Authors
+Tom Labiausse, Laurent Mazaré, Edouard Grave, Patrick Pérez, Alexandre Défossez, Neil Zeghidour

config.toml ADDED Viewed

	@@ -0,0 +1,60 @@

+mimi_name = "[email protected]"
+moshi_name = "[email protected]"
+tokenizer_name = "tokenizer_spm_48k_multi6_2.model"
+[model]
+text_in_vocab_size = 48001
+text_out_vocab_size = 48000
+audio_vocab_size = 2049
+audio_codebooks = 16
+[model.transformer]
+d_model = 2048
+num_heads = 16
+num_layers = 16
+dim_feedforward = 8192
+causal = true
+norm_first = true
+bias_ff = false
+bias_attn = false
+context = 500
+max_period = 100000
+use_conv_block = false
+use_conv_bias = true
+gating = "silu"
+norm = "RmsNorm"
+positional_embedding = "Rope"
+conv_layout = false
+conv_kernel_size = 3
+kv_repeat = 1
+max_seq_len = 4096
+[model.depformer]
+num_slices = 8
+[model.depformer.transformer]
+d_model = 1024
+num_heads = 16
+num_layers = 6
+dim_feedforward = 4096
+causal = true
+norm_first = true
+bias_ff = false
+bias_attn = false
+context = 32
+max_period = 10000
+use_conv_block = false
+use_conv_bias = true
+gating = "silu"
+norm = "RmsNorm"
+positional_embedding = "None"
+conv_layout = false
+conv_kernel_size = 3
+kv_repeat = 1
+max_seq_len = 4096
+[model.conditioners.description]
+type = "Lut"
+n_bins = 31
+dim = 16
+possible_values = ["very_bad", "bad", "neutral", "good", "very_good"]

[email protected] ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31c14cf365353131094e8248150c6fe58e8642cf91899c50d9e450f861630e55
+size 384644900

tokenizer_spm_48k_multi6_2.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c22110fb855aa049e17346ea2e88355bdd664f06cbfd09948380ab5e85b39697
+size 857314