alpindale commited on Jul 8, 2024

Commit

c689221

verified ·

1 Parent(s): 5aa9ea3

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

7B_1T_1/consolidated.pth +3 -0
7B_1T_1/params.json +12 -0
7B_1T_4/consolidated.pth +3 -0
7B_1T_4/params.json +12 -0
7B_200B_1/consolidated.pth +3 -0
7B_200B_1/params.json +12 -0
7B_200B_4/consolidated.pth +3 -0
7B_200B_4/params.json +12 -0
LICENSE +65 -0
README.md +123 -0
checklist.chk +9 -0
example_completion.py +53 -0
llama/__init__.py +6 -0
llama/generation.py +421 -0
llama/model.py +526 -0
llama/tokenizer.py +68 -0
tokenizer.model +3 -0

7B_1T_1/consolidated.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f0fd7688fea5a3edb4613fb40597715b7db1abbc30afee49a77c088e74fbe3b
+size 26953703248

7B_1T_1/params.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "dim": 4096,
+    "ffn_dim_multiplier": 1.0,
+    "multiple_of": 256,
+    "n_future_tokens": 1,
+    "n_heads": 32,
+    "n_kv_heads": 32,
+    "n_layers": 32,
+    "norm_eps": 1e-05,
+    "rope_theta": 10000.0,
+    "vocab_size": 32000
+}

7B_1T_4/consolidated.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1dd2a79e8283ec51955a914b0a416650a62ea6d20efc75bc6bdc8513fb8a55b5
+size 26953703376

7B_1T_4/params.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "dim": 4096,
+    "ffn_dim_multiplier": 1.0,
+    "multiple_of": 256,
+    "n_future_tokens": 4,
+    "n_heads": 32,
+    "n_kv_heads": 32,
+    "n_layers": 32,
+    "norm_eps": 1e-05,
+    "rope_theta": 10000.0,
+    "vocab_size": 32000
+}

7B_200B_1/consolidated.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:239c8662dae921cc26d4060b62eee086fcf5c6d6611253ead120ed260cfce42a
+size 26953703039

7B_200B_1/params.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "dim": 4096,
+    "ffn_dim_multiplier": 1.0,
+    "multiple_of": 256,
+    "n_future_tokens": 1,
+    "n_heads": 32,
+    "n_kv_heads": 32,
+    "n_layers": 32,
+    "norm_eps": 1e-05,
+    "rope_theta": 10000.0,
+    "vocab_size": 32000
+}

7B_200B_4/consolidated.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4f3209280186ad279b9f1421fc8a4bdd0f76a6ed18b14b277c9007c7cec8a0e
+size 26953703376

7B_200B_4/params.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "dim": 4096,
+    "ffn_dim_multiplier": 1.0,
+    "multiple_of": 256,
+    "n_future_tokens": 4,
+    "n_heads": 32,
+    "n_kv_heads": 32,
+    "n_layers": 32,
+    "norm_eps": 1e-05,
+    "rope_theta": 10000.0,
+    "vocab_size": 32000
+}

LICENSE ADDED Viewed

	@@ -0,0 +1,65 @@

+Multi-token Prediction Research License 18th June 2024
+This Multi-token Prediction Research License (“Agreement”) contains the terms and conditions that govern your access and use of the Materials (as defined below). You may not use the Materials if you do not accept this Agreement.  By clicking “I Accept” to accept, or accessing, using, or distributing any portion or element of the Materials you hereby agree to be bound by the terms of this Agreement.  If you are agreeing to be bound by the Agreement on behalf of your employer or other entity, you represent and warrant to Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland) (“Meta”) that you have full legal authority to bind your employer or such entity to this Agreement.  If you do not have requisite authority, you may not accept the Agreement or access the Materials on behalf of your employer or other entity.
+This Agreement is effective upon the earlier of the date that you first access the Materials or accept this Agreement (“Effective Date”), and is entered into by and between Meta, and you, or if you are entering into this Agreement on behalf of your employer or other entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules, or regulations to provide legal consent and, your employer or other entity and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf (“Licensee” or “You”).
+1. Definitions.
+a. “Documentation” means the specifications, manuals and documentation accompanying this release distributed by Meta at https://huggingface.co/facebook/multi-token-prediction.
+b. “Noncommercial Research Uses” means noncommercial research use cases related to research, development, education, processing, or analysis and in each case, is not primarily intended for commercial advantage or monetary compensation to you or others.
+c. “Materials” means, collectively, Documentation and the models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, demonstration materials and other elements of the foregoing distributed by Meta at https://huggingface.co/facebook/multi-token-prediction and made available under this Agreement.
+d. “Trade Control Laws” means any applicable U.S. and non-U.S. export control and trade sanctions laws and regulations.
+e. “Acceptable Use Policy” means the LLaMA Acceptable Use Policy applicable to Materials that is incorporated into this Agreement.
+2. License Rights and Redistribution. Subject to Your compliance with the terms and conditions of this Agreement, Meta hereby grants you the following:
+a. Grant of Rights. You are hereby granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials solely for Noncommercial Research Uses.
+b. Redistribution and Use.
+i. Distribution of Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the Materials, or any derivative works thereof, available to a third party, you may only do so under the terms of this Agreement. You shall also provide a copy of this Agreement to such third party.
+ii. If you submit for publication the results of research you perform on, using, or otherwise in connection with Materials, you must acknowledge the use of Materials in your publication.
+iii. You must retain in all copies of the Materials that you distribute and include the following attribution notice within a “Notice” text file distributed as a part of such copies: “Materials are licensed under the Multi-token Prediction Research License, Copyright © Meta Platforms, Inc. All Rights Reserved.”
+iv. Your use of the Materials must comply with applicable laws and regulations (including Trade Control Laws)) and adhere to the LLaMA Acceptable Use Policy, which is hereby incorporated by reference into this Agreement.
+v. You agree to validate and confirm LLaMA outputs for compliance with the LLaMA Acceptable Use Policy, including before relying on LLaMA outputs in any way as part of research activities or incorporating these outputs in research, studies, and papers.
+vi. You agree to report any violation of this Multi-token Prediction Research License or the Acceptable Use Policy as outlined in the LLaMA Acceptable Use Policy.
+3. Restrictions. You will not, and will not permit, assist or cause any third party to:
+a. use the Materials or any outputs or results of the Materials in connection with any commercial uses or for any uses other than Noncommercial Research Uses;
+b. disguise your or their location through IP proxying or other methods;
+c. use or download Materials if you or they are: (a) located in a comprehensively sanctioned jurisdiction, (b) currently listed on any U.S. or non-U.S. restricted parties list, or (c) will use Materials for any purpose prohibited by Trade Control Laws; or
+d. directly or indirectly export, re-export, provide, or otherwise transfer Materials: (a) to any individual, entity, or country prohibited by Trade Control Laws; (b) to anyone on U.S. or non-U.S. government restricted parties lists; or (c) for any purpose prohibited by Trade Control Laws, including nuclear, chemical or biological weapons, or missile technology applications.
+4. User Support. Your Noncommercial Research Use of the Materials is done at your own discretion; Meta does not process any information nor provide any service in relation to such use.  Meta is under no obligation to provide any support services for the Materials. Any support provided is “as is”, “with all faults”, and without warranty of any kind.
+5. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE MATERIALS AND ANY OUTPUT AND RESULTS.
+6. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT OR INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+7. Intellectual Property.
+a. No trademark licenses are granted under this Agreement, and in connection with the Materials, neither Meta nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Materials.
+b. Subject to Meta’s ownership of Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
+c. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Materials or outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses and rights  granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Materials.
+8. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Materials. Sections 3, 4, 5, 6,  7,  8 and 9 shall survive the termination of this Agreement.
+9. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+10. Modifications and Amendments. Meta may modify this Agreement from time to time by posting a revised version at https://huggingface.co/facebook/multi-token-prediction/LICENSE; provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no other modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta.

README.md ADDED Viewed

	@@ -0,0 +1,123 @@

+---
+license: other
+extra_gated_prompt: >-
+  ### MULTI-TOKEN PREDICTION RESEARCH LICENSE AGREEMENT 18th June 2024
+  This Multi-token Prediction Research License (“Agreement”) contains the terms and conditions that govern your access and use of the Materials (as defined below). You may not use the Materials if you do not accept this Agreement.  By clicking "submit" below to accept, or accessing, using, or distributing any portion or element of the Materials you hereby agree to be bound by the terms of this Agreement.  If you are agreeing to be bound by the Agreement on behalf of your employer or other entity, you represent and warrant to Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland) (“Meta”) that you have full legal authority to bind your employer or such entity to this Agreement.  If you do not have requisite authority, you may not accept the Agreement or access the Materials on behalf of your employer or other entity.
+  This Agreement is effective upon the earlier of the date that you first access the Materials or accept this Agreement (“Effective Date”), and is entered into by and between Meta, and you, or if you are entering into this Agreement on behalf of your employer or other entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules, or regulations to provide legal consent and, your employer or other entity and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf (“Licensee” or “You”).
+  1. Definitions.
+    a. “Documentation” means the specifications, manuals and documentation accompanying this release distributed by Meta at https://huggingface.co/facebook/multi-token-prediction.
+    b. “Noncommercial Research Uses” means noncommercial research use cases related to research, development, education, processing, or analysis and in each case, is not primarily intended for commercial advantage or monetary compensation to you or others.
+    c. “Materials” means, collectively, Documentation and the models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, demonstration materials and other elements of the foregoing distributed by Meta at https://huggingface.co/facebook/multi-token-prediction and made available under this Agreement.
+    d. “Trade Control Laws” means any applicable U.S. and non-U.S. export control and trade sanctions laws and regulations.
+    e. “Acceptable Use Policy” means the [LLaMA Acceptable Use Policy](https://ai.meta.com/llama/use-policy/) applicable to Materials that is incorporated into this Agreement.
+  2. License Rights and Redistribution. Subject to Your compliance with the terms and conditions of this Agreement, Meta hereby grants you the following:
+    a. Grant of Rights. You are hereby granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials solely for Noncommercial Research Uses.
+    b. Redistribution and Use.
+      i. Distribution of Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the Materials, or any derivative works thereof, available to a third party, you may only do so under the terms of this Agreement. You shall also provide a copy of this Agreement to such third party.
+      ii. If you submit for publication the results of research you perform on, using, or otherwise in connection with Materials, you must acknowledge the use of Materials in your publication.
+      iii. You must retain in all copies of the Materials that you distribute and include the following attribution notice within a “Notice” text file distributed as a part of such copies: “Materials are licensed under the Multi-token Prediction Research License, Copyright © Meta Platforms, Inc. All Rights Reserved.”
+      iv. Your use of the Materials must comply with applicable laws and regulations (including Trade Control Laws) and adhere to the LLaMA Acceptable Use Policy, which is hereby incorporated by reference into this Agreement.
+      v. You agree to validate and confirm LLaMA outputs for compliance with the LLaMA Acceptable Use Policy, including before relying on LLaMA outputs in any way as part of research activities or incorporating these outputs in research, studies, and papers.
+      vi. You agree to report any violation of this Multi-token Prediction Research License or the Acceptable Use Policy, as outlined in the LLaMA Acceptable Use Policy.
+  3. Restrictions. You will not, and will not permit, assist or cause any third party to:
+    a. use the Materials or any outputs or results of the Materials in connection with any commercial uses or for any uses other than Noncommercial Research Uses;
+    b. disguise your or their location through IP proxying or other methods;
+    c. use or download Materials if you or they are: (a) located in a comprehensively sanctioned jurisdiction, (b) currently listed on any U.S. or non-U.S. restricted parties list, or (c) will use Materials for any purpose prohibited by Trade Control Laws; or
+    d. directly or indirectly export, re-export, provide, or otherwise transfer Materials: (a) to any individual, entity, or country prohibited by Trade Control Laws; (b) to anyone on U.S. or non-U.S. government restricted parties lists; or (c) for any purpose prohibited by Trade Control Laws, including nuclear, chemical or biological weapons, or missile technology applications.
+  4. User Support. Your Noncommercial Research Use of the Materials is done at your own discretion; Meta does not process any information nor provide any service in relation to such use.  Meta is under no obligation to provide any support services for the Materials. Any support provided is “as is”, “with all faults”, and without warranty of any kind.
+  5. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE MATERIALS AND ANY OUTPUT AND RESULTS.
+  6. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT OR INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+  7. Intellectual Property.
+    a. No trademark licenses are granted under this Agreement, and in connection with the Materials, neither Meta nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Materials.
+    b. Subject to Meta’s ownership of Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
+    c. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Materials or outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses and rights  granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Materials.
+  8. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Materials. Sections 3, 4, 5, 6,  7,  8 and 9 shall survive the termination of this Agreement.
+  9. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+  10. Modifications and Amendments. Meta may modify this Agreement from time to time by posting a revised version at https://huggingface.co/facebook/multi-token-prediction/LICENSE; provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no other modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta.
+extra_gated_fields:
+  First Name: text
+  Last Name: text
+  Date of birth: date_picker
+  Country: country
+  Affiliation: text
+  geo: ip_location
+  By clicking Submit below I accept the terms of the license and acknowledge that the information I provide will be collected stored processed and shared in accordance with the Meta Privacy Policy: checkbox
+extra_gated_description: The information you provide will be collected, stored, processed and shared in accordance with the [Meta Privacy Policy](https://www.facebook.com/privacy/policy/).
+extra_gated_button_content: Submit
+---
+# **Multi-token prediction models and baselines**
+Models accompanying the research paper "Better & Faster Large Language Models via Multi-token Prediction" (https://arxiv.org/abs/2404.19737).
+Included are the following four 7B parameter models trained on code:
+- baseline model (`n=1`) trained on 200B tokens of code: [7B_200B_1/](7B_200B_1/)
+- multi-token prediction model (`n=4`) trained on 200B tokens of code: [7B_200B_4/](7B_200B_4/)
+- baseline model (`n=1`) trained on 1T tokens of code: [7B_1T_1/](7B_1T_1/)
+- multi-token prediction model (`n=4`) trained on 1T tokens of code: [7B_1T_4/](7B_1T_4/)
+Tokenizer: standard Llama 2 SentencePiece tokenizer in [tokenizer.model](tokenizer.model).
+## *Quickstart*
+Install `torch`, `fairscale`, `fire` and `sentencepiece` and run
+```
+torchrun --nproc_per_node 1 example_completion.py --ckpt_dir 7B_200B_4/ --tokenizer_path tokenizer.model --max_seq_len 128 --max_batch_size 2
+```
+replacing `7B_200B_4` by the respective checkpoint directory.
+## *Format*
+The Pytorch `state_dicts` are compatible with Llama format: the layers of the shared trunk and the next-token prediction head layer are numbered contiguously. Additional prediction heads for tokens further in the future are names `extra_heads` and can be ignored for standard autoregressive inference.
+The implementation of `forward()` in [llama/model.py](llama/model.py) provides an additional argument `return_all_heads`. If set, the additional prediction heads are called and the logits are returned in shape `(batch_size, seq_len, n_future_tokens, vocab_size)`. Otherwise, the logit's shape is `(batch_size, seq_len, 1, vocab_size)`.
+## *Citation*
+Gloeckle, F., Idrissi, B. Y., Rozière, B., Lopez-Paz, D., & Synnaeve, G. (2024). Better & faster large language models via multi-token prediction. arXiv preprint arXiv:2404.19737.
+Bibtex entry:
+```
+@article{gloeckle2024better,
+  title={Better \& faster large language models via multi-token prediction},
+  author={Gloeckle, Fabian and Idrissi, Badr Youbi and Rozi{\`e}re, Baptiste and Lopez-Paz, David and Synnaeve, Gabriel},
+  journal={arXiv preprint arXiv:2404.19737},
+  year={2024}
+}
+```
+## Feedback and comments
+Please report risks as indicated in the Acceptable Use Policy and address bugs and any other comments to the corresponding authors as indicated in the research paper.

checklist.chk ADDED Viewed

	@@ -0,0 +1,9 @@

+c91d557db711a63af14a1c6e50fda2b3  7B_1T_1/consolidated.pth
+2646a33bf77c20f9219351b40f183f5e  7B_1T_1/params.json
+4d42458cc14f8cdd1b13a59106ddff75  7B_1T_4/consolidated.pth
+ad853f2e42dd19f7163c04d82093ac28  7B_1T_4/params.json
+8d117da90ce11aaf03b4fe4d8cbc9ff2  7B_200B_1/consolidated.pth
+2646a33bf77c20f9219351b40f183f5e  7B_200B_1/params.json
+b7c2764583dd99aee35e6109100f53c5  7B_200B_4/consolidated.pth
+ad853f2e42dd19f7163c04d82093ac28  7B_200B_4/params.json
+eeec4125e9c7560836b4873b6f8e3025  tokenizer.model

example_completion.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+from typing import Optional
+import fire
+from llama import Llama
+def main(
+    ckpt_dir: str,
+    tokenizer_path: str,
+    temperature: float = 0.2,
+    top_p: float = 0.9,
+    max_seq_len: int = 256,
+    max_batch_size: int = 4,
+    max_gen_len: Optional[int] = None,
+):
+    generator = Llama.build(
+        ckpt_dir=ckpt_dir,
+        tokenizer_path=tokenizer_path,
+        max_seq_len=max_seq_len,
+        max_batch_size=max_batch_size,
+    )
+    prompts = [
+        # For these prompts, the expected answer is the natural continuation of the prompt
+        """\
+def fizzbuzz(n: int):""",
+        """\
+import argparse
+def main(string: str):
+    print(string)
+    print(string[::-1])
+if __name__ == "__main__":"""
+    ]
+    results = generator.text_completion(
+        prompts,
+        max_gen_len=max_gen_len,
+        temperature=temperature,
+        top_p=top_p,
+    )
+    for prompt, result in zip(prompts, results):
+        print(prompt)
+        print(f"> {result['generation']}")
+        print("\n==================================\n")
+if __name__ == "__main__":
+    fire.Fire(main)

llama/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+from .generation import Llama, Dialog
+from .model import ModelArgs, Transformer
+from .tokenizer import Tokenizer

llama/generation.py ADDED Viewed

	@@ -0,0 +1,421 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+import json
+import os
+import sys
+import time
+from pathlib import Path
+from typing import List, Literal, Optional, Tuple, TypedDict
+import torch
+import torch.nn.functional as F
+from fairscale.nn.model_parallel.initialize import (
+    get_model_parallel_rank,
+    initialize_model_parallel,
+    model_parallel_is_initialized,
+)
+from llama.model import ModelArgs, Transformer
+from llama.tokenizer import Tokenizer
+Role = Literal["system", "user", "assistant"]
+class Message(TypedDict):
+    role: Role
+    content: str
+class CompletionPrediction(TypedDict, total=False):
+    generation: str
+    tokens: List[str]  # not required
+    logprobs: List[float]  # not required
+class ChatPrediction(TypedDict, total=False):
+    generation: Message
+    tokens: List[str]  # not required
+    logprobs: List[float]  # not required
+Dialog = List[Message]
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+SPECIAL_TAGS = [B_INST, E_INST, "<<SYS>>", "<</SYS>>"]
+UNSAFE_ERROR = "Error: special tags are not allowed as part of the prompt."
+class Llama:
+    @staticmethod
+    def build(
+        ckpt_dir: str,
+        tokenizer_path: str,
+        max_seq_len: int,
+        max_batch_size: int,
+        model_parallel_size: Optional[int] = None,
+        seed: int = 1,
+    ) -> "Llama":
+        """
+        Build a Llama instance by initializing and loading a pre-trained model.
+        Args:
+            ckpt_dir (str): Path to the directory containing checkpoint files.
+            tokenizer_path (str): Path to the tokenizer file.
+            max_seq_len (int): Maximum sequence length for input text.
+            max_batch_size (int): Maximum batch size for inference.
+            model_parallel_size (Optional[int], optional): Number of model parallel processes.
+                If not provided, it's determined from the environment. Defaults to None.
+        Returns:
+            Llama: An instance of the Llama class with the loaded model and tokenizer.
+        Raises:
+            AssertionError: If there are no checkpoint files in the specified directory,
+                or if the model parallel size does not match the number of checkpoint files.
+        Note:
+            This method initializes the distributed process group, sets the device to CUDA,
+            and loads the pre-trained model and tokenizer.
+        """
+        if not torch.distributed.is_initialized():
+            torch.distributed.init_process_group("nccl")
+        if not model_parallel_is_initialized():
+            if model_parallel_size is None:
+                model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
+            initialize_model_parallel(model_parallel_size)
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        torch.cuda.set_device(local_rank)
+        # seed must be the same in all processes
+        torch.manual_seed(seed)
+        if local_rank > 0:
+            sys.stdout = open(os.devnull, "w")
+        start_time = time.time()
+        checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
+        assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
+        assert model_parallel_size == len(
+            checkpoints
+        ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {model_parallel_size}"
+        ckpt_path = checkpoints[get_model_parallel_rank()]
+        checkpoint = torch.load(ckpt_path, map_location="cpu")
+        with open(Path(ckpt_dir) / "params.json", "r") as f:
+            params = json.loads(f.read())
+        model_args: ModelArgs = ModelArgs(
+            max_seq_len=max_seq_len,
+            max_batch_size=max_batch_size,
+            **params,
+        )
+        tokenizer = Tokenizer(model_path=tokenizer_path)
+        model_args.vocab_size = tokenizer.n_words
+        torch.set_default_tensor_type(torch.cuda.HalfTensor)
+        model = Transformer(model_args)
+        model.load_state_dict(checkpoint, strict=False)
+        print(f"Loaded in {time.time() - start_time:.2f} seconds")
+        return Llama(model, tokenizer)
+    def __init__(self, model: Transformer, tokenizer: Tokenizer):
+        self.model = model
+        self.tokenizer = tokenizer
+    @torch.inference_mode()
+    def generate(
+        self,
+        prompt_tokens: List[List[int]],
+        max_gen_len: int,
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        logprobs: bool = False,
+        echo: bool = False,
+    ) -> Tuple[List[List[int]], Optional[List[List[float]]]]:
+        """
+        Generate text sequences based on provided prompts using the language generation model.
+        Args:
+            prompt_tokens (List[List[int]]): List of tokenized prompts, where each prompt is represented as a list of integers.
+            max_gen_len (int): Maximum length of the generated text sequence.
+            temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6.
+            top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
+            logprobs (bool, optional): Flag indicating whether to compute token log probabilities. Defaults to False.
+            echo (bool, optional): Flag indicating whether to include prompt tokens in the generated output. Defaults to False.
+        Returns:
+            Tuple[List[List[int]], Optional[List[List[float]]]]: A tuple containing generated token sequences and, if logprobs is True, corresponding token log probabilities.
+        Note:
+            This method uses the provided prompts as a basis for generating text. It employs nucleus sampling to produce text with controlled randomness.
+            If logprobs is True, token log probabilities are computed for each generated token.
+        """
+        params = self.model.params
+        bsz = len(prompt_tokens)
+        assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
+        min_prompt_len = min(len(t) for t in prompt_tokens)
+        max_prompt_len = max(len(t) for t in prompt_tokens)
+        assert max_prompt_len <= params.max_seq_len
+        total_len = min(params.max_seq_len, max_gen_len + max_prompt_len)
+        pad_id = self.tokenizer.pad_id
+        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
+        for k, t in enumerate(prompt_tokens):
+            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")
+        if logprobs:
+            token_logprobs = torch.zeros_like(tokens, dtype=torch.float)
+        prev_pos = 0
+        eos_reached = torch.tensor([False] * bsz, device="cuda")
+        input_text_mask = tokens != pad_id
+        if min_prompt_len == total_len:
+            logits = self.model.forward(tokens, prev_pos).squeeze(2)
+            token_logprobs = -F.cross_entropy(
+                input=logits.transpose(1, -1),
+                target=tokens.flatten,
+                reduction="none",
+                ignore_index=pad_id,
+            )
+        for cur_pos in range(min_prompt_len, total_len):
+            logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos).squeeze(2)
+            if temperature > 0:
+                probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
+                next_token = sample_top_p(probs, top_p)
+            else:
+                next_token = torch.argmax(logits[:, -1], dim=-1)
+            next_token = next_token.reshape(-1)
+            # only replace token if prompt has already been generated
+            next_token = torch.where(
+                input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
+            )
+            tokens[:, cur_pos] = next_token
+            if logprobs:
+                token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy(
+                    input=logits.transpose(1, -1),
+                    target=tokens[:, prev_pos + 1 : cur_pos + 1],
+                    reduction="none",
+                    ignore_index=pad_id,
+                )
+            eos_reached |= (~input_text_mask[:, cur_pos]) & (
+                next_token == self.tokenizer.eos_id
+            )
+            prev_pos = cur_pos
+            if all(eos_reached):
+                break
+        if logprobs:
+            token_logprobs = token_logprobs.tolist()
+        out_tokens, out_logprobs = [], []
+        for i, toks in enumerate(tokens.tolist()):
+            # cut to max gen len
+            start = 0 if echo else len(prompt_tokens[i])
+            toks = toks[start : len(prompt_tokens[i]) + max_gen_len]
+            probs = None
+            if logprobs:
+                probs = token_logprobs[i][start : len(prompt_tokens[i]) + max_gen_len]
+            # cut to eos tok if any
+            if self.tokenizer.eos_id in toks:
+                eos_idx = toks.index(self.tokenizer.eos_id)
+                toks = toks[:eos_idx]
+                probs = probs[:eos_idx] if logprobs else None
+            out_tokens.append(toks)
+            out_logprobs.append(probs)
+        return (out_tokens, out_logprobs if logprobs else None)
+    def text_completion(
+        self,
+        prompts: List[str],
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        max_gen_len: Optional[int] = None,
+        logprobs: bool = False,
+        echo: bool = False,
+    ) -> List[CompletionPrediction]:
+        """
+        Perform text completion for a list of prompts using the language generation model.
+        Args:
+            prompts (List[str]): List of text prompts for completion.
+            temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6.
+            top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
+            max_gen_len (Optional[int], optional): Maximum length of the generated completion sequence.
+                If not provided, it's set to the model's maximum sequence length minus 1.
+            logprobs (bool, optional): Flag indicating whether to compute token log probabilities. Defaults to False.
+            echo (bool, optional): Flag indicating whether to include prompt tokens in the generated output. Defaults to False.
+        Returns:
+            List[CompletionPrediction]: List of completion predictions, each containing the generated text completion.
+        Note:
+            This method generates text completions for the provided prompts, employing nucleus sampling to introduce controlled randomness.
+            If logprobs is True, token log probabilities are computed for each generated token.
+        """
+        if max_gen_len is None:
+            max_gen_len = self.model.params.max_seq_len - 1
+        prompt_tokens = [self.tokenizer.encode(x, bos=True, eos=False) for x in prompts]
+        generation_tokens, generation_logprobs = self.generate(
+            prompt_tokens=prompt_tokens,
+            max_gen_len=max_gen_len,
+            temperature=temperature,
+            top_p=top_p,
+            logprobs=logprobs,
+            echo=echo,
+        )
+        if logprobs:
+            return [
+                {
+                    "generation": self.tokenizer.decode(t),
+                    "tokens": [self.tokenizer.decode(x) for x in t],
+                    "logprobs": logprobs_i,
+                }
+                for t, logprobs_i in zip(generation_tokens, generation_logprobs)
+            ]
+        return [{"generation": self.tokenizer.decode(t)} for t in generation_tokens]
+    def chat_completion(
+        self,
+        dialogs: List[Dialog],
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        max_gen_len: Optional[int] = None,
+        logprobs: bool = False,
+    ) -> List[ChatPrediction]:
+        """
+        Generate assistant responses for a list of conversational dialogs using the language generation model.
+        Args:
+            dialogs (List[Dialog]): List of conversational dialogs, where each dialog is a list of messages.
+            temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6.
+            top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
+            max_gen_len (Optional[int], optional): Maximum length of the generated response sequence.
+                If not provided, it's set to the model's maximum sequence length minus 1.
+            logprobs (bool, optional): Flag indicating whether to compute token log probabilities. Defaults to False.
+        Returns:
+            List[ChatPrediction]: List of chat predictions, each containing the assistant's generated response.
+        Raises:
+            AssertionError: If the last message in a dialog is not from the user.
+            AssertionError: If the dialog roles are not in the required 'user', 'assistant', and optional 'system' order.
+        Note:
+            This method generates assistant responses for the provided conversational dialogs.
+            It employs nucleus sampling to introduce controlled randomness in text generation.
+            If logprobs is True, token log probabilities are computed for each generated token.
+        """
+        if max_gen_len is None:
+            max_gen_len = self.model.params.max_seq_len - 1
+        prompt_tokens = []
+        unsafe_requests = []
+        for dialog in dialogs:
+            unsafe_requests.append(
+                any([tag in msg["content"] for tag in SPECIAL_TAGS for msg in dialog])
+            )
+            if dialog[0]["role"] == "system":
+                dialog = [
+                    {
+                        "role": dialog[1]["role"],
+                        "content": B_SYS
+                        + dialog[0]["content"]
+                        + E_SYS
+                        + dialog[1]["content"],
+                    }
+                ] + dialog[2:]
+            assert all([msg["role"] == "user" for msg in dialog[::2]]) and all(
+                [msg["role"] == "assistant" for msg in dialog[1::2]]
+            ), (
+                "model only supports 'system', 'user' and 'assistant' roles, "
+                "starting with 'system', then 'user' and alternating (u/a/u/a/u...)"
+            )
+            dialog_tokens: List[int] = sum(
+                [
+                    self.tokenizer.encode(
+                        f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} ",
+                        bos=True,
+                        eos=True,
+                    )
+                    for prompt, answer in zip(
+                        dialog[::2],
+                        dialog[1::2],
+                    )
+                ],
+                [],
+            )
+            assert (
+                dialog[-1]["role"] == "user"
+            ), f"Last message must be from user, got {dialog[-1]['role']}"
+            dialog_tokens += self.tokenizer.encode(
+                f"{B_INST} {(dialog[-1]['content']).strip()} {E_INST}",
+                bos=True,
+                eos=False,
+            )
+            prompt_tokens.append(dialog_tokens)
+        generation_tokens, generation_logprobs = self.generate(
+            prompt_tokens=prompt_tokens,
+            max_gen_len=max_gen_len,
+            temperature=temperature,
+            top_p=top_p,
+            logprobs=logprobs,
+        )
+        if logprobs:
+            return [
+                {
+                    "generation": {
+                        "role": "assistant",
+                        "content": self.tokenizer.decode(t)
+                        if not unsafe
+                        else UNSAFE_ERROR,
+                    },
+                    "tokens": [self.tokenizer.decode(x) for x in t],
+                    "logprobs": logprobs_i,
+                }
+                for t, logprobs_i, unsafe in zip(
+                    generation_tokens, generation_logprobs, unsafe_requests
+                )
+            ]
+        return [
+            {
+                "generation": {
+                    "role": "assistant",
+                    "content": self.tokenizer.decode(t) if not unsafe else UNSAFE_ERROR,
+                }
+            }
+            for t, unsafe in zip(generation_tokens, unsafe_requests)
+        ]
+def sample_top_p(probs, p):
+    """
+    Perform top-p (nucleus) sampling on a probability distribution.
+    Args:
+        probs (torch.Tensor): Probability distribution tensor.
+        p (float): Probability threshold for top-p sampling.
+    Returns:
+        torch.Tensor: Sampled token indices.
+    Note:
+        Top-p sampling selects the smallest set of tokens whose cumulative probability mass
+        exceeds the threshold p. The distribution is renormalized based on the selected tokens.
+    """
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort[mask] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token

llama/model.py ADDED Viewed

	@@ -0,0 +1,526 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import fairscale.nn.model_parallel.initialize as fs_init
+import torch
+import torch.nn.functional as F
+from fairscale.nn.model_parallel.layers import (
+    ColumnParallelLinear,
+    ParallelEmbedding,
+    RowParallelLinear,
+)
+from torch import nn
+@dataclass
+class ModelArgs:
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    n_future_tokens: int = 1
+    rope_theta: float = 10000.0
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    """
+    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        end (int): End index for precomputing frequencies.
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+    Returns:
+        torch.Tensor: Precomputed frequency tensor with complex exponentials.
+    """
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+    Args:
+        freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+    Raises:
+        AssertionError: If the frequency tensor doesn't match the expected shape.
+        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
+    """
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor.
+    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
+    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
+    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
+    returned as real tensors.
+    Args:
+        xq (torch.Tensor): Query tensor to apply rotary embeddings.
+        xk (torch.Tensor): Key tensor to apply rotary embeddings.
+        freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    bs, slen, n_kv_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        x[:, :, :, None, :]
+        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+    )
+class Attention(nn.Module):
+    """Multi-head attention module."""
+    def __init__(self, args: ModelArgs):
+        """
+        Initialize the Attention module.
+        Args:
+            args (ModelArgs): Model configuration parameters.
+        Attributes:
+            n_kv_heads (int): Number of key and value heads.
+            n_local_heads (int): Number of local query heads.
+            n_local_kv_heads (int): Number of local key and value heads.
+            n_rep (int): Number of repetitions for local heads.
+            head_dim (int): Dimension size of each attention head.
+            wq (ColumnParallelLinear): Linear transformation for queries.
+            wk (ColumnParallelLinear): Linear transformation for keys.
+            wv (ColumnParallelLinear): Linear transformation for values.
+            wo (RowParallelLinear): Linear transformation for output.
+            cache_k (torch.Tensor): Cached keys for attention.
+            cache_v (torch.Tensor): Cached values for attention.
+        """
+        super().__init__()
+        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        model_parallel_size = fs_init.get_model_parallel_world_size()
+        self.n_local_heads = args.n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.dim // args.n_heads
+        self.wq = ColumnParallelLinear(
+            args.dim,
+            args.n_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.wk = ColumnParallelLinear(
+            args.dim,
+            self.n_kv_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.wv = ColumnParallelLinear(
+            args.dim,
+            self.n_kv_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.wo = RowParallelLinear(
+            args.n_heads * self.head_dim,
+            args.dim,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+        self.cache_k = torch.zeros(
+            (
+                args.max_batch_size,
+                args.max_seq_len,
+                self.n_local_kv_heads,
+                self.head_dim,
+            )
+        ).cuda()
+        self.cache_v = torch.zeros(
+            (
+                args.max_batch_size,
+                args.max_seq_len,
+                self.n_local_kv_heads,
+                self.head_dim,
+            )
+        ).cuda()
+    def forward(
+        self,
+        x: torch.Tensor,
+        start_pos: int,
+        freqs_cis: torch.Tensor,
+        mask: Optional[torch.Tensor],
+    ):
+        """
+        Forward pass of the attention module.
+        Args:
+            x (torch.Tensor): Input tensor.
+            start_pos (int): Starting position for caching.
+            freqs_cis (torch.Tensor): Precomputed frequency tensor.
+            mask (torch.Tensor, optional): Attention mask tensor.
+        Returns:
+            torch.Tensor: Output tensor after attention.
+        """
+        bsz, seqlen, _ = x.shape
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+        self.cache_k = self.cache_k.to(xq)
+        self.cache_v = self.cache_v.to(xq)
+        self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk
+        self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv
+        keys = self.cache_k[:bsz, : start_pos + seqlen]
+        values = self.cache_v[:bsz, : start_pos + seqlen]
+        # repeat k/v heads if n_kv_heads < n_heads
+        keys = repeat_kv(keys, self.n_rep)  # (bs, cache_len + seqlen, n_local_heads, head_dim)
+        values = repeat_kv(values, self.n_rep)  # (bs, cache_len + seqlen, n_local_heads, head_dim)
+        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        keys = keys.transpose(1, 2) # (bs, n_local_heads, cache_len + seqlen, head_dim)
+        values = values.transpose(1, 2) # (bs, n_local_heads, cache_len + seqlen, head_dim)
+        scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if mask is not None:
+            scores = scores + mask  # (bs, n_local_heads, seqlen, cache_len + seqlen)
+        scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+        output = torch.matmul(scores, values)  # (bs, n_local_heads, seqlen, head_dim)
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+        return self.wo(output)
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+    ):
+        """
+        Initialize the FeedForward module.
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+            multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+            ffn_dim_multiplier (float, optional): Custom multiplier for hidden dimension. Defaults to None.
+        Attributes:
+            w1 (ColumnParallelLinear): Linear transformation for the first layer.
+            w2 (RowParallelLinear): Linear transformation for the second layer.
+            w3 (ColumnParallelLinear): Linear transformation for the third layer.
+        """
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = ColumnParallelLinear(
+            dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x
+        )
+        self.w2 = RowParallelLinear(
+            hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x
+        )
+        self.w3 = ColumnParallelLinear(
+            dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x
+        )
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, args: ModelArgs):
+        """
+        Initialize a TransformerBlock.
+        Args:
+            layer_id (int): Identifier for the layer.
+            args (ModelArgs): Model configuration parameters.
+        Attributes:
+            n_heads (int): Number of attention heads.
+            dim (int): Dimension size of the model.
+            head_dim (int): Dimension size of each attention head.
+            attention (Attention): Attention module.
+            feed_forward (FeedForward): FeedForward module.
+            layer_id (int): Identifier for the layer.
+            attention_norm (RMSNorm): Layer normalization for attention output.
+            ffn_norm (RMSNorm): Layer normalization for feedforward output.
+        """
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.head_dim = args.dim // args.n_heads
+        self.attention = Attention(args)
+        self.feed_forward = FeedForward(
+            dim=args.dim,
+            hidden_dim=4 * args.dim,
+            multiple_of=args.multiple_of,
+            ffn_dim_multiplier=args.ffn_dim_multiplier,
+        )
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+    def forward(
+        self,
+        x: torch.Tensor,
+        start_pos: int,
+        freqs_cis: torch.Tensor,
+        mask: Optional[torch.Tensor],
+    ):
+        """
+        Perform a forward pass through the TransformerBlock.
+        Args:
+            x (torch.Tensor): Input tensor.
+            start_pos (int): Starting position for attention caching.
+            freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.
+            mask (torch.Tensor, optional): Masking tensor for attention. Defaults to None.
+        Returns:
+            torch.Tensor: Output tensor after applying attention and feedforward layers.
+        """
+        h = x + self.attention(
+            self.attention_norm(x), start_pos, freqs_cis, mask
+        )
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+class Transformer(nn.Module):
+    def __init__(self, params: ModelArgs):
+        """
+        Initialize a Transformer model.
+        Args:
+            params (ModelArgs): Model configuration parameters.
+        Attributes:
+            params (ModelArgs): Model configuration parameters.
+            vocab_size (int): Vocabulary size.
+            n_layers (int): Total number of layers in the model (including extra heads).
+            n_future_tokens (int): Number of prediction heads in the model (= 1 + `len(extra_heads)`).
+            tok_embeddings (ParallelEmbedding): Token embeddings.
+            layers (torch.nn.ModuleList): List of Transformer blocks (trunk + next-token head).
+            extra_heads (torch.nn.ModuleList): List of Transformer blocks
+              (additional prediction heads for multi-token prediction).
+            norm (RMSNorm): Layer normalization for the model output.
+            output (ColumnParallelLinear): Linear layer for final output.
+            freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.
+        """
+        super().__init__()
+        self.params = params
+        self.vocab_size = params.vocab_size
+        self.n_layers = params.n_layers
+        self.n_future_tokens = params.n_future_tokens
+        self.tok_embeddings = ParallelEmbedding(
+            params.vocab_size, params.dim, init_method=lambda x: x
+        )
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(params.n_layers - self.n_future_tokens + 1):
+            self.layers.append(TransformerBlock(layer_id, params))
+        # Additional prediction heads for multi-token prediction.
+        # `layer_id` counts contiguously from the first Transformer block.
+        self.extra_heads = torch.nn.ModuleList()
+        for layer_id in range(self.n_layers - self.n_future_tokens + 1, self.n_layers):
+            self.extra_heads.append(TransformerBlock(layer_id, params))
+        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
+        self.output = ColumnParallelLinear(
+            params.dim, params.vocab_size, bias=False, init_method=lambda x: x
+        )
+        self.freqs_cis = precompute_freqs_cis(
+            # Note that self.params.max_seq_len is multiplied by 2 because the token limit for the Llama 2 generation of models is 4096.
+            # Adding this multiplier instead of using 4096 directly allows for dynamism of token lengths while training or fine-tuning.
+            self.params.dim // self.params.n_heads, self.params.max_seq_len * 2, theta=self.params.rope_theta
+        )
+    @torch.inference_mode()
+    def forward(self, tokens: torch.Tensor, start_pos: int, return_all_heads: bool = False):
+        """
+        Perform a forward pass through the Transformer model.
+        Args:
+            tokens (torch.Tensor): Input token indices.
+            start_pos (int): Starting position for attention caching.
+            return_all_heads (bool, optional): Whether to return logits
+              for all prediction heads. Defaults to False.
+        Returns:
+            torch.Tensor: Output logits after applying the Transformer model
+              of shape (batch_size, seq_len, n_future_tokens, vocab_size).
+        Note:
+            If return_all_heads is False, the output logits broadcast to
+            (batch_size, seq_len, vocab_size) and are compatible with standard
+            decoding.
+        """
+        _bsz, seqlen = tokens.shape
+        h = self.tok_embeddings(tokens)
+        self.freqs_cis = self.freqs_cis.to(h.device)
+        freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
+        mask = None
+        if seqlen > 1:
+            mask = torch.full(
+                (seqlen, seqlen), float("-inf"), device=tokens.device
+            )
+            mask = torch.triu(mask, diagonal=1)
+            # When performing key-value caching, we compute the attention scores
+            # only for the new sequence. Thus, the matrix of scores is of size
+            # (seqlen, cache_len + seqlen), and the only masked entries are (i, j) for
+            # j > cache_len + i, since row i corresponds to token cache_len + i.
+            mask = torch.hstack([
+                torch.zeros((seqlen, start_pos), device=tokens.device),
+                mask
+            ]).type_as(h)
+        # Model trunk.
+        for layer in self.layers[:-1]:
+            h = layer(h, start_pos, freqs_cis, mask)
+        h_trunk = h
+        # Prediction heads.
+        latents = []
+        n_heads_to_use = self.n_future_tokens if return_all_heads else 1
+        prediction_heads = [self.layers[-1]] + list(self.extra_heads)
+        for layer in prediction_heads[:n_heads_to_use]:
+            h = layer(h_trunk, start_pos, freqs_cis, mask)
+            latents.append(h)
+        h = torch.stack(latents, dim=-2)  # (_bsz, seqlen, n_heads_to_use, dim)
+        h = self.norm(h)
+        output = self.output(h).float()
+        return output

llama/tokenizer.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+import os
+from logging import getLogger
+from typing import List
+from sentencepiece import SentencePieceProcessor
+logger = getLogger()
+class Tokenizer:
+    """tokenizing and encoding/decoding text using SentencePiece."""
+    def __init__(self, model_path: str):
+        """
+        Initializes the Tokenizer with a SentencePiece model.
+        Args:
+            model_path (str): The path to the SentencePiece model file.
+        """
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+        logger.info(f"Reloaded SentencePiece model from {model_path}")
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.pad_id()
+        logger.info(
+            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
+        )
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
+        """
+        Encodes a string into a list of token IDs.
+        Args:
+            s (str): The input string to be encoded.
+            bos (bool): Whether to prepend the beginning-of-sequence token.
+            eos (bool): Whether to append the end-of-sequence token.
+        Returns:
+            List[int]: A list of token IDs.
+        """
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+    def decode(self, t: List[int]) -> str:
+        """
+        Decodes a list of token IDs into a string.
+        Args:
+            t (List[int]): The list of token IDs to be decoded.
+        Returns:
+            str: The decoded string.
+        """
+        return self.sp_model.decode(t)

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723