h2ogpt-chatbot

Runtime error

App Files Files Community

trhacknon

pseudotensor commited on Apr 25, 2023

Commit

d7185d6

0 Parent(s):

Duplicate from h2oai/h2ogpt-chatbot

Browse files

Co-authored-by: Jonathan McKinney <[email protected]>

Files changed (11) hide show

.gitattributes +34 -0
LICENSE +201 -0
README.md +14 -0
app.py +0 -0
client_test.py +121 -0
finetune.py +932 -0
h2o-logo.svg +1 -0
prompter.py +106 -0
requirements.txt +50 -0
stopping.py +139 -0
utils.py +186 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: H2ogpt Chatbot
+emoji: 📚
+colorFrom: yellow
+colorTo: yellow
+sdk: gradio
+sdk_version: 3.27.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+duplicated_from: h2oai/h2ogpt-chatbot
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

The diff for this file is too large to render. See raw diff

client_test.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""
+Client test.
+Run server:
+python generate.py  --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
+NOTE: For private models, add --use-auth_token=True
+NOTE: --infer_devices=True (default) must be used for multi-GPU in case see failures with cuda:x cuda:y mismatches.
+Currently, this will force model to be on a single GPU.
+Then run this client as:
+python client_test.py
+For HF spaces:
+HOST="https://h2oai-h2ogpt-chatbot.hf.space" python client_test.py
+Result:
+Loaded as API: https://h2oai-h2ogpt-chatbot.hf.space ✔
+{'instruction_nochat': 'Who are you?', 'iinput_nochat': '', 'response': 'I am h2oGPT, a large language model developed by LAION.'}
+For demo:
+HOST="https://gpt.h2o.ai" python client_test.py
+Result:
+Loaded as API: https://gpt.h2o.ai ✔
+{'instruction_nochat': 'Who are you?', 'iinput_nochat': '', 'response': 'I am h2oGPT, a chatbot created by LAION.'}
+"""
+debug = False
+import os
+os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
+def get_client():
+    from gradio_client import Client
+    client = Client(os.getenv('HOST', "http://localhost:7860"))
+    if debug:
+        print(client.view_api(all_endpoints=True))
+    return client
+def test_client_basic():
+    instruction = ''  # only for chat=True
+    iinput = ''  # only for chat=True
+    context = ''
+    # streaming output is supported, loops over and outputs each generation in streaming mode
+    # but leave stream_output=False for simple input/output mode
+    stream_output = False
+    prompt_type = 'human_bot'
+    temperature = 0.1
+    top_p = 0.75
+    top_k = 40
+    num_beams = 1
+    max_new_tokens = 50
+    min_new_tokens = 0
+    early_stopping = False
+    max_time = 20
+    repetition_penalty = 1.0
+    num_return_sequences = 1
+    do_sample = True
+    # only these 2 below used if pass chat=False
+    chat = False
+    instruction_nochat = "Who are you?"
+    iinput_nochat = ''
+    args = [instruction,
+            iinput,
+            context,
+            stream_output,
+            prompt_type,
+            temperature,
+            top_p,
+            top_k,
+            num_beams,
+            max_new_tokens,
+            min_new_tokens,
+            early_stopping,
+            max_time,
+            repetition_penalty,
+            num_return_sequences,
+            do_sample,
+            chat,
+            instruction_nochat,
+            iinput_nochat,
+            ]
+    api_name = '/submit_nochat'
+    client = get_client()
+    res = client.predict(
+        *tuple(args),
+        api_name=api_name,
+    )
+    res_dict = dict(instruction_nochat=instruction_nochat, iinput_nochat=iinput_nochat, response=md_to_text(res))
+    print(res_dict)
+    return res_dict
+import markdown  # pip install markdown
+from bs4 import BeautifulSoup  # pip install beautifulsoup4
+def md_to_text(md):
+    html = markdown.markdown(md)
+    soup = BeautifulSoup(html, features='html.parser')
+    return soup.get_text()
+if __name__ == '__main__':
+    test_client_basic()

finetune.py ADDED Viewed

	@@ -0,0 +1,932 @@

+import os
+import pathlib
+import random
+import shutil
+import subprocess
+import sys
+import time
+from datetime import datetime
+from typing import List, Union
+import fire
+import numpy as np
+import torch
+from datasets import load_dataset, concatenate_datasets
+import transformers
+import torch.distributed as dist
+from peft import (
+    prepare_model_for_int8_training,
+    LoraConfig,
+    get_peft_model,
+    get_peft_model_state_dict,
+    set_peft_model_state_dict,
+)
+from peft import mapping
+lora_mappings = mapping.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
+def log(*args, **kwargs):
+    if int(os.environ.get("LOCAL_RANK", 0)) == 0:
+        print(*args, **kwargs)
+try:
+    import neptune
+    from transformers.integrations import NeptuneCallback
+    neptune_run = neptune.init_run(
+        source_files=[],
+    )
+    log("Connected to Neptune.")
+except ImportError:
+    neptune_run = None
+    log("Please pip install neptune for tracking.")
+except neptune.exceptions.NeptuneMissingApiTokenException:
+    neptune_run = None
+    os.environ["NEPTUNE_MODE"] = 'debug'
+    log("No neptune configured, set NEPTUNE_API_TOKEN env var.")
+from enum import Enum
+class PromptType(Enum):
+    plain = 0
+    instruct = 1
+    quality = 2
+    human_bot = 3
+    dai_faq = 4
+    summarize = 5
+    simple_instruct = 6
+    instruct_vicuna = 7
+    instruct_with_end = 8
+    human_bot_orig = 9
+prompt_type_to_model_name = {
+    'plain': [
+        'EleutherAI/gpt-j-6B',
+        'EleutherAI/pythia-6.9b',
+        'EleutherAI/pythia-12b',
+        'EleutherAI/pythia-12b-deduped',
+        'EleutherAI/gpt-neox-20b',
+        'decapoda-research/llama-7b-hf',
+        'decapoda-research/llama-13b-hf',
+        'decapoda-research/llama-30b-hf',
+        'decapoda-research/llama-65b-hf',
+        'facebook/mbart-large-50-many-to-many-mmt',
+        'philschmid/bart-large-cnn-samsum',
+        'philschmid/flan-t5-base-samsum',
+        'gpt2',
+        'distilgpt2',
+    ],
+    'instruct': [],
+    'instruct_with_end': ['databricks/dolly-v2-12b'],
+    'quality': [],
+    'human_bot': [
+        'h2oai/h2ogpt-oasst1-512-12b',
+        'h2oai/h2ogpt-oasst1-512-20b',
+        'h2oai/h2ogpt-oig-oasst1-512-6.9b',
+    ],
+    'dai_faq': [],
+    'summarize': [],
+    'simple_instruct': ['t5-small', 't5-large', 'google/flan-t5', 'google/flan-t5-xxl', 'google/flan-ul2'],
+    'instruct_vicuna': ['AlekseyKorshuk/vicuna-7b'],
+    'human_bot_orig': ['togethercomputer/GPT-NeoXT-Chat-Base-20B'],
+}
+inv_prompt_type_to_model_name = {v.strip(): k for k, l in prompt_type_to_model_name.items() for v in l}
+inv_prompt_type_to_model_lower = {v.strip().lower(): k for k, l in prompt_type_to_model_name.items() for v in l}
+human = '<human>:'
+bot = "<bot>:"
+prompt_types_strings = []
+for p in PromptType:
+    prompt_types_strings.extend([p.name])
+prompt_types = []
+for p in PromptType:
+    prompt_types.extend([p.name, p.value, str(p.value)])
+# supported by huggingface evaluate
+supported_metrics = ['bleu', 'rouge', 'sacrebleu', 'meteor']
+def train(
+        save_code: bool = False,
+        run_id: int = None,
+        base_model: str = 'h2oai/h2ogpt-oig-oasst1-512-6.9b',
+        # base_model: str = 'h2oai/h2ogpt-oasst1-512-12b',
+        # base_model: str = 'h2oai/h2ogpt-oasst1-512-20b',
+        # base_model: str = 'EleutherAI/gpt-neox-20b',
+        # base_model: str = 'EleutherAI/pythia-12b-deduped',
+        # base_model: str = 'togethercomputer/GPT-NeoXT-Chat-Base-20B',
+        # base_model: str = 'decapoda-research/llama-7b-hf',
+        # base_model: str = 'decapoda-research/llama-13b-hf',
+        # base_model: str = 'decapoda-research/llama-30b-hf',
+        # base_model: str = 'EleutherAI/gpt-j-6B',
+        # only needed if base_model is self-exported HF state without tokenizer
+        tokenizer_base_model: str = None,
+        # tokenizer_base_model: str = 'EleutherAI/gpt-neox-20b',
+        data_path: str = None,
+        data_col_dict: dict = None,
+        # data_path: str = "./dai_docs.train.json",
+        prompt_type: Union[str, int] = "plain",  # "plain", "instruct", "quality", "human_bot", "dai_faq"
+        valid_path: str = None,
+        # valid_path: str = "./dai_docs.valid.json",
+        # data_mix_in_path: str = "laion/OIG",  # way too big, medium quality
+        data_mix_in_path: str = "0-hero/OIG-small-chip2",  # high quality, 50 MB, good enough for now
+        data_mix_in_factor: float = 0.0,  # >1: more mix-in data, <1: more of data_path data
+        data_mix_in_col_dict: dict = {'user': 'instruction', 'chip2': 'output'},
+        data_mix_in_prompt_type: str = "instruct",  # just instruction->output, same as instruct
+        output_dir: str = None,
+        # LoRA checkpoint continuation
+        lora_weights: str = "",
+        # batching training hyperparams
+        batch_size: int = 128,
+        micro_batch_size: int = 4,
+        gradient_checkpointing=False,  # unnecessary with gradient accumulation enabled
+        fp16=True,
+        # general training hyperparams
+        num_epochs: float = 1,
+        learning_rate: float = 3e-4,
+        # validation settings
+        val_set_size: int = None,
+        val_metrics: List[str] = [],
+        eval_steps: int = None,  # to control eval steps via steps
+        eval_epochs: float = None,  # to control eval steps via epochs
+        # lora hyperparams
+        lora_r: int = 8,
+        lora_alpha: int = 16,
+        lora_dropout: float = 0.05,
+        lora_target_modules: List[str] = None,
+        llama_type: bool = None,
+        # llm hyperparams
+        train_on_inputs: bool = True,  # if False, masks out inputs in loss
+        group_by_length: bool = False,  # if True, faster, but produces an odd training loss curve
+        resume_from_checkpoint: str = None,  # either training checkpoint or final adapter
+        cutoff_len: int = 1024,  # Good default, especially when have high quality non-trivial data
+        # torch training params
+        ddp: bool = True,  # set to False if OOM with True, for multi-GPU model parallelism
+        local_files_only: bool = False,  # else will download new versions, normally unwanted
+        resume_download: bool = True,
+        use_auth_token: Union[str, bool] = False,  # True requires CLI did huggingface-cli login before running
+        warmup_steps: int = 100,
+        logging_steps: int = 1,
+        save_steps: int = None,  # must be round multiple of eval_steps
+        add_eos_token: bool = False,
+):
+    # allow set token directly
+    use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
+    prompt_type = str(prompt_type)  # migration from integers
+    assert prompt_type in prompt_types
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    rank = int(os.getenv("RANK", 0))
+    print(f"local_rank: {local_rank}")
+    print(f"global rank: {rank}")
+    gpus = max(world_size, torch.cuda.device_count())
+    run_id = run_id or 0
+    if not data_path:
+        raise ValueError("No data_path provided")
+    if not output_dir:
+        output_dir = f"{base_model.split('/')[-1]}.{data_path.replace('/', '')}.{num_epochs}_epochs.{get_githash() or 'nogit'}.{run_id}"
+        if os.path.exists(output_dir) and not resume_from_checkpoint:
+            raise FileExistsError(f"output_dir based on run_id {run_id} already exists. Please pick a different run_id.")
+    else:
+        if os.path.exists(output_dir) and not resume_from_checkpoint:
+            raise FileExistsError(f"output_dir {output_dir} already exists. Please pick a different output_dir, or specify a run_id instead.")
+    device_map = "auto"
+    if save_code:
+        copy_code(run_id)
+    if tokenizer_base_model is None:
+        tokenizer_base_model = base_model
+    if llama_type is None:
+        llama_type = "llama" in base_model.lower()
+    assert (
+        base_model
+    ), "Please specify a --base_model, e.g. --base_model='decapoda-research/llama-7b-hf'"
+    gradient_accumulation_steps = batch_size // micro_batch_size
+    assert gradient_accumulation_steps >= world_size, "must increase batch_size for multi-GPU"
+    device_map = "auto"
+    locals_dict = locals()
+    locals_print = '\n'.join(['%s: %s' % (k, v) for k, v in locals_dict.items()])
+    log(f"Training model with params:\n{locals_print}")
+    log("Command: %s\nHash: %s" % (str(' '.join(sys.argv)), get_githash()))
+    max_memory = None
+    if gpus > 1:
+        if ddp:
+            log("Distributed: data parallel")
+            device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
+            gradient_accumulation_steps = gradient_accumulation_steps // world_size
+        else:
+            free_in_GB = int(min(torch.cuda.mem_get_info()) / 1024 ** 3)
+            max_memory = f"{free_in_GB - 2}GB"
+            max_memory = {i: max_memory for i in range(gpus)}
+            log("world_size: %d" % world_size)
+            log("num_gpus: %d" % gpus)
+            log("max mem: %s" % max_memory)
+    model_loader, tokenizer_loader = get_loaders(llama_type=llama_type, model_name=base_model, reward_type=False)
+    model = model_loader.from_pretrained(
+        base_model,
+        load_in_8bit=True,
+        device_map=device_map,
+        torch_dtype=torch.float16,
+        max_memory=max_memory,
+        local_files_only=local_files_only,
+        resume_download=resume_download,
+        use_auth_token=use_auth_token,
+    )
+    if gpus > 1:
+        if not ddp:
+            log("model parallel")
+            model.is_parallelizable = True
+            model.model_parallel = True
+    tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
+                                                 local_files_only=local_files_only,
+                                                 resume_download=resume_download,
+                                                 use_auth_token=use_auth_token)
+    tokenizer.pad_token_id = 0  # different from the eos token
+    # when generating, we will use the logits of right-most token to predict the next token
+    # so the padding should be on the left,
+    # e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference
+    tokenizer.padding_side = "left"  # Allow batched inference
+    def tokenize(prompt, add_eos_token=True):
+        # there's probably a way to do this with the tokenizer settings
+        # but again, gotta move fast
+        result = tokenizer(
+            prompt,
+            truncation=True,
+            max_length=cutoff_len,
+            padding=False,
+            return_tensors=None,
+        )
+        if (
+                result["input_ids"][-1] != tokenizer.eos_token_id
+                and len(result["input_ids"]) < cutoff_len
+                and add_eos_token
+        ):
+            result["input_ids"].append(tokenizer.eos_token_id)
+            result["attention_mask"].append(1)
+        result["labels"] = result["input_ids"].copy()
+        return result
+    def generate_and_tokenize_prompt(data_point, add_eos=add_eos_token):
+        full_prompt, _, _ = generate_prompt(data_point, prompt_type, False, False)
+        tokenized_full_prompt = tokenize(full_prompt)
+        if not train_on_inputs:
+            user_prompt, _, _ = generate_prompt({**data_point, "output": ""}, prompt_type, False, False)
+            tokenized_user_prompt = tokenize(user_prompt, add_eos_token=add_eos)
+            user_prompt_len = len(tokenized_user_prompt["input_ids"])
+            if add_eos:
+                user_prompt_len -= 1
+            # ignore_index=-100 ensures torch/tf don't include padding token id in CrossEntropyLoss
+            tokenized_full_prompt["labels"] = [
+                                                  -100
+                                              ] * user_prompt_len + tokenized_full_prompt["labels"][
+                                                                    user_prompt_len:
+                                                                    ]  # could be sped up, probably
+        return tokenized_full_prompt
+    if "gpt-neox" not in base_model or True:
+        model = prepare_model_for_int8_training(model)
+    else:
+        model = prepare_model_for_int8_training(
+            model,
+            output_embedding_layer_name="embed_out",  # keep output logits in float32
+            layer_norm_names=["layer_norm", "layernorm"],  # keep all layer norms in higher precision
+        )
+    if lora_weights:
+        from peft import PeftModel
+        model = PeftModel.from_pretrained(
+            model,
+            lora_weights,
+            torch_dtype=torch.float16,
+            device_map=device_map,
+            local_files_only=local_files_only,
+            resume_download=resume_download,
+            use_auth_token=use_auth_token,
+        )
+    else:
+        if lora_target_modules is None:
+            base_model_lower = base_model.lower()
+            if base_model_lower in lora_mappings:
+                lora_target_modules_cand = [lora_mappings[base_model_lower]]
+            else:
+                lora_target_modules_cand = [["query_key_value"], ["q_proj", "v_proj"]]
+        else:
+            lora_target_modules_cand = [lora_target_modules]
+        for lora_target_modules in lora_target_modules_cand:
+            try:
+                config = LoraConfig(
+                    r=lora_r,
+                    lora_alpha=lora_alpha,
+                    target_modules=lora_target_modules,
+                    lora_dropout=lora_dropout,
+                    bias="none",
+                    task_type="CAUSAL_LM",
+                )
+                model = get_peft_model(model, config)
+                break
+            except ValueError as e:
+                if "Target modules" in str(e) and "not found" in str(e):
+                    continue
+                else:
+                    raise
+        from peft import PeftModel
+        assert isinstance(model, PeftModel), "LoRA failed. Please provide --lora_target_modules explicitly."
+    if resume_from_checkpoint:
+        # Check the available weights and load them
+        checkpoint_name = os.path.join(
+            resume_from_checkpoint, "pytorch_model.bin"
+        )  # Full checkpoint
+        if not os.path.exists(checkpoint_name):
+            checkpoint_name = os.path.join(
+                resume_from_checkpoint, "adapter_model.bin"
+            )  # only LoRA model - LoRA config above has to fit
+            resume_from_checkpoint = False  # So the trainer won't try loading its state
+        # The two files above have a different name depending on how they were saved, but are actually the same.
+        if os.path.exists(checkpoint_name):
+            log(f"Restarting from {checkpoint_name}")
+            adapters_weights = torch.load(checkpoint_name)
+            model = set_peft_model_state_dict(model, adapters_weights)
+        else:
+            log(f"Checkpoint {checkpoint_name} not found")
+    print(model)
+    model.print_trainable_parameters()  # Be more transparent about the % of trainable params.
+    metrics = {}
+    for name in supported_metrics:
+        if name in val_metrics:
+            import evaluate  # Causes hang for 'python generate.py' on dual 4090 if imported early, 100% reproducible
+            metrics[name] = evaluate.load(name)
+    log("Using Validation Metrics: %s" % str(list(metrics.keys())))
+    log("Supported Metrics: %s" % supported_metrics)
+    if val_set_size is None:
+        if len(metrics) == 0:
+            val_set_size = 1000
+        else:
+            val_set_size = 100
+        log("Auto set val_set_size %s" % val_set_size)
+    elif val_set_size < 1.0 and val_set_size != 0:
+        raise RuntimeError("Fractional validation size not supported.")
+    if valid_path:
+        data = load_dataset("json", data_files={"train": data_path, "valid": valid_path})
+    else:
+        if "json" in data_path:
+            data = load_dataset("json", data_files={"train": data_path})
+        else:
+            data = load_dataset(data_path)
+            data = data.rename_columns(data_col_dict or {})
+    valid_data = None
+    train_data_mix_in = None
+    valid_data_mix_in = None
+    if data_mix_in_path and data_mix_in_factor > 0:
+        # get mix-in training/validation data - to keep model "sane"
+        num_rows = data["train"].num_rows
+        log("Loading mix-in dataset: %s" % data_mix_in_path)
+        if "json" in data_mix_in_path:
+            data_mix_in = load_dataset("json", data_files={"train": data_mix_in_path})["train"]
+        else:
+            data_mix_in = load_dataset(data_mix_in_path)["train"]  # can be large
+        data_mix_in = data_mix_in.rename_columns(data_mix_in_col_dict or {})
+        # only get as much as we need to balance
+        valid_size = min(data_mix_in.num_rows // 2, val_set_size or 0)
+        train_size = max(1, min(data_mix_in.num_rows - valid_size, int(num_rows * data_mix_in_factor)))
+        mixin_small = data_mix_in.train_test_split(
+            test_size=train_size + valid_size,
+            shuffle=True, seed=np.random.randint(10000),
+        )["test"]
+        if valid_size:
+            mixin_train_test = mixin_small.train_test_split(
+                test_size=valid_size, shuffle=False,
+            )
+            train_data_mix_in = mixin_train_test["train"]
+            valid_data_mix_in = mixin_train_test["test"]
+        else:
+            train_data_mix_in = mixin_small
+        if "prompt_type" not in train_data_mix_in.column_names:
+            train_data_mix_in = train_data_mix_in.add_column(
+                "prompt_type",
+                [data_mix_in_prompt_type] * train_data_mix_in.num_rows,
+            )
+            log("Added prompt type %s to mix-in training data" % data_mix_in_prompt_type)
+        if valid_data_mix_in and "prompt_type" not in valid_data_mix_in.column_names:
+            valid_data_mix_in = valid_data_mix_in.add_column(
+                "prompt_type",
+                [data_mix_in_prompt_type] * valid_data_mix_in.num_rows,
+            )
+            log("Added prompt type %s to mix-in validation data" % data_mix_in_prompt_type)
+        log("Created mix-in data:\nTrain %s\nValid %s" % (train_data_mix_in, valid_data_mix_in))
+    # get our own training/validation data - for fine-tuning
+    if val_set_size > 0 and not valid_path and not data_mix_in_path:
+        # create valid split from train
+        train_val = data["train"].train_test_split(
+            test_size=val_set_size, shuffle=True, seed=42
+        )
+        train_data = train_val["train"]
+        valid_data = train_val["test"]
+    else:
+        train_data = data["train"]
+        if valid_path:
+            # use given valid split, has priority over data_mix_in_path
+            valid_data = data["valid"]
+    if "prompt_type" not in train_data.column_names:
+        train_data = train_data.add_column(
+            "prompt_type",
+            [prompt_type] * train_data.num_rows,
+        )
+        log("Added prompt type %s to training data" % prompt_type)
+    if valid_data and "prompt_type" not in valid_data.column_names:
+        valid_data = valid_data.add_column(
+            "prompt_type",
+            [prompt_type] * valid_data.num_rows,
+        )
+        log("Added prompt type %s to validation data" % prompt_type)
+    assert train_data is not None
+    # shuffle and tokenize data
+    if train_data_mix_in:
+        train_data = concatenate_datasets([train_data, train_data_mix_in])
+    train_data = train_data.shuffle().map(generate_and_tokenize_prompt, num_proc=os.cpu_count() // torch.cuda.device_count())
+    train_set_size = len(train_data)
+    if valid_data and valid_data_mix_in:
+        valid_data = concatenate_datasets([valid_data, valid_data_mix_in])
+    elif valid_data_mix_in:
+        valid_data = valid_data_mix_in
+    if valid_data:
+        valid_data = valid_data.shuffle().map(generate_and_tokenize_prompt, num_proc=os.cpu_count() // torch.cuda.device_count())
+        val_set_size = len(valid_data)
+    else:
+        val_set_size = 0
+    log("Final fine-tuning data:\nTrain %s\nValid %s" % (train_data, valid_data))
+    sample_row_dict = train_data[:1]
+    del sample_row_dict['input_ids']
+    del sample_row_dict['attention_mask']
+    del sample_row_dict['labels']
+    log("Sample input: %s" % sample_row_dict)
+    if neptune_run:
+        neptune_callback = NeptuneCallback(run=neptune_run)
+        callbacks = [neptune_callback]
+    else:
+        from transformers.integrations import TensorBoardCallback, is_tensorboard_available
+        if is_tensorboard_available:
+            # tensorboard --logdir=runs/
+            from torch.utils.tensorboard import SummaryWriter
+            tb_writer = SummaryWriter()
+            callbacks = [TensorBoardCallback(tb_writer=tb_writer)]
+        else:
+            callbacks = []
+    expected_steps = (train_set_size * num_epochs) // batch_size
+    if eval_steps is None and eval_epochs is None:
+        # 20 evaluations for a run
+        eval_steps = max(1, int(expected_steps / 20))
+        log("Auto set eval_steps to %s out of %s total training steps" % (eval_steps, expected_steps))
+    elif eval_steps is None and eval_epochs is not None:
+        eval_steps = max(1, int(expected_steps * eval_epochs / num_epochs))
+        log("Auto converted eval_epochs=%s to eval_steps %s"
+            " out of %s total training steps" % (eval_epochs, eval_steps, expected_steps))
+    if save_steps is None:
+        save_steps = eval_steps
+        log("Auto step save_steps to %s" % save_steps)
+    elif save_steps > eval_steps:
+        # save steps must be round multiple of eval_steps
+        save_steps0 = save_steps
+        save_steps = max(1, (save_steps//eval_steps)) * eval_steps
+        if save_steps0 != save_steps:
+            log("Auto converted save_steps from %s to %s" % (save_steps0, save_steps))
+    def compute_metrics(eval_preds):
+        # e.g. see: https://huggingface.co/docs/transformers/v4.25.1/en/tasks/translation#evaluate
+        inputs = eval_preds.inputs
+        label_ids = eval_preds.label_ids
+        predictions = eval_preds.predictions
+        #inputs = np.where(inputs != -100, inputs, tokenizer.pad_token_id)
+        #decoded_inputs = tokenizer.batch_decode(inputs, skip_special_tokens=True)
+        #decoded_inputs = [pred.strip() for pred in decoded_inputs]
+        label_ids = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
+        # tokenizer behavior like generate time
+        decoded_labels = tokenizer.batch_decode(label_ids, skip_special_tokens=True,
+                                                           clean_up_tokenization_spaces=True)
+        decoded_labels = [pred.strip() for pred in decoded_labels]
+        predictions = np.argmax(predictions, -1)
+        predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
+        # tokenizer behavior like generate time
+        decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True,
+                                                                  clean_up_tokenization_spaces=True)
+        decoded_predictions = [pred.strip() for pred in decoded_predictions]
+        result = {}
+        for metric in metrics.values():
+            result1 = metric.compute(predictions=decoded_predictions, references=decoded_labels)
+            # get rid of lists, for precision etc., for now
+            numeric_results = {k: v for k, v in result1.items() if isinstance(v, (int, float))}
+            result.update(numeric_results)
+        return result
+    # the callback that computes metrics of interest
+    if val_metrics:
+        trainer_kwargs = dict(compute_metrics=compute_metrics)
+    else:
+        trainer_kwargs = dict()
+    trainer = transformers.Trainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=train_data,
+        eval_dataset=valid_data,
+        # NOTE: CausalLM is not supporting Seq2SeqTrainingArguments arguments, but not incompatible
+        args=transformers.Seq2SeqTrainingArguments(
+            per_device_train_batch_size=micro_batch_size,
+            per_device_eval_batch_size=1,
+            eval_accumulation_steps=10,
+            # predict_with_generate=True,  # SEQ2SEQ only
+            include_inputs_for_metrics=True,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            warmup_steps=warmup_steps,
+            num_train_epochs=num_epochs,
+            learning_rate=learning_rate,
+            gradient_checkpointing=gradient_checkpointing,
+            fp16=fp16,
+            # cosnider 8-bit adam: https://huggingface.co/docs/transformers/v4.18.0/en/performance#8bit-adam
+            optim="adamw_torch",  # consider "adafactor" to save memory
+            logging_steps=logging_steps,
+            logging_strategy="steps",
+            evaluation_strategy="steps" if val_set_size > 0 else "no",
+            save_strategy="steps",
+            eval_steps=eval_steps if val_set_size > 0 else None,
+            save_steps=save_steps,
+            output_dir=output_dir,
+            save_total_limit=3,
+            load_best_model_at_end=True if val_set_size > 0 else False,
+            ddp_find_unused_parameters=False if ddp else None,
+            group_by_length=group_by_length,
+            #fsdp="shard_grad_op auto_wrap" if gpus > 1 and not ddp else None,
+            #fsdp_min_num_params=20000 if gpus > 1 and not ddp else None,
+            report_to='tensorboard' if not neptune_run else 'neptune',
+        ),
+        data_collator=transformers.DataCollatorForSeq2Seq(
+            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
+        ),
+        callbacks=callbacks,
+        **trainer_kwargs,
+    )
+    model.config.use_cache = False
+    old_state_dict = model.state_dict
+    model.state_dict = (
+        lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
+    ).__get__(model, type(model))
+    if torch.__version__ >= "2" and sys.platform != "win32":
+        model = torch.compile(model)
+        # WIP (not generally replacing layers until pytorch 2.1)
+        torch.backends.cuda.enable_flash_sdp(True)
+    if gpus > 1 and not ddp:
+        assert trainer.is_model_parallel
+    else:
+        assert not trainer.is_model_parallel
+    trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+    model.save_pretrained(output_dir)
+    log("\n If there's a warning about missing keys above, please disregard :)")
+def get_loaders(llama_type, model_name, reward_type):
+    # NOTE: Some models need specific new prompt_type
+    # E.g. t5_xxl_true_nli_mixture has input format: "premise: PREMISE_TEXT hypothesis: HYPOTHESIS_TEXT".)
+    if llama_type:
+        from transformers import LlamaForCausalLM, LlamaTokenizer
+        model_loader = LlamaForCausalLM
+        tokenizer_loader = LlamaTokenizer
+    elif 'gpt2' in model_name.lower():
+        from transformers import GPT2LMHeadModel, GPT2Tokenizer
+        return GPT2LMHeadModel, GPT2Tokenizer
+    elif 'mbart-' in model_name.lower():
+        from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
+        return MBartForConditionalGeneration, MBart50TokenizerFast
+    elif 't5' == model_name.lower() or \
+         't5-' in model_name.lower() or \
+         'flan-' in model_name.lower():
+        from transformers import AutoTokenizer, T5ForConditionalGeneration
+        return T5ForConditionalGeneration, AutoTokenizer
+    elif 'bigbird' in model_name:
+        from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer
+        return BigBirdPegasusForConditionalGeneration, AutoTokenizer
+    elif 'bart-large-cnn-samsum' in model_name or 'flan-t5-base-samsum' in model_name:
+        from transformers import pipeline
+        return pipeline, "summarization"
+    elif reward_type or 'OpenAssistant/reward-model'.lower() in model_name.lower():
+        from transformers import AutoModelForSequenceClassification, AutoTokenizer
+        return AutoModelForSequenceClassification, AutoTokenizer
+    else:
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        model_loader = AutoModelForCausalLM
+        tokenizer_loader = AutoTokenizer
+    return model_loader, tokenizer_loader
+def get_githash():
+    try:
+        githash = subprocess.run(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE).stdout.decode('utf-8')[0:-1]
+    except:
+        githash = ''
+    return githash
+def copy_code(run_id):
+    """
+    copy code to track changes
+    :param run_id:
+    :return:
+    """
+    rnd_num = str(random.randint(0, 2 ** 31))
+    run_id = 'run_' + str(run_id)
+    os.makedirs(run_id, exist_ok=True)
+    me_full = os.path.join(pathlib.Path(__file__).parent.resolve(), __file__)
+    me_file = os.path.basename(__file__)
+    new_me = os.path.join(run_id, me_file + '_' + get_githash())
+    if os.path.isfile(new_me):
+        new_me = os.path.join(run_id, me_file + '_' + get_githash() + '_' + rnd_num)
+        shutil.copy(me_full, new_me)
+    else:
+        shutil.copy(me_full, new_me)
+def get_prompt(prompt_type, chat, context, reduced):
+    if prompt_type in [-1, "-1", "plain"]:
+        promptA = promptB = PreInstruct = PreInput = PreResponse = ''
+        terminate_response = []
+    elif prompt_type == 'simple_instruct':
+        promptA = promptB = PreInstruct = PreInput = PreResponse = None
+        terminate_response = []
+    elif prompt_type in [0, "0", "instruct"] or prompt_type in [7, "7", "instruct_with_end"]:
+        promptA = 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n' if not (chat and reduced) else ''
+        promptB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n' if not (chat and reduced) else ''
+        PreInstruct = """
+### Instruction:
+"""
+        PreInput = """
+### Input:
+"""
+        PreResponse = """
+### Response:
+"""
+        if prompt_type in [7, "7", "instruct_with_end"]:
+            terminate_response = ['### End']
+        else:
+            terminate_response = None
+    elif prompt_type in [1, "1", "quality"]:
+        promptA = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction as applied on the Input.\n' if not (chat and reduced) else ''
+        promptB = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction.\n' if not (chat and reduced) else ''
+        PreInstruct = """
+### Instruction:
+"""
+        PreInput = """
+### Input:
+"""
+        PreResponse = """
+### Response:
+"""
+        terminate_response = None
+    elif prompt_type in [2, "2", "human_bot", 9, "9", "human_bot_orig"]:
+        if reduced or context or prompt_type in [2, "2", "human_bot"]:
+            preprompt = ''
+        else:
+            cur_date = time.strftime('%Y-%m-%d')
+            cur_time = time.strftime('%H:%M:%S %p %Z')
+            PRE_PROMPT = """\
+Current Date: {}
+Current Time: {}
+"""
+            preprompt = PRE_PROMPT.format(cur_date, cur_time)
+        start = human
+        promptB = promptA = '%s%s ' % (preprompt, start)
+        PreInstruct = ""
+        PreInput = None
+        PreResponse = bot
+        terminate_response = [start, PreResponse]
+    elif prompt_type in [3, "3", "dai_faq"]:
+        promptA = ''
+        promptB = 'Answer the following Driverless AI question.\n'
+        PreInstruct = """
+### Driverless AI frequently asked question:
+"""
+        PreInput = None
+        PreResponse = """
+### Driverless AI documentation answer:
+"""
+        terminate_response = ['\n\n']
+    elif prompt_type in [5, "5", "summarize"]:
+        promptA = promptB = PreInput = ''
+        PreInstruct = '## Main Text\n\n'
+        PreResponse = '\n\n## Summary\n\n'
+        terminate_response = None
+    elif prompt_type in [6, "6", "instruct_vicuna"]:
+        promptA = promptB = "A chat between a curious human and an artificial intelligence assistant. " \
+            "The assistant gives helpful, detailed, and polite answers to the human's questions." if not (chat and reduced) else ''
+        PreInstruct = """
+### Human:
+"""
+        PreInput = None
+        PreResponse = """
+### Assistant:
+"""
+        terminate_response = ['### Human:']  # but only allow terminate after prompt is found correctly, else can't terminate
+    else:
+        raise RuntimeError("No such prompt_type=%s" % prompt_type)
+    return promptA, promptB, PreInstruct, PreInput, PreResponse, terminate_response
+def generate_prompt(data_point, prompt_type, chat, reduced):
+    context = data_point.get('context')
+    if context is None:
+        context = ''
+    instruction = data_point.get('instruction')
+    input = data_point.get('input')
+    output = data_point.get('output')
+    prompt_type = data_point.get('prompt_type', prompt_type)
+    assert prompt_type in prompt_types, "Bad prompt type: %s" % prompt_type
+    promptA, promptB, PreInstruct, PreInput, PreResponse, terminate_response = get_prompt(prompt_type, chat, context, reduced)
+    prompt = context
+    if input and promptA:
+        prompt += f"""{promptA}"""
+    elif promptB:
+        prompt += f"""{promptB}"""
+    if instruction and PreInstruct is not None and input and PreInput is not None:
+        prompt += f"""{PreInstruct}{instruction}{PreInput}{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif instruction and input and PreInstruct is None and PreInput is not None:
+        prompt += f"""{PreInput}{instruction}
+{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and instruction and PreInput is None and PreInstruct is not None:
+        prompt += f"""{PreInstruct}{instruction}
+{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif instruction and PreInstruct is not None:
+        prompt += f"""{PreInstruct}{instruction}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and PreInput is not None:
+        prompt += f"""{PreInput}{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and instruction and PreInput is not None:
+        prompt += f"""{PreInput}{instruction}{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and instruction and PreInstruct is not None:
+        prompt += f"""{PreInstruct}{instruction}{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and instruction:
+        # i.e. for simple_instruct
+        prompt += f"""{instruction}: {input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input:
+        prompt += f"""{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif instruction:
+        prompt += f"""{instruction}"""
+        prompt = inject_newline(prompt_type, prompt)
+    if PreResponse is not None:
+        prompt += f"""{PreResponse}"""
+        pre_response = PreResponse  # Don't use strip
+    else:
+        pre_response = ''
+    if output:
+        prompt += f"""{output}"""
+    return prompt, pre_response, terminate_response
+def inject_newline(prompt_type, prompt):
+    if prompt_type not in [-1, '-1', 'plain', 'simple_instruct']:
+        # only add new line if structured prompt, while 'plain' is just generation of next tokens from input
+        prompt += '\n'
+    return prompt
+example_data_point0 = dict(instruction="Summarize",
+                           input="Ducks eat seeds by the lake, then swim in the lake where fish eat small animals.",
+                           output="Ducks eat and swim at the lake.")
+example_data_point1 = dict(instruction="Who is smarter, Einstein or Newton?",
+                           output="Einstein.")
+example_data_point2 = dict(input="Who is smarter, Einstein or Newton?",
+                           output="Einstein.")
+example_data_points = [example_data_point0, example_data_point1, example_data_point2]
+def test_train_prompt(prompt_type='instruct', data_point=0):
+    example_data_point = example_data_points[data_point]
+    return generate_prompt(example_data_point, prompt_type, False, False)
+def test_debug():
+    fire.Fire(train)
+if __name__ == "__main__":
+    CONFIG = "NCCL_P2P_LEVEL=LOC WORLD_SIZE=5 torchrun --nnodes=5 --master_addr=10.10.10.2 --master_port=1111 --nproc_per_node=1"
+    CMD = "finetune.py --data_path=config.json --num_epochs=1 --base_model=decapoda-research/llama-13b-hf"
+    log(f"""
+    Example runs on 4 GPUs:
+    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 finetune.py --base_model='decapoda-research/llama-7b-hf' --data_path=data/config.json --run_id=0 &> 0.log
+    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 finetune.py --base_model='decapoda-research/llama-30b-hf' --data_path=data/config.json --batch_size=16 --micro_batch_size=1 --run_id=1 --save_code=True &> 1.log
+    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 finetune.py --base_model='EleutherAI/gpt-j-6B' --data_path=data/config.json --run_id=2 &> 2.log
+    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 finetune.py --base_model='EleutherAI/gpt-neox-20b' --data_path=data/config.json --run_id=8 --batch_size=16 --micro_batch_size=4 &> 8.log
+    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 finetune.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --data_path=data/config.json --prompt_type='dai_faq' --run_id=13 --batch_size=16 --micro_batch_size=4 --num_epochs=100 --val_set_size=0 data_mix_in_path='' &> 13.log
+    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 finetune.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --data_path=data/config.json --run_id=28 --batch_size=16 --micro_batch_size=4 --num_epochs=8 --val_set_size=0 --data_mix_in_factor=0.1 --data_mix_in_prompt_type='human_bot' --save_code=True --cutoff_len=512  &> 28.log
+    All metrics:
+    CUDA_VISIBLE_DEVICES= finetune.py --data_mix_in_factor=0 --eval_steps=100 --warmup_steps=2 --val_set_size=100 --val_metrics="['bleu', 'rouge', 'sacrebleu', 'meteor']"
+    # Fine-tune 20B on 24GB GPUs across 3 nodes with 3+2+2 GPUs
+    rippa>
+NCCL_P2P_LEVEL=LOC WORLD_SIZE=7 CUDA_VISIBLE_DEVICES="0,1,2" torchrun --node_rank 0 --nproc_per_node=3 --master_port=1234 --nnodes=3 --master_addr=10.10.10.2 finetune.py --data_path=merged_shuffled_OIG_87f6a1e788.json --micro_batch_size=1 --batch_size=7 --cutoff_len=512 --run_id=17 &>log.17.rank0
+    ova>
+NCCL_P2P_LEVEL=LOC WORLD_SIZE=7 CUDA_VISIBLE_DEVICES="0,1" torchrun --node_rank 1 --nproc_per_node=2 --master_port=1234 --nnodes=3 --master_addr=10.10.10.2 finetune.py --data_path=merged_shuffled_OIG_87f6a1e788.json --micro_batch_size=1 --batch_size=7 --cutoff_len=512 --run_id=17 &>log.17.rank1
+    timemachine>
+NCCL_P2P_LEVEL=LOC WORLD_SIZE=7 CUDA_VISIBLE_DEVICES="0,1" torchrun --node_rank 2 --nproc_per_node=2 --master_port=1234 --nnodes=3 --master_addr=10.10.10.2 finetune.py --data_path=merged_shuffled_OIG_87f6a1e788.json --micro_batch_size=1 --batch_size=7 --cutoff_len=512 --run_id=17 &>log.17.rank2
+    """, flush=True)
+    if os.environ.get("LOCAL_RANK") is None:
+        # then not using torchrun, so can't do distributed, ensure CVD set
+        assert os.environ.get("CUDA_VISIBLE_DEVICES") is not None, "Run python script using: torchrun finetune.py OR set CUDA_VISIBLE_DEVICES to single GPU"
+    fire.Fire(train)

h2o-logo.svg ADDED Viewed

prompter.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from finetune import generate_prompt
+class Prompter(object):
+    def __init__(self, prompt_type, debug=False, chat=False, stream_output=False, repeat_penalty=True,
+                 allowed_repeat_line_length=10):
+        self.prompt_type = prompt_type
+        data_point = dict(instruction='', input='', output='')
+        _, self.pre_response, self.terminate_response = generate_prompt(data_point, prompt_type, chat, False)
+        self.debug = debug
+        self.chat = chat
+        self.stream_output = stream_output
+        self.repeat_penalty = repeat_penalty
+        self.allowed_repeat_line_length = allowed_repeat_line_length
+    def generate_prompt(self, data_point):
+        reduced = False
+        prompt, _, _ = generate_prompt(data_point, self.prompt_type, self.chat, reduced)
+        if self.debug:
+            print("prompt: ", prompt, flush=True)
+        self.prompt = prompt
+        return prompt
+    def get_response(self, outputs, prompt=None, sanitize_bot_response=True):
+        if isinstance(outputs, str):
+            outputs = [outputs]
+        if self.debug:
+            print("output: ", '\n\n'.join(outputs), flush=True)
+        if prompt is not None:
+            self.prompt = prompt
+        def clean_response(response):
+            meaningless_words = ['<pad>', '</s>', '<|endoftext|>', '”\n']
+            for word in meaningless_words:
+                response = response.replace(word, "")
+            if sanitize_bot_response:
+                from better_profanity import profanity
+                response = profanity.censor(response)
+            response = response.strip("\n")
+            return response
+        def clean_repeats(response):
+            lines = response.split('\n')
+            new_lines = []
+            [new_lines.append(line) for line in lines if
+             line not in new_lines or len(line) < self.allowed_repeat_line_length]
+            if self.debug and len(lines) != len(new_lines):
+                print("cleaned repeats: %s %s" % (len(lines), len(new_lines)), flush=True)
+            response = '\n'.join(new_lines)
+            return response
+        multi_output = len(outputs) > 1
+        for oi, output in enumerate(outputs):
+            if self.prompt_type in [0, '0', 'plain']:
+                output = clean_response(output)
+            else:
+                # find first instance of prereponse
+                # prompt sometimes has odd characters, that mutate length,
+                # so can't go by length alone
+                if self.pre_response:
+                    outputi = output.find(prompt)
+                    if outputi >= 0:
+                        output = output[outputi + len(prompt):]
+                        allow_terminate = True
+                    else:
+                        # subtraction is risky due to space offsets sometimes, so only do if necessary
+                        output = output[len(prompt) - len(self.pre_response):]
+                        # [1] to avoid repeated pre_response, just take first (after prompt - pre_response for chat)
+                        if self.pre_response in output:
+                            output = output.split(self.pre_response)[1]
+                            allow_terminate = True
+                        else:
+                            print("Failure of parsing: %s" % output, flush=True)
+                            allow_terminate = False
+                else:
+                    allow_terminate = True
+                    output = output[len(prompt):]
+                # clean after subtract prompt out, so correct removal of pre_response
+                output = clean_response(output).strip()
+                if self.repeat_penalty:
+                    output = clean_repeats(output).strip()
+                if self.terminate_response and allow_terminate:
+                    finds = []
+                    for term in self.terminate_response:
+                        finds.append(output.find(term))
+                    finds = [x for x in finds if x >= 0]
+                    if len(finds) > 0:
+                        termi = finds[0]
+                        output = output[:termi].strip()
+                    else:
+                        output = output.strip()
+                else:
+                    output = output.strip()
+            if multi_output:
+                # prefix with output counter
+                output = "\n=========== Output %d\n\n" % (1 + oi) + output
+                if oi > 0:
+                    # post fix outputs with seperator
+                    output += '\n'
+            outputs[oi] = output
+        # join all outputs, only one extra new line between outputs
+        output = '\n'.join(outputs)
+        if self.debug:
+            print("outputclean: ", '\n\n'.join(outputs), flush=True)
+        return output

requirements.txt ADDED Viewed

	@@ -0,0 +1,50 @@

+# for generate (gradio server) and finetune
+datasets==2.11.0
+sentencepiece==0.1.97
+accelerate==0.18.0
+gradio==3.27.0
+huggingface_hub==0.13.4
+appdirs==1.4.4
+fire==0.5.0
+docutils==0.19
+torch==2.0.0
+evaluate==0.4.0
+rouge_score==0.1.2
+sacrebleu==2.3.1
+scikit-learn==1.2.2
+alt-profanity-check==1.2.2
+better-profanity==0.6.1
+numpy==1.24.2
+pandas==2.0.0
+matplotlib==3.7.1
+loralib==0.1.1
+bitsandbytes==0.38.1
+git+https://github.com/huggingface/peft.git@098962fa6515f2e4fe83a757f5995d3ffbb1c373
+transformers==4.28.1
+tokenizers==0.13.3
+# optional for generate
+pynvml==11.5.0
+psutil==5.9.4
+boto3==1.26.101
+botocore==1.29.101
+# optional for finetune
+tensorboard==2.12.1
+neptune==1.1.1
+# for gradio client
+gradio_client==0.1.3
+beautifulsoup4==4.12.2
+markdown==3.4.1
+# data and testing
+pytest==7.2.2
+pytest-xdist==3.2.1
+nltk==3.8.1
+textstat==0.7.3
+pandoc==2.3
+pypandoc==1.11
+openpyxl==3.1.2
+lm_dataformat==0.0.20
+bioc==2.0

stopping.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import traceback
+from queue import Queue
+from threading import Thread
+import collections.abc
+import torch
+from transformers import StoppingCriteria
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=[], encounters=[]):
+        super().__init__()
+        assert len(stops) % len(encounters) == 0, "Number of stops and encounters must match"
+        self.encounters = encounters
+        self.stops = [stop.to("cuda") for stop in stops]
+        self.num_stops = [0] * len(stops)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        for stopi, stop in enumerate(self.stops):
+            if torch.all((stop == input_ids[0][-len(stop):])).item():
+                self.num_stops[stopi] += 1
+                if self.num_stops[stopi] >= self.encounters[stopi % len(self.encounters)]:
+                    return True
+        # print("Tokens: %s" % input_ids[0].cpu().numpy(), flush=True)
+        # print("Stop Tokens: %s" % [x.cpu().numpy() for x in self.stops], flush=True)
+        return False
+class Stream(StoppingCriteria):
+    """
+    This class can be used to callback during generation. Keep
+    in mind for decoder-only type of transformers, this will include the initial prompted tokens.
+    Args:
+        func (`callable`):
+            A callable function to apply on first input in list every iteration of generation
+    """
+    def __init__(self, func=None):
+        self.func = func
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        if self.func is not None:
+            # only consume first of multiple responses
+            self.func(input_ids[0])
+        return False
+class CallbackToGenerator(collections.abc.Generator):
+    """
+    A generator wrapper for a function that invokes a callback multiple times.
+    Calling `send` on the generator emits a value from one callback, and returns
+    the next.
+    Note this starts a background thread
+    """
+    def __init__(self, func, *args, callback=None, **kwargs):
+        self.func = func
+        self.args = args
+        self.kwargs = kwargs
+        self.callback = callback
+        self._ready_queue = Queue(1)
+        self._done_queue = Queue(1)
+        self._done_holder = [False]
+        # local to avoid reference cycles
+        ready_queue = self._ready_queue
+        done_queue = self._done_queue
+        done_holder = self._done_holder
+        def val_callback(value):
+            done_queue.put((False, value))
+            cmd, val = ready_queue.get()
+            if cmd == 'send':
+                return val
+            elif cmd == 'throw':
+                raise val
+            else:
+                assert False  # pragma: no cover
+        def thread_func():
+            while True:
+                cmd, val = ready_queue.get()
+                if cmd == 'send' and val is not None:
+                    done_queue.put((True, TypeError("can't send non-None value to a just-started generator")))
+                    continue
+                break
+            try:
+                if cmd == 'throw':
+                    raise val
+                ret = func(callback=val_callback, **self.kwargs)
+                raise StopIteration(ret) if ret is not None else StopIteration
+            except BaseException as e:
+                done_holder[0] = True
+                done_queue.put((True, e))
+        self._thread = Thread(target=thread_func)
+        self._thread.start()
+    def _put(self, *args):
+        if self._done_holder[0]:
+            raise StopIteration
+        self._ready_queue.put(args)
+        is_exception, val = self._done_queue.get()
+        if is_exception:
+            try:
+                raise val
+            finally:
+                # prevent val's traceback containing a reference cycle
+                del val
+        else:
+            return val
+    def send(self, value):
+        return self._put('send', value)
+    def throw(self, exc):
+        return self._put('throw', exc)
+    def close(self):
+        try:
+            self.throw(GeneratorExit)
+        except StopIteration:
+            self._thread.join()
+        except GeneratorExit:
+            self._thread.join()
+        except BaseException:
+            self._thread.join()
+            raise
+        else:
+            # yielded again, can't clean up the thread
+            raise RuntimeError('Task with callback ignored GeneratorExit')
+    def __del__(self):
+        self.close()

utils.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import os
+import gc
+import random
+import time
+import traceback
+import zipfile
+from datetime import datetime
+import filelock
+import numpy as np
+import pandas as pd
+import torch
+def set_seed(seed: int):
+    """
+    Sets the seed of the entire notebook so results are the same every time we run.
+    This is for REPRODUCIBILITY.
+    """
+    np.random.seed(seed)
+    random_state = np.random.RandomState(seed)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    return random_state
+def flatten_list(lis):
+    """Given a list, possibly nested to any level, return it flattened."""
+    new_lis = []
+    for item in lis:
+        if type(item) == type([]):
+            new_lis.extend(flatten_list(item))
+        else:
+            new_lis.append(item)
+    return new_lis
+def clear_torch_cache():
+    if torch.cuda.is_available:
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+        gc.collect()
+def system_info():
+    import psutil
+    system = {}
+    # https://stackoverflow.com/questions/48951136/plot-multiple-graphs-in-one-plot-using-tensorboard
+    # https://arshren.medium.com/monitoring-your-devices-in-python-5191d672f749
+    temps = psutil.sensors_temperatures(fahrenheit=False)
+    if 'coretemp' in temps:
+        coretemp = temps['coretemp']
+        temp_dict = {k.label: k.current for k in coretemp}
+        for k, v in temp_dict.items():
+            system['CPU_C/%s' % k] = v
+    # https://github.com/gpuopenanalytics/pynvml/blob/master/help_query_gpu.txt
+    from pynvml.smi import nvidia_smi
+    nvsmi = nvidia_smi.getInstance()
+    gpu_power_dict = {'W_gpu%d' % i: x['power_readings']['power_draw'] for i, x in
+                      enumerate(nvsmi.DeviceQuery('power.draw')['gpu'])}
+    for k, v in gpu_power_dict.items():
+        system['GPU_W/%s' % k] = v
+    gpu_temp_dict = {'C_gpu%d' % i: x['temperature']['gpu_temp'] for i, x in
+                     enumerate(nvsmi.DeviceQuery('temperature.gpu')['gpu'])}
+    for k, v in gpu_temp_dict.items():
+        system['GPU_C/%s' % k] = v
+    gpu_memory_free_dict = {'MiB_gpu%d' % i: x['fb_memory_usage']['free'] for i, x in
+                            enumerate(nvsmi.DeviceQuery('memory.free')['gpu'])}
+    gpu_memory_total_dict = {'MiB_gpu%d' % i: x['fb_memory_usage']['total'] for i, x in
+                             enumerate(nvsmi.DeviceQuery('memory.total')['gpu'])}
+    gpu_memory_frac_dict = {k: gpu_memory_free_dict[k] / gpu_memory_total_dict[k] for k in gpu_memory_total_dict}
+    for k, v in gpu_memory_frac_dict.items():
+        system[f'GPU_M/%s' % k] = v
+    return system
+def system_info_print():
+    try:
+        df = pd.DataFrame.from_dict(system_info(), orient='index')
+        # avoid slamming GPUs
+        time.sleep(1)
+        return df.to_markdown()
+    except Exception as e:
+        return "Error: %s" % str(e)
+def zip_data(root_dirs=None, zip_file=None, base_dir='./'):
+    try:
+        return _zip_data(zip_file=zip_file, base_dir=base_dir, root_dirs=root_dirs)
+    except Exception as e:
+        traceback.print_exc()
+        print('Exception in zipping: %s' % str(e))
+def _zip_data(root_dirs=None, zip_file=None, base_dir='./'):
+    if zip_file is None:
+        datetime_str = str(datetime.now()).replace(" ", "_").replace(":", "_")
+        host_name = os.getenv('HF_HOSTNAME', 'emptyhost')
+        zip_file = "data_%s_%s.zip" % (datetime_str, host_name)
+    assert root_dirs is not None
+    with zipfile.ZipFile(zip_file, "w") as expt_zip:
+        for root_dir in root_dirs:
+            if root_dir is None:
+                continue
+            for root, d, files in os.walk(root_dir):
+                for file in files:
+                    file_to_archive = os.path.join(root, file)
+                    assert os.path.exists(file_to_archive)
+                    path_to_archive = os.path.relpath(file_to_archive, base_dir)
+                    expt_zip.write(filename=file_to_archive, arcname=path_to_archive)
+    return zip_file, zip_file
+def save_generate_output(output=None, base_model=None, save_dir=None):
+    try:
+        return _save_generate_output(output=output, base_model=base_model, save_dir=save_dir)
+    except Exception as e:
+        traceback.print_exc()
+        print('Exception in saving: %s' % str(e))
+def _save_generate_output(output=None, base_model=None, save_dir=None):
+    """
+    Save conversation to .json, row by row.
+    json_file_path is path to final JSON file. If not in ., then will attempt to make directories.
+    Appends if file exists
+    """
+    assert save_dir, "save_dir must be provided"
+    if os.path.exists(save_dir) and not os.path.isdir(save_dir):
+        raise RuntimeError("save_dir already exists and is not a directory!")
+    os.makedirs(save_dir, exist_ok=True)
+    import json
+    if output[-10:] == '\n\n<human>:':
+        # remove trailing <human>:
+        output = output[:-10]
+    with filelock.FileLock("save_dir.lock"):
+        # lock logging in case have concurrency
+        with open(os.path.join(save_dir, "history.json"), "a") as f:
+            # just add [ at start, and ] at end, and have proper JSON dataset
+            f.write(
+                "  " + json.dumps(
+                    dict(text=output, time=time.ctime(), base_model=base_model)
+                ) + ",\n"
+            )
+def s3up(filename):
+    try:
+        return _s3up(filename)
+    except Exception as e:
+        traceback.print_exc()
+        print('Exception for file %s in s3up: %s' % (filename, str(e)))
+        return "Failed to upload %s: Error: %s" % (filename, str(e))
+def _s3up(filename):
+    import boto3
+    aws_access_key_id = os.getenv('AWS_SERVER_PUBLIC_KEY')
+    aws_secret_access_key = os.getenv('AWS_SERVER_SECRET_KEY')
+    bucket = os.getenv('AWS_BUCKET')
+    assert aws_access_key_id, "Set AWS key"
+    assert aws_secret_access_key, "Set AWS secret"
+    assert bucket, "Set AWS Bucket"
+    s3 = boto3.client('s3',
+                      aws_access_key_id=os.getenv('AWS_SERVER_PUBLIC_KEY'),
+                      aws_secret_access_key=os.getenv('AWS_SERVER_SECRET_KEY'),
+                      )
+    ret = s3.upload_file(
+        Filename=filename,
+        Bucket=os.getenv('AWS_BUCKET'),
+        Key=filename,
+    )
+    if ret in [None, '']:
+        return "Successfully uploaded %s" % filename