{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "gpuClass": "standard", "widgets": { "application/vnd.jupyter.widget-state+json": { "c1f06c162a994fe39bc1c72dcd732eb5": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_0972d5d3a6c94e6aa5da01ac427bc98a", "IPY_MODEL_ad7adfc018ca4ebbbf582ea6e370dafe", "IPY_MODEL_5ed735ca184b45158e432a280e6c6b5c" ], "layout": "IPY_MODEL_8d9e6b2e8e3147118c319ba4788795c5" } }, "0972d5d3a6c94e6aa5da01ac427bc98a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_826c9c8d73d448b182343775d0004feb", "placeholder": "​", "style": "IPY_MODEL_cb6b93777f914372bb582e331faaae17", "value": "Loading checkpoint shards: 100%" } }, "ad7adfc018ca4ebbbf582ea6e370dafe": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_71cfafa9755245de98399af9ea8a1cce", "max": 3, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_f01bf5b1b1b0433388823e3d3e2f7608", "value": 3 } }, "5ed735ca184b45158e432a280e6c6b5c": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_839b0090d16949f8ab5ca3f550759432", "placeholder": "​", "style": "IPY_MODEL_9e7bcd41202041eb91035eb005e2341f", "value": " 3/3 [00:26<00:00, 8.65s/it]" } }, "8d9e6b2e8e3147118c319ba4788795c5": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "826c9c8d73d448b182343775d0004feb": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "cb6b93777f914372bb582e331faaae17": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "71cfafa9755245de98399af9ea8a1cce": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f01bf5b1b1b0433388823e3d3e2f7608": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "839b0090d16949f8ab5ca3f550759432": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9e7bcd41202041eb91035eb005e2341f": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "markdown", "source": [ "# `transformers` meets `bitsandbytes` for democratzing Large Language Models (LLMs) through 4bit quantization - **Fork by [crumb](https://hf.co/crumbly) for GPT2-linear-XL**\n", "\n", "
\n", "\"drawing\"\n", "
\n", "\n", "Welcome to this notebook that goes through the recent `bitsandbytes` integration that includes the work that introduces no performance degradation 4bit quantization techniques, for democratizing LLMs inference and training.\n", "\n", "In this notebook, we will learn together how to load a large model in 4bit ~~(`gpt-neo-x-20b`)~~ (`gpt2-xl`) and train it using Google Colab and PEFT library from Hugging Face 🤗.\n", "\n", "[In the general usage notebook](https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf?usp=sharing), you can learn how to propely load a model in 4bit with all its variants.\n", "\n", "If you liked the previous work for integrating [*LLM.int8*](https://arxiv.org/abs/2208.07339), you can have a look at the [introduction blogpost](https://huggingface.co/blog/hf-bitsandbytes-integration) to lean more about that quantization method.\n" ], "metadata": { "id": "XIyP_0r6zuVc" } }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "FuXIFTFapAMI", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "e9be514a-cf54-49d1-f359-6851312f4e65" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n" ] } ], "source": [ "!pip install -q -U bitsandbytes\n", "!pip install -q -U git+https://github.com/huggingface/transformers.git\n", "!pip install -q -U git+https://github.com/huggingface/peft.git\n", "!pip install -q -U git+https://github.com/huggingface/accelerate.git\n", "!pip install -q datasets\n", "!pip install -q wandb" ] }, { "cell_type": "markdown", "source": [ "First let's load the model we are going to use - GPT2-XL" ], "metadata": { "id": "MJ-5idQwzvg-" } }, { "cell_type": "code", "source": [ "import torch\n", "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n", "\n", "# we'll use the bf16 version because it takes up 1/2 the space\n", "# and is quicker to download\n", "model_id = \"crumbly/gpt2-linear-xl-sharded-bf16\"\n", "bnb_config = BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_use_double_quant=True,\n", " bnb_4bit_quant_type=\"nf4\",\n", " bnb_4bit_compute_dtype=torch.bfloat16\n", ")\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "model = AutoModelForCausalLM.from_pretrained(model_id, device_map={\"\":0}, quantization_config=bnb_config, trust_remote_code=True)" ], "metadata": { "id": "E0Nl5mWL0k2T", "colab": { "base_uri": "https://localhost:8080/", "height": 153, "referenced_widgets": [ "c1f06c162a994fe39bc1c72dcd732eb5", "0972d5d3a6c94e6aa5da01ac427bc98a", "ad7adfc018ca4ebbbf582ea6e370dafe", "5ed735ca184b45158e432a280e6c6b5c", "8d9e6b2e8e3147118c319ba4788795c5", "826c9c8d73d448b182343775d0004feb", "cb6b93777f914372bb582e331faaae17", "71cfafa9755245de98399af9ea8a1cce", "f01bf5b1b1b0433388823e3d3e2f7608", "839b0090d16949f8ab5ca3f550759432", "9e7bcd41202041eb91035eb005e2341f" ] }, "outputId": "a550a9bf-0715-4be5-d03a-f5c348d0031b" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "A new version of the following files was downloaded from https://huggingface.co/crumbly/gpt2-linear-xl:\n", "- configuration_gpt2l.py\n", ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n", "A new version of the following files was downloaded from https://huggingface.co/crumbly/gpt2-linear-xl:\n", "- modeling_gpt2l.py\n", ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "Loading checkpoint shards: 0%| | 0/3 [00:00" ], "text/html": [ "\n", "
\n", " \n", " \n", " [64/64 04:43, Epoch 1/9223372036854775807]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
42.849300
82.507900
122.744300
162.537700
202.808800
242.619400
282.521000
322.543500
362.439600
402.369900
442.448100
482.389500
522.331100
562.366500
602.401100
642.153900

" ] }, "metadata": {} }, { "output_type": "execute_result", "data": { "text/plain": [ "TrainOutput(global_step=64, training_loss=2.5019835233688354, metrics={'train_runtime': 303.3326, 'train_samples_per_second': 1.688, 'train_steps_per_second': 0.211, 'total_flos': 802220553600000.0, 'train_loss': 2.5019835233688354, 'epoch': 1.0})" ] }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "code", "source": [ "inputs = {k:v.cuda() for k,v in tokenizer(\"\"\"\n", "You are an AI assistant. You will be given a question. You must generate a short and factual answer.\n", "What is the capital city of France?\n", "\"\"\", return_tensors='pt').items()}\n", "outputs = model.generate(**inputs, max_new_tokens=16, temperature=0.5, do_sample=True)\n", "print(tokenizer.decode(outputs[0]), \"...\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wr6bfZ0wyk3c", "outputId": "ca4ede1b-7456-43ea-ce52-961c1383dff8" }, "execution_count": 16, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "\n", "You are an AI assistant. You will be given a question. You must generate a short and factual answer.\n", "What is the capital city of France?\n", "\n", "\n", "Paris\n", "\n", "Paris is the capital of France. The city is located ...\n" ] } ] }, { "cell_type": "markdown", "source": [ "To save your adapters, you can either use\n", "\n", "```python\n", "model.save_pretrained(\"local_folder\")\n", "```\n", "\n", "or push them to the hub with\n", "\n", "```python\n", "model.push_to_hub(\"myusername/my_repo\")\n", "```\n", "\n", "If you would like to merge the adapters into your model, you'll have to load the base model again without quantization, and merge them like this.\n", "\n", "```python\n", "from peft import PeftModel\n", "from transformers import AutoModelForCausalLM, AutoTokenizer\n", "\n", "model = AutoModelForCausalLM.from_pretrained(\"crumbly/gpt2-linear-xl-sharded-bf16\")\n", "model = PeftModel.from_pretrained(model, \"myusername/my_repo\")\n", "model = model.merge_and_unload()\n", "```\n", "\n", "You can then push that to the hub or save it to a local folder like before, but including all of the weights." ], "metadata": { "id": "NsGnWFe8mr0p" } } ] }