Upload basic_inference_llama_2_13b_dolphin.ipynb

Browse files

Files changed (1) hide show

assets/basic_inference_llama_2_13b_dolphin.ipynb +179 -0

assets/basic_inference_llama_2_13b_dolphin.ipynb ADDED Viewed

	@@ -0,0 +1,179 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "A100"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LqFeWyhye38d"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -q -U huggingface_hub peft transformers torch accelerate"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!nvidia-smi\n"
+      ],
+      "metadata": {
+        "id": "y5FkaLZcfAHm"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import torch\n",
+        "from peft import PeftModel, PeftConfig\n",
+        "from transformers import AutoModelForCausalLM, AutoTokenizer\n"
+      ],
+      "metadata": {
+        "id": "EKXLttEgf06g"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!huggingface-cli login"
+      ],
+      "metadata": {
+        "id": "Q_8EpxK4gUZI"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "peft_model_id = \"dfurman/llama-2-13b-dolphin-peft\"\n",
+        "config = PeftConfig.from_pretrained(peft_model_id)\n",
+        "\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\n",
+        "    config.base_model_name_or_path,\n",
+        "    use_auth_token=True\n",
+        ")\n",
+        "tokenizer.pad_token = tokenizer.eos_token\n",
+        "model = AutoModelForCausalLM.from_pretrained(\n",
+        "    config.base_model_name_or_path,\n",
+        "    torch_dtype=torch.bfloat16,\n",
+        "    device_map=\"auto\",\n",
+        "    use_auth_token=True,\n",
+        ")\n",
+        "\n",
+        "# Load the Lora model\n",
+        "model = PeftModel.from_pretrained(model, peft_model_id)"
+      ],
+      "metadata": {
+        "id": "AGxrbUqDgD8D"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def llama_generate(\n",
+        "    model: AutoModelForCausalLM,\n",
+        "    tokenizer: AutoTokenizer,\n",
+        "    prompt: str,\n",
+        "    max_new_tokens: int = 128,\n",
+        "    temperature: int = 1.0,\n",
+        ") -> str:\n",
+        "    \"\"\"\n",
+        "    Initialize the pipeline\n",
+        "    Uses Hugging Face GenerationConfig defaults\n",
+        "        https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig\n",
+        "    Args:\n",
+        "        model (transformers.AutoModelForCausalLM): Falcon model for text generation\n",
+        "        tokenizer (transformers.AutoTokenizer): Tokenizer for model\n",
+        "        prompt (str): Prompt for text generation\n",
+        "        max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.\n",
+        "        temperature (float, optional): The value used to modulate the next token probabilities.\n",
+        "            Defaults to 1.0\n",
+        "    \"\"\"\n",
+        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+        "\n",
+        "    inputs = tokenizer(\n",
+        "        [prompt],\n",
+        "        return_tensors=\"pt\",\n",
+        "        return_token_type_ids=False,\n",
+        "    ).to(\n",
+        "        device\n",
+        "    )  # tokenize inputs, load on device\n",
+        "\n",
+        "    # when running Torch modules in lower precision, it is best practice to use the torch.autocast context manager.\n",
+        "    with torch.autocast(\"cuda\", dtype=torch.bfloat16):\n",
+        "        response = model.generate(\n",
+        "            **inputs,\n",
+        "            max_new_tokens=max_new_tokens,\n",
+        "            temperature=temperature,\n",
+        "            return_dict_in_generate=True,\n",
+        "            eos_token_id=tokenizer.eos_token_id,\n",
+        "            pad_token_id=tokenizer.pad_token_id,\n",
+        "        )\n",
+        "\n",
+        "    decoded_output = tokenizer.decode(\n",
+        "        response[\"sequences\"][0],\n",
+        "        skip_special_tokens=True,\n",
+        "    )  # grab output in natural language\n",
+        "\n",
+        "    return decoded_output[len(prompt) :]  # remove prompt from output\n"
+      ],
+      "metadata": {
+        "id": "OQD_s1-egFjB"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "prompt = \"Your are a helpful AI assistant. Write me a numbered list of things to do in New York City.\\n\"\n",
+        "\n",
+        "response = llama_generate(\n",
+        "    model,\n",
+        "    tokenizer,\n",
+        "    prompt,\n",
+        "    max_new_tokens=150,\n",
+        "    temperature=0.92,\n",
+        ")\n",
+        "\n",
+        "print(response)"
+      ],
+      "metadata": {
+        "id": "mKXUkc6BgjdL"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "JOgPF_UdgnWr"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}