Delete assets/basic_inference_llama_2_13b_dolphin.ipynb

Browse files

Files changed (1) hide show

assets/basic_inference_llama_2_13b_dolphin.ipynb +0 -179

assets/basic_inference_llama_2_13b_dolphin.ipynb DELETED Viewed

@@ -1,179 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "gpuType": "A100"
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "accelerator": "GPU"
-  },
-  "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "LqFeWyhye38d"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -q -U huggingface_hub peft transformers torch accelerate"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "!nvidia-smi\n"
-      ],
-      "metadata": {
-        "id": "y5FkaLZcfAHm"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import torch\n",
-        "from peft import PeftModel, PeftConfig\n",
-        "from transformers import AutoModelForCausalLM, AutoTokenizer\n"
-      ],
-      "metadata": {
-        "id": "EKXLttEgf06g"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "!huggingface-cli login"
-      ],
-      "metadata": {
-        "id": "Q_8EpxK4gUZI"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "peft_model_id = \"dfurman/llama-2-13b-dolphin-peft\"\n",
-        "config = PeftConfig.from_pretrained(peft_model_id)\n",
-        "\n",
-        "tokenizer = AutoTokenizer.from_pretrained(\n",
-        "    config.base_model_name_or_path,\n",
-        "    use_auth_token=True\n",
-        ")\n",
-        "tokenizer.pad_token = tokenizer.eos_token\n",
-        "model = AutoModelForCausalLM.from_pretrained(\n",
-        "    config.base_model_name_or_path,\n",
-        "    torch_dtype=torch.bfloat16,\n",
-        "    device_map=\"auto\",\n",
-        "    use_auth_token=True,\n",
-        ")\n",
-        "\n",
-        "# Load the Lora model\n",
-        "model = PeftModel.from_pretrained(model, peft_model_id)"
-      ],
-      "metadata": {
-        "id": "AGxrbUqDgD8D"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "def llama_generate(\n",
-        "    model: AutoModelForCausalLM,\n",
-        "    tokenizer: AutoTokenizer,\n",
-        "    prompt: str,\n",
-        "    max_new_tokens: int = 128,\n",
-        "    temperature: int = 1.0,\n",
-        ") -> str:\n",
-        "    \"\"\"\n",
-        "    Initialize the pipeline\n",
-        "    Uses Hugging Face GenerationConfig defaults\n",
-        "        https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig\n",
-        "    Args:\n",
-        "        model (transformers.AutoModelForCausalLM): Falcon model for text generation\n",
-        "        tokenizer (transformers.AutoTokenizer): Tokenizer for model\n",
-        "        prompt (str): Prompt for text generation\n",
-        "        max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.\n",
-        "        temperature (float, optional): The value used to modulate the next token probabilities.\n",
-        "            Defaults to 1.0\n",
-        "    \"\"\"\n",
-        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "\n",
-        "    inputs = tokenizer(\n",
-        "        [prompt],\n",
-        "        return_tensors=\"pt\",\n",
-        "        return_token_type_ids=False,\n",
-        "    ).to(\n",
-        "        device\n",
-        "    )  # tokenize inputs, load on device\n",
-        "\n",
-        "    # when running Torch modules in lower precision, it is best practice to use the torch.autocast context manager.\n",
-        "    with torch.autocast(\"cuda\", dtype=torch.bfloat16):\n",
-        "        response = model.generate(\n",
-        "            **inputs,\n",
-        "            max_new_tokens=max_new_tokens,\n",
-        "            temperature=temperature,\n",
-        "            return_dict_in_generate=True,\n",
-        "            eos_token_id=tokenizer.eos_token_id,\n",
-        "            pad_token_id=tokenizer.pad_token_id,\n",
-        "        )\n",
-        "\n",
-        "    decoded_output = tokenizer.decode(\n",
-        "        response[\"sequences\"][0],\n",
-        "        skip_special_tokens=True,\n",
-        "    )  # grab output in natural language\n",
-        "\n",
-        "    return decoded_output[len(prompt) :]  # remove prompt from output\n"
-      ],
-      "metadata": {
-        "id": "OQD_s1-egFjB"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "prompt = \"Your are a helpful AI assistant. Write me a numbered list of things to do in New York City.\\n\"\n",
-        "\n",
-        "response = llama_generate(\n",
-        "    model,\n",
-        "    tokenizer,\n",
-        "    prompt,\n",
-        "    max_new_tokens=150,\n",
-        "    temperature=0.92,\n",
-        ")\n",
-        "\n",
-        "print(response)"
-      ],
-      "metadata": {
-        "id": "mKXUkc6BgjdL"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [],
-      "metadata": {
-        "id": "JOgPF_UdgnWr"
-      },
-      "execution_count": null,
-      "outputs": []
-    }
-  ]
-}