Text Generation
PEFT
Safetensors
mistral
conversational
Eval Results
dfurman commited on
Commit
9bbf6f2
·
1 Parent(s): 5480208

Upload basic_inference_llama_2_13b_dolphin.ipynb

Browse files
assets/basic_inference_llama_2_13b_dolphin.ipynb ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "A100"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "metadata": {
23
+ "id": "LqFeWyhye38d"
24
+ },
25
+ "outputs": [],
26
+ "source": [
27
+ "!pip install -q -U huggingface_hub peft transformers torch accelerate"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "source": [
33
+ "!nvidia-smi\n"
34
+ ],
35
+ "metadata": {
36
+ "id": "y5FkaLZcfAHm"
37
+ },
38
+ "execution_count": null,
39
+ "outputs": []
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "source": [
44
+ "import torch\n",
45
+ "from peft import PeftModel, PeftConfig\n",
46
+ "from transformers import AutoModelForCausalLM, AutoTokenizer\n"
47
+ ],
48
+ "metadata": {
49
+ "id": "EKXLttEgf06g"
50
+ },
51
+ "execution_count": null,
52
+ "outputs": []
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "source": [
57
+ "!huggingface-cli login"
58
+ ],
59
+ "metadata": {
60
+ "id": "Q_8EpxK4gUZI"
61
+ },
62
+ "execution_count": null,
63
+ "outputs": []
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "source": [
68
+ "peft_model_id = \"dfurman/llama-2-13b-dolphin-peft\"\n",
69
+ "config = PeftConfig.from_pretrained(peft_model_id)\n",
70
+ "\n",
71
+ "tokenizer = AutoTokenizer.from_pretrained(\n",
72
+ " config.base_model_name_or_path,\n",
73
+ " use_auth_token=True\n",
74
+ ")\n",
75
+ "tokenizer.pad_token = tokenizer.eos_token\n",
76
+ "model = AutoModelForCausalLM.from_pretrained(\n",
77
+ " config.base_model_name_or_path,\n",
78
+ " torch_dtype=torch.bfloat16,\n",
79
+ " device_map=\"auto\",\n",
80
+ " use_auth_token=True,\n",
81
+ ")\n",
82
+ "\n",
83
+ "# Load the Lora model\n",
84
+ "model = PeftModel.from_pretrained(model, peft_model_id)"
85
+ ],
86
+ "metadata": {
87
+ "id": "AGxrbUqDgD8D"
88
+ },
89
+ "execution_count": null,
90
+ "outputs": []
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "source": [
95
+ "def llama_generate(\n",
96
+ " model: AutoModelForCausalLM,\n",
97
+ " tokenizer: AutoTokenizer,\n",
98
+ " prompt: str,\n",
99
+ " max_new_tokens: int = 128,\n",
100
+ " temperature: int = 1.0,\n",
101
+ ") -> str:\n",
102
+ " \"\"\"\n",
103
+ " Initialize the pipeline\n",
104
+ " Uses Hugging Face GenerationConfig defaults\n",
105
+ " https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig\n",
106
+ " Args:\n",
107
+ " model (transformers.AutoModelForCausalLM): Falcon model for text generation\n",
108
+ " tokenizer (transformers.AutoTokenizer): Tokenizer for model\n",
109
+ " prompt (str): Prompt for text generation\n",
110
+ " max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.\n",
111
+ " temperature (float, optional): The value used to modulate the next token probabilities.\n",
112
+ " Defaults to 1.0\n",
113
+ " \"\"\"\n",
114
+ " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
115
+ "\n",
116
+ " inputs = tokenizer(\n",
117
+ " [prompt],\n",
118
+ " return_tensors=\"pt\",\n",
119
+ " return_token_type_ids=False,\n",
120
+ " ).to(\n",
121
+ " device\n",
122
+ " ) # tokenize inputs, load on device\n",
123
+ "\n",
124
+ " # when running Torch modules in lower precision, it is best practice to use the torch.autocast context manager.\n",
125
+ " with torch.autocast(\"cuda\", dtype=torch.bfloat16):\n",
126
+ " response = model.generate(\n",
127
+ " **inputs,\n",
128
+ " max_new_tokens=max_new_tokens,\n",
129
+ " temperature=temperature,\n",
130
+ " return_dict_in_generate=True,\n",
131
+ " eos_token_id=tokenizer.eos_token_id,\n",
132
+ " pad_token_id=tokenizer.pad_token_id,\n",
133
+ " )\n",
134
+ "\n",
135
+ " decoded_output = tokenizer.decode(\n",
136
+ " response[\"sequences\"][0],\n",
137
+ " skip_special_tokens=True,\n",
138
+ " ) # grab output in natural language\n",
139
+ "\n",
140
+ " return decoded_output[len(prompt) :] # remove prompt from output\n"
141
+ ],
142
+ "metadata": {
143
+ "id": "OQD_s1-egFjB"
144
+ },
145
+ "execution_count": null,
146
+ "outputs": []
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "source": [
151
+ "prompt = \"Your are a helpful AI assistant. Write me a numbered list of things to do in New York City.\\n\"\n",
152
+ "\n",
153
+ "response = llama_generate(\n",
154
+ " model,\n",
155
+ " tokenizer,\n",
156
+ " prompt,\n",
157
+ " max_new_tokens=150,\n",
158
+ " temperature=0.92,\n",
159
+ ")\n",
160
+ "\n",
161
+ "print(response)"
162
+ ],
163
+ "metadata": {
164
+ "id": "mKXUkc6BgjdL"
165
+ },
166
+ "execution_count": null,
167
+ "outputs": []
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "source": [],
172
+ "metadata": {
173
+ "id": "JOgPF_UdgnWr"
174
+ },
175
+ "execution_count": null,
176
+ "outputs": []
177
+ }
178
+ ]
179
+ }