Text Generation
PEFT
Safetensors
mistral
conversational
Eval Results
dfurman commited on
Commit
3f13085
·
1 Parent(s): b80a593

Delete assets/basic_inference_llama_2_13b_dolphin.ipynb

Browse files
assets/basic_inference_llama_2_13b_dolphin.ipynb DELETED
@@ -1,179 +0,0 @@
1
- {
2
- "nbformat": 4,
3
- "nbformat_minor": 0,
4
- "metadata": {
5
- "colab": {
6
- "provenance": [],
7
- "gpuType": "A100"
8
- },
9
- "kernelspec": {
10
- "name": "python3",
11
- "display_name": "Python 3"
12
- },
13
- "language_info": {
14
- "name": "python"
15
- },
16
- "accelerator": "GPU"
17
- },
18
- "cells": [
19
- {
20
- "cell_type": "code",
21
- "execution_count": null,
22
- "metadata": {
23
- "id": "LqFeWyhye38d"
24
- },
25
- "outputs": [],
26
- "source": [
27
- "!pip install -q -U huggingface_hub peft transformers torch accelerate"
28
- ]
29
- },
30
- {
31
- "cell_type": "code",
32
- "source": [
33
- "!nvidia-smi\n"
34
- ],
35
- "metadata": {
36
- "id": "y5FkaLZcfAHm"
37
- },
38
- "execution_count": null,
39
- "outputs": []
40
- },
41
- {
42
- "cell_type": "code",
43
- "source": [
44
- "import torch\n",
45
- "from peft import PeftModel, PeftConfig\n",
46
- "from transformers import AutoModelForCausalLM, AutoTokenizer\n"
47
- ],
48
- "metadata": {
49
- "id": "EKXLttEgf06g"
50
- },
51
- "execution_count": null,
52
- "outputs": []
53
- },
54
- {
55
- "cell_type": "code",
56
- "source": [
57
- "!huggingface-cli login"
58
- ],
59
- "metadata": {
60
- "id": "Q_8EpxK4gUZI"
61
- },
62
- "execution_count": null,
63
- "outputs": []
64
- },
65
- {
66
- "cell_type": "code",
67
- "source": [
68
- "peft_model_id = \"dfurman/llama-2-13b-dolphin-peft\"\n",
69
- "config = PeftConfig.from_pretrained(peft_model_id)\n",
70
- "\n",
71
- "tokenizer = AutoTokenizer.from_pretrained(\n",
72
- " config.base_model_name_or_path,\n",
73
- " use_auth_token=True\n",
74
- ")\n",
75
- "tokenizer.pad_token = tokenizer.eos_token\n",
76
- "model = AutoModelForCausalLM.from_pretrained(\n",
77
- " config.base_model_name_or_path,\n",
78
- " torch_dtype=torch.bfloat16,\n",
79
- " device_map=\"auto\",\n",
80
- " use_auth_token=True,\n",
81
- ")\n",
82
- "\n",
83
- "# Load the Lora model\n",
84
- "model = PeftModel.from_pretrained(model, peft_model_id)"
85
- ],
86
- "metadata": {
87
- "id": "AGxrbUqDgD8D"
88
- },
89
- "execution_count": null,
90
- "outputs": []
91
- },
92
- {
93
- "cell_type": "code",
94
- "source": [
95
- "def llama_generate(\n",
96
- " model: AutoModelForCausalLM,\n",
97
- " tokenizer: AutoTokenizer,\n",
98
- " prompt: str,\n",
99
- " max_new_tokens: int = 128,\n",
100
- " temperature: int = 1.0,\n",
101
- ") -> str:\n",
102
- " \"\"\"\n",
103
- " Initialize the pipeline\n",
104
- " Uses Hugging Face GenerationConfig defaults\n",
105
- " https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig\n",
106
- " Args:\n",
107
- " model (transformers.AutoModelForCausalLM): Falcon model for text generation\n",
108
- " tokenizer (transformers.AutoTokenizer): Tokenizer for model\n",
109
- " prompt (str): Prompt for text generation\n",
110
- " max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.\n",
111
- " temperature (float, optional): The value used to modulate the next token probabilities.\n",
112
- " Defaults to 1.0\n",
113
- " \"\"\"\n",
114
- " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
115
- "\n",
116
- " inputs = tokenizer(\n",
117
- " [prompt],\n",
118
- " return_tensors=\"pt\",\n",
119
- " return_token_type_ids=False,\n",
120
- " ).to(\n",
121
- " device\n",
122
- " ) # tokenize inputs, load on device\n",
123
- "\n",
124
- " # when running Torch modules in lower precision, it is best practice to use the torch.autocast context manager.\n",
125
- " with torch.autocast(\"cuda\", dtype=torch.bfloat16):\n",
126
- " response = model.generate(\n",
127
- " **inputs,\n",
128
- " max_new_tokens=max_new_tokens,\n",
129
- " temperature=temperature,\n",
130
- " return_dict_in_generate=True,\n",
131
- " eos_token_id=tokenizer.eos_token_id,\n",
132
- " pad_token_id=tokenizer.pad_token_id,\n",
133
- " )\n",
134
- "\n",
135
- " decoded_output = tokenizer.decode(\n",
136
- " response[\"sequences\"][0],\n",
137
- " skip_special_tokens=True,\n",
138
- " ) # grab output in natural language\n",
139
- "\n",
140
- " return decoded_output[len(prompt) :] # remove prompt from output\n"
141
- ],
142
- "metadata": {
143
- "id": "OQD_s1-egFjB"
144
- },
145
- "execution_count": null,
146
- "outputs": []
147
- },
148
- {
149
- "cell_type": "code",
150
- "source": [
151
- "prompt = \"Your are a helpful AI assistant. Write me a numbered list of things to do in New York City.\\n\"\n",
152
- "\n",
153
- "response = llama_generate(\n",
154
- " model,\n",
155
- " tokenizer,\n",
156
- " prompt,\n",
157
- " max_new_tokens=150,\n",
158
- " temperature=0.92,\n",
159
- ")\n",
160
- "\n",
161
- "print(response)"
162
- ],
163
- "metadata": {
164
- "id": "mKXUkc6BgjdL"
165
- },
166
- "execution_count": null,
167
- "outputs": []
168
- },
169
- {
170
- "cell_type": "code",
171
- "source": [],
172
- "metadata": {
173
- "id": "JOgPF_UdgnWr"
174
- },
175
- "execution_count": null,
176
- "outputs": []
177
- }
178
- ]
179
- }