Ffftdtd5dtft commited on
Commit
e73380c
verified
1 Parent(s): dbe2a2e

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +241 -0
main.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importar librer铆as necesarias
2
+ from unsloth import FastLanguageModel
3
+ import torch
4
+ from dotenv import load_dotenv
5
+ import os
6
+ import gc
7
+
8
+ # Cargar variables de entorno
9
+ load_dotenv()
10
+ token = os.getenv("HF_TOKEN")
11
+
12
+ # Configuraci贸n de par谩metros
13
+ max_seq_length = 2048
14
+ dtype = None # None para detecci贸n autom谩tica. Float16 para Tesla T4, V100, Bfloat16 para Ampere+
15
+ load_in_4bit = True # Utilizar cuantizaci贸n de 4 bits para reducir el uso de memoria
16
+ load_in_1bit = True # Utilizar cuantizaci贸n de 1 bit para una mayor optimizaci贸n de la memoria
17
+ optimize_storage = True # Optimizar el almacenamiento para minimizar el uso del disco
18
+ optimize_ram = True # Optimizar el uso de RAM descargando pesos no utilizados
19
+ optimize_model_space = True # Optimizar el espacio del modelo eliminando elementos inservibles
20
+
21
+ # Lista de modelos pre-cuantizados en 4bit y 1bit
22
+ quantized_models = [
23
+ "unsloth/mistral-7b-bnb-4bit",
24
+ "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
25
+ "unsloth/llama-2-7b-bnb-4bit",
26
+ "unsloth/gemma-7b-bnb-4bit",
27
+ "unsloth/gemma-7b-it-bnb-4bit",
28
+ "unsloth/gemma-2b-bnb-4bit",
29
+ "unsloth/gemma-2b-it-bnb-4bit",
30
+ "unsloth/gemma-7b-bnb-1bit", # Modelo cuantizado en 1 bit
31
+ "unsloth/gemma-2b-bnb-1bit", # Modelo cuantizado en 1 bit
32
+ ]
33
+
34
+ # Cargar el modelo y el tokenizador
35
+ model, tokenizer = FastLanguageModel.from_pretrained(
36
+ model_name="unsloth/gemma-7b-bnb-1bit",
37
+ max_seq_length=max_seq_length,
38
+ dtype=dtype,
39
+ load_in_4bit=load_in_4bit,
40
+ load_in_1bit=load_in_1bit,
41
+ optimize_storage=optimize_storage,
42
+ optimize_ram=optimize_ram,
43
+ optimize_model_space=optimize_model_space, # Activar optimizaci贸n de espacio del modelo
44
+ token=token,
45
+ )
46
+
47
+ # Agregar adaptadores LoRA
48
+ model = FastLanguageModel.get_peft_model(
49
+ model,
50
+ r=16,
51
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
52
+ "gate_proj", "up_proj", "down_proj"],
53
+ lora_alpha=16,
54
+ lora_dropout=0,
55
+ bias="none",
56
+ use_gradient_checkpointing="unsloth",
57
+ random_state=3407,
58
+ use_rslora=False,
59
+ loftq_config=None,
60
+ optimize_1bit=True, # Habilitar optimizaci贸n de 1 bit
61
+ )
62
+
63
+ # Optimizaci贸n de almacenamiento, RAM y espacio del modelo
64
+ if optimize_storage or optimize_ram or optimize_model_space:
65
+ torch.cuda.empty_cache()
66
+ gc.collect()
67
+
68
+ # Eliminar componentes inservibles del modelo para optimizar el espacio
69
+ def prune_model(model):
70
+ layers_to_keep = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
71
+ for name, module in model.named_modules():
72
+ if name not in layers_to_keep:
73
+ delattr(model, name)
74
+ return model
75
+
76
+ if optimize_model_space:
77
+ model = prune_model(model)
78
+
79
+ if optimize_storage:
80
+ model.save_pretrained("optimized_model", max_shard_size="100MB")
81
+ if optimize_ram:
82
+ model.to_disk("optimized_model", device_map="cpu")
83
+
84
+ # Preparaci贸n de datos
85
+ from datasets import load_dataset
86
+
87
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
88
+
89
+ ### Instruction:
90
+ {}
91
+
92
+ ### Input:
93
+ {}
94
+
95
+ ### Response:
96
+ {}"""
97
+
98
+ EOS_TOKEN = tokenizer.eos_token
99
+
100
+ def formatting_prompts_func(examples):
101
+ instructions = examples["instruction"]
102
+ inputs = examples["input"]
103
+ outputs = examples["output"]
104
+ texts = []
105
+ for instruction, input, output in zip(instructions, inputs, outputs):
106
+ text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
107
+ texts.append(text)
108
+ return {"text": texts}
109
+
110
+ dataset = load_dataset("yahma/alpaca-cleaned", split="train")
111
+ dataset = dataset.map(formatting_prompts_func, batched=True)
112
+
113
+ # Entrenamiento del modelo
114
+ from trl import SFTTrainer
115
+ from transformers import TrainingArguments
116
+ from unsloth import is_bfloat16_supported
117
+
118
+ trainer = SFTTrainer(
119
+ model=model,
120
+ tokenizer=tokenizer,
121
+ train_dataset=dataset,
122
+ dataset_text_field="text",
123
+ max_seq_length=max_seq_length,
124
+ dataset_num_proc=20,
125
+ packing=False,
126
+ args=TrainingArguments(
127
+ per_device_train_batch_size=2,
128
+ gradient_accumulation_steps=4,
129
+ warmup_steps=5,
130
+ max_steps=60,
131
+ learning_rate=8e-4,
132
+ fp16=not is_bfloat16_supported(),
133
+ bf16=is_bfloat16_supported(),
134
+ logging_steps=1,
135
+ optim="adamw_8bit",
136
+ weight_decay=0.01,
137
+ lr_scheduler_type="linear",
138
+ seed=3407,
139
+ output_dir="outputs",
140
+ ),
141
+ )
142
+
143
+ # Mostrar estad铆sticas de memoria actuales
144
+ gpu_stats = torch.cuda.get_device_properties(0)
145
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
146
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
147
+ print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
148
+ print(f"{start_gpu_memory} GB of memory reserved.")
149
+
150
+ # Entrenar el modelo
151
+ trainer_stats = trainer.train()
152
+
153
+ # Mostrar estad铆sticas finales de memoria y tiempo
154
+ used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
155
+ used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
156
+ used_percentage = round(used_memory / max_memory * 100, 3)
157
+ lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
158
+ print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
159
+ print(f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.")
160
+ print(f"Peak reserved memory = {used_memory} GB.")
161
+ print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
162
+ print(f"Peak reserved memory % of max memory = {used_percentage} %.")
163
+ print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
164
+
165
+ # Inferencia
166
+ FastLanguageModel.for_inference(model)
167
+ inputs = tokenizer(
168
+ [
169
+ alpaca_prompt.format(
170
+ "Continue the fibonacci sequence.",
171
+ "1, 1, 2, 3, 5, 8",
172
+ "",
173
+ )
174
+ ], return_tensors="pt").to("cuda")
175
+
176
+ outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
177
+ print(tokenizer.batch_decode(outputs))
178
+
179
+ # Inferencia continua usando TextStreamer
180
+ from transformers import TextStreamer
181
+
182
+ text_streamer = TextStreamer(tokenizer)
183
+ inputs = tokenizer(
184
+ [
185
+ alpaca_prompt.format(
186
+ "Continue the fibonacci sequence.",
187
+ "1, 1, 2, 3, 5, 8",
188
+ "",
189
+ )
190
+ ], return_tensors="pt").to("cuda")
191
+
192
+ _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)
193
+
194
+ # Guardar y cargar modelos fine-tuned
195
+ model.save_pretrained("lora_model")
196
+ tokenizer.save_pretrained("lora_model")
197
+
198
+ if True:
199
+ model, tokenizer = FastLanguageModel.from_pretrained(
200
+ model_name="lora_model",
201
+ max_seq_length=max_seq_length,
202
+ dtype=dtype,
203
+ load_in_4bit=load_in_4bit,
204
+ load_in_1bit=load_in_1bit,
205
+ optimize_storage=optimize_storage,
206
+ optimize_ram=optimize_ram,
207
+ optimize_model_space=optimize_model_space, # Activar optimizaci贸n de espacio del modelo
208
+ )
209
+ FastLanguageModel.for_inference(model)
210
+
211
+ inputs = tokenizer(
212
+ [
213
+ alpaca_prompt.format(
214
+ "What is a famous tall tower in Paris?",
215
+ "",
216
+ "",
217
+ )
218
+ ], return_tensors="pt").to("cuda")
219
+
220
+ outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
221
+ print(tokenizer.batch_decode(outputs))
222
+
223
+ # Guardar en float16 para VLLM
224
+ if True: model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit",)
225
+ if True: model.push_to_hub_merged("Yjhhh/model", tokenizer, save_method="merged_16bit", token=token)
226
+
227
+ # Guardar en formato GGUF
228
+ if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_0")
229
+ if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q4_0", token=token)
230
+
231
+ if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_1")
232
+ if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q4_1", token=token)
233
+
234
+ if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q8")
235
+ if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q8", token=token)
236
+
237
+ if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q8_0")
238
+ if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q8_0", token=token)
239
+
240
+ if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q8_1")
241
+ if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q8_1", token=token)