Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,10 +4,12 @@ import glob
|
|
4 |
import gc
|
5 |
from transformers import (
|
6 |
AutoModelForCausalLM,
|
|
|
7 |
BitsAndBytesConfig,
|
8 |
TrainingArguments,
|
9 |
Trainer,
|
10 |
-
DataCollatorForLanguageModeling
|
|
|
11 |
)
|
12 |
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
|
13 |
from datasets import Dataset
|
@@ -130,71 +132,71 @@ def prepare_for_dataset(batch):
|
|
130 |
return output
|
131 |
|
132 |
def load_model():
|
133 |
-
clean_memory() # Start with clean memory
|
134 |
-
|
135 |
print(f"Loading base model architecture from: {hf_model_repo_id}")
|
136 |
|
137 |
-
#
|
138 |
-
|
139 |
-
|
140 |
-
bnb_4bit_quant_type="nf4",
|
141 |
-
bnb_4bit_compute_dtype=torch.float16, # Use float16 instead of bfloat16
|
142 |
-
bnb_4bit_use_double_quant=True,
|
143 |
-
)
|
144 |
|
145 |
-
# For 4-bit training, we need to load on a single device
|
146 |
-
# Choose GPU with most available memory
|
147 |
-
free_memory = []
|
148 |
for i in range(torch.cuda.device_count()):
|
149 |
-
|
150 |
-
|
151 |
-
|
|
|
152 |
|
153 |
-
|
154 |
-
target_gpu = free_memory.index(max(free_memory))
|
155 |
-
print(f"Loading model on GPU {target_gpu} with {free_memory[target_gpu]:.2f}GB free memory")
|
156 |
|
157 |
-
#
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
-
# Load
|
161 |
model = AutoModelForCausalLM.from_pretrained(
|
162 |
hf_model_repo_id,
|
163 |
quantization_config=bnb_config,
|
164 |
-
device_map=
|
165 |
-
|
166 |
-
use_cache=False,
|
167 |
-
torch_dtype=torch.float16,
|
168 |
-
low_cpu_mem_usage=True,
|
169 |
)
|
170 |
|
171 |
-
|
172 |
-
|
|
|
|
|
|
|
173 |
|
174 |
-
#
|
175 |
-
print(f"Loaded model vocab size: {model.get_input_embeddings().weight.shape[0]}")
|
176 |
print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
|
177 |
|
178 |
-
#
|
|
|
|
|
|
|
179 |
lora_config = LoraConfig(
|
180 |
-
r=16,
|
181 |
lora_alpha=32,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
lora_dropout=0.05,
|
183 |
bias="none",
|
184 |
-
task_type=TaskType.CAUSAL_LM
|
185 |
-
target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
|
186 |
)
|
187 |
|
188 |
-
#
|
189 |
-
model = prepare_model_for_kbit_training(model)
|
190 |
-
|
191 |
-
# Add LoRA adapters
|
192 |
model = get_peft_model(model, lora_config)
|
193 |
-
|
194 |
-
# Log number of trainable parameters
|
195 |
model.print_trainable_parameters()
|
196 |
|
197 |
-
return model
|
198 |
|
199 |
def load_dataset():
|
200 |
# --- Download the dataset repository files ---
|
@@ -275,7 +277,7 @@ def train_model(progress=gr.Progress()):
|
|
275 |
clean_memory()
|
276 |
|
277 |
# Load model with optimized memory settings
|
278 |
-
model = load_model()
|
279 |
|
280 |
# Load and prepare dataset
|
281 |
progress(0.1, desc="Loading dataset...")
|
@@ -302,13 +304,14 @@ def train_model(progress=gr.Progress()):
|
|
302 |
logging_first_step=True, # Force log on first step
|
303 |
)
|
304 |
|
305 |
-
# Create a simple trainer
|
306 |
trainer = Trainer(
|
307 |
model=model,
|
308 |
args=training_args,
|
309 |
train_dataset=train_dataset,
|
310 |
data_collator=DataCollatorForLanguageModeling(
|
311 |
-
tokenizer=
|
|
|
312 |
)
|
313 |
)
|
314 |
|
|
|
4 |
import gc
|
5 |
from transformers import (
|
6 |
AutoModelForCausalLM,
|
7 |
+
AutoTokenizer,
|
8 |
BitsAndBytesConfig,
|
9 |
TrainingArguments,
|
10 |
Trainer,
|
11 |
+
DataCollatorForLanguageModeling,
|
12 |
+
AutoTokenizer
|
13 |
)
|
14 |
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
|
15 |
from datasets import Dataset
|
|
|
132 |
return output
|
133 |
|
134 |
def load_model():
|
|
|
|
|
135 |
print(f"Loading base model architecture from: {hf_model_repo_id}")
|
136 |
|
137 |
+
# Get information about GPU with most free memory
|
138 |
+
gpu_id = 0 # Default to first GPU
|
139 |
+
max_free_memory = 0
|
|
|
|
|
|
|
|
|
140 |
|
|
|
|
|
|
|
141 |
for i in range(torch.cuda.device_count()):
|
142 |
+
free_memory = torch.cuda.get_device_properties(i).total_memory - torch.cuda.memory_allocated(i)
|
143 |
+
if free_memory > max_free_memory:
|
144 |
+
max_free_memory = free_memory
|
145 |
+
gpu_id = i
|
146 |
|
147 |
+
print(f"Loading model on GPU {gpu_id} with {max_free_memory / 1e9:.2f}GB free memory")
|
|
|
|
|
148 |
|
149 |
+
# Configure quantization
|
150 |
+
bnb_config = BitsAndBytesConfig(
|
151 |
+
load_in_4bit=True,
|
152 |
+
bnb_4bit_use_double_quant=True,
|
153 |
+
bnb_4bit_quant_type="nf4",
|
154 |
+
bnb_4bit_compute_dtype=torch.bfloat16
|
155 |
+
)
|
156 |
|
157 |
+
# Load the model
|
158 |
model = AutoModelForCausalLM.from_pretrained(
|
159 |
hf_model_repo_id,
|
160 |
quantization_config=bnb_config,
|
161 |
+
device_map={"": gpu_id},
|
162 |
+
torch_dtype=torch.bfloat16,
|
|
|
|
|
|
|
163 |
)
|
164 |
|
165 |
+
print(f"Model loaded on device: cuda:{gpu_id}")
|
166 |
+
|
167 |
+
# Load tokenizer as well
|
168 |
+
tokenizer = AutoTokenizer.from_pretrained(hf_model_repo_id)
|
169 |
+
print(f"Loaded model vocab size: {len(tokenizer)}")
|
170 |
|
171 |
+
# Print information about input embeddings
|
|
|
172 |
print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
|
173 |
|
174 |
+
# Prepare model for k-bit training
|
175 |
+
model = prepare_model_for_kbit_training(model)
|
176 |
+
|
177 |
+
# Define LoRA configuration
|
178 |
lora_config = LoraConfig(
|
179 |
+
r=16,
|
180 |
lora_alpha=32,
|
181 |
+
target_modules=[
|
182 |
+
"q_proj",
|
183 |
+
"k_proj",
|
184 |
+
"v_proj",
|
185 |
+
"o_proj",
|
186 |
+
"gate_proj",
|
187 |
+
"up_proj",
|
188 |
+
"down_proj",
|
189 |
+
],
|
190 |
lora_dropout=0.05,
|
191 |
bias="none",
|
192 |
+
task_type=TaskType.CAUSAL_LM
|
|
|
193 |
)
|
194 |
|
195 |
+
# Apply LoRA to model
|
|
|
|
|
|
|
196 |
model = get_peft_model(model, lora_config)
|
|
|
|
|
197 |
model.print_trainable_parameters()
|
198 |
|
199 |
+
return model, tokenizer # Return both model and tokenizer
|
200 |
|
201 |
def load_dataset():
|
202 |
# --- Download the dataset repository files ---
|
|
|
277 |
clean_memory()
|
278 |
|
279 |
# Load model with optimized memory settings
|
280 |
+
model, tokenizer = load_model()
|
281 |
|
282 |
# Load and prepare dataset
|
283 |
progress(0.1, desc="Loading dataset...")
|
|
|
304 |
logging_first_step=True, # Force log on first step
|
305 |
)
|
306 |
|
307 |
+
# Create a simple trainer with the tokenizer
|
308 |
trainer = Trainer(
|
309 |
model=model,
|
310 |
args=training_args,
|
311 |
train_dataset=train_dataset,
|
312 |
data_collator=DataCollatorForLanguageModeling(
|
313 |
+
tokenizer=tokenizer,
|
314 |
+
mlm=False
|
315 |
)
|
316 |
)
|
317 |
|