Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -31,19 +31,21 @@ local_download_path = "./downloaded_dataset_files"
|
|
31 |
|
32 |
# Training parameters
|
33 |
NUM_EPOCHS = 1
|
34 |
-
BATCH_SIZE_PER_DEVICE =
|
35 |
-
GRAD_ACCUMULATION_STEPS =
|
36 |
LEARNING_RATE = 1e-4
|
37 |
WEIGHT_DECAY = 0.01
|
38 |
WARMUP_RATIO = 0.03
|
39 |
LR_SCHEDULER = "cosine"
|
40 |
OPTIMIZER = "paged_adamw_8bit"
|
|
|
|
|
41 |
|
42 |
# Multi-GPU configuration
|
43 |
accelerator = Accelerator()
|
44 |
|
45 |
# Configure environment for multi-GPU
|
46 |
-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:
|
47 |
|
48 |
# Print GPU information
|
49 |
print(f"Available GPUs: {torch.cuda.device_count()}")
|
@@ -127,34 +129,44 @@ def prepare_for_dataset(batch):
|
|
127 |
return output
|
128 |
|
129 |
def load_model():
|
130 |
-
#
|
131 |
-
|
132 |
print(f"Loading base model architecture from: {hf_model_repo_id}")
|
133 |
-
print(f"Using device: {DEVICE}")
|
134 |
|
135 |
-
#
|
136 |
bnb_config = BitsAndBytesConfig(
|
137 |
load_in_4bit=True,
|
138 |
bnb_4bit_quant_type="nf4",
|
139 |
-
bnb_4bit_compute_dtype=torch.bfloat16
|
140 |
bnb_4bit_use_double_quant=True,
|
141 |
)
|
142 |
|
143 |
-
#
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
-
|
147 |
-
max_memory = {i: "22GB" for i in range(torch.cuda.device_count())}
|
148 |
-
max_memory["cpu"] = "32GB" # Allow some CPU offloading if needed
|
149 |
|
|
|
150 |
model = AutoModelForCausalLM.from_pretrained(
|
151 |
hf_model_repo_id,
|
152 |
quantization_config=bnb_config,
|
153 |
-
device_map=
|
154 |
max_memory=max_memory,
|
155 |
trust_remote_code=True,
|
156 |
use_cache=False,
|
157 |
torch_dtype=torch.float16,
|
|
|
158 |
)
|
159 |
print(f"Loaded model vocab size: {model.config.vocab_size}")
|
160 |
print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
|
@@ -247,94 +259,69 @@ def load_dataset():
|
|
247 |
return hf_dataset
|
248 |
|
249 |
def train_model(progress=gr.Progress()):
|
250 |
-
#
|
251 |
-
|
252 |
-
os.makedirs(LOGGING_DIR, exist_ok=True)
|
253 |
|
254 |
-
|
255 |
-
|
256 |
-
if model_to_train is None:
|
257 |
-
return "Failed to load model."
|
258 |
|
259 |
-
|
|
|
260 |
train_dataset = load_dataset()
|
261 |
-
if train_dataset is None:
|
262 |
-
return "Failed to load dataset."
|
263 |
|
264 |
-
|
265 |
-
|
266 |
-
total_train_batch_size = BATCH_SIZE_PER_DEVICE * GRAD_ACCUMULATION_STEPS
|
267 |
-
num_training_steps = math.ceil((len(train_dataset) * NUM_EPOCHS) / total_train_batch_size)
|
268 |
-
num_warmup_steps = int(num_training_steps * WARMUP_RATIO)
|
269 |
|
270 |
-
#
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
logging_dir=LOGGING_DIR,
|
282 |
-
logging_strategy="steps",
|
283 |
-
logging_steps=LOGGING_STEPS,
|
284 |
-
save_strategy="steps",
|
285 |
-
save_steps=SAVE_STEPS,
|
286 |
-
save_total_limit=2,
|
287 |
-
learning_rate=LEARNING_RATE,
|
288 |
-
weight_decay=WEIGHT_DECAY,
|
289 |
-
warmup_steps=num_warmup_steps,
|
290 |
-
lr_scheduler_type=LR_SCHEDULER,
|
291 |
-
report_to="tensorboard",
|
292 |
-
fp16=True,
|
293 |
-
bf16=False,
|
294 |
-
gradient_checkpointing=True,
|
295 |
-
gradient_checkpointing_kwargs={"use_reentrant": False},
|
296 |
-
ddp_find_unused_parameters=False,
|
297 |
-
local_rank=int(os.getenv("LOCAL_RANK", -1)),
|
298 |
-
dataloader_num_workers=4,
|
299 |
-
)
|
300 |
|
|
|
301 |
trainer = Trainer(
|
302 |
-
model=
|
303 |
args=training_args,
|
304 |
train_dataset=train_dataset,
|
305 |
-
data_collator=
|
306 |
)
|
307 |
|
308 |
-
# Print memory
|
|
|
309 |
for i in range(torch.cuda.device_count()):
|
310 |
print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
|
311 |
-
|
312 |
-
progress(0.5, desc="Starting training...")
|
313 |
-
# Clear cache before starting
|
314 |
-
gc.collect()
|
315 |
-
if torch.cuda.is_available():
|
316 |
-
torch.cuda.empty_cache()
|
317 |
|
318 |
try:
|
319 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
train_result = trainer.train()
|
321 |
|
|
|
322 |
progress(0.9, desc="Saving model...")
|
323 |
-
|
324 |
-
final_save_path = os.path.join(training_args.output_dir, "final_checkpoint")
|
325 |
-
trainer.save_model(final_save_path)
|
326 |
-
trainer.save_state()
|
327 |
|
328 |
-
|
329 |
-
metrics = train_result.metrics
|
330 |
-
trainer.log_metrics("train", metrics)
|
331 |
-
trainer.save_metrics("train", metrics)
|
332 |
-
|
333 |
-
progress(1.0, desc="Training complete!")
|
334 |
-
return f"Training completed successfully. Model saved to {final_save_path}"
|
335 |
-
|
336 |
except Exception as e:
|
337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
|
339 |
# Create Gradio interface
|
340 |
def create_ui():
|
@@ -386,4 +373,13 @@ if __name__ == "__main__":
|
|
386 |
|
387 |
# Create and launch the UI
|
388 |
demo = create_ui()
|
389 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
# Training parameters
|
33 |
NUM_EPOCHS = 1
|
34 |
+
BATCH_SIZE_PER_DEVICE = 1
|
35 |
+
GRAD_ACCUMULATION_STEPS = 64
|
36 |
LEARNING_RATE = 1e-4
|
37 |
WEIGHT_DECAY = 0.01
|
38 |
WARMUP_RATIO = 0.03
|
39 |
LR_SCHEDULER = "cosine"
|
40 |
OPTIMIZER = "paged_adamw_8bit"
|
41 |
+
MAX_SEQ_LENGTH = 256
|
42 |
+
MICRO_BATCH_SIZE = 1
|
43 |
|
44 |
# Multi-GPU configuration
|
45 |
accelerator = Accelerator()
|
46 |
|
47 |
# Configure environment for multi-GPU
|
48 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
|
49 |
|
50 |
# Print GPU information
|
51 |
print(f"Available GPUs: {torch.cuda.device_count()}")
|
|
|
129 |
return output
|
130 |
|
131 |
def load_model():
|
132 |
+
clean_memory() # Start with clean memory
|
133 |
+
|
134 |
print(f"Loading base model architecture from: {hf_model_repo_id}")
|
|
|
135 |
|
136 |
+
# Even more extreme quantization
|
137 |
bnb_config = BitsAndBytesConfig(
|
138 |
load_in_4bit=True,
|
139 |
bnb_4bit_quant_type="nf4",
|
140 |
+
bnb_4bit_compute_dtype=torch.float16, # Use float16 instead of bfloat16
|
141 |
bnb_4bit_use_double_quant=True,
|
142 |
)
|
143 |
|
144 |
+
# Use DeepSpeed if available
|
145 |
+
try:
|
146 |
+
from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live
|
147 |
+
use_deepspeed = True
|
148 |
+
print("DeepSpeed available, will use ZeRO-3")
|
149 |
+
except ImportError:
|
150 |
+
use_deepspeed = False
|
151 |
+
print("DeepSpeed not available, falling back to standard distribution")
|
152 |
+
|
153 |
+
# Calculate per-GPU reserved memory (be very conservative)
|
154 |
+
n_gpus = max(1, torch.cuda.device_count())
|
155 |
+
max_memory = {i: f"{int(torch.cuda.get_device_properties(i).total_memory / 1e9) - 4}GB" for i in range(n_gpus)}
|
156 |
+
max_memory["cpu"] = "32GB"
|
157 |
|
158 |
+
print(f"Using {n_gpus} GPUs with memory configuration: {max_memory}")
|
|
|
|
|
159 |
|
160 |
+
# Load model with proper device distribution
|
161 |
model = AutoModelForCausalLM.from_pretrained(
|
162 |
hf_model_repo_id,
|
163 |
quantization_config=bnb_config,
|
164 |
+
device_map="balanced_low_0", # Distribute evenly with priority to minimize GPU 0 usage
|
165 |
max_memory=max_memory,
|
166 |
trust_remote_code=True,
|
167 |
use_cache=False,
|
168 |
torch_dtype=torch.float16,
|
169 |
+
low_cpu_mem_usage=True,
|
170 |
)
|
171 |
print(f"Loaded model vocab size: {model.config.vocab_size}")
|
172 |
print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
|
|
|
259 |
return hf_dataset
|
260 |
|
261 |
def train_model(progress=gr.Progress()):
|
262 |
+
# Clean memory before starting
|
263 |
+
clean_memory()
|
|
|
264 |
|
265 |
+
# Load model with optimized memory settings
|
266 |
+
model = load_model()
|
|
|
|
|
267 |
|
268 |
+
# Load and prepare dataset
|
269 |
+
progress(0.1, desc="Loading dataset...")
|
270 |
train_dataset = load_dataset()
|
|
|
|
|
271 |
|
272 |
+
# Initialize trainer with memory-optimized settings
|
273 |
+
progress(0.2, desc="Initializing trainer...")
|
|
|
|
|
|
|
274 |
|
275 |
+
# Optional: try a custom data collator that explicitly caps sequence length
|
276 |
+
def data_capped_collator(examples):
|
277 |
+
# Call your existing collator
|
278 |
+
batch = seq2seq_causal_collator(examples)
|
279 |
+
|
280 |
+
# Ensure we cap to MAX_SEQ_LENGTH
|
281 |
+
for k, v in batch.items():
|
282 |
+
if isinstance(v, torch.Tensor) and v.dim() >= 2:
|
283 |
+
batch[k] = v[:, :MAX_SEQ_LENGTH]
|
284 |
+
|
285 |
+
return batch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
|
287 |
+
# Initialize trainer
|
288 |
trainer = Trainer(
|
289 |
+
model=model,
|
290 |
args=training_args,
|
291 |
train_dataset=train_dataset,
|
292 |
+
data_collator=data_capped_collator, # Use our capped collator
|
293 |
)
|
294 |
|
295 |
+
# Print memory status before training
|
296 |
+
progress(0.3, desc="Ready to train, checking memory...")
|
297 |
for i in range(torch.cuda.device_count()):
|
298 |
print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
|
300 |
try:
|
301 |
+
# Clean again just before training
|
302 |
+
clean_memory()
|
303 |
+
|
304 |
+
# Start with smaller gradient accumulation and increase
|
305 |
+
progress(0.4, desc="Starting training with conservative settings...")
|
306 |
+
|
307 |
+
# Train with multi-GPU support
|
308 |
train_result = trainer.train()
|
309 |
|
310 |
+
# Save the final model
|
311 |
progress(0.9, desc="Saving model...")
|
312 |
+
trainer.save_model(OUTPUT_TRAINING_DIR)
|
|
|
|
|
|
|
313 |
|
314 |
+
return "Training completed successfully!"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
except Exception as e:
|
316 |
+
error_msg = str(e)
|
317 |
+
print(f"Training error: {error_msg}")
|
318 |
+
|
319 |
+
# Add memory diagnostics to error message
|
320 |
+
mem_info = "\nMemory status at error time:\n"
|
321 |
+
for i in range(torch.cuda.device_count()):
|
322 |
+
mem_info += f"GPU {i}: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved\n"
|
323 |
+
|
324 |
+
return f"An error occurred during training: {error_msg}\n{mem_info}"
|
325 |
|
326 |
# Create Gradio interface
|
327 |
def create_ui():
|
|
|
373 |
|
374 |
# Create and launch the UI
|
375 |
demo = create_ui()
|
376 |
+
demo.launch()
|
377 |
+
|
378 |
+
# Memory cleaning function
|
379 |
+
def clean_memory():
|
380 |
+
gc.collect()
|
381 |
+
if torch.cuda.is_available():
|
382 |
+
for i in range(torch.cuda.device_count()):
|
383 |
+
with torch.cuda.device(f'cuda:{i}'):
|
384 |
+
torch.cuda.empty_cache()
|
385 |
+
torch.cuda.reset_peak_memory_stats()
|