Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -280,94 +280,56 @@ def train_model(progress=gr.Progress()):
|
|
280 |
progress(0.1, desc="Loading dataset...")
|
281 |
train_dataset = load_dataset()
|
282 |
|
283 |
-
#
|
|
|
|
|
|
|
|
|
284 |
progress(0.2, desc="Initializing trainer...")
|
285 |
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
use_deepspeed = True
|
290 |
-
print("DeepSpeed available, will use ZeRO-3")
|
291 |
-
|
292 |
-
ds_config = {
|
293 |
-
"zero_optimization": {
|
294 |
-
"stage": 3,
|
295 |
-
"offload_optimizer": {
|
296 |
-
"device": "cpu",
|
297 |
-
"pin_memory": True
|
298 |
-
},
|
299 |
-
"offload_param": {
|
300 |
-
"device": "cpu",
|
301 |
-
"pin_memory": True
|
302 |
-
},
|
303 |
-
"overlap_comm": True,
|
304 |
-
"contiguous_gradients": True,
|
305 |
-
"reduce_bucket_size": 5e7,
|
306 |
-
"stage3_prefetch_bucket_size": 5e7,
|
307 |
-
"stage3_param_persistence_threshold": 1e5
|
308 |
-
},
|
309 |
-
"train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE,
|
310 |
-
"gradient_accumulation_steps": GRAD_ACCUMULATION_STEPS,
|
311 |
-
"fp16": {"enabled": True},
|
312 |
-
"zero_allow_untested_optimizer": True,
|
313 |
-
"aio": {"block_size": 1048576, "queue_depth": 8, "thread_count": 1}
|
314 |
-
}
|
315 |
-
except ImportError:
|
316 |
-
use_deepspeed = False
|
317 |
-
print("DeepSpeed not available, falling back to standard distribution")
|
318 |
-
ds_config = None
|
319 |
-
|
320 |
-
# Define training arguments inside the function
|
321 |
training_args = TrainingArguments(
|
322 |
output_dir=OUTPUT_TRAINING_DIR,
|
323 |
logging_dir=LOGGING_DIR,
|
324 |
-
num_train_epochs=
|
325 |
-
per_device_train_batch_size=
|
326 |
-
gradient_accumulation_steps=
|
327 |
learning_rate=LEARNING_RATE,
|
328 |
weight_decay=WEIGHT_DECAY,
|
329 |
warmup_ratio=WARMUP_RATIO,
|
330 |
lr_scheduler_type=LR_SCHEDULER,
|
331 |
report_to="tensorboard",
|
332 |
fp16=True,
|
333 |
-
bf16=False,
|
334 |
|
335 |
-
#
|
336 |
-
|
337 |
-
gradient_checkpointing=True,
|
338 |
-
gradient_checkpointing_kwargs={"use_reentrant": False},
|
339 |
-
|
340 |
-
# Explicit model distribution
|
341 |
ddp_find_unused_parameters=False,
|
342 |
-
deepspeed=
|
343 |
|
344 |
-
#
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
|
|
|
|
|
|
|
|
351 |
)
|
352 |
|
353 |
-
#
|
354 |
-
|
355 |
-
# Call your existing collator
|
356 |
-
batch = seq2seq_causal_collator(examples)
|
357 |
-
|
358 |
-
# Ensure we cap to MAX_SEQ_LENGTH
|
359 |
-
for k, v in batch.items():
|
360 |
-
if isinstance(v, torch.Tensor) and v.dim() >= 2:
|
361 |
-
batch[k] = v[:, :MAX_SEQ_LENGTH]
|
362 |
-
|
363 |
-
return batch
|
364 |
|
365 |
-
# Initialize trainer
|
366 |
trainer = Trainer(
|
367 |
model=model,
|
368 |
args=training_args,
|
369 |
train_dataset=train_dataset,
|
370 |
-
data_collator=
|
371 |
)
|
372 |
|
373 |
# Print memory status before training
|
@@ -376,20 +338,37 @@ def train_model(progress=gr.Progress()):
|
|
376 |
print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
|
377 |
|
378 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
379 |
# Clean again just before training
|
380 |
clean_memory()
|
381 |
|
382 |
-
|
383 |
-
progress(0.4, desc="Starting training
|
384 |
|
385 |
-
#
|
386 |
-
|
387 |
|
388 |
-
#
|
389 |
-
|
390 |
-
trainer.save_model(OUTPUT_TRAINING_DIR)
|
391 |
|
392 |
-
|
|
|
|
|
|
|
|
|
|
|
393 |
except Exception as e:
|
394 |
error_msg = str(e)
|
395 |
print(f"Training error: {error_msg}")
|
|
|
280 |
progress(0.1, desc="Loading dataset...")
|
281 |
train_dataset = load_dataset()
|
282 |
|
283 |
+
# Add verbose logging
|
284 |
+
import logging
|
285 |
+
logging.basicConfig(level=logging.INFO)
|
286 |
+
|
287 |
+
# Initialize trainer with debug flags
|
288 |
progress(0.2, desc="Initializing trainer...")
|
289 |
|
290 |
+
from transformers import TrainingArguments
|
291 |
+
|
292 |
+
# Ensure we're using the simplest training setup for first success
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
training_args = TrainingArguments(
|
294 |
output_dir=OUTPUT_TRAINING_DIR,
|
295 |
logging_dir=LOGGING_DIR,
|
296 |
+
num_train_epochs=1,
|
297 |
+
per_device_train_batch_size=1,
|
298 |
+
gradient_accumulation_steps=16, # Reduced for faster iterations
|
299 |
learning_rate=LEARNING_RATE,
|
300 |
weight_decay=WEIGHT_DECAY,
|
301 |
warmup_ratio=WARMUP_RATIO,
|
302 |
lr_scheduler_type=LR_SCHEDULER,
|
303 |
report_to="tensorboard",
|
304 |
fp16=True,
|
|
|
305 |
|
306 |
+
# Simplified training - disable fancy features
|
307 |
+
local_rank=-1, # Disable distributed training for debugging
|
|
|
|
|
|
|
|
|
308 |
ddp_find_unused_parameters=False,
|
309 |
+
deepspeed=None,
|
310 |
|
311 |
+
# More frequent logging to see progress
|
312 |
+
logging_steps=1, # Log every step
|
313 |
+
save_strategy="no", # Don't save during initial test
|
314 |
+
|
315 |
+
# Other settings
|
316 |
+
optim="adamw_torch", # Use simpler optimizer
|
317 |
+
gradient_checkpointing=True,
|
318 |
+
gradient_checkpointing_kwargs={"use_reentrant": False},
|
319 |
+
dataloader_num_workers=0,
|
320 |
+
group_by_length=False, # Disable grouping for debugging
|
321 |
+
max_grad_norm=1.0,
|
322 |
)
|
323 |
|
324 |
+
# Use a simpler data collator for testing
|
325 |
+
from transformers import default_data_collator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
|
327 |
+
# Initialize trainer with simplified settings
|
328 |
trainer = Trainer(
|
329 |
model=model,
|
330 |
args=training_args,
|
331 |
train_dataset=train_dataset,
|
332 |
+
data_collator=default_data_collator, # Use default collator for testing
|
333 |
)
|
334 |
|
335 |
# Print memory status before training
|
|
|
338 |
print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
|
339 |
|
340 |
try:
|
341 |
+
# Add a timeout mechanism
|
342 |
+
import signal
|
343 |
+
|
344 |
+
class TimeoutException(Exception):
|
345 |
+
pass
|
346 |
+
|
347 |
+
def timeout_handler(signum, frame):
|
348 |
+
raise TimeoutException("Training step is taking too long")
|
349 |
+
|
350 |
+
# Set 30-minute timeout for training (adjust as needed)
|
351 |
+
signal.signal(signal.SIGALRM, timeout_handler)
|
352 |
+
signal.alarm(1800) # 30 minutes in seconds
|
353 |
+
|
354 |
# Clean again just before training
|
355 |
clean_memory()
|
356 |
|
357 |
+
print("Starting training with verbose logging...")
|
358 |
+
progress(0.4, desc="Starting training (this may take a while for the first step)...")
|
359 |
|
360 |
+
# Try training with only a few steps first to test
|
361 |
+
trainer.train(max_steps=3) # Just try 3 steps to see if it works
|
362 |
|
363 |
+
# Cancel the alarm if training succeeds
|
364 |
+
signal.alarm(0)
|
|
|
365 |
|
366 |
+
progress(0.9, desc="Initial training successful! You can now run full training.")
|
367 |
+
return "Initial training completed successfully! The system is working. You can now adjust parameters for a full training run."
|
368 |
+
|
369 |
+
except TimeoutException as e:
|
370 |
+
return f"Training timed out: {str(e)}. Try reducing model parameters further or switching to a smaller model like LLaMA 3 3B."
|
371 |
+
|
372 |
except Exception as e:
|
373 |
error_msg = str(e)
|
374 |
print(f"Training error: {error_msg}")
|