Twelve2five commited on
Commit
c93ea92
·
verified ·
1 Parent(s): b2842e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -79
app.py CHANGED
@@ -280,95 +280,44 @@ def train_model(progress=gr.Progress()):
280
  progress(0.1, desc="Loading dataset...")
281
  train_dataset = load_dataset()
282
 
283
- # Add verbose logging
284
- import logging
285
- logging.basicConfig(level=logging.INFO)
286
-
287
  # Initialize trainer with debug flags
288
  progress(0.2, desc="Initializing trainer...")
289
 
290
- from transformers import TrainingArguments
291
-
292
- # Ensure we're using the simplest training setup for first success
293
- training_args = TrainingArguments(
294
- output_dir=OUTPUT_TRAINING_DIR,
295
- logging_dir=LOGGING_DIR,
296
- num_train_epochs=1,
297
- per_device_train_batch_size=1,
298
- gradient_accumulation_steps=16, # Reduced for faster iterations
299
- learning_rate=LEARNING_RATE,
300
- weight_decay=WEIGHT_DECAY,
301
- warmup_ratio=WARMUP_RATIO,
302
- lr_scheduler_type=LR_SCHEDULER,
303
- report_to="tensorboard",
304
- fp16=True,
305
-
306
- # Simplified training - disable fancy features
307
- local_rank=-1, # Disable distributed training for debugging
308
- ddp_find_unused_parameters=False,
309
- deepspeed=None,
310
-
311
- # More frequent logging to see progress
312
- logging_steps=1, # Log every step
313
- save_strategy="no", # Don't save during initial test
314
-
315
- # Other settings
316
- optim="adamw_torch", # Use simpler optimizer
317
- gradient_checkpointing=True,
318
- gradient_checkpointing_kwargs={"use_reentrant": False},
319
- dataloader_num_workers=0,
320
- group_by_length=False, # Disable grouping for debugging
321
- max_grad_norm=1.0,
322
- )
323
-
324
- # Use a simpler data collator for testing
325
- from transformers import default_data_collator
326
-
327
- # Initialize trainer with simplified settings
328
- trainer = Trainer(
329
- model=model,
330
- args=training_args,
331
- train_dataset=train_dataset,
332
- data_collator=default_data_collator, # Use default collator for testing
333
- )
334
-
335
- # Print memory status before training
336
- progress(0.3, desc="Ready to train, checking memory...")
337
- for i in range(torch.cuda.device_count()):
338
- print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
339
-
340
  try:
341
- # Add a timeout mechanism
342
- import signal
343
-
344
- class TimeoutException(Exception):
345
- pass
346
-
347
- def timeout_handler(signum, frame):
348
- raise TimeoutException("Training step is taking too long")
349
-
350
- # Set 30-minute timeout for training (adjust as needed)
351
- signal.signal(signal.SIGALRM, timeout_handler)
352
- signal.alarm(1800) # 30 minutes in seconds
353
-
354
- # Clean again just before training
355
- clean_memory()
356
-
357
- print("Starting training with verbose logging...")
358
- progress(0.4, desc="Starting training (this may take a while for the first step)...")
359
 
360
- # Try training with only a few steps first to test
361
- trainer.train(max_steps=3) # Just try 3 steps to see if it works
 
 
 
 
 
 
 
362
 
363
- # Cancel the alarm if training succeeds
364
- signal.alarm(0)
 
365
 
366
  progress(0.9, desc="Initial training successful! You can now run full training.")
367
  return "Initial training completed successfully! The system is working. You can now adjust parameters for a full training run."
368
 
369
- except TimeoutException as e:
370
- return f"Training timed out: {str(e)}. Try reducing model parameters further or switching to a smaller model like LLaMA 3 3B."
371
-
372
  except Exception as e:
373
  error_msg = str(e)
374
  print(f"Training error: {error_msg}")
 
280
  progress(0.1, desc="Loading dataset...")
281
  train_dataset = load_dataset()
282
 
 
 
 
 
283
  # Initialize trainer with debug flags
284
  progress(0.2, desc="Initializing trainer...")
285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  try:
287
+ # Set up training args with simplified settings
288
+ training_args = TrainingArguments(
289
+ output_dir="./results",
290
+ num_train_epochs=1, # Just 1 epoch for testing
291
+ per_device_train_batch_size=1, # Minimal batch size
292
+ gradient_accumulation_steps=4, # Reduce memory pressure
293
+ warmup_steps=2,
294
+ logging_steps=1, # Log every step
295
+ save_steps=10000, # Don't save checkpoints during test
296
+ learning_rate=2e-4,
297
+ fp16=False, # Disable mixed precision for stability
298
+ optim="adamw_torch",
299
+ report_to="none", # Disable wandb/tensorboard reporting
300
+ max_steps=3, # Just try 3 steps to see if it works
301
+ logging_first_step=True, # Force log on first step
302
+ )
 
 
303
 
304
+ # Create a simple trainer
305
+ trainer = Trainer(
306
+ model=model,
307
+ args=training_args,
308
+ train_dataset=train_dataset,
309
+ data_collator=transformers.DataCollatorForLanguageModeling(
310
+ tokenizer=None, mlm=False
311
+ )
312
+ )
313
 
314
+ # Run training for just 3 steps
315
+ progress(0.3, desc="Starting training (this may take 5-15 minutes for first step)...")
316
+ trainer.train()
317
 
318
  progress(0.9, desc="Initial training successful! You can now run full training.")
319
  return "Initial training completed successfully! The system is working. You can now adjust parameters for a full training run."
320
 
 
 
 
321
  except Exception as e:
322
  error_msg = str(e)
323
  print(f"Training error: {error_msg}")