Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -280,95 +280,44 @@ def train_model(progress=gr.Progress()):
|
|
280 |
progress(0.1, desc="Loading dataset...")
|
281 |
train_dataset = load_dataset()
|
282 |
|
283 |
-
# Add verbose logging
|
284 |
-
import logging
|
285 |
-
logging.basicConfig(level=logging.INFO)
|
286 |
-
|
287 |
# Initialize trainer with debug flags
|
288 |
progress(0.2, desc="Initializing trainer...")
|
289 |
|
290 |
-
from transformers import TrainingArguments
|
291 |
-
|
292 |
-
# Ensure we're using the simplest training setup for first success
|
293 |
-
training_args = TrainingArguments(
|
294 |
-
output_dir=OUTPUT_TRAINING_DIR,
|
295 |
-
logging_dir=LOGGING_DIR,
|
296 |
-
num_train_epochs=1,
|
297 |
-
per_device_train_batch_size=1,
|
298 |
-
gradient_accumulation_steps=16, # Reduced for faster iterations
|
299 |
-
learning_rate=LEARNING_RATE,
|
300 |
-
weight_decay=WEIGHT_DECAY,
|
301 |
-
warmup_ratio=WARMUP_RATIO,
|
302 |
-
lr_scheduler_type=LR_SCHEDULER,
|
303 |
-
report_to="tensorboard",
|
304 |
-
fp16=True,
|
305 |
-
|
306 |
-
# Simplified training - disable fancy features
|
307 |
-
local_rank=-1, # Disable distributed training for debugging
|
308 |
-
ddp_find_unused_parameters=False,
|
309 |
-
deepspeed=None,
|
310 |
-
|
311 |
-
# More frequent logging to see progress
|
312 |
-
logging_steps=1, # Log every step
|
313 |
-
save_strategy="no", # Don't save during initial test
|
314 |
-
|
315 |
-
# Other settings
|
316 |
-
optim="adamw_torch", # Use simpler optimizer
|
317 |
-
gradient_checkpointing=True,
|
318 |
-
gradient_checkpointing_kwargs={"use_reentrant": False},
|
319 |
-
dataloader_num_workers=0,
|
320 |
-
group_by_length=False, # Disable grouping for debugging
|
321 |
-
max_grad_norm=1.0,
|
322 |
-
)
|
323 |
-
|
324 |
-
# Use a simpler data collator for testing
|
325 |
-
from transformers import default_data_collator
|
326 |
-
|
327 |
-
# Initialize trainer with simplified settings
|
328 |
-
trainer = Trainer(
|
329 |
-
model=model,
|
330 |
-
args=training_args,
|
331 |
-
train_dataset=train_dataset,
|
332 |
-
data_collator=default_data_collator, # Use default collator for testing
|
333 |
-
)
|
334 |
-
|
335 |
-
# Print memory status before training
|
336 |
-
progress(0.3, desc="Ready to train, checking memory...")
|
337 |
-
for i in range(torch.cuda.device_count()):
|
338 |
-
print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
|
339 |
-
|
340 |
try:
|
341 |
-
#
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
print("Starting training with verbose logging...")
|
358 |
-
progress(0.4, desc="Starting training (this may take a while for the first step)...")
|
359 |
|
360 |
-
#
|
361 |
-
trainer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
|
363 |
-
#
|
364 |
-
|
|
|
365 |
|
366 |
progress(0.9, desc="Initial training successful! You can now run full training.")
|
367 |
return "Initial training completed successfully! The system is working. You can now adjust parameters for a full training run."
|
368 |
|
369 |
-
except TimeoutException as e:
|
370 |
-
return f"Training timed out: {str(e)}. Try reducing model parameters further or switching to a smaller model like LLaMA 3 3B."
|
371 |
-
|
372 |
except Exception as e:
|
373 |
error_msg = str(e)
|
374 |
print(f"Training error: {error_msg}")
|
|
|
280 |
progress(0.1, desc="Loading dataset...")
|
281 |
train_dataset = load_dataset()
|
282 |
|
|
|
|
|
|
|
|
|
283 |
# Initialize trainer with debug flags
|
284 |
progress(0.2, desc="Initializing trainer...")
|
285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
try:
|
287 |
+
# Set up training args with simplified settings
|
288 |
+
training_args = TrainingArguments(
|
289 |
+
output_dir="./results",
|
290 |
+
num_train_epochs=1, # Just 1 epoch for testing
|
291 |
+
per_device_train_batch_size=1, # Minimal batch size
|
292 |
+
gradient_accumulation_steps=4, # Reduce memory pressure
|
293 |
+
warmup_steps=2,
|
294 |
+
logging_steps=1, # Log every step
|
295 |
+
save_steps=10000, # Don't save checkpoints during test
|
296 |
+
learning_rate=2e-4,
|
297 |
+
fp16=False, # Disable mixed precision for stability
|
298 |
+
optim="adamw_torch",
|
299 |
+
report_to="none", # Disable wandb/tensorboard reporting
|
300 |
+
max_steps=3, # Just try 3 steps to see if it works
|
301 |
+
logging_first_step=True, # Force log on first step
|
302 |
+
)
|
|
|
|
|
303 |
|
304 |
+
# Create a simple trainer
|
305 |
+
trainer = Trainer(
|
306 |
+
model=model,
|
307 |
+
args=training_args,
|
308 |
+
train_dataset=train_dataset,
|
309 |
+
data_collator=transformers.DataCollatorForLanguageModeling(
|
310 |
+
tokenizer=None, mlm=False
|
311 |
+
)
|
312 |
+
)
|
313 |
|
314 |
+
# Run training for just 3 steps
|
315 |
+
progress(0.3, desc="Starting training (this may take 5-15 minutes for first step)...")
|
316 |
+
trainer.train()
|
317 |
|
318 |
progress(0.9, desc="Initial training successful! You can now run full training.")
|
319 |
return "Initial training completed successfully! The system is working. You can now adjust parameters for a full training run."
|
320 |
|
|
|
|
|
|
|
321 |
except Exception as e:
|
322 |
error_msg = str(e)
|
323 |
print(f"Training error: {error_msg}")
|