Julian Bilcke
commited on
Commit
·
d2fbbb1
1
Parent(s):
79aa37d
add more logs
Browse files
vms/ui/project/services/training.py
CHANGED
|
@@ -1495,10 +1495,16 @@ class TrainingService:
|
|
| 1495 |
# Check in lora_weights directory
|
| 1496 |
lora_weights_dir = self.app.output_path / "lora_weights"
|
| 1497 |
if lora_weights_dir.exists():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1498 |
lora_safetensors = lora_weights_dir / "pytorch_lora_weights.safetensors"
|
| 1499 |
if lora_safetensors.exists():
|
| 1500 |
logger.info(f"Found weights in lora_weights directory: {lora_safetensors}")
|
| 1501 |
return str(lora_safetensors)
|
|
|
|
|
|
|
| 1502 |
|
| 1503 |
# If not found in root or lora_weights, log the issue
|
| 1504 |
logger.warning(f"Model weights not found at expected location: {model_output_safetensors_path}")
|
|
@@ -1509,10 +1515,18 @@ class TrainingService:
|
|
| 1509 |
if checkpoints:
|
| 1510 |
logger.info(f"Found {len(checkpoints)} checkpoint directories, but main weights file is missing")
|
| 1511 |
latest_checkpoint = max(checkpoints, key=lambda x: int(x.name.split("_")[-1]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1512 |
checkpoint_weights = latest_checkpoint / "pytorch_lora_weights.safetensors"
|
| 1513 |
if checkpoint_weights.exists():
|
| 1514 |
logger.info(f"Found weights in latest checkpoint: {checkpoint_weights}")
|
| 1515 |
return str(checkpoint_weights)
|
|
|
|
|
|
|
| 1516 |
|
| 1517 |
return None
|
| 1518 |
|
|
|
|
| 1495 |
# Check in lora_weights directory
|
| 1496 |
lora_weights_dir = self.app.output_path / "lora_weights"
|
| 1497 |
if lora_weights_dir.exists():
|
| 1498 |
+
logger.info(f"Found lora_weights directory: {lora_weights_dir}")
|
| 1499 |
+
lora_weights_contents = list(lora_weights_dir.glob("*"))
|
| 1500 |
+
logger.info(f"Contents of lora_weights directory: {lora_weights_contents}")
|
| 1501 |
+
|
| 1502 |
lora_safetensors = lora_weights_dir / "pytorch_lora_weights.safetensors"
|
| 1503 |
if lora_safetensors.exists():
|
| 1504 |
logger.info(f"Found weights in lora_weights directory: {lora_safetensors}")
|
| 1505 |
return str(lora_safetensors)
|
| 1506 |
+
else:
|
| 1507 |
+
logger.info(f"pytorch_lora_weights.safetensors not found in lora_weights directory")
|
| 1508 |
|
| 1509 |
# If not found in root or lora_weights, log the issue
|
| 1510 |
logger.warning(f"Model weights not found at expected location: {model_output_safetensors_path}")
|
|
|
|
| 1515 |
if checkpoints:
|
| 1516 |
logger.info(f"Found {len(checkpoints)} checkpoint directories, but main weights file is missing")
|
| 1517 |
latest_checkpoint = max(checkpoints, key=lambda x: int(x.name.split("_")[-1]))
|
| 1518 |
+
logger.info(f"Latest checkpoint directory: {latest_checkpoint}")
|
| 1519 |
+
|
| 1520 |
+
# Log contents of latest checkpoint
|
| 1521 |
+
checkpoint_contents = list(latest_checkpoint.glob("*"))
|
| 1522 |
+
logger.info(f"Contents of latest checkpoint {latest_checkpoint.name}: {checkpoint_contents}")
|
| 1523 |
+
|
| 1524 |
checkpoint_weights = latest_checkpoint / "pytorch_lora_weights.safetensors"
|
| 1525 |
if checkpoint_weights.exists():
|
| 1526 |
logger.info(f"Found weights in latest checkpoint: {checkpoint_weights}")
|
| 1527 |
return str(checkpoint_weights)
|
| 1528 |
+
else:
|
| 1529 |
+
logger.info(f"pytorch_lora_weights.safetensors not found in checkpoint directory")
|
| 1530 |
|
| 1531 |
return None
|
| 1532 |
|