Charm_15 / model_4_of_278.safetensors
GeminiFan207's picture
Rename model_4_of_10.safetensors to model_4_of_278.safetensors
ec8f409 verified
raw
history blame
5.42 kB
import torch
from safetensors.torch import load_file, save_file
import logging
from typing import Dict, List, Optional
import time
from pathlib import Path
import sys
# Enhanced logging setup with rotation
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler("model_operations.log")
]
)
class ModelHandler:
"""Class to handle model operations with improved efficiency."""
DEFAULT_CHECKPOINT = Path("Model_4_of_10.safetensors")
def __init__(self, checkpoint_path: str | Path = DEFAULT_CHECKPOINT):
self.checkpoint_path = Path(checkpoint_path)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def _log_time(self, operation: str, start_time: float) -> None:
"""Helper method for consistent timing logging."""
elapsed = time.time() - start_time
logging.info(f"{operation} completed in {elapsed:.2f} seconds")
def load_model(self) -> Dict[str, torch.Tensor]:
"""Loads model with memory-efficient handling."""
start_time = time.time()
try:
logging.info(f"Loading model from {self.checkpoint_path}")
# Load to CPU first to manage memory, then move to target device
model_data = load_file(str(self.checkpoint_path), device="cpu")
for key in model_data:
model_data[key] = model_data[key].to(self.device)
self._log_time("Model loading", start_time)
return model_data
except Exception as e:
logging.error(f"Model loading failed: {str(e)}")
raise RuntimeError(f"Failed to load model: {str(e)}") from e
def save_model(self, model_tensors: Dict[str, torch.Tensor]) -> None:
"""Saves model with validation and error handling."""
start_time = time.time()
try:
logging.info(f"Saving model to {self.checkpoint_path}")
self.checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
save_file(model_tensors, str(self.checkpoint_path))
self._log_time("Model saving", start_time)
except Exception as e:
logging.error(f"Model saving failed: {str(e)}")
raise RuntimeError(f"Failed to save model: {str(e)}") from e
def initialize_model(
self,
layers: List[int] = [8192, 16384, 32768],
dtype: torch.dtype = torch.bfloat16,
seed: Optional[int] = 42
) -> Dict[str, torch.Tensor]:
"""Initializes model with optimized parameters."""
if seed is not None:
torch.manual_seed(seed)
model_tensors = {}
start_time = time.time()
try:
for i, size in enumerate(layers, 1):
layer_name = f"layer_{i}"
logging.info(f"Initializing {layer_name} [{size}x{size}] on {self.device}")
# Scaled initialization for better stability
tensor = torch.randn(size, size, dtype=dtype, device=self.device) * (1.0 / size ** 0.5)
model_tensors[layer_name] = tensor
self._log_time("Model initialization", start_time)
return model_tensors
except Exception as e:
logging.error(f"Model initialization failed: {str(e)}")
raise RuntimeError(f"Failed to initialize model: {str(e)}") from e
def verify_model(
self,
original: Dict[str, torch.Tensor],
loaded: Dict[str, torch.Tensor],
atol: float = 1e-5,
rtol: float = 1e-3
) -> bool:
"""Verifies model integrity with detailed comparison."""
all_match = True
for key in original:
if key not in loaded:
logging.warning(f"Missing tensor: {key}")
all_match = False
continue
orig, load = original[key], loaded[key]
try:
if orig.shape != load.shape:
logging.warning(f"Shape mismatch in {key}: {orig.shape} vs {load.shape}")
all_match = False
continue
if not torch.allclose(orig, load, atol=atol, rtol=rtol):
diff = torch.max(torch.abs(orig - load))
logging.warning(f"Mismatch in {key}: max diff = {diff}")
all_match = False
else:
logging.info(f"Tensor {key} verified (shape: {orig.shape})")
except Exception as e:
logging.error(f"Verification failed for {key}: {str(e)}")
all_match = False
return all_match
def main():
"""Main execution flow."""
try:
# Initialize handler
handler = ModelHandler()
# Create and save model
model_data = handler.initialize_model()
handler.save_model(model_data)
# Load and verify
loaded_model_data = handler.load_model()
is_valid = handler.verify_model(model_data, loaded_model_data)
logging.info(f"Model verification {'passed' if is_valid else 'failed'}")
return 0
except Exception as e:
logging.error(f"Execution failed: {str(e)}")
return 1
if __name__ == "__main__":
sys.exit(main())