Create model_4_of_10.safetensors
Browse files- model_4_of_10.safetensors +140 -0
model_4_of_10.safetensors
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from safetensors.torch import load_file, save_file
|
3 |
+
import logging
|
4 |
+
from typing import Dict, List, Optional
|
5 |
+
import time
|
6 |
+
from pathlib import Path
|
7 |
+
import sys
|
8 |
+
|
9 |
+
# Enhanced logging setup with rotation
|
10 |
+
logging.basicConfig(
|
11 |
+
level=logging.INFO,
|
12 |
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
13 |
+
handlers=[
|
14 |
+
logging.StreamHandler(sys.stdout),
|
15 |
+
logging.FileHandler("model_operations.log")
|
16 |
+
]
|
17 |
+
)
|
18 |
+
|
19 |
+
class ModelHandler:
|
20 |
+
"""Class to handle model operations with improved efficiency."""
|
21 |
+
|
22 |
+
DEFAULT_CHECKPOINT = Path("Model_4_of_10.safetensors")
|
23 |
+
|
24 |
+
def __init__(self, checkpoint_path: str | Path = DEFAULT_CHECKPOINT):
|
25 |
+
self.checkpoint_path = Path(checkpoint_path)
|
26 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
27 |
+
|
28 |
+
def _log_time(self, operation: str, start_time: float) -> None:
|
29 |
+
"""Helper method for consistent timing logging."""
|
30 |
+
elapsed = time.time() - start_time
|
31 |
+
logging.info(f"{operation} completed in {elapsed:.2f} seconds")
|
32 |
+
|
33 |
+
def load_model(self) -> Dict[str, torch.Tensor]:
|
34 |
+
"""Loads model with memory-efficient handling."""
|
35 |
+
start_time = time.time()
|
36 |
+
try:
|
37 |
+
logging.info(f"Loading model from {self.checkpoint_path}")
|
38 |
+
# Load to CPU first to manage memory, then move to target device
|
39 |
+
model_data = load_file(str(self.checkpoint_path), device="cpu")
|
40 |
+
for key in model_data:
|
41 |
+
model_data[key] = model_data[key].to(self.device)
|
42 |
+
self._log_time("Model loading", start_time)
|
43 |
+
return model_data
|
44 |
+
except Exception as e:
|
45 |
+
logging.error(f"Model loading failed: {str(e)}")
|
46 |
+
raise RuntimeError(f"Failed to load model: {str(e)}") from e
|
47 |
+
|
48 |
+
def save_model(self, model_tensors: Dict[str, torch.Tensor]) -> None:
|
49 |
+
"""Saves model with validation and error handling."""
|
50 |
+
start_time = time.time()
|
51 |
+
try:
|
52 |
+
logging.info(f"Saving model to {self.checkpoint_path}")
|
53 |
+
self.checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
|
54 |
+
save_file(model_tensors, str(self.checkpoint_path))
|
55 |
+
self._log_time("Model saving", start_time)
|
56 |
+
except Exception as e:
|
57 |
+
logging.error(f"Model saving failed: {str(e)}")
|
58 |
+
raise RuntimeError(f"Failed to save model: {str(e)}") from e
|
59 |
+
|
60 |
+
def initialize_model(
|
61 |
+
self,
|
62 |
+
layers: List[int] = [8192, 16384, 32768],
|
63 |
+
dtype: torch.dtype = torch.bfloat16,
|
64 |
+
seed: Optional[int] = 42
|
65 |
+
) -> Dict[str, torch.Tensor]:
|
66 |
+
"""Initializes model with optimized parameters."""
|
67 |
+
if seed is not None:
|
68 |
+
torch.manual_seed(seed)
|
69 |
+
|
70 |
+
model_tensors = {}
|
71 |
+
start_time = time.time()
|
72 |
+
try:
|
73 |
+
for i, size in enumerate(layers, 1):
|
74 |
+
layer_name = f"layer_{i}"
|
75 |
+
logging.info(f"Initializing {layer_name} [{size}x{size}] on {self.device}")
|
76 |
+
# Scaled initialization for better stability
|
77 |
+
tensor = torch.randn(size, size, dtype=dtype, device=self.device) * (1.0 / size ** 0.5)
|
78 |
+
model_tensors[layer_name] = tensor
|
79 |
+
self._log_time("Model initialization", start_time)
|
80 |
+
return model_tensors
|
81 |
+
except Exception as e:
|
82 |
+
logging.error(f"Model initialization failed: {str(e)}")
|
83 |
+
raise RuntimeError(f"Failed to initialize model: {str(e)}") from e
|
84 |
+
|
85 |
+
def verify_model(
|
86 |
+
self,
|
87 |
+
original: Dict[str, torch.Tensor],
|
88 |
+
loaded: Dict[str, torch.Tensor],
|
89 |
+
atol: float = 1e-5,
|
90 |
+
rtol: float = 1e-3
|
91 |
+
) -> bool:
|
92 |
+
"""Verifies model integrity with detailed comparison."""
|
93 |
+
all_match = True
|
94 |
+
for key in original:
|
95 |
+
if key not in loaded:
|
96 |
+
logging.warning(f"Missing tensor: {key}")
|
97 |
+
all_match = False
|
98 |
+
continue
|
99 |
+
|
100 |
+
orig, load = original[key], loaded[key]
|
101 |
+
try:
|
102 |
+
if orig.shape != load.shape:
|
103 |
+
logging.warning(f"Shape mismatch in {key}: {orig.shape} vs {load.shape}")
|
104 |
+
all_match = False
|
105 |
+
continue
|
106 |
+
|
107 |
+
if not torch.allclose(orig, load, atol=atol, rtol=rtol):
|
108 |
+
diff = torch.max(torch.abs(orig - load))
|
109 |
+
logging.warning(f"Mismatch in {key}: max diff = {diff}")
|
110 |
+
all_match = False
|
111 |
+
else:
|
112 |
+
logging.info(f"Tensor {key} verified (shape: {orig.shape})")
|
113 |
+
except Exception as e:
|
114 |
+
logging.error(f"Verification failed for {key}: {str(e)}")
|
115 |
+
all_match = False
|
116 |
+
return all_match
|
117 |
+
|
118 |
+
def main():
|
119 |
+
"""Main execution flow."""
|
120 |
+
try:
|
121 |
+
# Initialize handler
|
122 |
+
handler = ModelHandler()
|
123 |
+
|
124 |
+
# Create and save model
|
125 |
+
model_data = handler.initialize_model()
|
126 |
+
handler.save_model(model_data)
|
127 |
+
|
128 |
+
# Load and verify
|
129 |
+
loaded_model_data = handler.load_model()
|
130 |
+
is_valid = handler.verify_model(model_data, loaded_model_data)
|
131 |
+
|
132 |
+
logging.info(f"Model verification {'passed' if is_valid else 'failed'}")
|
133 |
+
return 0
|
134 |
+
|
135 |
+
except Exception as e:
|
136 |
+
logging.error(f"Execution failed: {str(e)}")
|
137 |
+
return 1
|
138 |
+
|
139 |
+
if __name__ == "__main__":
|
140 |
+
sys.exit(main())
|