GeminiFan207 commited on
Commit
7a92675
·
verified ·
1 Parent(s): e122497

Create model_4_of_10.safetensors

Browse files
Files changed (1) hide show
  1. model_4_of_10.safetensors +140 -0
model_4_of_10.safetensors ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from safetensors.torch import load_file, save_file
3
+ import logging
4
+ from typing import Dict, List, Optional
5
+ import time
6
+ from pathlib import Path
7
+ import sys
8
+
9
+ # Enhanced logging setup with rotation
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format="%(asctime)s - %(levelname)s - %(message)s",
13
+ handlers=[
14
+ logging.StreamHandler(sys.stdout),
15
+ logging.FileHandler("model_operations.log")
16
+ ]
17
+ )
18
+
19
+ class ModelHandler:
20
+ """Class to handle model operations with improved efficiency."""
21
+
22
+ DEFAULT_CHECKPOINT = Path("Model_4_of_10.safetensors")
23
+
24
+ def __init__(self, checkpoint_path: str | Path = DEFAULT_CHECKPOINT):
25
+ self.checkpoint_path = Path(checkpoint_path)
26
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+
28
+ def _log_time(self, operation: str, start_time: float) -> None:
29
+ """Helper method for consistent timing logging."""
30
+ elapsed = time.time() - start_time
31
+ logging.info(f"{operation} completed in {elapsed:.2f} seconds")
32
+
33
+ def load_model(self) -> Dict[str, torch.Tensor]:
34
+ """Loads model with memory-efficient handling."""
35
+ start_time = time.time()
36
+ try:
37
+ logging.info(f"Loading model from {self.checkpoint_path}")
38
+ # Load to CPU first to manage memory, then move to target device
39
+ model_data = load_file(str(self.checkpoint_path), device="cpu")
40
+ for key in model_data:
41
+ model_data[key] = model_data[key].to(self.device)
42
+ self._log_time("Model loading", start_time)
43
+ return model_data
44
+ except Exception as e:
45
+ logging.error(f"Model loading failed: {str(e)}")
46
+ raise RuntimeError(f"Failed to load model: {str(e)}") from e
47
+
48
+ def save_model(self, model_tensors: Dict[str, torch.Tensor]) -> None:
49
+ """Saves model with validation and error handling."""
50
+ start_time = time.time()
51
+ try:
52
+ logging.info(f"Saving model to {self.checkpoint_path}")
53
+ self.checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
54
+ save_file(model_tensors, str(self.checkpoint_path))
55
+ self._log_time("Model saving", start_time)
56
+ except Exception as e:
57
+ logging.error(f"Model saving failed: {str(e)}")
58
+ raise RuntimeError(f"Failed to save model: {str(e)}") from e
59
+
60
+ def initialize_model(
61
+ self,
62
+ layers: List[int] = [8192, 16384, 32768],
63
+ dtype: torch.dtype = torch.bfloat16,
64
+ seed: Optional[int] = 42
65
+ ) -> Dict[str, torch.Tensor]:
66
+ """Initializes model with optimized parameters."""
67
+ if seed is not None:
68
+ torch.manual_seed(seed)
69
+
70
+ model_tensors = {}
71
+ start_time = time.time()
72
+ try:
73
+ for i, size in enumerate(layers, 1):
74
+ layer_name = f"layer_{i}"
75
+ logging.info(f"Initializing {layer_name} [{size}x{size}] on {self.device}")
76
+ # Scaled initialization for better stability
77
+ tensor = torch.randn(size, size, dtype=dtype, device=self.device) * (1.0 / size ** 0.5)
78
+ model_tensors[layer_name] = tensor
79
+ self._log_time("Model initialization", start_time)
80
+ return model_tensors
81
+ except Exception as e:
82
+ logging.error(f"Model initialization failed: {str(e)}")
83
+ raise RuntimeError(f"Failed to initialize model: {str(e)}") from e
84
+
85
+ def verify_model(
86
+ self,
87
+ original: Dict[str, torch.Tensor],
88
+ loaded: Dict[str, torch.Tensor],
89
+ atol: float = 1e-5,
90
+ rtol: float = 1e-3
91
+ ) -> bool:
92
+ """Verifies model integrity with detailed comparison."""
93
+ all_match = True
94
+ for key in original:
95
+ if key not in loaded:
96
+ logging.warning(f"Missing tensor: {key}")
97
+ all_match = False
98
+ continue
99
+
100
+ orig, load = original[key], loaded[key]
101
+ try:
102
+ if orig.shape != load.shape:
103
+ logging.warning(f"Shape mismatch in {key}: {orig.shape} vs {load.shape}")
104
+ all_match = False
105
+ continue
106
+
107
+ if not torch.allclose(orig, load, atol=atol, rtol=rtol):
108
+ diff = torch.max(torch.abs(orig - load))
109
+ logging.warning(f"Mismatch in {key}: max diff = {diff}")
110
+ all_match = False
111
+ else:
112
+ logging.info(f"Tensor {key} verified (shape: {orig.shape})")
113
+ except Exception as e:
114
+ logging.error(f"Verification failed for {key}: {str(e)}")
115
+ all_match = False
116
+ return all_match
117
+
118
+ def main():
119
+ """Main execution flow."""
120
+ try:
121
+ # Initialize handler
122
+ handler = ModelHandler()
123
+
124
+ # Create and save model
125
+ model_data = handler.initialize_model()
126
+ handler.save_model(model_data)
127
+
128
+ # Load and verify
129
+ loaded_model_data = handler.load_model()
130
+ is_valid = handler.verify_model(model_data, loaded_model_data)
131
+
132
+ logging.info(f"Model verification {'passed' if is_valid else 'failed'}")
133
+ return 0
134
+
135
+ except Exception as e:
136
+ logging.error(f"Execution failed: {str(e)}")
137
+ return 1
138
+
139
+ if __name__ == "__main__":
140
+ sys.exit(main())