ZeppelinCorp
/

Charm_15

@@ -1,67 +1,128 @@
-import os
 import torch
-from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
-from safetensors.torch import save_file
-# Define model and output settings
-model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"  # Corrected to a real model
-output_dir = "mixtral_8x7b_safetensors"
-max_shard_size = 2 * 1024 * 1024 * 1024  # 2GB per shard in bytes
-dtype = torch.float16  # Half-precision to save space
-# Create output directory
-os.makedirs(output_dir, exist_ok=True)
-try:
-    # Load config and tokenizer first (low memory footprint)
-    config = AutoConfig.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    # Save config and tokenizer for later use
-    config.save_pretrained(output_dir)
-    tokenizer.save_pretrained(output_dir)
-    # Load model with offloading to avoid OOM (if GPU available)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        torch_dtype=dtype,
-        device_map="auto",  # Auto-distribute across GPU/CPU
-        low_cpu_mem_usage=True  # Reduce RAM usage during load
-    )
-    # Get state dict
-    state_dict = model.state_dict()
-    # Estimate total size and shard dynamically
-    total_size = sum(t.element_size() * t.nelement() for t in state_dict.values())
-    num_shards = max(1, int(total_size / max_shard_size) + 1)  # At least 1 shard
-    # Distribute parameters by size, not count
-    shards = [{} for _ in range(num_shards)]
-    current_size = [0] * num_shards
-    shard_index = 0
-    for key, value in state_dict.items():
-        tensor_size = value.element_size() * value.nelement()
-        # Move to next shard if current one exceeds size limit
-        while current_size[shard_index] + tensor_size > max_shard_size and shard_index < num_shards - 1:
-            shard_index += 1
-        shards[shard_index][key] = value
-        current_size[shard_index] += tensor_size
-    # Save each shard
-    for i, shard in enumerate(shards):
-        if shard:  # Only save non-empty shards
-            shard_path = os.path.join(output_dir, f"model_shard_{i}.safetensors")
-            save_file(shard, shard_path)
-            print(f"Saved shard {i} to {shard_path}")
-    print(f"Model saved to {output_dir} with {len([s for s in shards if s])} shards")
-except Exception as e:
-    print(f"Error occurred: {str(e)}")
-finally:
-    # Clean up memory
-    if 'model' in locals():
-        del model
-        torch.cuda.empty_cache()  # Clear GPU memory if used

+from safetensors.torch import load_file, save_file
 import torch
+from typing import List, Dict, Optional
+import logging
+from tqdm import tqdm
+import os
+import hashlib
+from concurrent.futures import ThreadPoolExecutor, as_completed
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+def calculate_checksum(file_path: str) -> str:
+    """
+    Calculate the SHA-256 checksum of a file.
+    Args:
+        file_path (str): Path to the file.
+    Returns:
+        str: SHA-256 checksum of the file.
+    """
+    sha256 = hashlib.sha256()
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            sha256.update(chunk)
+    return sha256.hexdigest()
+def verify_checksums(model_parts: List[str], expected_checksums: List[str]) -> None:
+    """
+    Verify the checksums of model part files.
+    Args:
+        model_parts (list): List of model part file paths.
+        expected_checksums (list): List of expected checksums for each part.
+    Raises:
+        RuntimeError: If any checksum does not match.
+    """
+    for part, expected_checksum in zip(model_parts, expected_checksums):
+        actual_checksum = calculate_checksum(part)
+        if actual_checksum != expected_checksum:
+            raise RuntimeError(f"Checksum mismatch for {part}: expected {expected_checksum}, got {actual_checksum}")
+def load_part(part: str) -> Dict[str, torch.Tensor]:
+    """
+    Load a single model part.
+    Args:
+        part (str): Path to the model part file.
+    Returns:
+        dict: State dictionary of the model part.
+    """
+    return load_file(part)
+def load_charm_model(model_parts: List[str], expected_checksums: Optional[List[str]] = None) -> Dict[str, torch.Tensor]:
+    """
+    Load and merge multiple .safetensors model files.
+    Args:
+        model_parts (list): List of model part file paths (e.g., ["model-1-of-10.safetensors", ...]).
+        expected_checksums (list, optional): List of expected checksums for each part.
+    Returns:
+        dict: Merged model state dictionary.
+    Raises:
+        FileNotFoundError: If any model part file is missing.
+        RuntimeError: If there is an issue loading or merging the model parts.
+    """
+    merged_state_dict = {}
+    # Check if all model parts exist
+    for part in model_parts:
+        if not os.path.exists(part):
+            raise FileNotFoundError(f"Model part not found: {part}")
+    # Verify checksums if provided
+    if expected_checksums:
+        logger.info("Verifying checksums...")
+        verify_checksums(model_parts, expected_checksums)
+        logger.info("Checksums verified successfully.")
+    # Load and merge model parts in parallel
+    try:
+        logger.info("Loading and merging model parts...")
+        with ThreadPoolExecutor() as executor:
+            futures = {executor.submit(load_part, part): part for part in model_parts}
+            for future in tqdm(as_completed(futures), total=len(futures), desc="Loading model parts"):
+                part = futures[future]
+                try:
+                    state_dict = future.result()
+                    merged_state_dict.update(state_dict)  # Merge parameters
+                    logger.debug(f"Loaded part: {part}")
+                except Exception as e:
+                    logger.error(f"Error loading part {part}: {e}")
+                    raise RuntimeError(f"Failed to load part: {part}")
+        logger.info("Model parts loaded and merged successfully.")
+        return merged_state_dict
+    except Exception as e:
+        logger.error(f"Error loading or merging model parts: {e}")
+        raise RuntimeError("Failed to load or merge model parts.")
+# Example usage
+if __name__ == "__main__":
+    try:
+        # List of model part files
+        model_files = [f"model-{i}-of-10.safetensors" for i in range(1, 11)]
+        # Optional: List of expected checksums for each part
+        expected_checksums = [
+            "checksum_for_model-1-of-10.safetensors",
+            "checksum_for_model-2-of-10.safetensors",
+            # Add checksums for all parts...
+        ]
+        # Load and merge the model
+        charm_model = load_charm_model(model_files, expected_checksums)
+        # Save the merged model as a .safetensors file
+        output_file = "merged_model.safetensors"
+        save_file(charm_model, output_file)
+        logger.info(f"Merged model saved as '{output_file}'.")
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")