mamba-encoder-swarm_app / upload_to_hf.py
Debito's picture
Upload 4 files
7aad614 verified
# upload_to_hf.py - Script to upload your Mamba Swarm to HuggingFace
import os
import shutil
from huggingface_hub import HfApi, upload_folder
import json
# Import the actual model classes
from modeling_mamba_swarm import MambaSwarmForCausalLM, MambaSwarmConfig
def prepare_model_repo():
"""Prepare model repository structure for HuggingFace"""
# Create required files for HuggingFace model
model_files = {
"README.md": create_model_readme(),
"config.json": create_model_config(),
"requirements.txt": create_requirements(),
"modeling_mamba_swarm.py": create_modeling_file()
}
# Create model repo directory
os.makedirs("hf_model_repo", exist_ok=True)
# Copy your mamba_swarm code
shutil.copytree("mamba_swarm", "hf_model_repo/mamba_swarm", dirs_exist_ok=True)
# Create HuggingFace specific files
for filename, content in model_files.items():
with open(f"hf_model_repo/{filename}", "w") as f:
f.write(content)
print("Model repository prepared!")
def create_model_readme():
return """---
license: apache-2.0
language:
- en
pipeline_tag: text-generation
tags:
- mamba
- swarm
- routing
- language-model
---
# Mamba Swarm: Dynamic Routing Language Model
A novel architecture combining 100 specialized Mamba encoders with dynamic routing and aggregation for efficient language modeling.
## Architecture
- **100 Mamba Encoders**: Specialized domain experts
- **Dynamic Router**: Selects relevant encoders per input
- **Aggregation Layer**: Combines encoder outputs
- **Mamba Decoder**: Generates final responses
## Usage
```python
from transformers import AutoModel, AutoTokenizer
from mamba_swarm import MambaSwarmEngine
# Load the model
model = MambaSwarmEngine.from_pretrained("your-username/mamba-swarm-model")
tokenizer = AutoTokenizer.from_pretrained("your-username/mamba-swarm-model")
# Generate text
input_text = "Explain quantum computing"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs, max_length=100)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
```
## Training
This model uses a three-phase training approach:
1. Collective pre-training on general data
2. Domain specialization for encoder groups
3. End-to-end coordination training
## Performance
- **Parameters**: ~7B total (100 × 70M encoders)
- **Domains**: Medical, Legal, Code, Science, General
- **Routing Efficiency**: Only 10-20% of encoders active per query
## Citation
```
@misc{mamba-swarm-2025,
title={Mamba Swarm: Dynamic Routing for Efficient Language Modeling},
author={Your Name},
year={2025}
}
```
"""
def create_model_config():
config = {
"model_type": "mamba_swarm",
"architectures": ["MambaSwarmForCausalLM"],
"num_encoders": 100,
"encoder_config": {
"d_model": 768,
"n_layer": 24,
"vocab_size": 50280,
"ssm_cfg": {},
"rms_norm": True,
"residual_in_fp32": True,
"fused_add_norm": True
},
"router_config": {
"top_k": 10,
"routing_strategy": "content_based"
},
"aggregator_config": {
"method": "weighted_sum",
"attention_heads": 8
},
"torch_dtype": "float16",
"use_cache": True
}
return json.dumps(config, indent=2)
def create_requirements():
return """torch>=2.0.0
transformers>=4.35.0
mamba-ssm>=1.2.0
causal-conv1d>=1.2.0
numpy>=1.21.0
scipy>=1.7.0
triton>=2.0.0
einops>=0.6.1
packaging>=20.0
"""
def create_modeling_file():
return """# modeling_mamba_swarm.py - HuggingFace integration
from transformers import PreTrainedModel, PretrainedConfig
from transformers.modeling_outputs import CausalLMOutputWithPast
import torch
import torch.nn as nn
class MambaSwarmConfig(PretrainedConfig):
model_type = "mamba_swarm"
def __init__(
self,
num_encoders=100,
encoder_config=None,
router_config=None,
aggregator_config=None,
**kwargs
):
self.num_encoders = num_encoders
self.encoder_config = encoder_config or {}
self.router_config = router_config or {}
self.aggregator_config = aggregator_config or {}
super().__init__(**kwargs)
class MambaSwarmForCausalLM(PreTrainedModel):
config_class = MambaSwarmConfig
def __init__(self, config):
super().__init__(config)
# Import your actual implementation
from mamba_swarm.system.swarm_engine import MambaSwarmEngine
self.swarm_engine = MambaSwarmEngine(config)
def forward(
self,
input_ids=None,
attention_mask=None,
labels=None,
**kwargs
):
# Your forward pass implementation
outputs = self.swarm_engine(input_ids, attention_mask)
loss = None
if labels is not None:
# Calculate loss
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(outputs.logits.view(-1, outputs.logits.size(-1)), labels.view(-1))
return CausalLMOutputWithPast(
loss=loss,
logits=outputs.logits,
past_key_values=outputs.past_key_values,
)
def generate(self, *args, **kwargs):
return self.swarm_engine.generate(*args, **kwargs)
@classmethod
def from_pretrained(cls, model_name_or_path, *model_args, **kwargs):
# Custom loading logic if needed
return super().from_pretrained(model_name_or_path, *model_args, **kwargs)
"""
def upload_model():
"""Upload model code to HuggingFace"""
api = HfApi()
# Upload model repository
upload_folder(
folder_path="hf_model_repo",
repo_id="your-username/mamba-swarm-model", # Replace with your username
repo_type="model",
commit_message="Initial upload of Mamba Swarm model"
)
print("Model uploaded successfully!")
def upload_weights():
"""Upload model weights separately"""
# This assumes you have trained weights in checkpoints/
api = HfApi()
upload_folder(
folder_path="checkpoints",
repo_id="your-username/mamba-swarm-weights", # Replace with your username
repo_type="model",
commit_message="Upload trained model weights"
)
print("Weights uploaded successfully!")
if __name__ == "__main__":
prepare_model_repo()
upload_model()
# upload_weights() # Uncomment when you have trained weights