Spaces:

Debito
/

mamba-encoder-swarm_app

Sleeping

Debito commited on Aug 3

Commit

6db4d44

verified ·

1 Parent(s): 1535ec7

Upload 2 files

Files changed (2) hide show

config.json ADDED Viewed

+{
+  "architectures": ["MambaSwarmForCausalLM"],
+  "auto_map": {
+    "AutoConfig": "configuration_mamba_swarm.MambaSwarmConfig",
+    "AutoModelForCausalLM": "modeling_mamba_swarm.MambaSwarmForCausalLM"
+  },
+  "model_type": "mamba_swarm",
+  "num_mamba_encoders": 5,
+  "max_mamba_encoders": 1000,
+  "d_model": 768,
+  "d_state": 16,
+  "d_conv": 4,
+  "expand_factor": 2,
+  "vocab_size": 50257,
+  "max_sequence_length": 2048,
+  "pad_token_id": 50256,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.36.0",
+  "use_cache": true,
+  "gating_config": {
+    "gating_type": "learned",
+    "top_k": 2,
+    "load_balancing_loss_coef": 0.01
+  },
+  "routing_config": {
+    "routing_strategy": "dynamic",
+    "aggregation_method": "weighted_average"
+  }
+}

config.py ADDED Viewed

+# =============================================================================
+# core/config.py
+# =============================================================================
+import torch
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+@dataclass
+class MambaConfig:
+    # Model architecture
+    vocab_size: int = 50257
+    d_model: int = 1024
+    n_layers: int = 12
+    d_inner: int = 2048
+    d_state: int = 16
+    d_conv: int = 4
+    dt_rank: Optional[int] = None
+    bias: bool = False
+    conv_bias: bool = True
+    # Training
+    max_seq_len: int = 2048
+    batch_size: int = 8
+    learning_rate: float = 1e-4
+    weight_decay: float = 0.1
+    warmup_steps: int = 1000
+    max_steps: int = 100000
+    # Swarm specific
+    num_specialists: int = 100
+    specialist_domains: List[str] = None
+    shared_embedding: bool = True
+    hierarchical_sharing: bool = True
+    # Hardware
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype: torch.dtype = torch.float16
+    def __post_init__(self):
+        if self.dt_rank is None:
+            self.dt_rank = max(16, self.d_model // 16)
+        if self.specialist_domains is None:
+            self.specialist_domains = [f"domain_{i}" for i in range(self.num_specialists)]