Spaces:

Debito
/

mamba-encoder-swarm_app

Sleeping

App Files Files Community

Debito commited on 12 days ago

Commit

fcf0a07

verified ·

1 Parent(s): 687ec98

Upload 4 files

Browse files

Files changed (4) hide show

system/inference.py +138 -0
system/mambaSwarm.py +816 -0
system/memory_manager.py +306 -0
system/weight_manager.py +168 -0

system/inference.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# =============================================================================
+# system/inference.py
+# =============================================================================
+import torch
+from typing import Dict, List, Optional, Union
+import time
+class MambaInferenceEngine:
+    """Optimized inference engine for Mamba swarm"""
+    def __init__(self, swarm_engine):
+        self.swarm_engine = swarm_engine
+        self.config = swarm_engine.config
+        # Inference optimizations
+        self.use_half_precision = True
+        self.use_torch_compile = hasattr(torch, 'compile')
+        # Apply optimizations
+        self._optimize_models()
+    def _optimize_models(self):
+        """Apply inference optimizations"""
+        if self.use_half_precision and self.config.device != 'cpu':
+            # Convert to half precision for faster inference
+            for specialist in self.swarm_engine.tlm_manager.specialists.values():
+                specialist.model = specialist.model.half()
+            self.swarm_engine.aggregator = self.swarm_engine.aggregator.half()
+        if self.use_torch_compile:
+            try:
+                # Compile models for faster inference (PyTorch 2.0+)
+                for specialist in self.swarm_engine.tlm_manager.specialists.values():
+                    specialist.model = torch.compile(specialist.model)
+                self.swarm_engine.aggregator = torch.compile(self.swarm_engine.aggregator)
+                print("Models compiled for faster inference")
+            except Exception as e:
+                print(f"Could not compile models: {e}")
+    def generate(self, prompt: str, max_tokens: int = 100,
+                temperature: float = 0.7, top_k: int = 50) -> Dict:
+        """
+        Generate text response with advanced sampling
+        Args:
+            prompt: Input text prompt
+            max_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_k: Top-k sampling parameter
+        Returns:
+            Dict with generated text and metadata
+        """
+        start_time = time.time()
+        # Process through swarm
+        result = self.swarm_engine.process_request(prompt, max_tokens)
+        if not result['success']:
+            return result
+        # Add inference metadata
+        result.update({
+            'temperature': temperature,
+            'top_k': top_k,
+            'inference_time': time.time() - start_time,
+            'tokens_per_second': max_tokens / (time.time() - start_time)
+        })
+        return result
+    def stream_generate(self, prompt: str, max_tokens: int = 100):
+        """
+        Stream generation token by token (placeholder implementation)
+        """
+        # This would implement streaming generation
+        # For now, return the full response
+        result = self.generate(prompt, max_tokens)
+        yield result['response']
+    def chat_completion(self, messages: List[Dict], max_tokens: int = 100) -> Dict:
+        """
+        Chat completion interface similar to OpenAI API
+        Args:
+            messages: List of message dicts with 'role' and 'content'
+            max_tokens: Maximum tokens to generate
+        Returns:
+            Chat completion response
+        """
+        # Convert messages to single prompt
+        prompt = self._format_chat_prompt(messages)
+        # Generate response
+        result = self.generate(prompt, max_tokens)
+        if result['success']:
+            # Format as chat completion
+            return {
+                'choices': [{
+                    'message': {
+                        'role': 'assistant',
+                        'content': result['response']
+                    },
+                    'finish_reason': 'stop'
+                }],
+                'usage': {
+                    'prompt_tokens': len(prompt.split()),
+                    'completion_tokens': len(result['response'].split()),
+                    'total_tokens': len(prompt.split()) + len(result['response'].split())
+                },
+                'model': 'mamba-swarm-70m',
+                'inference_time': result.get('inference_time', 0)
+            }
+        else:
+            return {
+                'error': result.get('error', 'Unknown error'),
+                'success': False
+            }
+    def _format_chat_prompt(self, messages: List[Dict]) -> str:
+        """Format chat messages into a single prompt"""
+        formatted = ""
+        for message in messages:
+            role = message.get('role', 'user')
+            content = message.get('content', '')
+            if role == 'system':
+                formatted += f"System: {content}\n"
+            elif role == 'user':
+                formatted += f"User: {content}\n"
+            elif role == 'assistant':
+                formatted += f"Assistant: {content}\n"
+        formatted += "Assistant: "
+        return formatted

system/mambaSwarm.py ADDED Viewed

	@@ -0,0 +1,816 @@

+# =============================================================================
+# system/mambaSwarm.py - Unified Scalable Mamba Swarm Engine
+# =============================================================================
+import torch
+import time
+import os
+import asyncio
+from typing import Dict, List, Tuple, Optional, Union
+from concurrent.futures import ThreadPoolExecutor
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# Core imports
+from core.config import MambaConfig, MambaSwarmConfig, auto_detect_tier
+from core.tokenizer import MambaTokenizer
+from core.preprocess import TextPreprocessor
+from core.model import MambaModel
+from core.mamba_swarm_integration import MambaEncoderSwarmModel, create_swarm_from_existing_config
+# Routing imports
+from routing.router import TopicRouter, ContentBasedRouter
+from routing.tlm_manager import TLMManager
+from routing.aggregator import AttentionAggregator, WeightedAggregator
+from utils.domain_configs import DomainConfigs
+class UnifiedMambaSwarm:
+    """
+    Unified Mamba Swarm Engine combining the best of both architectures:
+    - Scalable tier-based system with auto-detection
+    - Production-ready async processing and monitoring
+    - Graceful fallback to simulation mode
+    - Support for both custom and pre-trained models
+    """
+    def __init__(self,
+                 tier: Optional[str] = None,
+                 config: Optional[Union[MambaConfig, MambaSwarmConfig]] = None,
+                 use_pretrained: bool = True,
+                 config_override: Optional[Dict] = None):
+        """
+        Initialize the unified swarm engine
+        Args:
+            tier: Scaling tier (demo/small/medium/large/full) or None for auto-detect
+            config: Either MambaConfig for custom models or MambaSwarmConfig for scaling
+            use_pretrained: Whether to use HuggingFace pretrained models
+            config_override: Dictionary to override config settings
+        """
+        # Auto-detect tier if not specified
+        if tier is None:
+            tier = auto_detect_tier()
+            print(f"Auto-detected tier: {tier}")
+        self.tier = tier
+        self.use_pretrained = use_pretrained
+        # Initialize configuration
+        if config is None:
+            if use_pretrained:
+                self.swarm_config = MambaSwarmConfig(tier=tier)
+                if config_override:
+                    self.swarm_config.config.update(config_override)
+                self.config = self._create_legacy_config()
+            else:
+                # Use custom config for legacy components
+                self.config = MambaConfig()  # Default config
+                self.swarm_config = None
+        else:
+            if isinstance(config, MambaSwarmConfig):
+                self.swarm_config = config
+                self.config = self._create_legacy_config()
+            else:
+                self.config = config
+                self.swarm_config = None
+        self.device = getattr(self.config, 'device', 'cuda' if torch.cuda.is_available() else 'cpu')
+        # System properties
+        if self.swarm_config:
+            self.num_encoders = self.swarm_config.config["num_encoders"]
+            self.encoder_size = self.swarm_config.config["encoder_size"]
+        else:
+            self.num_encoders = getattr(self.config, 'num_specialists', 5)
+            self.encoder_size = "130M"
+        # Initialize components
+        self.encoders = []
+        self.tokenizer = None
+        self.preprocessor = None
+        self.router = None
+        self.aggregator = None
+        self.tlm_manager = None
+        # Performance tracking
+        self.stats = {
+            'total_requests': 0,
+            'total_tokens_processed': 0,
+            'avg_response_time': 0.0,
+            'specialist_usage': {i: 0 for i in range(self.num_encoders)},
+            'simulation_mode': False,
+            'model_load_errors': 0
+        }
+        # Initialize system
+        self._initialize_system()
+        print(f"✅ Unified Mamba Swarm initialized: {self.tier} tier, {self.num_encoders} encoders")
+    def _create_legacy_config(self) -> MambaConfig:
+        """Create legacy MambaConfig from SwarmConfig for compatibility"""
+        legacy_config = MambaConfig()
+        if self.swarm_config:
+            legacy_config.num_specialists = self.swarm_config.config["num_encoders"]
+            legacy_config.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        return legacy_config
+    def _initialize_system(self):
+        """Initialize the complete swarm system"""
+        try:
+            # Initialize tokenizer and preprocessor
+            self._initialize_tokenizer()
+            self._initialize_preprocessor()
+            # Initialize encoders/specialists
+            if self.use_pretrained:
+                self._initialize_pretrained_encoders()
+            else:
+                self._initialize_custom_specialists()
+            # Initialize routing system
+            self._initialize_routing()
+            # Initialize aggregation system
+            self._initialize_aggregation()
+            print(f"🚀 System initialization complete!")
+        except Exception as e:
+            print(f"⚠️  Error during initialization: {e}")
+            self._fallback_to_simulation()
+    def _initialize_tokenizer(self):
+        """Initialize tokenizer based on mode"""
+        if self.use_pretrained:
+            base_model_name = self._get_base_model_name()
+            try:
+                self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
+                if self.tokenizer.pad_token is None:
+                    self.tokenizer.pad_token = self.tokenizer.eos_token
+                print(f"📝 Loaded HuggingFace tokenizer: {base_model_name}")
+            except:
+                print("⚠️  HuggingFace tokenizer failed, using custom tokenizer")
+                self.tokenizer = MambaTokenizer(self.config)
+        else:
+            self.tokenizer = MambaTokenizer(self.config)
+    def _initialize_preprocessor(self):
+        """Initialize text preprocessor"""
+        self.preprocessor = TextPreprocessor(self.config)
+    def _get_base_model_name(self):
+        """Get the appropriate base model for current tier"""
+        model_mapping = {
+            "130M": "state-spaces/mamba-130m",
+            "370M": "state-spaces/mamba-370m",
+            "790M": "state-spaces/mamba-790m",
+            "1.4B": "state-spaces/mamba-1.4b",
+            "2.8B": "state-spaces/mamba-2.8b"
+        }
+        return model_mapping.get(self.encoder_size, "state-spaces/mamba-130m")
+    def _initialize_pretrained_encoders(self):
+        """Initialize pretrained encoder swarm"""
+        print(f"🔄 Loading {self.num_encoders} pretrained encoders...")
+        base_model_name = self._get_base_model_name()
+        try:
+            # Load base model
+            base_model = AutoModelForCausalLM.from_pretrained(
+                base_model_name,
+                torch_dtype=torch.float16 if self.num_encoders > 5 else torch.float32,
+                device_map="auto" if torch.cuda.is_available() else "cpu"
+            )
+            # Create encoder instances
+            for i in range(self.num_encoders):
+                domain_info = self.swarm_config.domain_assignments[i] if self.swarm_config else {
+                    "domain": f"general_{i}", "specialty": "general"
+                }
+                if self.tier == "demo" or self.num_encoders <= 5:
+                    # Share model instance for smaller configurations
+                    encoder = {
+                        "id": i,
+                        "model": base_model,
+                        "domain": domain_info["domain"],
+                        "specialty": domain_info["specialty"],
+                        "shared": True
+                    }
+                else:
+                    # Separate instances for larger configurations
+                    encoder = {
+                        "id": i,
+                        "model": AutoModelForCausalLM.from_pretrained(
+                            base_model_name,
+                            torch_dtype=torch.float16,
+                            device_map="auto"
+                        ),
+                        "domain": domain_info["domain"],
+                        "specialty": domain_info["specialty"],
+                        "shared": False
+                    }
+                self.encoders.append(encoder)
+                print(f"  ✓ Encoder {i}: {encoder['domain']} specialist")
+        except Exception as e:
+            print(f"❌ Failed to load pretrained models: {e}")
+            self.stats['model_load_errors'] += 1
+            self._create_simulated_encoders()
+    def _initialize_custom_specialists(self):
+        """Initialize custom TLM specialists or native Mamba swarm"""
+        try:
+            if hasattr(self, 'use_native_swarm') and self.use_native_swarm:
+                # Use the native Mamba swarm integration
+                self.native_swarm_model = create_swarm_from_existing_config(
+                    self.config, num_encoders=self.num_encoders
+                )
+                print(f"✓ Initialized native Mamba swarm with {self.num_encoders} encoders")
+            else:
+                # Use TLM manager (legacy approach)
+                self.tlm_manager = TLMManager(self.config)
+                print(f"✓ Initialized {self.num_encoders} custom specialists")
+        except Exception as e:
+            print(f"⚠️  Custom specialists failed: {e}")
+            self._create_simulated_encoders()
+    def _create_simulated_encoders(self):
+        """Create simulated encoders for demonstration/fallback"""
+        print("🎭 Creating simulated encoders...")
+        self.stats['simulation_mode'] = True
+        for i in range(self.num_encoders):
+            domain_info = self.swarm_config.domain_assignments[i] if self.swarm_config else {
+                "domain": f"general_{i}", "specialty": "general"
+            }
+            encoder = {
+                "id": i,
+                "model": None,
+                "domain": domain_info["domain"],
+                "specialty": domain_info["specialty"],
+                "simulated": True
+            }
+            self.encoders.append(encoder)
+    def _initialize_routing(self):
+        """Initialize routing system"""
+        try:
+            if self.use_pretrained and self.swarm_config:
+                # Use content-based router for pretrained models
+                router_config = self.swarm_config.get_router_config()
+                self.router = ContentBasedRouter(
+                    num_encoders=self.num_encoders,
+                    domain_assignments=self.swarm_config.domain_assignments,
+                    config=router_config
+                )
+            else:
+                # Use topic router for custom models
+                domain_configs = DomainConfigs.get_domain_configs(self.num_encoders)
+                self.router = TopicRouter(self.config, domain_configs)
+                if hasattr(self.router, 'to'):
+                    self.router.to(self.device)
+            print("🧭 Router initialized")
+        except Exception as e:
+            print(f"⚠️  Router initialization failed: {e}")
+            # Create basic fallback router
+            self.router = self._create_fallback_router()
+    def _initialize_aggregation(self):
+        """Initialize aggregation system"""
+        try:
+            if self.use_pretrained:
+                self.aggregator = WeightedAggregator(
+                    num_encoders=self.num_encoders,
+                    hidden_dim=768
+                )
+            else:
+                self.aggregator = AttentionAggregator(self.config)
+                if hasattr(self.aggregator, 'to'):
+                    self.aggregator.to(self.device)
+            print("🔄 Aggregator initialized")
+        except Exception as e:
+            print(f"⚠️  Aggregator initialization failed: {e}")
+            self.aggregator = None
+    def _create_fallback_router(self):
+        """Create a simple fallback router"""
+        class FallbackRouter:
+            def __init__(self, num_encoders):
+                self.num_encoders = num_encoders
+            def route(self, text):
+                # Simple round-robin routing
+                import random
+                num_selected = min(3, self.num_encoders)
+                return {
+                    "selected_encoders": random.sample(range(self.num_encoders), num_selected)
+                }
+            def chunk_and_route(self, text):
+                return [{"specialists": [(0, 1.0)], "chunk": text}]
+        return FallbackRouter(self.num_encoders)
+    def _fallback_to_simulation(self):
+        """Complete fallback to simulation mode"""
+        print("🎭 Entering full simulation mode")
+        self.stats['simulation_mode'] = True
+        self._create_simulated_encoders()
+        if not self.router:
+            self.router = self._create_fallback_router()
+    # =============================================================================
+    # MAIN PROCESSING METHODS
+    # =============================================================================
+    def generate(self, prompt: str, max_length: int = 100, temperature: float = 0.7,
+                show_routing: bool = True) -> Dict:
+        """
+        Generate response using the swarm (from swarmEngine2 style)
+        Args:
+            prompt: Input text prompt
+            max_length: Maximum tokens to generate
+            temperature: Sampling temperature
+            show_routing: Whether to display routing information
+        Returns:
+            Dict with response and metadata
+        """
+        start_time = time.time()
+        try:
+            # Route to appropriate encoders
+            if hasattr(self.router, 'route'):
+                routing_decision = self.router.route(prompt)
+                selected_encoders = routing_decision.get("selected_encoders", [0])
+            else:
+                # Fallback routing
+                selected_encoders = [0]
+            if show_routing:
+                print(f"🔀 Routing: Selected {len(selected_encoders)} encoders")
+                for enc_id in selected_encoders[:3]:
+                    if enc_id < len(self.encoders):
+                        domain = self.encoders[enc_id]["domain"]
+                        print(f"   Encoder {enc_id}: {domain}")
+            # Generate response
+            if self.stats['simulation_mode'] or any(enc.get("simulated") for enc in self.encoders):
+                response = self._simulate_generation(prompt, selected_encoders, max_length)
+            else:
+                response = self._real_generation(prompt, selected_encoders, max_length, temperature)
+            # Update statistics
+            processing_time = time.time() - start_time
+            self._update_stats_simple(prompt, selected_encoders, processing_time)
+            return {
+                "response": response,
+                "processing_time": processing_time,
+                "routing_info": {
+                    "selected_encoders": selected_encoders,
+                    "num_active": len(selected_encoders),
+                    "total_encoders": self.num_encoders,
+                    "domains": [self.encoders[i]["domain"] for i in selected_encoders
+                               if i < len(self.encoders)]
+                },
+                "success": True
+            }
+        except Exception as e:
+            return {
+                "response": f"Error generating response: {str(e)}",
+                "processing_time": time.time() - start_time,
+                "success": False,
+                "error": str(e)
+            }
+    def process_request(self, text: str, max_new_tokens: int = 100) -> Dict:
+        """
+        Process request using traditional pipeline (from swarm_engine style)
+        Args:
+            text: Input text to process
+            max_new_tokens: Maximum tokens to generate
+        Returns:
+            Dict with response and metadata
+        """
+        start_time = time.time()
+        try:
+            # Step 1: Preprocess input
+            if self.preprocessor:
+                clean_text = self.preprocessor.clean_text(text)
+            else:
+                clean_text = text
+            # Step 2: Route to specialists
+            if hasattr(self.router, 'chunk_and_route'):
+                routing_results = self.router.chunk_and_route(clean_text)
+            else:
+                # Fallback for content-based router
+                routing_decision = self.router.route(clean_text)
+                routing_results = [{"specialists": [(enc_id, 1.0) for enc_id in routing_decision["selected_encoders"]],
+                                 "chunk": clean_text}]
+            # Step 3: Process chunks
+            if self.tlm_manager and not self.stats['simulation_mode']:
+                specialist_outputs = self.tlm_manager.encode_parallel(routing_results)
+            else:
+                # Simulate processing
+                specialist_outputs = [{"response": f"Processed chunk: {res['chunk'][:50]}..."}
+                                    for res in routing_results]
+            # Step 4: Aggregate results
+            if self.aggregator and not self.stats['simulation_mode']:
+                response = self.aggregator.generate_response(specialist_outputs, max_new_tokens)
+            else:
+                # Simple aggregation fallback
+                response = " ".join([out.get("response", "") for out in specialist_outputs])
+            # Update stats
+            processing_time = time.time() - start_time
+            self._update_stats(text, routing_results, processing_time)
+            return {
+                'response': response,
+                'processing_time': processing_time,
+                'chunks_processed': len(routing_results),
+                'specialists_used': self._get_specialists_used(routing_results),
+                'success': True
+            }
+        except Exception as e:
+            return {
+                'response': f"Error processing request: {str(e)}",
+                'processing_time': time.time() - start_time,
+                'success': False,
+                'error': str(e)
+            }
+    # =============================================================================
+    # ASYNC AND BATCH PROCESSING
+    # =============================================================================
+    async def process_request_async(self, text: str, max_new_tokens: int = 100) -> Dict:
+        """Async version of process_request"""
+        loop = asyncio.get_event_loop()
+        with ThreadPoolExecutor() as executor:
+            result = await loop.run_in_executor(
+                executor, self.process_request, text, max_new_tokens
+            )
+        return result
+    async def generate_async(self, prompt: str, max_length: int = 100,
+                           temperature: float = 0.7) -> Dict:
+        """Async version of generate"""
+        loop = asyncio.get_event_loop()
+        with ThreadPoolExecutor() as executor:
+            result = await loop.run_in_executor(
+                executor, self.generate, prompt, max_length, temperature, False
+            )
+        return result
+    def batch_process(self, texts: List[str], max_new_tokens: int = 100,
+                     method: str = "process") -> List[Dict]:
+        """
+        Process multiple texts in batch
+        Args:
+            texts: List of input texts
+            max_new_tokens: Maximum tokens to generate
+            method: "process" or "generate" for processing method
+        """
+        results = []
+        for text in texts:
+            if method == "generate":
+                result = self.generate(text, max_new_tokens, show_routing=False)
+            else:
+                result = self.process_request(text, max_new_tokens)
+            results.append(result)
+        return results
+    # =============================================================================
+    # GENERATION METHODS
+    # =============================================================================
+    def _simulate_generation(self, prompt: str, selected_encoders: List[int], max_length: int) -> str:
+        """Simulate generation for demo/fallback purposes"""
+        import random
+        # Determine response type based on selected encoder domains
+        domains = [self.encoders[i]["domain"] for i in selected_encoders if i < len(self.encoders)]
+        if any("code" in domain.lower() for domain in domains):
+            return f"Here's a solution for '{prompt[:30]}...':\n\n```python\ndef solution():\n    # Implementation here\n    return result\n```"
+        elif any("medical" in domain.lower() for domain in domains):
+            return f"Regarding '{prompt[:30]}...': This medical topic requires careful consideration. Please consult healthcare professionals."
+        elif any("science" in domain.lower() for domain in domains):
+            return f"From a scientific perspective on '{prompt[:30]}...': Current research indicates several key factors..."
+        else:
+            return f"Thank you for asking about '{prompt[:30]}...'. Based on expertise from {len(selected_encoders)} specialized domains, here's a comprehensive response..."
+    def _real_generation(self, prompt: str, selected_encoders: List[int],
+                        max_length: int, temperature: float) -> str:
+        """Real generation using loaded models"""
+        if not selected_encoders or selected_encoders[0] >= len(self.encoders):
+            return "No valid encoders available for generation."
+        try:
+            # Use primary encoder for generation
+            primary_encoder = self.encoders[selected_encoders[0]]
+            if primary_encoder.get("simulated") or not primary_encoder["model"]:
+                return self._simulate_generation(prompt, selected_encoders, max_length)
+            # Tokenize input
+            if hasattr(self.tokenizer, 'encode'):
+                inputs = self.tokenizer(prompt, return_tensors="pt")
+            else:
+                # Fallback tokenization
+                return self._simulate_generation(prompt, selected_encoders, max_length)
+            # Generate with model
+            with torch.no_grad():
+                outputs = primary_encoder["model"].generate(
+                    **inputs,
+                    max_length=max_length,
+                    temperature=temperature,
+                    do_sample=True,
+                    pad_token_id=self.tokenizer.eos_token_id if hasattr(self.tokenizer, 'eos_token_id') else 0
+                )
+            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Remove original prompt from response
+            response = response[len(prompt):].strip()
+            return response if response else "Generated response was empty."
+        except Exception as e:
+            print(f"⚠️  Real generation failed: {e}")
+            return self._simulate_generation(prompt, selected_encoders, max_length)
+    # =============================================================================
+    # UTILITY METHODS
+    # =============================================================================
+    def _get_specialists_used(self, routing_results: List[Dict]) -> List[int]:
+        """Extract specialist IDs used in routing"""
+        specialists_used = set()
+        for chunk_info in routing_results:
+            if 'specialists' in chunk_info:
+                for specialist_id, _ in chunk_info['specialists']:
+                    specialists_used.add(specialist_id)
+        return list(specialists_used)
+    def _update_stats(self, text: str, routing_results: List[Dict], processing_time: float):
+        """Update detailed performance statistics"""
+        self.stats['total_requests'] += 1
+        self.stats['total_tokens_processed'] += len(text.split())
+        # Update average response time
+        prev_avg = self.stats['avg_response_time']
+        n = self.stats['total_requests']
+        self.stats['avg_response_time'] = (prev_avg * (n-1) + processing_time) / n
+        # Update specialist usage
+        specialists_used = self._get_specialists_used(routing_results)
+        for specialist_id in specialists_used:
+            if specialist_id in self.stats['specialist_usage']:
+                self.stats['specialist_usage'][specialist_id] += 1
+    def _update_stats_simple(self, text: str, selected_encoders: List[int], processing_time: float):
+        """Update simple statistics for generate method"""
+        self.stats['total_requests'] += 1
+        self.stats['total_tokens_processed'] += len(text.split())
+        # Update average response time
+        prev_avg = self.stats['avg_response_time']
+        n = self.stats['total_requests']
+        self.stats['avg_response_time'] = (prev_avg * (n-1) + processing_time) / n
+        # Update encoder usage
+        for enc_id in selected_encoders:
+            if enc_id in self.stats['specialist_usage']:
+                self.stats['specialist_usage'][enc_id] += 1
+    # =============================================================================
+    # SCALING AND MANAGEMENT
+    # =============================================================================
+    def scale_up(self, new_tier: str):
+        """Scale up to a higher tier"""
+        if new_tier not in ["demo", "small", "medium", "large", "full"]:
+            raise ValueError(f"Invalid tier: {new_tier}")
+        print(f"🚀 Scaling from {self.tier} to {new_tier}")
+        # Preserve current stats
+        old_stats = self.stats.copy()
+        # Reinitialize with new tier
+        self.__init__(tier=new_tier, use_pretrained=self.use_pretrained)
+        # Restore relevant stats
+        self.stats['total_requests'] = old_stats['total_requests']
+        self.stats['total_tokens_processed'] = old_stats['total_tokens_processed']
+        self.stats['avg_response_time'] = old_stats['avg_response_time']
+    def get_system_info(self) -> Dict:
+        """Get comprehensive system information"""
+        info = {
+            "tier": self.tier,
+            "num_encoders": self.num_encoders,
+            "encoder_size": self.encoder_size,
+            "use_pretrained": self.use_pretrained,
+            "simulation_mode": self.stats['simulation_mode'],
+            "device": self.device,
+            "domains": list(set(enc["domain"] for enc in self.encoders)),
+        }
+        if self.swarm_config:
+            info.update({
+                "total_parameters": self.swarm_config.config["total_params"],
+                "memory_estimate": self.swarm_config.config["memory_estimate"],
+                "hardware_recommendation": self.swarm_config.config["hardware"]
+            })
+        return info
+    def get_stats(self) -> Dict:
+        """Get current performance statistics"""
+        return self.stats.copy()
+    def load_models(self, checkpoint_path: str):
+        """Load trained models from checkpoint"""
+        if not os.path.exists(checkpoint_path):
+            print(f"❌ Checkpoint not found: {checkpoint_path}")
+            return
+        try:
+            checkpoint = torch.load(checkpoint_path, map_location=self.device)
+            # Load aggregator
+            if self.aggregator and 'aggregator_state' in checkpoint:
+                self.aggregator.load_state_dict(checkpoint['aggregator_state'])
+            # Load specialists (if using custom models)
+            if self.tlm_manager and 'specialist_states' in checkpoint:
+                for specialist_id, state_dict in checkpoint['specialist_states'].items():
+                    if specialist_id in self.tlm_manager.specialists:
+                        self.tlm_manager.specialists[specialist_id].model.load_state_dict(state_dict)
+            print(f"✅ Models loaded from {checkpoint_path}")
+        except Exception as e:
+            print(f"❌ Error loading models: {e}")
+    def set_eval_mode(self):
+        """Set all models to evaluation mode"""
+        if self.tlm_manager:
+            for specialist in self.tlm_manager.specialists.values():
+                if hasattr(specialist, 'model'):
+                    specialist.model.eval()
+        if self.aggregator and hasattr(self.aggregator, 'eval'):
+            self.aggregator.eval()
+        if self.router and hasattr(self.router, 'eval'):
+            self.router.eval()
+        # Set pretrained encoders to eval mode
+        for encoder in self.encoders:
+            if encoder.get("model") and hasattr(encoder["model"], 'eval'):
+                encoder["model"].eval()
+    def set_train_mode(self):
+        """Set all models to training mode"""
+        if self.tlm_manager:
+            for specialist in self.tlm_manager.specialists.values():
+                if hasattr(specialist, 'model'):
+                    specialist.model.train()
+        if self.aggregator and hasattr(self.aggregator, 'train'):
+            self.aggregator.train()
+        if self.router and hasattr(self.router, 'train'):
+            self.router.train()
+# =============================================================================
+# FACTORY FUNCTIONS
+# =============================================================================
+def create_mamba_swarm(tier: str = "auto", use_pretrained: bool = True,
+                      config_override: Optional[Dict] = None) -> UnifiedMambaSwarm:
+    """
+    Factory function to create appropriately configured swarm
+    Args:
+        tier: Scaling tier or "auto" for auto-detection
+        use_pretrained: Whether to use pretrained HuggingFace models
+        config_override: Dictionary to override default config
+    Returns:
+        Configured UnifiedMambaSwarm instance
+    """
+    if tier == "auto":
+        tier = auto_detect_tier()
+    return UnifiedMambaSwarm(
+        tier=tier,
+        use_pretrained=use_pretrained,
+        config_override=config_override
+    )
+def create_production_swarm(tier: str = "medium") -> UnifiedMambaSwarm:
+    """Create production-ready swarm with optimal settings"""
+    return UnifiedMambaSwarm(
+        tier=tier,
+        use_pretrained=True,
+        config_override={
+            "batch_size": 32,
+            "max_sequence_length": 2048
+        }
+    )
+def create_development_swarm() -> UnifiedMambaSwarm:
+    """Create development swarm with simulation fallback"""
+    return UnifiedMambaSwarm(
+        tier="demo",
+        use_pretrained=True,
+        config_override={
+            "simulation_fallback": True
+        }
+    )
+# =============================================================================
+# MAIN EXECUTION
+# =============================================================================
+if __name__ == "__main__":
+    print("🧪 Testing Unified Mamba Swarm...")
+    # Create swarm instance
+    swarm = create_mamba_swarm(tier="demo")
+    # Display system info
+    print("\n📊 System Information:")
+    info = swarm.get_system_info()
+    for key, value in info.items():
+        print(f"  {key}: {value}")
+    # Test both processing methods
+    test_prompts = [
+        "Write a Python function to calculate fibonacci numbers",
+        "Explain the process of photosynthesis",
+        "What are the symptoms of diabetes?"
+    ]
+    print("\n🧪 Testing generate method:")
+    for prompt in test_prompts[:2]:
+        result = swarm.generate(prompt, max_length=150)
+        print(f"\nPrompt: {prompt}")
+        print(f"Response: {result['response'][:100]}...")
+        print(f"Processing time: {result['processing_time']:.3f}s")
+        print(f"Routing: {result['routing_info']['domains']}")
+    print("\n🧪 Testing process_request method:")
+    result = swarm.process_request(test_prompts[2])
+    print(f"Response: {result['response'][:100]}...")
+    print(f"Success: {result['success']}")
+    # Test batch processing
+    print("\n🧪 Testing batch processing:")
+    batch_results = swarm.batch_process(test_prompts, method="generate")
+    print(f"Processed {len(batch_results)} requests in batch")
+    # Display final stats
+    print("\n📈 Final Statistics:")
+    stats = swarm.get_stats()
+    for key, value in stats.items():
+        if key != 'specialist_usage':
+            print(f"  {key}: {value}")
+    print("\n✅ Testing complete!")

system/memory_manager.py ADDED Viewed

	@@ -0,0 +1,306 @@

+"""
+Memory Manager for Mamba Swarm
+Handles memory optimization, caching, and distributed memory management
+"""
+import torch
+import torch.nn as nn
+import gc
+import psutil
+import threading
+from typing import Dict, Any, Optional, List, Tuple
+from dataclasses import dataclass
+from collections import OrderedDict
+import numpy as np
+import logging
+@dataclass
+class MemoryStats:
+    total_memory: float
+    used_memory: float
+    free_memory: float
+    gpu_memory: float
+    gpu_free: float
+    cache_size: float
+class LRUCache:
+    """Least Recently Used cache for model states and activations"""
+    def __init__(self, max_size: int = 1000):
+        self.max_size = max_size
+        self.cache = OrderedDict()
+        self.lock = threading.Lock()
+    def get(self, key: str) -> Optional[torch.Tensor]:
+        with self.lock:
+            if key in self.cache:
+                # Move to end (most recently used)
+                value = self.cache.pop(key)
+                self.cache[key] = value
+                return value
+            return None
+    def put(self, key: str, value: torch.Tensor):
+        with self.lock:
+            if key in self.cache:
+                self.cache.pop(key)
+            elif len(self.cache) >= self.max_size:
+                # Remove least recently used
+                oldest_key = next(iter(self.cache))
+                old_value = self.cache.pop(oldest_key)
+                del old_value
+            self.cache[key] = value.clone() if isinstance(value, torch.Tensor) else value
+    def clear(self):
+        with self.lock:
+            self.cache.clear()
+            gc.collect()
+class GradientAccumulator:
+    """Manages gradient accumulation across multiple steps"""
+    def __init__(self, accumulation_steps: int = 8):
+        self.accumulation_steps = accumulation_steps
+        self.current_step = 0
+        self.accumulated_gradients = {}
+    def accumulate(self, model: nn.Module):
+        """Accumulate gradients from current backward pass"""
+        for name, param in model.named_parameters():
+            if param.grad is not None:
+                if name not in self.accumulated_gradients:
+                    self.accumulated_gradients[name] = param.grad.clone()
+                else:
+                    self.accumulated_gradients[name] += param.grad
+        self.current_step += 1
+    def should_update(self) -> bool:
+        """Check if we should perform optimizer step"""
+        return self.current_step % self.accumulation_steps == 0
+    def get_averaged_gradients(self) -> Dict[str, torch.Tensor]:
+        """Get accumulated gradients averaged over accumulation steps"""
+        averaged = {}
+        for name, grad in self.accumulated_gradients.items():
+            averaged[name] = grad / self.accumulation_steps
+        return averaged
+    def reset(self):
+        """Reset accumulator"""
+        self.accumulated_gradients.clear()
+        self.current_step = 0
+class MemoryManager:
+    """Comprehensive memory management for Mamba Swarm"""
+    def __init__(self,
+                 max_cache_size: int = 2000,
+                 gradient_accumulation_steps: int = 8,
+                 auto_cleanup: bool = True,
+                 memory_threshold: float = 0.85):
+        self.logger = logging.getLogger(__name__)
+        self.max_cache_size = max_cache_size
+        self.gradient_accumulation_steps = gradient_accumulation_steps
+        self.auto_cleanup = auto_cleanup
+        self.memory_threshold = memory_threshold
+        # Initialize components
+        self.activation_cache = LRUCache(max_cache_size)
+        self.state_cache = LRUCache(max_cache_size // 2)
+        self.gradient_accumulator = GradientAccumulator(gradient_accumulation_steps)
+        # Memory tracking
+        self.peak_memory_usage = 0.0
+        self.memory_history = []
+        self.cleanup_threshold = memory_threshold
+        # Device management
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.setup_memory_optimization()
+    def setup_memory_optimization(self):
+        """Setup memory optimization settings"""
+        if torch.cuda.is_available():
+            # Enable memory mapping for large tensors
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+            # Set memory fraction
+            if hasattr(torch.cuda, 'set_per_process_memory_fraction'):
+                torch.cuda.set_per_process_memory_fraction(0.9)
+    def get_memory_stats(self) -> MemoryStats:
+        """Get current memory statistics"""
+        # System memory
+        memory = psutil.virtual_memory()
+        total_memory = memory.total / (1024**3)  # GB
+        used_memory = memory.used / (1024**3)
+        free_memory = memory.available / (1024**3)
+        # GPU memory
+        gpu_memory = 0.0
+        gpu_free = 0.0
+        if torch.cuda.is_available():
+            gpu_memory = torch.cuda.memory_allocated() / (1024**3)
+            gpu_free = (torch.cuda.memory_reserved() - torch.cuda.memory_allocated()) / (1024**3)
+        # Cache size estimation
+        cache_size = (len(self.activation_cache.cache) + len(self.state_cache.cache)) * 0.001  # Rough estimate
+        stats = MemoryStats(
+            total_memory=total_memory,
+            used_memory=used_memory,
+            free_memory=free_memory,
+            gpu_memory=gpu_memory,
+            gpu_free=gpu_free,
+            cache_size=cache_size
+        )
+        # Update peak usage
+        current_usage = used_memory + gpu_memory
+        if current_usage > self.peak_memory_usage:
+            self.peak_memory_usage = current_usage
+        return stats
+    def check_memory_pressure(self) -> bool:
+        """Check if system is under memory pressure"""
+        stats = self.get_memory_stats()
+        memory_usage_ratio = stats.used_memory / stats.total_memory
+        if torch.cuda.is_available():
+            gpu_usage_ratio = stats.gpu_memory / (stats.gpu_memory + stats.gpu_free + 1e-6)
+            return memory_usage_ratio > self.cleanup_threshold or gpu_usage_ratio > self.cleanup_threshold
+        return memory_usage_ratio > self.cleanup_threshold
+    def cleanup_memory(self, aggressive: bool = False):
+        """Perform memory cleanup"""
+        if aggressive:
+            self.activation_cache.clear()
+            self.state_cache.clear()
+            self.gradient_accumulator.reset()
+        # Python garbage collection
+        gc.collect()
+        # GPU memory cleanup
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        self.logger.info(f"Memory cleanup completed. Aggressive: {aggressive}")
+    def cache_activation(self, key: str, activation: torch.Tensor):
+        """Cache activation with memory pressure check"""
+        if self.auto_cleanup and self.check_memory_pressure():
+            self.cleanup_memory()
+        self.activation_cache.put(key, activation)
+    def get_cached_activation(self, key: str) -> Optional[torch.Tensor]:
+        """Retrieve cached activation"""
+        return self.activation_cache.get(key)
+    def cache_hidden_state(self, key: str, state: torch.Tensor):
+        """Cache hidden state"""
+        self.state_cache.put(key, state)
+    def get_cached_state(self, key: str) -> Optional[torch.Tensor]:
+        """Retrieve cached hidden state"""
+        return self.state_cache.get(key)
+    def manage_gradient_accumulation(self, model: nn.Module) -> bool:
+        """Manage gradient accumulation and return if optimizer step should be taken"""
+        self.gradient_accumulator.accumulate(model)
+        if self.gradient_accumulator.should_update():
+            # Apply accumulated gradients
+            averaged_grads = self.gradient_accumulator.get_averaged_gradients()
+            for name, param in model.named_parameters():
+                if name in averaged_grads:
+                    param.grad = averaged_grads[name]
+            self.gradient_accumulator.reset()
+            return True
+        return False
+    def optimize_model_memory(self, model: nn.Module):
+        """Optimize model memory usage"""
+        # Enable gradient checkpointing for large models
+        for module in model.modules():
+            if hasattr(module, 'gradient_checkpointing'):
+                module.gradient_checkpointing = True
+        # Convert to half precision if possible
+        if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 7:
+            model = model.half()
+        return model
+    def create_memory_efficient_dataloader(self, dataset, batch_size: int, **kwargs):
+        """Create memory-efficient dataloader"""
+        # Adjust batch size based on available memory
+        stats = self.get_memory_stats()
+        if stats.free_memory < 2.0:  # Less than 2GB free
+            batch_size = max(1, batch_size // 2)
+            self.logger.warning(f"Reduced batch size to {batch_size} due to low memory")
+        return torch.utils.data.DataLoader(
+            dataset,
+            batch_size=batch_size,
+            num_workers=min(4, psutil.cpu_count()),
+            pin_memory=torch.cuda.is_available(),
+            prefetch_factor=2,
+            **kwargs
+        )
+    def monitor_memory_usage(self):
+        """Monitor and log memory usage"""
+        stats = self.get_memory_stats()
+        self.memory_history.append({
+            'timestamp': torch.cuda.Event(enable_timing=True) if torch.cuda.is_available() else None,
+            'stats': stats
+        })
+        # Keep only recent history
+        if len(self.memory_history) > 100:
+            self.memory_history = self.memory_history[-50:]
+        self.logger.debug(f"Memory - System: {stats.used_memory:.2f}GB/{stats.total_memory:.2f}GB, "
+                         f"GPU: {stats.gpu_memory:.2f}GB, Cache: {stats.cache_size:.2f}GB")
+    def get_memory_report(self) -> Dict[str, Any]:
+        """Generate comprehensive memory report"""
+        stats = self.get_memory_stats()
+        return {
+            'current_stats': stats.__dict__,
+            'peak_usage': self.peak_memory_usage,
+            'cache_stats': {
+                'activation_cache_size': len(self.activation_cache.cache),
+                'state_cache_size': len(self.state_cache.cache),
+                'max_cache_size': self.max_cache_size
+            },
+            'gradient_accumulation': {
+                'current_step': self.gradient_accumulator.current_step,
+                'accumulation_steps': self.gradient_accumulation_steps,
+                'accumulated_params': len(self.gradient_accumulator.accumulated_gradients)
+            },
+            'memory_pressure': self.check_memory_pressure(),
+            'device': str(self.device)
+        }
+    def __enter__(self):
+        """Context manager entry"""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit with cleanup"""
+        self.cleanup_memory(aggressive=True)

system/weight_manager.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# =============================================================================
+# system/weight_manager.py
+# =============================================================================
+import torch
+import torch.nn as nn
+from typing import Dict, List, Optional
+import os
+from pathlib import Path
+class WeightManager:
+    """Manages hierarchical weight sharing and loading/saving"""
+    def __init__(self, config, tlm_manager):
+        self.config = config
+        self.tlm_manager = tlm_manager
+        # Track shared weights
+        self.shared_embeddings = None
+        self.shared_foundation_layers = {}
+    def setup_hierarchical_sharing(self):
+        """Setup hierarchical weight sharing between specialists"""
+        print("Setting up hierarchical weight sharing...")
+        # Create shared embedding if enabled
+        if self.config.shared_embedding:
+            self.shared_embeddings = nn.Embedding(
+                self.config.vocab_size,
+                self.config.d_model
+            ).to(self.config.device)
+            # Share embedding across all specialists
+            for specialist in self.tlm_manager.specialists.values():
+                specialist.model.embedding.token_embedding = self.shared_embeddings
+        # Setup foundation layer sharing
+        self._setup_foundation_sharing()
+        print("Hierarchical weight sharing setup complete!")
+    def _setup_foundation_sharing(self):
+        """Setup sharing of foundation layers"""
+        num_shared_layers = self.config.n_layers // 2
+        # Group specialists by domain similarity
+        domain_groups = self._group_specialists_by_domain()
+        for group_name, specialist_ids in domain_groups.items():
+            if len(specialist_ids) > 1:
+                # Create shared foundation layers for this group
+                reference_specialist = self.tlm_manager.specialists[specialist_ids[0]]
+                shared_layers = reference_specialist.model.layers[:num_shared_layers]
+                # Share with other specialists in the group
+                for specialist_id in specialist_ids[1:]:
+                    specialist = self.tlm_manager.specialists[specialist_id]
+                    for i in range(num_shared_layers):
+                        specialist.model.layers[i] = shared_layers[i]
+                self.shared_foundation_layers[group_name] = shared_layers
+    def _group_specialists_by_domain(self) -> Dict[str, List[int]]:
+        """Group specialists by domain for weight sharing"""
+        domain_groups = {
+            'stem': [],
+            'programming': [],
+            'language': [],
+            'business': [],
+            'general': []
+        }
+        for specialist_id, specialist in self.tlm_manager.specialists.items():
+            domain_name = specialist.domain_info['name'].lower()
+            if any(x in domain_name for x in ['math', 'physics', 'chemistry', 'biology']):
+                domain_groups['stem'].append(specialist_id)
+            elif any(x in domain_name for x in ['python', 'javascript', 'systems']):
+                domain_groups['programming'].append(specialist_id)
+            elif any(x in domain_name for x in ['writing', 'translation']):
+                domain_groups['language'].append(specialist_id)
+            elif any(x in domain_name for x in ['business', 'legal']):
+                domain_groups['business'].append(specialist_id)
+            else:
+                domain_groups['general'].append(specialist_id)
+        return {k: v for k, v in domain_groups.items() if len(v) > 1}
+    def save_weights(self, save_path: str):
+        """Save all weights with hierarchical structure"""
+        save_path = Path(save_path)
+        save_path.mkdir(parents=True, exist_ok=True)
+        # Save shared embeddings
+        if self.shared_embeddings is not None:
+            torch.save(
+                self.shared_embeddings.state_dict(),
+                save_path / "shared_embeddings.pt"
+            )
+        # Save shared foundation layers
+        for group_name, layers in self.shared_foundation_layers.items():
+            group_state = {}
+            for i, layer in enumerate(layers):
+                group_state[f"layer_{i}"] = layer.state_dict()
+            torch.save(group_state, save_path / f"shared_foundation_{group_name}.pt")
+        # Save specialist-specific weights
+        specialists_path = save_path / "specialists"
+        specialists_path.mkdir(exist_ok=True)
+        for specialist_id, specialist in self.tlm_manager.specialists.items():
+            torch.save(
+                specialist.model.state_dict(),
+                specialists_path / f"specialist_{specialist_id}.pt"
+            )
+        print(f"Weights saved to {save_path}")
+    def load_weights(self, load_path: str):
+        """Load weights with hierarchical structure"""
+        load_path = Path(load_path)
+        if not load_path.exists():
+            raise FileNotFoundError(f"Weight path {load_path} not found")
+        # Load shared embeddings
+        embeddings_path = load_path / "shared_embeddings.pt"
+        if embeddings_path.exists() and self.shared_embeddings is not None:
+            self.shared_embeddings.load_state_dict(torch.load(embeddings_path))
+        # Load shared foundation layers
+        for group_name in self.shared_foundation_layers.keys():
+            foundation_path = load_path / f"shared_foundation_{group_name}.pt"
+            if foundation_path.exists():
+                group_state = torch.load(foundation_path)
+                for i, layer in enumerate(self.shared_foundation_layers[group_name]):
+                    if f"layer_{i}" in group_state:
+                        layer.load_state_dict(group_state[f"layer_{i}"])
+        # Load specialist weights
+        specialists_path = load_path / "specialists"
+        if specialists_path.exists():
+            for specialist_id, specialist in self.tlm_manager.specialists.items():
+                specialist_path = specialists_path / f"specialist_{specialist_id}.pt"
+                if specialist_path.exists():
+                    specialist.model.load_state_dict(torch.load(specialist_path))
+        print(f"Weights loaded from {load_path}")
+    def get_memory_usage(self) -> Dict[str, int]:
+        """Get memory usage breakdown"""
+        usage = {}
+        # Shared embedding memory
+        if self.shared_embeddings is not None:
+            usage['shared_embeddings'] = sum(
+                p.numel() * p.element_size()
+                for p in self.shared_embeddings.parameters()
+            )
+        # Shared foundation layer memory
+        total_foundation = 0
+        for layers in self.shared_foundation_layers.values():
+            for layer in layers:
+                total_foundation += sum(
+                    p.numel() * p.element_size()
+                    for p in layer.parameters()
+                )