Spaces:

Debito
/

mamba-encoder-swarm_app

Sleeping

App Files Files Community

Debito commited on Aug 3

Commit

2ee6fe0

verified ·

1 Parent(s): 43c0029

Upload 3 files

Browse files

Files changed (3) hide show

routing/aggregator.py +134 -0
routing/router.py +157 -0
routing/tlm_manager.py +244 -0

routing/aggregator.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# =============================================================================
+# routing/aggregator.py
+# =============================================================================
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, List, Tuple
+from core.config import MambaConfig
+class AttentionAggregator(nn.Module):
+    """Attention-based aggregator for combining specialist outputs"""
+    def __init__(self, config: MambaConfig):
+        super().__init__()
+        self.config = config
+        self.d_model = config.d_model
+        self.num_specialists = config.num_specialists
+        # Attention mechanism for combining specialist outputs
+        self.specialist_attention = nn.MultiheadAttention(
+            embed_dim=self.d_model,
+            num_heads=8,
+            dropout=0.1,
+            batch_first=True
+        )
+        # Project specialist confidence scores
+        self.confidence_proj = nn.Linear(1, self.d_model)
+        # Output layers
+        self.output_layers = nn.Sequential(
+            nn.Linear(self.d_model, self.d_model * 2),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(self.d_model * 2, self.d_model),
+            nn.LayerNorm(self.d_model)
+        )
+        # Final language modeling head
+        self.lm_head = nn.Linear(self.d_model, config.vocab_size, bias=False)
+    def forward(self, specialist_outputs: Dict[int, List[Dict]]) -> torch.Tensor:
+        """
+        Aggregate specialist outputs into final representation
+        Args:
+            specialist_outputs: Dict mapping chunk_id to list of specialist results
+        Returns:
+            aggregated_logits: [batch, seq_len, vocab_size]
+        """
+        batch_outputs = []
+        for chunk_id in sorted(specialist_outputs.keys()):
+            chunk_results = specialist_outputs[chunk_id]
+            if not chunk_results:
+                continue
+            # Stack specialist encodings
+            encodings = []
+            confidences = []
+            for result in chunk_results:
+                if result is not None:
+                    encodings.append(result['encoding'])
+                    confidences.append(result['confidence'])
+            if not encodings:
+                continue
+            # Stack tensors
+            specialist_encodings = torch.stack(encodings)  # [num_specialists, d_model]
+            confidence_scores = torch.tensor(confidences, device=encodings[0].device)
+            # Project confidence scores
+            confidence_embeddings = self.confidence_proj(
+                confidence_scores.unsqueeze(-1)
+            )  # [num_specialists, d_model]
+            # Add confidence information to encodings
+            enhanced_encodings = specialist_encodings + confidence_embeddings
+            # Apply attention to combine specialist outputs
+            # Use self-attention to let specialists communicate
+            aggregated, _ = self.specialist_attention(
+                enhanced_encodings.unsqueeze(0),  # [1, num_specialists, d_model]
+                enhanced_encodings.unsqueeze(0),
+                enhanced_encodings.unsqueeze(0)
+            )
+            # Pool the attended representations
+            chunk_representation = aggregated.mean(dim=1)  # [1, d_model]
+            # Apply output layers
+            chunk_output = self.output_layers(chunk_representation)
+            batch_outputs.append(chunk_output)
+        if not batch_outputs:
+            # Return dummy output if no valid results
+            return torch.zeros(1, 1, self.config.vocab_size)
+        # Concatenate chunk outputs
+        final_representation = torch.cat(batch_outputs, dim=0)  # [num_chunks, d_model]
+        # Generate logits
+        logits = self.lm_head(final_representation)  # [num_chunks, vocab_size]
+        return logits.unsqueeze(0)  # [1, num_chunks, vocab_size]
+    def generate_response(self, specialist_outputs: Dict[int, List[Dict]],
+                         max_tokens: int = 100) -> str:
+        """Generate text response from specialist outputs"""
+        # Get aggregated logits
+        logits = self.forward(specialist_outputs)
+        # Simple greedy decoding (can be improved with better generation)
+        generated_ids = []
+        current_logits = logits[0, -1, :]  # Use last chunk's logits
+        for _ in range(max_tokens):
+            # Get next token
+            next_token = torch.argmax(current_logits, dim=-1)
+            generated_ids.append(next_token.item())
+            # Break on EOS token (assuming token 0 is EOS)
+            if next_token.item() == 0:
+                break
+        # Convert to text (placeholder - should use proper tokenizer)
+        # This is simplified - integrate with actual tokenizer for real text
+        response = f"Generated response with {len(generated_ids)} tokens"
+        return response

routing/router.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# =============================================================================
+# routing/router.py
+# =============================================================================
+import torch
+import torch.nn as nn
+import numpy as np
+from typing import List, Dict, Tuple, Optional
+from collections import defaultdict
+import re
+from utils.domain_configs import DomainConfigs
+class TopicRouter(nn.Module):
+    def __init__(self, config, domain_configs: List[Dict]):
+        super().__init__()
+        self.config = config
+        self.domain_configs = domain_configs
+        self.num_specialists = len(domain_configs)
+        # Build keyword mappings
+        self.keyword_to_domains = defaultdict(list)
+        self.domain_keywords = {}
+        for domain in domain_configs:
+            domain_id = domain["id"]
+            keywords = domain["keywords"]
+            self.domain_keywords[domain_id] = keywords
+            for keyword in keywords:
+                self.keyword_to_domains[keyword.lower()].append(domain_id)
+        # Neural router for complex routing decisions
+        self.neural_router = nn.Sequential(
+            nn.Linear(config.d_model, 512),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.Linear(256, self.num_specialists)
+        )
+        # Text similarity threshold
+        self.similarity_threshold = 0.1
+    def keyword_based_routing(self, text: str) -> Dict[int, float]:
+        """Route based on keyword matching"""
+        text_lower = text.lower()
+        domain_scores = defaultdict(float)
+        # Count keyword matches for each domain
+        for domain_id, keywords in self.domain_keywords.items():
+            for keyword in keywords:
+                if keyword in text_lower:
+                    # Weight by keyword frequency and length
+                    count = text_lower.count(keyword)
+                    weight = len(keyword) / 10.0  # Longer keywords get higher weight
+                    domain_scores[domain_id] += count * weight
+        # Normalize scores
+        total_score = sum(domain_scores.values())
+        if total_score > 0:
+            domain_scores = {k: v/total_score for k, v in domain_scores.items()}
+        return dict(domain_scores)
+    def neural_routing(self, embeddings: torch.Tensor) -> torch.Tensor:
+        """Neural network based routing"""
+        # Use mean pooling of embeddings
+        pooled = embeddings.mean(dim=1)  # [batch, d_model]
+        scores = self.neural_router(pooled)  # [batch, num_specialists]
+        return torch.softmax(scores, dim=-1)
+    def route_text(self, text: str, embeddings: torch.Tensor = None,
+                   max_specialists: int = 10) -> List[Tuple[int, float]]:
+        """
+        Route text to appropriate specialists
+        Args:
+            text: Input text to route
+            embeddings: Text embeddings [1, seq_len, d_model]
+            max_specialists: Maximum number of specialists to activate
+        Returns:
+            List of (specialist_id, confidence) tuples
+        """
+        # Keyword-based routing
+        keyword_scores = self.keyword_based_routing(text)
+        # Neural routing (if embeddings provided)
+        neural_scores = {}
+        if embeddings is not None:
+            neural_weights = self.neural_routing(embeddings)
+            neural_scores = {i: float(neural_weights[0, i])
+                           for i in range(self.num_specialists)}
+        # Combine scores
+        final_scores = {}
+        for i in range(self.num_specialists):
+            keyword_score = keyword_scores.get(i, 0.0)
+            neural_score = neural_scores.get(i, 0.0)
+            # Weighted combination
+            final_scores[i] = 0.7 * keyword_score + 0.3 * neural_score
+        # Sort by score and take top specialists
+        sorted_specialists = sorted(final_scores.items(),
+                                  key=lambda x: x[1],
+                                  reverse=True)
+        # Filter by threshold and limit
+        active_specialists = []
+        for specialist_id, score in sorted_specialists:
+            if score > self.similarity_threshold and len(active_specialists) < max_specialists:
+                active_specialists.append((specialist_id, score))
+        # Ensure at least one specialist is active
+        if not active_specialists and sorted_specialists:
+            active_specialists = [sorted_specialists[0]]
+        return active_specialists
+    def chunk_and_route(self, text: str, chunk_size: int = 512) -> List[Dict]:
+        """
+        Split text into chunks and route each chunk
+        Returns:
+            List of dicts with 'text', 'specialists', 'chunk_id'
+        """
+        # Simple sentence-based chunking
+        sentences = re.split(r'[.!?]+', text)
+        chunks = []
+        current_chunk = ""
+        chunk_id = 0
+        for sentence in sentences:
+            if len(current_chunk) + len(sentence) > chunk_size and current_chunk:
+                # Route current chunk
+                specialists = self.route_text(current_chunk)
+                chunks.append({
+                    'text': current_chunk.strip(),
+                    'specialists': specialists,
+                    'chunk_id': chunk_id
+                })
+                current_chunk = sentence
+                chunk_id += 1
+            else:
+                current_chunk += sentence + ". "
+        # Handle last chunk
+        if current_chunk.strip():
+            specialists = self.route_text(current_chunk)
+            chunks.append({
+                'text': current_chunk.strip(),
+                'specialists': specialists,
+                'chunk_id': chunk_id
+            })
+        return chunks

routing/tlm_manager.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# =============================================================================
+# routing/tlm_manager.py
+# =============================================================================
+import torch
+import torch.nn as nn
+from typing import List, Dict, Tuple, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import asyncio
+from core.model import MambaModel
+from core.config import MambaConfig
+from utils.domain_configs import DomainConfigs
+class SpecialistTLM:
+    """Individual Specialist Mamba TLM"""
+    def __init__(self, specialist_id: int, config: MambaConfig, domain_info: Dict):
+        self.specialist_id = specialist_id
+        self.config = config
+        self.domain_info = domain_info
+        self.model = MambaModel(config)
+        self.device = config.device
+        # Move to device
+        self.model.to(self.device)
+    def encode(self, input_ids: torch.Tensor) -> torch.Tensor:
+        """Encode input and return hidden states"""
+        self.model.eval()
+        with torch.no_grad():
+            # Get embeddings
+            x = self.model.embedding(input_ids)
+            # Pass through Mamba layers
+            for layer in self.model.layers:
+                x = layer(x)
+            # Apply final norm
+            x = self.model.norm_f(x)
+            # Return pooled representation
+            return x.mean(dim=1)  # [batch, d_model]
+    def get_memory_usage(self) -> int:
+        """Get model memory usage in bytes"""
+        return sum(p.numel() * p.element_size() for p in self.model.parameters())
+class TLMManager:
+    """Manages 100 specialist Mamba TLMs"""
+    def __init__(self, config: MambaConfig):
+        self.config = config
+        self.device = config.device
+        # Create domain configurations
+        self.domain_configs = DomainConfigs.get_domain_configs(config.num_specialists)
+        # Initialize specialists
+        self.specialists = {}
+        self._initialize_specialists()
+        # Shared components
+        self.shared_embedding = None
+        if config.shared_embedding:
+            self.shared_embedding = nn.Embedding(config.vocab_size, config.d_model)
+            self.shared_embedding.to(self.device)
+        # Thread pool for parallel processing
+        self.executor = ThreadPoolExecutor(max_workers=min(32, config.num_specialists))
+    def _initialize_specialists(self):
+        """Initialize all specialist TLMs"""
+        print("Initializing 100 specialist TLMs...")
+        for domain_config in self.domain_configs:
+            specialist_id = domain_config["id"]
+            # Create specialist-specific config
+            specialist_config = DomainConfigs.create_specialist_config(
+                self.config, specialist_id
+            )
+            # Create specialist TLM
+            specialist = SpecialistTLM(
+                specialist_id=specialist_id,
+                config=specialist_config,
+                domain_info=domain_config
+            )
+            self.specialists[specialist_id] = specialist
+            if specialist_id % 10 == 0:
+                print(f"Initialized {specialist_id + 1}/100 specialists")
+        print("All specialists initialized!")
+        # Apply weight sharing if enabled
+        if self.config.hierarchical_sharing:
+            self._apply_weight_sharing()
+    def _apply_weight_sharing(self):
+        """Apply hierarchical weight sharing between specialists"""
+        print("Applying hierarchical weight sharing...")
+        # Share embedding layers
+        if self.shared_embedding is not None:
+            for specialist in self.specialists.values():
+                specialist.model.embedding.token_embedding = self.shared_embedding
+        # Group specialists by domain similarity and share lower layers
+        domain_groups = self._group_domains_by_similarity()
+        for group in domain_groups:
+            if len(group) > 1:
+                # Use first specialist's weights as shared weights for the group
+                reference_specialist = self.specialists[group[0]]
+                shared_layers = reference_specialist.model.layers[:self.config.n_layers//2]
+                for specialist_id in group[1:]:
+                    specialist = self.specialists[specialist_id]
+                    for i, layer in enumerate(shared_layers):
+                        specialist.model.layers[i] = layer
+    def _group_domains_by_similarity(self) -> List[List[int]]:
+        """Group domains by similarity for weight sharing"""
+        # Simple grouping based on domain categories
+        groups = {
+            'stem': [],
+            'programming': [],
+            'language': [],
+            'business': [],
+            'other': []
+        }
+        for domain_config in self.domain_configs:
+            domain_name = domain_config["name"].lower()
+            specialist_id = domain_config["id"]
+            if any(x in domain_name for x in ['math', 'physics', 'chemistry', 'biology']):
+                groups['stem'].append(specialist_id)
+            elif any(x in domain_name for x in ['python', 'javascript', 'systems']):
+                groups['programming'].append(specialist_id)
+            elif any(x in domain_name for x in ['writing', 'translation']):
+                groups['language'].append(specialist_id)
+            elif any(x in domain_name for x in ['business', 'legal']):
+                groups['business'].append(specialist_id)
+            else:
+                groups['other'].append(specialist_id)
+        return [group for group in groups.values() if len(group) > 1]
+    def encode_parallel(self, routing_results: List[Dict]) -> List[Dict]:
+        """
+        Encode chunks in parallel using appropriate specialists
+        Args:
+            routing_results: List of routing results from router
+        Returns:
+            List of encoded results with specialist outputs
+        """
+        futures = []
+        for chunk_info in routing_results:
+            chunk_text = chunk_info['text']
+            specialists = chunk_info['specialists']
+            chunk_id = chunk_info['chunk_id']
+            # Create encoding task for each relevant specialist
+            for specialist_id, confidence in specialists:
+                if specialist_id in self.specialists:
+                    future = self.executor.submit(
+                        self._encode_chunk,
+                        chunk_text,
+                        specialist_id,
+                        confidence,
+                        chunk_id
+                    )
+                    futures.append(future)
+        # Collect results
+        encoded_results = []
+        for future in as_completed(futures):
+            try:
+                result = future.result()
+                encoded_results.append(result)
+            except Exception as e:
+                print(f"Error in specialist encoding: {e}")
+        # Group results by chunk_id
+        grouped_results = {}
+        for result in encoded_results:
+            chunk_id = result['chunk_id']
+            if chunk_id not in grouped_results:
+                grouped_results[chunk_id] = []
+            grouped_results[chunk_id].append(result)
+        return grouped_results
+    def _encode_chunk(self, text: str, specialist_id: int, confidence: float,
+                     chunk_id: int) -> Dict:
+        """Encode a single chunk with a specific specialist"""
+        try:
+            specialist = self.specialists[specialist_id]
+            # Tokenize text (simplified - should use proper tokenizer)
+            # This is a placeholder - integrate with actual tokenizer
+            input_ids = torch.randint(0, 1000, (1, 100)).to(self.device)
+            # Encode with specialist
+            encoding = specialist.encode(input_ids)
+            return {
+                'chunk_id': chunk_id,
+                'specialist_id': specialist_id,
+                'confidence': confidence,
+                'encoding': encoding,
+                'domain': specialist.domain_info['name']
+            }
+        except Exception as e:
+            print(f"Error encoding chunk {chunk_id} with specialist {specialist_id}: {e}")
+            return None
+    def get_active_specialists(self) -> List[int]:
+        """Get list of currently active specialist IDs"""
+        return list(self.specialists.keys())
+    def get_specialist_info(self, specialist_id: int) -> Dict:
+        """Get information about a specific specialist"""
+        if specialist_id in self.specialists:
+            specialist = self.specialists[specialist_id]
+            return {
+                'id': specialist_id,
+                'domain': specialist.domain_info,
+                'params': specialist.model.get_num_params(),
+                'memory': specialist.get_memory_usage()
+            }
+        return None
+    def get_total_parameters(self) -> int:
+        """Get total parameters across all specialists"""
+        total = 0
+        for specialist in self.specialists.values():
+            total += specialist.model.get_num_params()
+        return total