Spaces:

snyk-etso
/

prompt-injection-instruction-defense-challenge

Running on Zero

App Files Files Community

ddas commited on 11 days ago

Commit

e2aa9a2

unverified ·

1 Parent(s): 040a4cc

threshold reduced, more aggressive tagger

Browse files

Files changed (3) hide show

agent.py +1 -1
instruction_classifier.py +177 -87
utils.py +20 -3

agent.py CHANGED Viewed

@@ -549,7 +549,7 @@ Body: {email.body_value}"""
 # Import the instruction classifier sanitizer
-from instruction_classifier import sanitize_tool_output, sanitize_tool_output_with_annotations
 def extract_tool_calls(text):

 # Import the instruction classifier sanitizer
+from instruction_classifier import sanitize_tool_output_with_annotations
 def extract_tool_calls(text):

instruction_classifier.py CHANGED Viewed

@@ -51,6 +51,7 @@ class InstructionClassifierSanitizer:
         model_filename: str = "best_instruction_classifier.pth",
         model_name: str = "xlm-roberta-base",
         threshold: float = 0.01,
         max_length: int = 512,
         overlap: int = 256,
         use_local_model: bool = False  # Set to False to use HF Hub
@@ -63,13 +64,15 @@ class InstructionClassifierSanitizer:
             model_repo_id: Hugging Face model repository ID (if use_local_model=False)
             model_filename: Filename of the model in the HF repository
             model_name: Base transformer model name
-            threshold: Threshold for instruction detection (proportion of instruction tokens)
             max_length: Maximum sequence length for sliding windows
             overlap: Overlap between sliding windows
             use_local_model: Whether to use local model file or download from HF Hub
         """
         self.model_name = model_name
         self.threshold = threshold
         self.max_length = max_length
         self.overlap = overlap
         self.use_local_model = use_local_model
@@ -181,54 +184,6 @@ class InstructionClassifierSanitizer:
         self.model.to(self.device)  # Keep on CPU during initialization
         self.model.eval()
-    @spaces.GPU
-    def sanitize_tool_output(self, tool_output: str) -> str:
-        """
-        Main sanitization function that processes tool output and removes instruction content
-        Args:
-            tool_output: The raw tool output string
-        Returns:
-            Sanitized tool output with instruction content removed
-        """
-        if not tool_output or not tool_output.strip():
-            return tool_output
-        # Move model to target device (GPU) within @spaces.GPU decorated method
-        if self.device != self.target_device:
-            print(f"🚀 Moving model from {self.device} to {self.target_device} within @spaces.GPU context")
-            self.model.to(self.target_device)
-            self.device = self.target_device
-        try:
-            # Step 1: Detect if the tool output contains instructions
-            is_injection, confidence_score, tagged_text = self._detect_injection(tool_output)
-            print(f"🔍 Instruction detection: injection={is_injection}, confidence={confidence_score:.3f}")
-            if not is_injection:
-                print("✅ No injection detected - returning original output")
-                return tool_output
-            print(f"🚨 Injection detected! Sanitizing output...")
-            print(f"   Original: {tool_output}")
-            print(f"   Tagged: {tagged_text}")
-            # Step 2: Merge close instruction tags
-            merged_tagged_text = self._merge_close_instruction_tags(tagged_text, min_words_between=4)
-            print(f"   After merging: {merged_tagged_text}")
-            # Step 3: Remove instruction tags and their content
-            sanitized_output = self._remove_instruction_tags(merged_tagged_text)
-            print(f"   Sanitized: {sanitized_output}")
-            return sanitized_output
-        except Exception as e:
-            print(f"❌ Error in instruction classifier sanitization: {e}")
-            # Return original output if sanitization fails
-            return tool_output
     @spaces.GPU
     def sanitize_with_annotations(self, tool_output: str) -> Tuple[str, List[Dict[str, any]]]:
@@ -261,17 +216,23 @@ class InstructionClassifierSanitizer:
                 print("✅ No injection detected - returning original output")
                 return tool_output, []
-            print(f"🚨 Injection detected! Extracting annotations...")
-            # Step 2: Extract annotation positions from tagged text
             annotations = self._extract_annotations_from_tagged_text(tagged_text, tool_output)
-            # Step 3: Merge close instruction tags
-            merged_tagged_text = self._merge_close_instruction_tags(tagged_text, min_words_between=4)
-            # Step 4: Remove instruction tags and their content
-            sanitized_output = self._remove_instruction_tags(merged_tagged_text)
             return sanitized_output, annotations
         except Exception as e:
@@ -367,11 +328,12 @@ class InstructionClassifierSanitizer:
         from utils import predict_instructions
         try:
-            # Use the predict_instructions function directly
-            tokens, predictions = predict_instructions(self.model, self.tokenizer, text, self.device)
             return predictions, tokens
         except Exception as e:
-            print(f"Error in predict_instructions: {e}")
             # Fallback to simple tokenization if the complex method fails
             return self._simple_predict(text)
@@ -402,7 +364,9 @@ class InstructionClassifierSanitizer:
         self.model.eval()
         with torch.no_grad():
             outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
-            predictions = torch.argmax(outputs['logits'], dim=-1)
         # Convert back to word-level predictions
         word_ids = encoded.word_ids()
@@ -494,6 +458,160 @@ class InstructionClassifierSanitizer:
         return result
     def _merge_close_instruction_tags(self, text, min_words_between=3):
         """
         Merge <instruction>...</instruction> segments that are separated by less than min_words_between words
@@ -572,34 +690,6 @@ def get_sanitizer():
                 return None
     return _sanitizer_instance
-def sanitize_tool_output(tool_output, defense_enabled=True):
-    """
-    Main sanitization function that uses the instruction classifier to detect and remove
-    prompt injection attempts from tool outputs.
-    Args:
-        tool_output: The raw tool output string
-        defense_enabled: Whether defense is enabled (passed from agent)
-    Returns:
-        Sanitized tool output with instruction content removed
-    """
-    print(f"🔍 sanitize_tool_output called with: {tool_output[:100]}...")
-    # If defense is disabled globally, return original output
-    if not defense_enabled:
-        print("⚠️ Defense disabled - returning original output without processing")
-        return tool_output
-    sanitizer = get_sanitizer()
-    if sanitizer is None:
-        print("⚠️  Instruction classifier not available, returning original output")
-        return tool_output
-    print("✅ Sanitizer found, processing...")
-    result = sanitizer.sanitize_tool_output(tool_output)
-    print(f"🔒 Sanitization complete, result: {result[:100]}...")
-    return result
 def sanitize_tool_output_with_annotations(tool_output, defense_enabled=True):
     """

         model_filename: str = "best_instruction_classifier.pth",
         model_name: str = "xlm-roberta-base",
         threshold: float = 0.01,
+        token_threshold: float = 0.4,
         max_length: int = 512,
         overlap: int = 256,
         use_local_model: bool = False  # Set to False to use HF Hub
             model_repo_id: Hugging Face model repository ID (if use_local_model=False)
             model_filename: Filename of the model in the HF repository
             model_name: Base transformer model name
+            threshold: Document-level threshold - proportion of tokens that must be INSTRUCTION to classify document as injection
+            token_threshold: Token-level threshold - probability threshold for classifying individual tokens as INSTRUCTION (0.0-1.0, lower = more aggressive)
             max_length: Maximum sequence length for sliding windows
             overlap: Overlap between sliding windows
             use_local_model: Whether to use local model file or download from HF Hub
         """
         self.model_name = model_name
         self.threshold = threshold
+        self.token_threshold = token_threshold
         self.max_length = max_length
         self.overlap = overlap
         self.use_local_model = use_local_model
         self.model.to(self.device)  # Keep on CPU during initialization
         self.model.eval()
     @spaces.GPU
     def sanitize_with_annotations(self, tool_output: str) -> Tuple[str, List[Dict[str, any]]]:
                 print("✅ No injection detected - returning original output")
                 return tool_output, []
+            print(f"🚨 Injection detected! Processing with extensions and annotations...")
+            # Step 2: Extract annotation positions from original tagged text
             annotations = self._extract_annotations_from_tagged_text(tagged_text, tool_output)
+            print(f"📝 Original tagged text: {tagged_text}")
+            # Step 3: Extend instruction tags by one token on each side
+            extended_tagged_text = self._extend_instruction_tags(tagged_text)
+            print(f"🔄 Extended tagged text: {extended_tagged_text}")
+            # Step 4: Merge close instruction tags
+            merged_tagged_text = self._merge_close_instruction_tags(extended_tagged_text, min_words_between=4)
+            print(f"🔗 Merged tagged text: {merged_tagged_text}")
+            # Step 5: Remove instruction tags and their content
+            sanitized_output = self._remove_instruction_tags(merged_tagged_text)
+            print(f"🔒 Sanitized output: {sanitized_output}")
             return sanitized_output, annotations
         except Exception as e:
         from utils import predict_instructions
         try:
+            # Use the predict_instructions function directly with token-level threshold
+            tokens, predictions = predict_instructions(self.model, self.tokenizer, text, self.device, self.token_threshold)
             return predictions, tokens
         except Exception as e:
+            print(f"⚠️ FALLBACK TRIGGERED: Error in predict_instructions: {e}")
+            print(f"   Using _simple_predict as fallback (still uses threshold={self.token_threshold})")
             # Fallback to simple tokenization if the complex method fails
             return self._simple_predict(text)
         self.model.eval()
         with torch.no_grad():
             outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+            # Use threshold approach (same as main prediction) instead of argmax
+            probs = torch.softmax(outputs['logits'], dim=-1)
+            predictions = (probs[:, :, 1] > self.token_threshold).long()
         # Convert back to word-level predictions
         word_ids = encoded.word_ids()
         return result
+    def _extend_instruction_tags(self, tagged_text: str) -> str:
+        """
+        Extend each <instruction>...</instruction> block by one word token on each side,
+        but only if the adjacent token is not already instruction-tagged.
+        This prevents overlaps between instruction blocks while extending them
+        to capture more context around detected instruction content.
+        Args:
+            tagged_text: Text with <instruction>...</instruction> tags
+        Returns:
+            Text with extended instruction tags
+        """
+        if not tagged_text.strip():
+            return tagged_text
+        # Find all instruction regions first to avoid overlaps
+        instruction_regions = []
+        pattern = r'<instruction>(.*?)</instruction>'
+        for match in re.finditer(pattern, tagged_text, re.DOTALL):
+            instruction_regions.append({
+                'start': match.start(),
+                'end': match.end(),
+                'content': match.group(1)
+            })
+        if not instruction_regions:
+            return tagged_text
+        # Split into words while preserving positions
+        words = tagged_text.split()
+        # Build word-to-character position mapping
+        word_positions = []
+        char_pos = 0
+        for word in words:
+            start_pos = tagged_text.find(word, char_pos)
+            end_pos = start_pos + len(word)
+            word_positions.append({
+                'word': word,
+                'start': start_pos,
+                'end': end_pos
+            })
+            char_pos = end_pos
+        # Find which words are currently inside instruction tags
+        instruction_word_indices = set()
+        for region in instruction_regions:
+            for i, word_info in enumerate(word_positions):
+                # Check if word overlaps with instruction region
+                if (word_info['start'] < region['end'] and word_info['end'] > region['start']):
+                    instruction_word_indices.add(i)
+        # Find instruction blocks by consecutive instruction words
+        instruction_blocks = []
+        current_block = None
+        for i in range(len(words)):
+            if i in instruction_word_indices:
+                if current_block is None:
+                    current_block = {'start': i, 'end': i}
+                else:
+                    current_block['end'] = i
+            else:
+                if current_block is not None:
+                    instruction_blocks.append(current_block)
+                    current_block = None
+        # Don't forget the last block
+        if current_block is not None:
+            instruction_blocks.append(current_block)
+        # Plan extensions with proper overlap prevention
+        extensions = []
+        planned_tagged_words = set(instruction_word_indices)  # Start with currently tagged words
+        for block in instruction_blocks:
+            start_idx = block['start']
+            end_idx = block['end']
+            extend_left = False
+            extend_right = False
+            # Try extend left (if not at beginning and previous token not planned to be tagged)
+            if start_idx > 0 and (start_idx - 1) not in planned_tagged_words:
+                extend_left = True
+                planned_tagged_words.add(start_idx - 1)  # Reserve this word
+            # Try extend right (if not at end and next token not planned to be tagged)
+            if end_idx < len(words) - 1 and (end_idx + 1) not in planned_tagged_words:
+                extend_right = True
+                planned_tagged_words.add(end_idx + 1)  # Reserve this word
+            extensions.append({
+                'original_start': start_idx,
+                'original_end': end_idx,
+                'new_start': start_idx - (1 if extend_left else 0),
+                'new_end': end_idx + (1 if extend_right else 0),
+                'extend_left': extend_left,
+                'extend_right': extend_right
+            })
+        # Create a mapping of which extension block each word belongs to
+        word_to_block = {}
+        for block_idx, ext in enumerate(extensions):
+            for i in range(ext['new_start'], ext['new_end'] + 1):
+                word_to_block[i] = block_idx
+        # Reconstruct the text with separate instruction blocks
+        result_parts = []
+        current_block = None
+        for i, word in enumerate(words):
+            # Clean the word of existing tags
+            clean_word = word.replace('<instruction>', '').replace('</instruction>', '')
+            # Skip empty words (from empty instruction tags)
+            if not clean_word.strip():
+                continue
+            word_block = word_to_block.get(i, None)
+            if word_block is not None and current_block != word_block:
+                # Close previous block if needed
+                if current_block is not None:
+                    result_parts[-1] += '</instruction>'
+                # Start new instruction block
+                result_parts.append(f'<instruction>{clean_word}')
+                current_block = word_block
+            elif word_block is not None and current_block == word_block:
+                # Continue current instruction block
+                result_parts.append(clean_word)
+            elif word_block is None and current_block is not None:
+                # End instruction block and add normal word
+                result_parts[-1] += '</instruction>'
+                result_parts.append(clean_word)
+                current_block = None
+            else:
+                # Normal word (not in any instruction block)
+                result_parts.append(clean_word)
+        # Close instruction if we ended inside one
+        if current_block is not None:
+            result_parts[-1] += '</instruction>'
+        return ' '.join(result_parts)
     def _merge_close_instruction_tags(self, text, min_words_between=3):
         """
         Merge <instruction>...</instruction> segments that are separated by less than min_words_between words
                 return None
     return _sanitizer_instance
 def sanitize_tool_output_with_annotations(tool_output, defense_enabled=True):
     """

utils.py CHANGED Viewed

@@ -581,8 +581,20 @@ def collate_fn(batch):
         'window_ends': [item['window_end'] for item in batch]
     }
-def predict_instructions(model, tokenizer, text: str, device=None):
-    """Predict instructions in a given text"""
     # Auto-detect device if not provided
     if device is None:
         if torch.backends.mps.is_available():
@@ -613,7 +625,12 @@ def predict_instructions(model, tokenizer, text: str, device=None):
     with torch.no_grad():
         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
-        predictions = torch.argmax(outputs['logits'], dim=-1)
     # Align predictions with original tokens
     word_ids = encoded.word_ids()

         'window_ends': [item['window_end'] for item in batch]
     }
+def predict_instructions(model, tokenizer, text: str, device=None, threshold=0.4):
+    """Predict instructions in a given text
+    Args:
+        model: The trained instruction classifier model
+        tokenizer: The tokenizer for the model
+        text: Input text to analyze
+        device: Device to run inference on
+        threshold: Probability threshold for classifying tokens as INSTRUCTION.
+                  Lower values = more aggressive detection (default: 0.4)
+    Returns:
+        tuple: (tokens, predictions) where predictions are 0=OTHER, 1=INSTRUCTION
+    """
     # Auto-detect device if not provided
     if device is None:
         if torch.backends.mps.is_available():
     with torch.no_grad():
         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        # Convert logits to probabilities
+        probs = torch.softmax(outputs['logits'], dim=-1)
+        # Use threshold on probability of class 1 (INSTRUCTION) instead of argmax
+        # This makes the classifier more aggressive - tokens are classified as INSTRUCTION
+        # if their probability of being INSTRUCTION is above the threshold
+        predictions = (probs[:, :, 1] > threshold).long()
     # Align predictions with original tokens
     word_ids = encoded.word_ids()