Spaces:

snyk-etso
/

prompt-injection-instruction-defense-challenge

Running on Zero

App Files Files Community

ddas commited on Aug 22

Commit

40187f3

unverified ·

1 Parent(s): e2aa9a2

UI tagged preview added

Browse files

Files changed (4) hide show

agent.py +67 -28
app.py +107 -16
instruction_classifier.py +18 -13
utils.py +3 -133

agent.py CHANGED Viewed

@@ -551,26 +551,6 @@ Body: {email.body_value}"""
 # Import the instruction classifier sanitizer
 from instruction_classifier import sanitize_tool_output_with_annotations
-def extract_tool_calls(text):
-    """Extract tool calls from LLM output (legacy function - kept for compatibility)"""
-    tool_calls = []
-    # Patterns to match tool calls
-    patterns = [
-        r'get_emails\(\)',
-        r'search_email\(keyword=[^)]*\)',           # search_email(keyword="UBS")
-        r'search_email\(\s*"[^"]+"\s*\)',       # search_email("UBS")
-        r'send_email\([^)]+\)'
-    ]
-    for pattern in patterns:
-        matches = re.findall(pattern, text)
-        tool_calls.extend(matches)
-    return tool_calls
 def extract_and_parse_tool_calls(text):
     """
     Extract tool calls from LLM output and parse them into structured format
@@ -714,6 +694,38 @@ def create_assistant_message_with_tool_calls(llm_output, parsed_tool_calls, prov
         return {"role": "assistant", "content": llm_output}
 def create_tool_result_message(tool_results, provider):
     """
     Create properly formatted tool result message based on LLM provider
@@ -843,6 +855,9 @@ def tool_agent_loop(user_query, inbox, system_prompt, model_name="gpt-4o-mini",
     # Track annotations for instruction classifier flagged content
     all_annotations = []
     # Initialize conversation with system prompt and user query
     # This will be used for LLM API calls (provider-specific format)
     llm_messages = [
@@ -968,7 +983,17 @@ def tool_agent_loop(user_query, inbox, system_prompt, model_name="gpt-4o-mini",
                 # Conditional sanitization based on defense setting
                 if defense_enabled:
                     # Sanitize tool output with annotations
-                    sanitized_output, annotations = sanitize_tool_output_with_annotations(tool_output, defense_enabled)
                     # Always add raw tool output to trace when defense is enabled
                     raw_tool_message = {
@@ -988,17 +1013,26 @@ def tool_agent_loop(user_query, inbox, system_prompt, model_name="gpt-4o-mini",
                         # Add annotations to our collection
                         all_annotations.extend(annotations)
-                        annotation_msg = f"📝 Found {len(annotations)} instruction flags in tool output"
-                        execution_log.append(annotation_msg)
-                        print(annotation_msg)
-                    # Always show sanitized result in logs when defense is enabled
                     sanitized_msg = f"🔒 Sanitized Result: {sanitized_output}"
                     execution_log.append(sanitized_msg)
                     print(sanitized_msg)
-                    # Always add sanitized tool output to trace when defense is enabled
                     sanitized_tool_message = {
                         "role": "tool",
                         "tool_call_id": tool_call_info['id'],
@@ -1144,4 +1178,9 @@ def tool_agent_loop(user_query, inbox, system_prompt, model_name="gpt-4o-mini",
     final_trace_msg = f"📊 Trace push completed (with {len(all_annotations)} annotations)"
     execution_log.append(final_trace_msg)
-    return "\n".join(execution_log), llm_output

 # Import the instruction classifier sanitizer
 from instruction_classifier import sanitize_tool_output_with_annotations
 def extract_and_parse_tool_calls(text):
     """
     Extract tool calls from LLM output and parse them into structured format
         return {"role": "assistant", "content": llm_output}
+def process_flagged_content(merged_tagged_text):
+    """
+    Process tagged text to find second 'To: [email protected]' and trim before it.
+    Args:
+        merged_tagged_text (str): Text with <instruction> tags from classifier
+    Returns:
+        str: Processed text with content before second occurrence removed
+    """
+    if not merged_tagged_text:
+        return ""
+    # Find second occurrence of "To: [email protected]"
+    target_phrase = "To: [email protected]"
+    first_pos = merged_tagged_text.find(target_phrase)
+    if first_pos != -1:
+        second_pos = merged_tagged_text.find(target_phrase, first_pos + 1)
+        if second_pos != -1:
+            # Remove everything before and including the second occurrence
+            processed_text = merged_tagged_text[second_pos + len(target_phrase):].strip()
+            print(f"🏷️ Found second occurrence at position {second_pos}, processed flagged content: {processed_text[:100]}...")
+            # Insert newline before "Time:" and "Body:" (but not if already at start of line)
+            processed_text = re.sub(r'(?<!\n)(Time:)', r'\n\1', processed_text)
+            processed_text = re.sub(r'(?<!\n)(Body:)', r'\n\1', processed_text)
+            return processed_text
+    # If no second occurrence, return entire text
+    print(f"🏷️ No second occurrence found, returning entire flagged content: {merged_tagged_text[:100]}...")
+    return merged_tagged_text
 def create_tool_result_message(tool_results, provider):
     """
     Create properly formatted tool result message based on LLM provider
     # Track annotations for instruction classifier flagged content
     all_annotations = []
+    # Track flagged content for UI display
+    all_flagged_content = []
     # Initialize conversation with system prompt and user query
     # This will be used for LLM API calls (provider-specific format)
     llm_messages = [
                 # Conditional sanitization based on defense setting
                 if defense_enabled:
                     # Sanitize tool output with annotations
+                    sanitized_output, annotations, merged_tagged_text = sanitize_tool_output_with_annotations(tool_output, defense_enabled)
+                    # Process and collect flagged content for UI display
+                    print(f"🔍 DEBUG: merged_tagged_text: {merged_tagged_text}")
+                    print(f"🔍 DEBUG: has <instruction> tags: {'<instruction>' in merged_tagged_text if merged_tagged_text else 'No text'}")
+                    if merged_tagged_text and merged_tagged_text.strip() and "<instruction>" in merged_tagged_text:
+                        processed_flagged = process_flagged_content(merged_tagged_text)
+                        print(f"🔍 DEBUG: processed_flagged result: {processed_flagged}")
+                        if processed_flagged:
+                            all_flagged_content.append(processed_flagged)
+                            print(f"🔍 DEBUG: Added to all_flagged_content. Total items: {len(all_flagged_content)}")
                     # Always add raw tool output to trace when defense is enabled
                     raw_tool_message = {
                         # Add annotations to our collection
                         all_annotations.extend(annotations)
+                    # Add some spacing before sanitized output for clarity
+                    execution_log.append("")
+                    execution_log.append("--- DEFENSE PROCESSING ---")
+                    execution_log.append("")
+                    # Show sanitized result in logs when defense is enabled
                     sanitized_msg = f"🔒 Sanitized Result: {sanitized_output}"
                     execution_log.append(sanitized_msg)
                     print(sanitized_msg)
+                    # Add spacing separator in trace for clarity
+                    separator_message = {
+                        "role": "system",
+                        "content": "--- DEFENSE SANITIZATION APPLIED ---"
+                    }
+                    trace_messages.append(separator_message)
+                    # Add sanitized tool output to trace when defense is enabled
                     sanitized_tool_message = {
                         "role": "tool",
                         "tool_call_id": tool_call_info['id'],
     final_trace_msg = f"📊 Trace push completed (with {len(all_annotations)} annotations)"
     execution_log.append(final_trace_msg)
+    # Combine all flagged content for UI display
+    combined_flagged_content = "\n\n".join(all_flagged_content) if all_flagged_content else ""
+    print(f"🔍 DEBUG: Final combined_flagged_content: '{combined_flagged_content}'")
+    print(f"🔍 DEBUG: Length: {len(combined_flagged_content)} characters")
+    return "\n".join(execution_log), llm_output, combined_flagged_content

app.py CHANGED Viewed

@@ -526,13 +526,6 @@ def is_likely_gibberish_soft(text):
     return False  # Passes soft gibberish checks
-def validate_english_with_model_loading(text):
-    """
-    Convenience function that handles FastText model loading automatically.
-    """
-    model = load_fasttext_model()  # This will download and load the model if needed
-    return validate_english_only_windowed(text, model)
 def get_fasttext_confidence_scores(text, model=None, top_k=3):
     """
     Get top language confidence scores from FastText without doing validation.
@@ -761,7 +754,7 @@ def submit_attack(from_addr, attack_subject, attack_body, model_name="gpt-4o", d
         }
         # Process the fixed user query with the tool agent loop
-        execution_log, final_output = tool_agent_loop(
             user_query=USER_INPUT,
             inbox=INBOX,
             system_prompt=SYSTEM_PROMPT,
@@ -771,13 +764,13 @@ def submit_attack(from_addr, attack_subject, attack_body, model_name="gpt-4o", d
             fasttext_confidence_scores=fasttext_confidence_scores
         )
-        # Return execution log and final output separately
-        return execution_log, final_output
     except Exception as e:
         error_msg = f"❌ Error processing attack: {str(e)}"
         print(error_msg)
-        return "", error_msg
 def reset_to_initial_state():
     """Reset the inbox to original state and clear all inputs"""
@@ -1175,6 +1168,74 @@ def create_interface():
     .results-card ul { margin: 0; padding-left: 16px; }
     .results-card li { margin: 4px 0; }
     /* Error Modal Popup Styling */
     .error-modal-overlay {
         position: fixed !important;
@@ -1512,6 +1573,14 @@ Satya
                     )
                     # Attack results summary (pretty list)
                     results_display = gr.HTML("", elem_id="attack-results")
                     with gr.Accordion("Show Execution Trace", open=False):
                         trace_display = gr.Textbox(
                             lines=14,
@@ -1705,7 +1774,9 @@ Satya
                     gr.update(),  # email1_display - no change
                     gr.update(),  # email2_display - no change
                     gr.update(),  # email3_display - no change
-                    gr.update(value=modal_html, visible=True)  # error_modal_html
                 )
             print("✅ ALL VALIDATION PASSED - proceeding with attack submission")
@@ -1717,7 +1788,7 @@ Satya
             }
             try:
-                exec_log, final_out = submit_attack(from_addr.strip(), subject, body, model, defense_enabled, user_info.strip(), confidence_scores)
             except Exception as e:
                 # Handle any setup or execution errors with detailed messages
                 error_str = str(e).lower()
@@ -1773,7 +1844,9 @@ Satya
                     gr.update(),  # email1_display - no change
                     gr.update(),  # email2_display - no change
                     gr.update(),  # email3_display - no change
-                    gr.update(value=modal_html, visible=True)  # error_modal_html
                 )
             # Build a formatted results summary extracted from exec_log
@@ -1818,16 +1891,34 @@ Satya
             for i, email in enumerate(emails_to_display):
                 updated_emails.append(format_single_email(email, i + 1))
             # Return results with hidden error modal (validation passed)
             success_timestamp = int(time.time() * 1000)
             print(f"✅ Validation successful at {success_timestamp} - hiding error modal")
             return (final_out, results_html, exec_log, updated_emails[0], updated_emails[1], updated_emails[2],
-                   gr.update(value="", visible=False))  # Hide error modal
         submit_btn.click(
             fn=submit_and_update,
             inputs=[attack_from, attack_subject, attack_body, model_selector, defense_toggle, user_info],
-            outputs=[final_output_display, results_display, trace_display, email1_display, email2_display, email3_display, error_modal_html]
         )
         # Connect dismiss trigger to properly hide the modal

     return False  # Passes soft gibberish checks
 def get_fasttext_confidence_scores(text, model=None, top_k=3):
     """
     Get top language confidence scores from FastText without doing validation.
         }
         # Process the fixed user query with the tool agent loop
+        execution_log, final_output, flagged_content = tool_agent_loop(
             user_query=USER_INPUT,
             inbox=INBOX,
             system_prompt=SYSTEM_PROMPT,
             fasttext_confidence_scores=fasttext_confidence_scores
         )
+        # Return execution log, final output, and flagged content separately
+        return execution_log, final_output, flagged_content
     except Exception as e:
         error_msg = f"❌ Error processing attack: {str(e)}"
         print(error_msg)
+        return "", error_msg, ""
 def reset_to_initial_state():
     """Reset the inbox to original state and clear all inputs"""
     .results-card ul { margin: 0; padding-left: 16px; }
     .results-card li { margin: 4px 0; }
+    /* Accordion content styling for flagged content */
+    .gr-accordion .gr-panel:has([data-testid="HTML"]) {
+        max-height: 300px !important;
+        overflow-y: auto !important;
+        padding: 16px !important;
+        background: white !important;
+        border-radius: 8px !important;
+        font-family: 'Roboto', sans-serif !important;
+        line-height: 1.6 !important;
+        color: #333333 !important;
+        word-wrap: break-word !important;
+        overflow-wrap: break-word !important;
+        scrollbar-width: thin !important;
+    }
+    /* Scrollbar styling for accordion content */
+    .gr-accordion .gr-panel:has([data-testid="HTML"])::-webkit-scrollbar {
+        width: 8px !important;
+    }
+    .gr-accordion .gr-panel:has([data-testid="HTML"])::-webkit-scrollbar-track {
+        background: rgba(0,0,0,0.1) !important;
+        border-radius: 4px !important;
+    }
+    .gr-accordion .gr-panel:has([data-testid="HTML"])::-webkit-scrollbar-thumb {
+        background: rgba(0,0,0,0.3) !important;
+        border-radius: 4px !important;
+    }
+    .gr-accordion .gr-panel:has([data-testid="HTML"])::-webkit-scrollbar-thumb:hover {
+        background: rgba(0,0,0,0.5) !important;
+    }
+    /* Instruction tag styling for light mode */
+    instruction {
+        background-color: #ffebee !important;
+        color: #c62828 !important;
+        padding: 2px 6px !important;
+        border-radius: 4px !important;
+        font-weight: 600 !important;
+        border: 1px solid #ef5350 !important;
+        box-shadow: 0 1px 2px rgba(198, 40, 40, 0.2) !important;
+        display: inline !important;
+        font-family: 'Roboto', sans-serif !important;
+        font-size: 14px !important;
+        line-height: 1.4 !important;
+        margin: 0 2px !important;
+    }
+    /* Instruction tag styling for dark mode */
+    @media (prefers-color-scheme: dark) {
+        instruction {
+            background-color: rgb(84 37 37) !important;
+            color: #ffffff !important;
+            border: 1px solid #d32f2f !important;
+            box-shadow: 0 1px 3px rgba(183, 28, 28, 0.4) !important;
+        }
+        /* Also ensure accordion content has proper dark mode styling */
+        .gr-accordion .gr-panel:has([data-testid="HTML"]) {
+            background: var(--background-fill-primary) !important;
+            color: var(--body-text-color) !important;
+        }
+    }
     /* Error Modal Popup Styling */
     .error-modal-overlay {
         position: fixed !important;
                     )
                     # Attack results summary (pretty list)
                     results_display = gr.HTML("", elem_id="attack-results")
+                    # Flagged content display (only shown when defense enabled and content found)
+                    with gr.Accordion("Show What was Flagged", open=False, visible=False) as flagged_accordion:
+                        flagged_content_display = gr.HTML(
+                            "",
+                            show_label=False
+                        )
                     with gr.Accordion("Show Execution Trace", open=False):
                         trace_display = gr.Textbox(
                             lines=14,
                     gr.update(),  # email1_display - no change
                     gr.update(),  # email2_display - no change
                     gr.update(),  # email3_display - no change
+                    gr.update(value=modal_html, visible=True),  # error_modal_html
+                    gr.update(),  # flagged_accordion - no change
+                    gr.update()  # flagged_content_display - no change
                 )
             print("✅ ALL VALIDATION PASSED - proceeding with attack submission")
             }
             try:
+                exec_log, final_out, flagged_content = submit_attack(from_addr.strip(), subject, body, model, defense_enabled, user_info.strip(), confidence_scores)
             except Exception as e:
                 # Handle any setup or execution errors with detailed messages
                 error_str = str(e).lower()
                     gr.update(),  # email1_display - no change
                     gr.update(),  # email2_display - no change
                     gr.update(),  # email3_display - no change
+                    gr.update(value=modal_html, visible=True),  # error_modal_html
+                    gr.update(),  # flagged_accordion - no change
+                    gr.update()  # flagged_content_display - no change
                 )
             # Build a formatted results summary extracted from exec_log
             for i, email in enumerate(emails_to_display):
                 updated_emails.append(format_single_email(email, i + 1))
+            # Process flagged content for display
+            flagged_display_html = ""
+            flagged_accordion_visible = False
+            flagged_accordion_open = False
+            if defense_enabled and flagged_content and flagged_content.strip():
+                # Convert newlines to HTML line breaks for proper rendering
+                flagged_content_html = flagged_content.replace('\n', '<br>')
+                # Simple HTML structure without extra containers
+                flagged_display_html = flagged_content_html
+                flagged_accordion_visible = True
+                flagged_accordion_open = True  # Open after submit when there's content
+                print(f"🏷️ Flagged content prepared for UI: {len(flagged_content)} characters")
+            else:
+                print("🏷️ No flagged content to display")
             # Return results with hidden error modal (validation passed)
             success_timestamp = int(time.time() * 1000)
             print(f"✅ Validation successful at {success_timestamp} - hiding error modal")
             return (final_out, results_html, exec_log, updated_emails[0], updated_emails[1], updated_emails[2],
+                   gr.update(value="", visible=False),  # Hide error modal
+                   gr.update(visible=flagged_accordion_visible, open=flagged_accordion_open),  # Update flagged accordion
+                   gr.update(value=flagged_display_html))  # Update flagged content
         submit_btn.click(
             fn=submit_and_update,
             inputs=[attack_from, attack_subject, attack_body, model_selector, defense_toggle, user_info],
+            outputs=[final_output_display, results_display, trace_display, email1_display, email2_display, email3_display, error_modal_html, flagged_accordion, flagged_content_display]
         )
         # Connect dismiss trigger to properly hide the modal

instruction_classifier.py CHANGED Viewed

@@ -186,7 +186,7 @@ class InstructionClassifierSanitizer:
     @spaces.GPU
-    def sanitize_with_annotations(self, tool_output: str) -> Tuple[str, List[Dict[str, any]]]:
         """
         Sanitization function that also returns annotation data for flagged content.
@@ -194,11 +194,13 @@ class InstructionClassifierSanitizer:
             tool_output: The raw tool output string
         Returns:
-            Tuple of (sanitized_output, annotations) where annotations contain
-            position information for content that was flagged by the classifier
         """
         if not tool_output or not tool_output.strip():
-            return tool_output, []
         # Move model to target device (GPU) within @spaces.GPU decorated method
         if self.device != self.target_device:
@@ -214,7 +216,7 @@ class InstructionClassifierSanitizer:
             if not is_injection:
                 print("✅ No injection detected - returning original output")
-                return tool_output, []
             print(f"🚨 Injection detected! Processing with extensions and annotations...")
@@ -233,12 +235,13 @@ class InstructionClassifierSanitizer:
             # Step 5: Remove instruction tags and their content
             sanitized_output = self._remove_instruction_tags(merged_tagged_text)
             print(f"🔒 Sanitized output: {sanitized_output}")
-            return sanitized_output, annotations
         except Exception as e:
             print(f"❌ Error in instruction classifier sanitization: {e}")
             # Return original output if sanitization fails
-            return tool_output, []
     def _extract_annotations_from_tagged_text(self, tagged_text: str, original_text: str) -> List[Dict[str, any]]:
         """
@@ -700,23 +703,25 @@ def sanitize_tool_output_with_annotations(tool_output, defense_enabled=True):
         defense_enabled: Whether defense is enabled (passed from agent)
     Returns:
-        Tuple of (sanitized_output, annotations) where annotations contain
-        position information for content that was flagged by the classifier
     """
     print(f"🔍 sanitize_tool_output_with_annotations called with: {tool_output[:100]}...")
     # If defense is disabled globally, return original output with no annotations
     if not defense_enabled:
         print("⚠️ Defense disabled - returning original output without processing")
-        return tool_output, []
     sanitizer = get_sanitizer()
     if sanitizer is None:
         print("⚠️  Instruction classifier not available, returning original output")
-        return tool_output, []
     print("✅ Sanitizer found, processing with annotations...")
-    sanitized_output, annotations = sanitizer.sanitize_with_annotations(tool_output)
     print(f"🔒 Sanitization complete, result: {sanitized_output[:100]}...")
     print(f"📝 Found {len(annotations)} annotations")
-    return sanitized_output, annotations

     @spaces.GPU
+    def sanitize_with_annotations(self, tool_output: str) -> Tuple[str, List[Dict[str, any]], str]:
         """
         Sanitization function that also returns annotation data for flagged content.
             tool_output: The raw tool output string
         Returns:
+            Tuple of (sanitized_output, annotations, merged_tagged_text) where:
+            - sanitized_output: cleaned text with instruction content removed
+            - annotations: position information for content flagged by classifier
+            - merged_tagged_text: text with <instruction> tags showing detected content
         """
         if not tool_output or not tool_output.strip():
+            return tool_output, [], tool_output
         # Move model to target device (GPU) within @spaces.GPU decorated method
         if self.device != self.target_device:
             if not is_injection:
                 print("✅ No injection detected - returning original output")
+                return tool_output, [], tool_output
             print(f"🚨 Injection detected! Processing with extensions and annotations...")
             # Step 5: Remove instruction tags and their content
             sanitized_output = self._remove_instruction_tags(merged_tagged_text)
             print(f"🔒 Sanitized output: {sanitized_output}")
+            print(f"🔍 DEBUG SANITIZER: Returning merged_tagged_text: '{merged_tagged_text}'")
+            return sanitized_output, annotations, merged_tagged_text
         except Exception as e:
             print(f"❌ Error in instruction classifier sanitization: {e}")
             # Return original output if sanitization fails
+            return tool_output, [], tool_output
     def _extract_annotations_from_tagged_text(self, tagged_text: str, original_text: str) -> List[Dict[str, any]]:
         """
         defense_enabled: Whether defense is enabled (passed from agent)
     Returns:
+        Tuple of (sanitized_output, annotations, merged_tagged_text) where:
+        - sanitized_output: cleaned text with instruction content removed
+        - annotations: position information for content flagged by classifier
+        - merged_tagged_text: text with <instruction> tags showing detected content
     """
     print(f"🔍 sanitize_tool_output_with_annotations called with: {tool_output[:100]}...")
     # If defense is disabled globally, return original output with no annotations
     if not defense_enabled:
         print("⚠️ Defense disabled - returning original output without processing")
+        return tool_output, [], tool_output
     sanitizer = get_sanitizer()
     if sanitizer is None:
         print("⚠️  Instruction classifier not available, returning original output")
+        return tool_output, [], tool_output
     print("✅ Sanitizer found, processing with annotations...")
+    sanitized_output, annotations, merged_tagged_text = sanitizer.sanitize_with_annotations(tool_output)
     print(f"🔒 Sanitization complete, result: {sanitized_output[:100]}...")
     print(f"📝 Found {len(annotations)} annotations")
+    return sanitized_output, annotations, merged_tagged_text

utils.py CHANGED Viewed

@@ -1,147 +1,19 @@
 import json
 import torch
 import torch.nn as nn
-from torch.utils.data import Dataset, DataLoader
-from transformers import AutoTokenizer, AutoModel, AutoConfig
-import numpy as np
-from tqdm import tqdm
 import re
-from typing import List, Tuple, Dict, Any
 import warnings
 import logging
 import os
-from datetime import datetime
-from sklearn.utils.class_weight import compute_class_weight
-import torch.nn.functional as F
 # Disable tokenizer parallelism to avoid forking warnings
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 warnings.filterwarnings('ignore')
-def set_random_seeds(seed=42):
-    """Set random seeds for reproducibility"""
-    import random
-    import numpy as np
-    import torch
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)  # For multi-GPU
-    # Make CuDNN deterministic (slower but reproducible)
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-def setup_logging(log_dir='data/logs'):
-    """Setup logging configuration"""
-    # Create logs directory if it doesn't exist
-    os.makedirs(log_dir, exist_ok=True)
-    # Create timestamp for log file
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    log_file = os.path.join(log_dir, f'training_log_{timestamp}.log')
-    # Configure logging
-    logging.basicConfig(
-        level=logging.INFO,  # Back to INFO level
-        format='%(asctime)s - %(levelname)s - %(message)s',
-        handlers=[
-            logging.FileHandler(log_file),
-            logging.StreamHandler()  # Also print to console
-        ]
-    )
-    logger = logging.getLogger(__name__)
-    logger.info(f"Logging initialized. Log file: {log_file}")
-    return logger, log_file
-def check_gpu_availability():
-    """Check and print GPU availability information"""
-    logger = logging.getLogger(__name__)
-    logger.info("=== GPU Availability Check ===")
-    if torch.backends.mps.is_available():
-        logger.info("✓ MPS (Apple Silicon GPU) is available")
-        if torch.backends.mps.is_built():
-            logger.info("✓ MPS is built into PyTorch")
-        else:
-            logger.info("✗ MPS is not built into PyTorch")
-    else:
-        logger.info("✗ MPS (Apple Silicon GPU) is not available")
-    if torch.cuda.is_available():
-        logger.info(f"✓ CUDA is available (GPU count: {torch.cuda.device_count()})")
-    else:
-        logger.info("✗ CUDA is not available")
-    logger.info(f"PyTorch version: {torch.__version__}")
-    logger.info("=" * 50)
-def calculate_class_weights(dataset):
-    """Calculate class weights for imbalanced dataset using BERT paper approach"""
-    logger = logging.getLogger(__name__)
-    # Collect all labels from the dataset (BERT approach: only first subtokens have real labels)
-    all_labels = []
-    for window_data in dataset.processed_data:
-        # Filter out -100 labels (special tokens + subsequent subtokens of same word)
-        # This gives us true word-level class distribution
-        valid_labels = [label for label in window_data['subword_labels'] if label != -100]
-        all_labels.extend(valid_labels)
-    # Convert to numpy array
-    y = np.array(all_labels)
-    # Calculate class weights using sklearn
-    classes = np.unique(y)
-    class_weights = compute_class_weight('balanced', classes=classes, y=y)
-    # Create weight tensor
-    weight_tensor = torch.FloatTensor(class_weights)
-    logger.info(f"Word-level class distribution: {np.bincount(y)}")
-    logger.info(f"Class 0 (Non-instruction words): {np.sum(y == 0)} words ({np.sum(y == 0)/len(y)*100:.1f}%)")
-    logger.info(f"Class 1 (Instruction words): {np.sum(y == 1)} words ({np.sum(y == 1)/len(y)*100:.1f}%)")
-    logger.info(f"Calculated class weights (word-level): {class_weights}")
-    logger.info(f"  Weight for class 0 (Non-instruction): {class_weights[0]:.4f}")
-    logger.info(f"  Weight for class 1 (Instruction): {class_weights[1]:.4f}")
-    return weight_tensor
-class FocalLoss(nn.Module):
-    """Focal Loss for addressing class imbalance"""
-    def __init__(self, alpha=1, gamma=2, ignore_index=-100):
-        super(FocalLoss, self).__init__()
-        self.alpha = alpha
-        self.gamma = gamma
-        self.ignore_index = ignore_index
-    def forward(self, inputs, targets):
-        # Flatten inputs and targets
-        inputs = inputs.view(-1, inputs.size(-1))
-        targets = targets.view(-1)
-        # Create mask for non-ignored indices
-        mask = targets != self.ignore_index
-        targets = targets[mask]
-        inputs = inputs[mask]
-        if len(targets) == 0:
-            return torch.tensor(0.0, requires_grad=True, device=inputs.device)
-        # Calculate cross entropy
-        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
-        # Calculate pt
-        pt = torch.exp(-ce_loss)
-        # Calculate focal loss
-        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
-        return focal_loss.mean()
 class InstructionDataset(Dataset):
     def __init__(self, data_path: str, tokenizer, max_length: int = 512, is_training: bool = True,
                  window_size: int = 512, overlap: int = 100):
@@ -517,8 +389,6 @@ class TransformerInstructionClassifier(nn.Module):
         # Setup loss function based on type
         if loss_type == 'weighted_ce':
             self.loss_fct = nn.CrossEntropyLoss(ignore_index=-100, weight=class_weights)
-        elif loss_type == 'focal':
-            self.loss_fct = FocalLoss(alpha=1, gamma=2, ignore_index=-100)
         else:
             self.loss_fct = nn.CrossEntropyLoss(ignore_index=-100)

 import json
 import torch
 import torch.nn as nn
+from torch.utils.data import Dataset
+from transformers import AutoModel
 import re
+from typing import List, Dict, Any
 import warnings
 import logging
 import os
 # Disable tokenizer parallelism to avoid forking warnings
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 warnings.filterwarnings('ignore')
 class InstructionDataset(Dataset):
     def __init__(self, data_path: str, tokenizer, max_length: int = 512, is_training: bool = True,
                  window_size: int = 512, overlap: int = 100):
         # Setup loss function based on type
         if loss_type == 'weighted_ce':
             self.loss_fct = nn.CrossEntropyLoss(ignore_index=-100, weight=class_weights)
         else:
             self.loss_fct = nn.CrossEntropyLoss(ignore_index=-100)