Spaces:

snyk-etso
/

prompt-injection-instruction-defense-challenge

Running on Zero

prompt-injection-instruction-defense-challenge

File size: 3,558 Bytes

82af392

#!/usr/bin/env python3
"""
Upload the instruction classifier model to Hugging Face Model Hub
"""

from huggingface_hub import HfApi, login
import os

def upload_model():
    # You'll need to login first: huggingface-cli login
    # Or set HUGGINGFACE_TOKEN environment variable
    
    api = HfApi()
    
    # Replace with your username and repository name
    repo_id = "ddas/instruction-classifier-model"  # CHANGE THIS!
    
    try:
        # Create repository if it doesn't exist (set private=True for private repo)
        api.create_repo(repo_id, repo_type="model", exist_ok=True, private=True)
        print(f"✅ Private repository {repo_id} created/verified")
        
        # Upload the model file
        api.upload_file(
            path_or_fileobj="models/best_instruction_classifier.pth",
            path_in_repo="best_instruction_classifier.pth",
            repo_id=repo_id,
            repo_type="model",
        )
        print(f"✅ Model uploaded to {repo_id}")
        
        # Upload a README for the model
        readme_content = f"""# Instruction Classifier Model

This model is trained to detect instruction-like tokens in text for prompt injection defense.

## Model Details
- Architecture: XLM-RoBERTa base with classification head
- Task: Token classification (instruction vs. other)
- Training: Sliding window approach with diverse datasets
- Size: ~1GB
- Parameters: ~278M

## Usage

```python
from huggingface_hub import hf_hub_download
import torch
from transformers import AutoTokenizer

# You'll need the TransformerInstructionClassifier class from utils.py
# from utils import TransformerInstructionClassifier

# Download model file (returns path, not model object)
model_path = hf_hub_download(
    repo_id="{repo_id}", 
    filename="best_instruction_classifier.pth",
    token="your_hf_token_if_private"  # Only needed for private repos
)

# Create model instance
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerInstructionClassifier(
    model_name='xlm-roberta-base',
    num_labels=2,
    dropout=0.1
)

# Load weights from downloaded file
checkpoint = torch.load(model_path, map_location=device)

# Filter out loss function weights if present
model_state_dict = {{}}
for key, value in checkpoint.items():
    if not key.startswith('loss_fct'):
        model_state_dict[key] = value

model.load_state_dict(model_state_dict, strict=False)
model.to(device)
model.eval()

print("✅ Model loaded successfully!")
```

## Direct Usage with Instruction Classifier

```python
from instruction_classifier import sanitize_tool_output

# This will automatically download and use the model
result = sanitize_tool_output("Your text to check for injections")
```

## License
[Specify your license here]
"""
        
        api.upload_file(
            path_or_fileobj=readme_content.encode(),
            path_in_repo="README.md",
            repo_id=repo_id,
            repo_type="model",
        )
        print(f"✅ README uploaded")
        
        print(f"\n🎉 Model successfully uploaded to: https://huggingface.co/{repo_id}")
        print(f"\nUpdate your instruction_classifier.py with:")
        print(f'model_path = hf_hub_download(repo_id="{repo_id}", filename="best_instruction_classifier.pth")')
        
    except Exception as e:
        print(f"❌ Error uploading model: {e}")
        print("\nMake sure to:")
        print("1. Run: huggingface-cli login")
        print("2. Update repo_id with your username")

if __name__ == "__main__":
    upload_model()