File size: 3,558 Bytes
82af392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3
"""
Upload the instruction classifier model to Hugging Face Model Hub
"""

from huggingface_hub import HfApi, login
import os

def upload_model():
    # You'll need to login first: huggingface-cli login
    # Or set HUGGINGFACE_TOKEN environment variable
    
    api = HfApi()
    
    # Replace with your username and repository name
    repo_id = "ddas/instruction-classifier-model"  # CHANGE THIS!
    
    try:
        # Create repository if it doesn't exist (set private=True for private repo)
        api.create_repo(repo_id, repo_type="model", exist_ok=True, private=True)
        print(f"βœ… Private repository {repo_id} created/verified")
        
        # Upload the model file
        api.upload_file(
            path_or_fileobj="models/best_instruction_classifier.pth",
            path_in_repo="best_instruction_classifier.pth",
            repo_id=repo_id,
            repo_type="model",
        )
        print(f"βœ… Model uploaded to {repo_id}")
        
        # Upload a README for the model
        readme_content = f"""# Instruction Classifier Model

This model is trained to detect instruction-like tokens in text for prompt injection defense.

## Model Details
- Architecture: XLM-RoBERTa base with classification head
- Task: Token classification (instruction vs. other)
- Training: Sliding window approach with diverse datasets
- Size: ~1GB
- Parameters: ~278M

## Usage

```python
from huggingface_hub import hf_hub_download
import torch
from transformers import AutoTokenizer

# You'll need the TransformerInstructionClassifier class from utils.py
# from utils import TransformerInstructionClassifier

# Download model file (returns path, not model object)
model_path = hf_hub_download(
    repo_id="{repo_id}", 
    filename="best_instruction_classifier.pth",
    token="your_hf_token_if_private"  # Only needed for private repos
)

# Create model instance
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerInstructionClassifier(
    model_name='xlm-roberta-base',
    num_labels=2,
    dropout=0.1
)

# Load weights from downloaded file
checkpoint = torch.load(model_path, map_location=device)

# Filter out loss function weights if present
model_state_dict = {{}}
for key, value in checkpoint.items():
    if not key.startswith('loss_fct'):
        model_state_dict[key] = value

model.load_state_dict(model_state_dict, strict=False)
model.to(device)
model.eval()

print("βœ… Model loaded successfully!")
```

## Direct Usage with Instruction Classifier

```python
from instruction_classifier import sanitize_tool_output

# This will automatically download and use the model
result = sanitize_tool_output("Your text to check for injections")
```

## License
[Specify your license here]
"""
        
        api.upload_file(
            path_or_fileobj=readme_content.encode(),
            path_in_repo="README.md",
            repo_id=repo_id,
            repo_type="model",
        )
        print(f"βœ… README uploaded")
        
        print(f"\nπŸŽ‰ Model successfully uploaded to: https://huggingface.co/{repo_id}")
        print(f"\nUpdate your instruction_classifier.py with:")
        print(f'model_path = hf_hub_download(repo_id="{repo_id}", filename="best_instruction_classifier.pth")')
        
    except Exception as e:
        print(f"❌ Error uploading model: {e}")
        print("\nMake sure to:")
        print("1. Run: huggingface-cli login")
        print("2. Update repo_id with your username")

if __name__ == "__main__":
    upload_model()