Spaces:

snyk-etso
/

prompt-injection-instruction-defense-challenge

Running on Zero

App Files Files Community

prompt-injection-instruction-defense-challenge / upload_model.py

ddas

model hosted externally

82af392 unverified 20 days ago

raw

history blame

3.56 kB

	#!/usr/bin/env python3
	"""
	Upload the instruction classifier model to Hugging Face Model Hub
	"""

	from huggingface_hub import HfApi, login
	import os

	def upload_model():
	# You'll need to login first: huggingface-cli login
	# Or set HUGGINGFACE_TOKEN environment variable

	api = HfApi()

	# Replace with your username and repository name
	repo_id = "ddas/instruction-classifier-model" # CHANGE THIS!

	try:
	# Create repository if it doesn't exist (set private=True for private repo)
	api.create_repo(repo_id, repo_type="model", exist_ok=True, private=True)
	print(f"✅ Private repository {repo_id} created/verified")

	# Upload the model file
	api.upload_file(
	path_or_fileobj="models/best_instruction_classifier.pth",
	path_in_repo="best_instruction_classifier.pth",
	repo_id=repo_id,
	repo_type="model",
	)
	print(f"✅ Model uploaded to {repo_id}")

	# Upload a README for the model
	readme_content = f"""# Instruction Classifier Model

	This model is trained to detect instruction-like tokens in text for prompt injection defense.

	## Model Details
	- Architecture: XLM-RoBERTa base with classification head
	- Task: Token classification (instruction vs. other)
	- Training: Sliding window approach with diverse datasets
	- Size: ~1GB
	- Parameters: ~278M

	## Usage

	```python
	from huggingface_hub import hf_hub_download
	import torch
	from transformers import AutoTokenizer

	# You'll need the TransformerInstructionClassifier class from utils.py
	# from utils import TransformerInstructionClassifier

	# Download model file (returns path, not model object)
	model_path = hf_hub_download(
	repo_id="{repo_id}",
	filename="best_instruction_classifier.pth",
	token="your_hf_token_if_private" # Only needed for private repos
	)

	# Create model instance
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model = TransformerInstructionClassifier(
	model_name='xlm-roberta-base',
	num_labels=2,
	dropout=0.1
	)

	# Load weights from downloaded file
	checkpoint = torch.load(model_path, map_location=device)

	# Filter out loss function weights if present
	model_state_dict = {{}}
	for key, value in checkpoint.items():
	if not key.startswith('loss_fct'):
	model_state_dict[key] = value

	model.load_state_dict(model_state_dict, strict=False)
	model.to(device)
	model.eval()

	print("✅ Model loaded successfully!")
	```

	## Direct Usage with Instruction Classifier

	```python
	from instruction_classifier import sanitize_tool_output

	# This will automatically download and use the model
	result = sanitize_tool_output("Your text to check for injections")
	```

	## License
	[Specify your license here]
	"""

	api.upload_file(
	path_or_fileobj=readme_content.encode(),
	path_in_repo="README.md",
	repo_id=repo_id,
	repo_type="model",
	)
	print(f"✅ README uploaded")

	print(f"\n🎉 Model successfully uploaded to: https://huggingface.co/{repo_id}")
	print(f"\nUpdate your instruction_classifier.py with:")
	print(f'model_path = hf_hub_download(repo_id="{repo_id}", filename="best_instruction_classifier.pth")')

	except Exception as e:
	print(f"❌ Error uploading model: {e}")
	print("\nMake sure to:")
	print("1. Run: huggingface-cli login")
	print("2. Update repo_id with your username")

	if __name__ == "__main__":
	upload_model()