File size: 5,328 Bytes
d0d5c87 8df12a4 d0d5c87 8df12a4 4ea34db 8df12a4 12bd9b6 8df12a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
---
license: apache-2.0
base_model:
- answerdotai/ModernBERT-large
datasets:
- codelion/optillm-router-dataset
---
# How to use?
This model is used in [optillm](https://github.com/codelion/optillm) to route between the various approaches based on the prompt.
To use the model with optillm you can just prepend `router` to the model name. E.g. if we set `router-gpt-4o-mini` as the model, it will use the `gpt-4o-mini` as the base model.
Otherwise, refer to the code in [router-plugin](https://github.com/codelion/optillm/blob/main/optillm/plugins/router_plugin.py) to see how to use this model for classification.
This model is based on `ModernBERT-large`and better than the previous [router model](https://huggingface.co/codelion/optillm-bert-uncased)
that was based on `bert-large-uncased`.
### Router results on AIME 2024 pass@1
| Model | Score |
|-------|-----:|
| router-gpt4o-mini with codelion/optillm-modernbert-large | 13.33 |
| router-gpt4o-mini with codelion/optillm-bert-uncased | 6.67 |
| gpt4o-mini | 3.33 |
# Usage
To use the model directly you will need to use our `OptILMClassifier` class as we added additional layers to the base model. The additional
`effort_encoder` is used to take into account the number of tokens a given approach consumes. Also, note
the mapping of the returned index to the `APPROACHES` list as shown below.
```python
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer, AutoConfig
from huggingface_hub import hf_hub_download
from safetensors import safe_open
from safetensors.torch import load_model
from transformers import AutoTokenizer, AutoModel
# Constants
MAX_LENGTH = 1024
APPROACHES = ["none", "mcts", "bon", "moa", "rto", "z3", "self_consistency", "pvg", "rstar", "cot_reflection", "plansearch", "leap", "re2"]
BASE_MODEL = "answerdotai/ModernBERT-large"
OPTILLM_MODEL_NAME = "codelion/optillm-modernbert-large"
class OptILMClassifier(nn.Module):
def __init__(self, base_model, num_labels):
super().__init__()
self.base_model = base_model
self.effort_encoder = nn.Sequential(
nn.Linear(1, 64),
nn.ReLU(),
nn.Linear(64, 64),
nn.ReLU()
)
self.classifier = nn.Linear(base_model.config.hidden_size + 64, num_labels)
def forward(self, input_ids, attention_mask, effort):
outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = outputs.last_hidden_state[:, 0] # Shape: (batch_size, hidden_size)
effort_encoded = self.effort_encoder(effort.unsqueeze(1)) # Shape: (batch_size, 64)
combined_input = torch.cat((pooled_output, effort_encoded), dim=1)
logits = self.classifier(combined_input)
return logits
def load_optillm_model():
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
# Load the base model
base_model = AutoModel.from_pretrained(BASE_MODEL)
# Create the OptILMClassifier
model = OptILMClassifier(base_model, num_labels=len(APPROACHES))
model.to(device)
# Download the safetensors file
safetensors_path = hf_hub_download(repo_id=OPTILLM_MODEL_NAME, filename="model.safetensors")
# Load the state dict from the safetensors file
load_model(model, safetensors_path)
tokenizer = AutoTokenizer.from_pretrained(OPTILLM_MODEL_NAME)
return model, tokenizer, device
def preprocess_input(tokenizer, system_prompt, initial_query):
combined_input = f"{system_prompt}\n\nUser: {initial_query}"
encoding = tokenizer.encode_plus(
combined_input,
add_special_tokens=True,
max_length=MAX_LENGTH,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt'
)
return encoding['input_ids'], encoding['attention_mask']
def predict_approach(model, input_ids, attention_mask, device, effort=0.7):
model.eval()
with torch.no_grad():
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
effort_tensor = torch.tensor([effort], dtype=torch.float).to(device)
logits = model(input_ids, attention_mask=attention_mask, effort=effort_tensor)
probabilities = F.softmax(logits, dim=1)
predicted_approach_index = torch.argmax(probabilities, dim=1).item()
confidence = probabilities[0][predicted_approach_index].item()
return APPROACHES[predicted_approach_index], confidence
```
You can now use the `predict_approach` method to get the predicted approach as follows:
```python
# Load the trained model
router_model, tokenizer, device = load_optillm_model()
# Preprocess the input
input_ids, attention_mask = preprocess_input(tokenizer, system_prompt, initial_query)
# Predict the best approach
predicted_approach, _ = predict_approach(router_model, input_ids, attention_mask, device)
print(f"Router predicted approach: {predicted_approach}")
```
## Citation
If you use this in your work, please cite:
```bibtex
@software{optillm,
title = {Optillm: Optimizing inference proxy for LLMs},
author = {Asankhaya Sharma},
year = {2024},
publisher = {GitHub},
url = {https://github.com/codelion/optillm}
}
``` |