|
from unsloth import FastLanguageModel |
|
from transformers import AutoTokenizer |
|
import torch |
|
|
|
class EndpointHandler: |
|
def __init__(self, path=""): |
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(path, use_fast=True) |
|
self.model, _ = FastLanguageModel.from_pretrained( |
|
model_name=path, |
|
max_seq_length=2048, |
|
dtype=torch.float16, |
|
load_in_4bit=True, |
|
) |
|
self.model.to(self.device) |
|
self.model.eval() |
|
|
|
def __call__(self, data): |
|
prompt = data.get("inputs", "") |
|
if not prompt: |
|
return {"error": "Missing 'inputs' in request payload."} |
|
|
|
generation_params = { |
|
"max_new_tokens": data.get("max_new_tokens", 128), |
|
"temperature": data.get("temperature", 0.7), |
|
"top_p": data.get("top_p", 0.9), |
|
"top_k": data.get("top_k", 50), |
|
"do_sample": data.get("do_sample", True), |
|
"repetition_penalty": data.get("repetition_penalty", 1.1), |
|
} |
|
|
|
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) |
|
|
|
with torch.no_grad(): |
|
outputs = self.model.generate( |
|
**inputs, |
|
**generation_params |
|
) |
|
|
|
generated_text = self.tokenizer.decode( |
|
outputs[0], |
|
skip_special_tokens=True, |
|
clean_up_tokenization_spaces=True |
|
) |
|
|
|
return {"generated_text": generated_text} |
|
|