from unsloth import FastLanguageModel from transformers import AutoTokenizer import torch class EndpointHandler: def __init__(self, path=""): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.tokenizer = AutoTokenizer.from_pretrained(path, use_fast=True) self.model, _ = FastLanguageModel.from_pretrained( model_name=path, max_seq_length=2048, dtype=torch.float16, load_in_4bit=True, ) self.model.to(self.device) self.model.eval() def __call__(self, data): prompt = data.get("inputs", "") if not prompt: return {"error": "Missing 'inputs' in request payload."} generation_params = { "max_new_tokens": data.get("max_new_tokens", 128), "temperature": data.get("temperature", 0.7), "top_p": data.get("top_p", 0.9), "top_k": data.get("top_k", 50), "do_sample": data.get("do_sample", True), "repetition_penalty": data.get("repetition_penalty", 1.1), } inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) with torch.no_grad(): outputs = self.model.generate( **inputs, **generation_params ) generated_text = self.tokenizer.decode( outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True ) return {"generated_text": generated_text}