import os import torch from transformers import AutoModelForCausalLM, AutoTokenizer model = None tokenizer = None device = None def init(): """ The init function is called once at startup to load the model into memory. """ global model, tokenizer, device # Replace this with your model repository ID model_name_or_path = "0xroyce/NazareAI-Senior-Marketing-Strategist" # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) # Load the model model = AutoModelForCausalLM.from_pretrained( model_name_or_path, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, low_cpu_mem_usage=True ) # Set up the device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) model.eval() # Store in global variables globals()["model"] = model globals()["tokenizer"] = tokenizer globals()["device"] = device def inference(model_inputs: dict) -> dict: """ This function is called for every request. The input is a dictionary with a 'prompt' key. The output is a dictionary with 'generated_text'. """ global model, tokenizer, device # Get the prompt from the input prompt = model_inputs.get("prompt", "") if not prompt: return {"error": "No prompt provided."} # Tokenize the prompt inputs = tokenizer(prompt, return_tensors="pt").to(device) # Run generation output_ids = model.generate( **inputs, max_new_tokens=200, do_sample=True, top_p=0.9, temperature=0.7 ) # Decode the output output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) return {"generated_text": output_text}