0xroyce
/

NazareAI-Senior-Marketing-Strategist

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

NazareAI-Senior-Marketing-Strategist / handler.py

0xroyce's picture

Update handler.py

dca494b verified 2 months ago

1.8 kB

	import os
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer

	model = None
	tokenizer = None
	device = None

	def init():
	"""
	The init function is called once at startup to load the model into memory.
	"""
	global model, tokenizer, device

	# Replace this with your model repository ID
	model_name_or_path = "0xroyce/NazareAI-Senior-Marketing-Strategist"

	# Load the tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

	# Load the model
	model = AutoModelForCausalLM.from_pretrained(
	model_name_or_path,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	low_cpu_mem_usage=True
	)

	# Set up the device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)
	model.eval()

	# Store in global variables
	globals()["model"] = model
	globals()["tokenizer"] = tokenizer
	globals()["device"] = device


	def inference(model_inputs: dict) -> dict:
	"""
	This function is called for every request.
	The input is a dictionary with a 'prompt' key.
	The output is a dictionary with 'generated_text'.
	"""
	global model, tokenizer, device

	# Get the prompt from the input
	prompt = model_inputs.get("prompt", "")
	if not prompt:
	return {"error": "No prompt provided."}

	# Tokenize the prompt
	inputs = tokenizer(prompt, return_tensors="pt").to(device)

	# Run generation
	output_ids = model.generate(
	**inputs,
	max_new_tokens=200,
	do_sample=True,
	top_p=0.9,
	temperature=0.7
	)

	# Decode the output
	output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

	return {"generated_text": output_text}