SpaceLlama3.1 / docker /models /spacellama3.1 /1 /model.py

smellslikeml

adding weights

6199e94 11 months ago

6.42 kB

	import os
	import io
	import torch
	import numpy as np
	import triton_python_backend_utils as pb_utils
	from prismatic import load
	from PIL import Image

	class TritonPythonModel:
	"""Your Python model must use the same class name. Every Python model
	that is created must have "TritonPythonModel" as the class name.
	"""

	@staticmethod
	def auto_complete_config(auto_complete_model_config):
	"""`auto_complete_config` is called only once when loading the model
	assuming the server was not started with
	`--disable-auto-complete-config`. Implementing this function is
	optional. No implementation of `auto_complete_config` will do nothing.
	This function can be used to set `max_batch_size`, `input` and `output`
	properties of the model using `set_max_batch_size`, `add_input`, and
	`add_output`. These properties will allow Triton to load the model with
	minimal model configuration in absence of a configuration file. This
	function returns the `pb_utils.ModelConfig` object with these
	properties. You can use the `as_dict` function to gain read-only access
	to the `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object
	being returned from here will be used as the final configuration for
	the model.
	Note: The Python interpreter used to invoke this function will be
	destroyed upon returning from this function and as a result none of the
	objects created here will be available in the `initialize`, `execute`,
	or `finalize` functions.
	Parameters
	----------
	auto_complete_model_config : pb_utils.ModelConfig
	An object containing the existing model configuration. You can build
	upon the configuration given by this object when setting the
	properties for this model.
	Returns
	-------
	pb_utils.ModelConfig
	An object containing the auto-completed model configuration
	"""
	inputs = [{
	'name': 'PROMPT',
	'data_type': 'TYPE_STRING',
	'dims': [-1]
	}, {
	'name': 'IMAGES',
	'data_type': 'TYPE_STRING', # Changed from TYPE_FP16 to TYPE_STRING
	'dims': [-1] # Changed to indicate a variable-length array of strings
	}]

	outputs = [{
	'name': 'RESULTS',
	'data_type': 'TYPE_STRING',
	'dims': [-1]
	}]

	config = auto_complete_model_config.as_dict()
	input_names = []
	output_names = []
	for input in config['input']:
	input_names.append(input['name'])
	for output in config['output']:
	output_names.append(output['name'])

	for input in inputs:
	if input['name'] not in input_names:
	auto_complete_model_config.add_input(input)
	for output in outputs:
	if output['name'] not in output_names:
	auto_complete_model_config.add_output(output)

	auto_complete_model_config.set_dynamic_batching()

	return auto_complete_model_config

	def initialize(self, args):
	"""`initialize` is called only once when the model is being loaded.
	Implementing `initialize` function is optional. This function allows
	the model to initialize any state associated with this model.
	Parameters
	----------
	args : dict
	Both keys and values are strings. The dictionary keys and values are:
	* model_config: A JSON string containing the model configuration
	* model_instance_kind: A string containing model instance kind
	* model_instance_device_id: A string containing model instance device
	ID
	* model_repository: Model repository path
	* model_version: Model version
	* model_name: Model name
	"""
	self.model = load("remyxai/SpaceLlama3.1")
	self.model.to(device, dtype=torch.bfloat16)
	print('Initialized...')

	def run_inference(self, prompt, image):
	image = Image.open(io.BytesIO(image)).convert("RGB")
	prompt_builder = self.model.get_prompt_builder()
	prompt_builder.add_turn(role="human", message=prompt)
	prompt_text = prompt_builder.get_prompt()

	output_string = self.model.generate(
	image,
	prompt_text,
	do_sample=True,
	temperature=0.1,
	max_new_tokens=512,
	min_length=1,
	)
	output_string = output_string.split("</s>")[0]
	output_data = np.array([output_string.encode('utf-8')], dtype=object)
	return output_data

	def execute(self, requests):
	"""`execute` must be implemented in every Python model. `execute`
	function receives a list of pb_utils.InferenceRequest as the only
	argument. This function is called when an inference is requested
	for this model.
	Parameters
	----------
	requests : list
	A list of pb_utils.InferenceRequest
	Returns
	-------
	list
	A list of pb_utils.InferenceResponse. The length of this list must
	be the same as `requests`
	"""

	responses = []

	for request in requests:
	# Perform inference on the request and append it to responses
	# list...
	prompt = [
	t.decode("UTF-8")
	for t in pb_utils.get_input_tensor_by_name(request, "PROMPT")
	.as_numpy()
	.tolist()
	][0]
	image = [
	t.decode("UTF-8")
	for t in pb_utils.get_input_tensor_by_name(request, "IMAGES")
	.as_numpy()
	.tolist()
	][0]
	results = self.run_inference(prompt, image)

	# Sending results
	inference_response = pb_utils.InferenceResponse(output_tensors=[
	pb_utils.Tensor(
	"RESULTS",
	results,
	)
	])

	responses.append(inference_response)

	return responses

	def finalize(self):
	"""`finalize` is called only once when the model is being unloaded.
	Implementing `finalize` function is optional. This function allows
	the model to perform any necessary clean ups before exit.
	"""
	print('Cleaning up...')