Transformers
remyx
smellslikeml
adding weights
6199e94
import os
import io
import torch
import numpy as np
import triton_python_backend_utils as pb_utils
from prismatic import load
from PIL import Image
class TritonPythonModel:
"""Your Python model must use the same class name. Every Python model
that is created must have "TritonPythonModel" as the class name.
"""
@staticmethod
def auto_complete_config(auto_complete_model_config):
"""`auto_complete_config` is called only once when loading the model
assuming the server was not started with
`--disable-auto-complete-config`. Implementing this function is
optional. No implementation of `auto_complete_config` will do nothing.
This function can be used to set `max_batch_size`, `input` and `output`
properties of the model using `set_max_batch_size`, `add_input`, and
`add_output`. These properties will allow Triton to load the model with
minimal model configuration in absence of a configuration file. This
function returns the `pb_utils.ModelConfig` object with these
properties. You can use the `as_dict` function to gain read-only access
to the `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object
being returned from here will be used as the final configuration for
the model.
Note: The Python interpreter used to invoke this function will be
destroyed upon returning from this function and as a result none of the
objects created here will be available in the `initialize`, `execute`,
or `finalize` functions.
Parameters
----------
auto_complete_model_config : pb_utils.ModelConfig
An object containing the existing model configuration. You can build
upon the configuration given by this object when setting the
properties for this model.
Returns
-------
pb_utils.ModelConfig
An object containing the auto-completed model configuration
"""
inputs = [{
'name': 'PROMPT',
'data_type': 'TYPE_STRING',
'dims': [-1]
}, {
'name': 'IMAGES',
'data_type': 'TYPE_STRING', # Changed from TYPE_FP16 to TYPE_STRING
'dims': [-1] # Changed to indicate a variable-length array of strings
}]
outputs = [{
'name': 'RESULTS',
'data_type': 'TYPE_STRING',
'dims': [-1]
}]
config = auto_complete_model_config.as_dict()
input_names = []
output_names = []
for input in config['input']:
input_names.append(input['name'])
for output in config['output']:
output_names.append(output['name'])
for input in inputs:
if input['name'] not in input_names:
auto_complete_model_config.add_input(input)
for output in outputs:
if output['name'] not in output_names:
auto_complete_model_config.add_output(output)
auto_complete_model_config.set_dynamic_batching()
return auto_complete_model_config
def initialize(self, args):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to initialize any state associated with this model.
Parameters
----------
args : dict
Both keys and values are strings. The dictionary keys and values are:
* model_config: A JSON string containing the model configuration
* model_instance_kind: A string containing model instance kind
* model_instance_device_id: A string containing model instance device
ID
* model_repository: Model repository path
* model_version: Model version
* model_name: Model name
"""
self.model = load("remyxai/SpaceLlama3.1")
self.model.to(device, dtype=torch.bfloat16)
print('Initialized...')
def run_inference(self, prompt, image):
image = Image.open(io.BytesIO(image)).convert("RGB")
prompt_builder = self.model.get_prompt_builder()
prompt_builder.add_turn(role="human", message=prompt)
prompt_text = prompt_builder.get_prompt()
output_string = self.model.generate(
image,
prompt_text,
do_sample=True,
temperature=0.1,
max_new_tokens=512,
min_length=1,
)
output_string = output_string.split("</s>")[0]
output_data = np.array([output_string.encode('utf-8')], dtype=object)
return output_data
def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference is requested
for this model.
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
responses = []
for request in requests:
# Perform inference on the request and append it to responses
# list...
prompt = [
t.decode("UTF-8")
for t in pb_utils.get_input_tensor_by_name(request, "PROMPT")
.as_numpy()
.tolist()
][0]
image = [
t.decode("UTF-8")
for t in pb_utils.get_input_tensor_by_name(request, "IMAGES")
.as_numpy()
.tolist()
][0]
results = self.run_inference(prompt, image)
# Sending results
inference_response = pb_utils.InferenceResponse(output_tensors=[
pb_utils.Tensor(
"RESULTS",
results,
)
])
responses.append(inference_response)
return responses
def finalize(self):
"""`finalize` is called only once when the model is being unloaded.
Implementing `finalize` function is optional. This function allows
the model to perform any necessary clean ups before exit.
"""
print('Cleaning up...')