|
import os |
|
import io |
|
import torch |
|
import numpy as np |
|
import triton_python_backend_utils as pb_utils |
|
from prismatic import load |
|
from PIL import Image |
|
|
|
class TritonPythonModel: |
|
"""Your Python model must use the same class name. Every Python model |
|
that is created must have "TritonPythonModel" as the class name. |
|
""" |
|
|
|
@staticmethod |
|
def auto_complete_config(auto_complete_model_config): |
|
"""`auto_complete_config` is called only once when loading the model |
|
assuming the server was not started with |
|
`--disable-auto-complete-config`. Implementing this function is |
|
optional. No implementation of `auto_complete_config` will do nothing. |
|
This function can be used to set `max_batch_size`, `input` and `output` |
|
properties of the model using `set_max_batch_size`, `add_input`, and |
|
`add_output`. These properties will allow Triton to load the model with |
|
minimal model configuration in absence of a configuration file. This |
|
function returns the `pb_utils.ModelConfig` object with these |
|
properties. You can use the `as_dict` function to gain read-only access |
|
to the `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object |
|
being returned from here will be used as the final configuration for |
|
the model. |
|
Note: The Python interpreter used to invoke this function will be |
|
destroyed upon returning from this function and as a result none of the |
|
objects created here will be available in the `initialize`, `execute`, |
|
or `finalize` functions. |
|
Parameters |
|
---------- |
|
auto_complete_model_config : pb_utils.ModelConfig |
|
An object containing the existing model configuration. You can build |
|
upon the configuration given by this object when setting the |
|
properties for this model. |
|
Returns |
|
------- |
|
pb_utils.ModelConfig |
|
An object containing the auto-completed model configuration |
|
""" |
|
inputs = [{ |
|
'name': 'PROMPT', |
|
'data_type': 'TYPE_STRING', |
|
'dims': [-1] |
|
}, { |
|
'name': 'IMAGES', |
|
'data_type': 'TYPE_STRING', |
|
'dims': [-1] |
|
}] |
|
|
|
outputs = [{ |
|
'name': 'RESULTS', |
|
'data_type': 'TYPE_STRING', |
|
'dims': [-1] |
|
}] |
|
|
|
config = auto_complete_model_config.as_dict() |
|
input_names = [] |
|
output_names = [] |
|
for input in config['input']: |
|
input_names.append(input['name']) |
|
for output in config['output']: |
|
output_names.append(output['name']) |
|
|
|
for input in inputs: |
|
if input['name'] not in input_names: |
|
auto_complete_model_config.add_input(input) |
|
for output in outputs: |
|
if output['name'] not in output_names: |
|
auto_complete_model_config.add_output(output) |
|
|
|
auto_complete_model_config.set_dynamic_batching() |
|
|
|
return auto_complete_model_config |
|
|
|
def initialize(self, args): |
|
"""`initialize` is called only once when the model is being loaded. |
|
Implementing `initialize` function is optional. This function allows |
|
the model to initialize any state associated with this model. |
|
Parameters |
|
---------- |
|
args : dict |
|
Both keys and values are strings. The dictionary keys and values are: |
|
* model_config: A JSON string containing the model configuration |
|
* model_instance_kind: A string containing model instance kind |
|
* model_instance_device_id: A string containing model instance device |
|
ID |
|
* model_repository: Model repository path |
|
* model_version: Model version |
|
* model_name: Model name |
|
""" |
|
self.model = load("remyxai/SpaceLlama3.1") |
|
self.model.to(device, dtype=torch.bfloat16) |
|
print('Initialized...') |
|
|
|
def run_inference(self, prompt, image): |
|
image = Image.open(io.BytesIO(image)).convert("RGB") |
|
prompt_builder = self.model.get_prompt_builder() |
|
prompt_builder.add_turn(role="human", message=prompt) |
|
prompt_text = prompt_builder.get_prompt() |
|
|
|
output_string = self.model.generate( |
|
image, |
|
prompt_text, |
|
do_sample=True, |
|
temperature=0.1, |
|
max_new_tokens=512, |
|
min_length=1, |
|
) |
|
output_string = output_string.split("</s>")[0] |
|
output_data = np.array([output_string.encode('utf-8')], dtype=object) |
|
return output_data |
|
|
|
def execute(self, requests): |
|
"""`execute` must be implemented in every Python model. `execute` |
|
function receives a list of pb_utils.InferenceRequest as the only |
|
argument. This function is called when an inference is requested |
|
for this model. |
|
Parameters |
|
---------- |
|
requests : list |
|
A list of pb_utils.InferenceRequest |
|
Returns |
|
------- |
|
list |
|
A list of pb_utils.InferenceResponse. The length of this list must |
|
be the same as `requests` |
|
""" |
|
|
|
responses = [] |
|
|
|
for request in requests: |
|
|
|
|
|
prompt = [ |
|
t.decode("UTF-8") |
|
for t in pb_utils.get_input_tensor_by_name(request, "PROMPT") |
|
.as_numpy() |
|
.tolist() |
|
][0] |
|
image = [ |
|
t.decode("UTF-8") |
|
for t in pb_utils.get_input_tensor_by_name(request, "IMAGES") |
|
.as_numpy() |
|
.tolist() |
|
][0] |
|
results = self.run_inference(prompt, image) |
|
|
|
|
|
inference_response = pb_utils.InferenceResponse(output_tensors=[ |
|
pb_utils.Tensor( |
|
"RESULTS", |
|
results, |
|
) |
|
]) |
|
|
|
responses.append(inference_response) |
|
|
|
return responses |
|
|
|
def finalize(self): |
|
"""`finalize` is called only once when the model is being unloaded. |
|
Implementing `finalize` function is optional. This function allows |
|
the model to perform any necessary clean ups before exit. |
|
""" |
|
print('Cleaning up...') |
|
|