smellslikeml
commited on
Commit
·
6199e94
1
Parent(s):
4fa8cde
adding weights
Browse files- README.md +1 -1
- config.json +54 -37
- config.yaml +52 -0
- docker/Dockerfile +32 -0
- docker/client.py +61 -0
- docker/models/spacellama3.1/1/model.py +169 -0
- docker/models/spacellama3.1/config.pbtxt +24 -0
- run-metrics.jsonl +1 -0
- run_inference.py +49 -0
- spacellava+llama3-based-224-4epoch+stage-finetune+x7.jsonl +0 -0
README.md
CHANGED
|
@@ -47,4 +47,4 @@ With a pipeline of expert models, we can infer spatial relationships between obj
|
|
| 47 |
booktitle = {International Conference on Machine Learning (ICML)},
|
| 48 |
year = {2024},
|
| 49 |
}
|
| 50 |
-
```
|
|
|
|
| 47 |
booktitle = {International Conference on Machine Learning (ICML)},
|
| 48 |
year = {2024},
|
| 49 |
}
|
| 50 |
+
```
|
config.json
CHANGED
|
@@ -1,42 +1,59 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
"
|
| 8 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
},
|
| 10 |
-
"
|
| 11 |
-
"
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
"
|
| 25 |
-
"
|
| 26 |
-
"
|
| 27 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
},
|
| 29 |
-
"
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
"
|
| 34 |
-
|
| 35 |
-
|
|
|
|
| 36 |
],
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
-
"use_fused_vision_backbone": true,
|
| 40 |
-
"vision_backbone_id": "dinosiglip-vit-so-224px",
|
| 41 |
-
"vocab_size": 32001
|
| 42 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"dataset": {
|
| 3 |
+
"align_stage_components": [
|
| 4 |
+
"/home/ubuntu/spacellava_data/data/dataset.json",
|
| 5 |
+
"/home/ubuntu/spacellava_data/data"
|
| 6 |
+
],
|
| 7 |
+
"dataset_id": "spacellava",
|
| 8 |
+
"dataset_root_dir": "/home/ubuntu/spacellava_data/data",
|
| 9 |
+
"finetune_stage_components": [
|
| 10 |
+
"/home/ubuntu/spacellava_data/data/dataset.json",
|
| 11 |
+
"/home/ubuntu/spacellava_data/data"
|
| 12 |
+
],
|
| 13 |
+
"type": "spacellava"
|
| 14 |
},
|
| 15 |
+
"hf_token": ".hf_token",
|
| 16 |
+
"model": {
|
| 17 |
+
"align_epochs": 1,
|
| 18 |
+
"align_global_batch_size": 4,
|
| 19 |
+
"align_learning_rate": 0.001,
|
| 20 |
+
"align_lr_scheduler_type": "linear-warmup+cosine-decay",
|
| 21 |
+
"align_max_grad_norm": 1.0,
|
| 22 |
+
"align_max_steps": null,
|
| 23 |
+
"align_per_device_batch_size": 1,
|
| 24 |
+
"align_train_strategy": "fsdp-shard-grad-op",
|
| 25 |
+
"align_warmup_ratio": 0.03,
|
| 26 |
+
"align_weight_decay": 0.0,
|
| 27 |
+
"arch_specifier": "no-align+gelu-mlp",
|
| 28 |
+
"enable_gradient_checkpointing": true,
|
| 29 |
+
"enable_mixed_precision_training": true,
|
| 30 |
+
"finetune_epochs": 3,
|
| 31 |
+
"finetune_global_batch_size": 128,
|
| 32 |
+
"finetune_learning_rate": 2e-06,
|
| 33 |
+
"finetune_lr_scheduler_type": "linear-warmup+cosine-decay",
|
| 34 |
+
"finetune_max_grad_norm": 1.0,
|
| 35 |
+
"finetune_max_steps": null,
|
| 36 |
+
"finetune_per_device_batch_size": 4,
|
| 37 |
+
"finetune_train_strategy": "fsdp-full-shard",
|
| 38 |
+
"finetune_warmup_ratio": 0.03,
|
| 39 |
+
"finetune_weight_decay": 0.1,
|
| 40 |
+
"image_resize_strategy": "letterbox",
|
| 41 |
+
"llm_backbone_id": "llama3-1-8b-pure",
|
| 42 |
+
"llm_max_length": 2048,
|
| 43 |
+
"model_id": "llama3-based-224-4epoch",
|
| 44 |
+
"reduce_in_full_precision": false,
|
| 45 |
+
"type": "one-stage+7b",
|
| 46 |
+
"vision_backbone_id": "dinosiglip-vit-so-224px"
|
| 47 |
},
|
| 48 |
+
"pretrained_checkpoint": null,
|
| 49 |
+
"run_id": "spacellava+llama3-based-224-4epoch+stage-finetune+x7",
|
| 50 |
+
"run_root_dir": "runs",
|
| 51 |
+
"seed": 7,
|
| 52 |
+
"stage": "finetune",
|
| 53 |
+
"trackers": [
|
| 54 |
+
"jsonl",
|
| 55 |
+
"wandb"
|
| 56 |
],
|
| 57 |
+
"wandb_entity": "smellslikeml",
|
| 58 |
+
"wandb_project": "prismatic"
|
|
|
|
|
|
|
|
|
|
| 59 |
}
|
config.yaml
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset:
|
| 2 |
+
align_stage_components:
|
| 3 |
+
- /home/ubuntu/spacellava_data/data/dataset.json
|
| 4 |
+
- /home/ubuntu/spacellava_data/data
|
| 5 |
+
dataset_id: spacellava
|
| 6 |
+
dataset_root_dir: /home/ubuntu/spacellava_data/data
|
| 7 |
+
finetune_stage_components:
|
| 8 |
+
- /home/ubuntu/spacellava_data/data/dataset.json
|
| 9 |
+
- /home/ubuntu/spacellava_data/data
|
| 10 |
+
type: spacellava
|
| 11 |
+
hf_token: .hf_token
|
| 12 |
+
model:
|
| 13 |
+
align_epochs: 1
|
| 14 |
+
align_global_batch_size: 4
|
| 15 |
+
align_learning_rate: 0.001
|
| 16 |
+
align_lr_scheduler_type: linear-warmup+cosine-decay
|
| 17 |
+
align_max_grad_norm: 1.0
|
| 18 |
+
align_max_steps: null
|
| 19 |
+
align_per_device_batch_size: 1
|
| 20 |
+
align_train_strategy: fsdp-shard-grad-op
|
| 21 |
+
align_warmup_ratio: 0.03
|
| 22 |
+
align_weight_decay: 0.0
|
| 23 |
+
arch_specifier: no-align+gelu-mlp
|
| 24 |
+
enable_gradient_checkpointing: true
|
| 25 |
+
enable_mixed_precision_training: true
|
| 26 |
+
finetune_epochs: 3
|
| 27 |
+
finetune_global_batch_size: 128
|
| 28 |
+
finetune_learning_rate: 2.0e-06
|
| 29 |
+
finetune_lr_scheduler_type: linear-warmup+cosine-decay
|
| 30 |
+
finetune_max_grad_norm: 1.0
|
| 31 |
+
finetune_max_steps: null
|
| 32 |
+
finetune_per_device_batch_size: 4
|
| 33 |
+
finetune_train_strategy: fsdp-full-shard
|
| 34 |
+
finetune_warmup_ratio: 0.03
|
| 35 |
+
finetune_weight_decay: 0.1
|
| 36 |
+
image_resize_strategy: letterbox
|
| 37 |
+
llm_backbone_id: llama3-1-8b-pure
|
| 38 |
+
llm_max_length: 2048
|
| 39 |
+
model_id: llama3-based-224-4epoch
|
| 40 |
+
reduce_in_full_precision: false
|
| 41 |
+
type: one-stage+7b
|
| 42 |
+
vision_backbone_id: dinosiglip-vit-so-224px
|
| 43 |
+
pretrained_checkpoint: null
|
| 44 |
+
run_id: spacellava+llama3-based-224-4epoch+stage-finetune+x7
|
| 45 |
+
run_root_dir: runs
|
| 46 |
+
seed: 7
|
| 47 |
+
stage: finetune
|
| 48 |
+
trackers:
|
| 49 |
+
- jsonl
|
| 50 |
+
- wandb
|
| 51 |
+
wandb_entity: smellslikeml
|
| 52 |
+
wandb_project: prismatic
|
docker/Dockerfile
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM nvcr.io/nvidia/tritonserver:22.11-py3
|
| 2 |
+
|
| 3 |
+
WORKDIR /workspace
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install cmake -y
|
| 6 |
+
|
| 7 |
+
RUN pip install --upgrade pip && pip install --upgrade tensorrt
|
| 8 |
+
|
| 9 |
+
RUN git clone https://github.com/NVIDIA/TensorRT.git -b main --single-branch \
|
| 10 |
+
&& cd TensorRT \
|
| 11 |
+
&& git submodule update --init --recursive
|
| 12 |
+
|
| 13 |
+
ENV TRT_OSSPATH=/workspace/TensorRT
|
| 14 |
+
WORKDIR ${TRT_OSSPATH}
|
| 15 |
+
|
| 16 |
+
RUN mkdir -p build \
|
| 17 |
+
&& cd build \
|
| 18 |
+
&& cmake .. -DTRT_OUT_DIR=$PWD/out \
|
| 19 |
+
&& cd plugin \
|
| 20 |
+
&& make -j$(nproc)
|
| 21 |
+
|
| 22 |
+
ENV PLUGIN_LIBS="${TRT_OSSPATH}/build/out/libnvinfer_plugin.so"
|
| 23 |
+
|
| 24 |
+
RUN python3 -m pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
|
| 25 |
+
RUN git clone https://github.com/remyxai/prismatic-vlms.git && cd prismatic-vlms && python3 -m pip install .
|
| 26 |
+
RUN python3 -m pip install --upgrade transformers
|
| 27 |
+
|
| 28 |
+
WORKDIR /models
|
| 29 |
+
COPY ./models/ .
|
| 30 |
+
|
| 31 |
+
WORKDIR /workspace
|
| 32 |
+
CMD ["tritonserver", "--model-store=/models"]
|
docker/client.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import time
|
| 3 |
+
import base64
|
| 4 |
+
import numpy as np
|
| 5 |
+
import requests
|
| 6 |
+
import os
|
| 7 |
+
from urllib.parse import urlparse
|
| 8 |
+
from tritonclient.http import InferenceServerClient, InferInput, InferRequestedOutput
|
| 9 |
+
|
| 10 |
+
def download_image(image_url):
|
| 11 |
+
parsed_url = urlparse(image_url)
|
| 12 |
+
filename = os.path.basename(parsed_url.path)
|
| 13 |
+
response = requests.get(image_url)
|
| 14 |
+
if response.status_code == 200:
|
| 15 |
+
with open(filename, 'wb') as img_file:
|
| 16 |
+
img_file.write(response.content)
|
| 17 |
+
return filename
|
| 18 |
+
else:
|
| 19 |
+
raise Exception("Failed to download image")
|
| 20 |
+
|
| 21 |
+
def image_to_base64_data_uri(image_input):
|
| 22 |
+
with open(image_input, "rb") as img_file:
|
| 23 |
+
base64_data = base64.b64encode(img_file.read()).decode('utf-8')
|
| 24 |
+
return base64_data
|
| 25 |
+
|
| 26 |
+
def setup_argparse():
|
| 27 |
+
parser = argparse.ArgumentParser(description="Client for Triton Inference Server")
|
| 28 |
+
parser.add_argument("--image_path", type=str, required=True, help="Path to the image or URL of the image to process")
|
| 29 |
+
parser.add_argument("--prompt", type=str, required=True, help="Prompt to be used for the inference")
|
| 30 |
+
return parser.parse_args()
|
| 31 |
+
|
| 32 |
+
if __name__ == "__main__":
|
| 33 |
+
args = setup_argparse()
|
| 34 |
+
|
| 35 |
+
triton_client = InferenceServerClient(url="localhost:8000", verbose=False)
|
| 36 |
+
|
| 37 |
+
if args.image_path.startswith('http://') or args.image_path.startswith('https://'):
|
| 38 |
+
image_path = download_image(args.image_path)
|
| 39 |
+
else:
|
| 40 |
+
image_path = args.image_path
|
| 41 |
+
|
| 42 |
+
image_data = image_to_base64_data_uri(image_path).encode('utf-8')
|
| 43 |
+
image_data_np = np.array([image_data], dtype=object)
|
| 44 |
+
prompt_np = np.array([args.prompt.encode('utf-8')], dtype=object)
|
| 45 |
+
|
| 46 |
+
images_in = InferInput(name="IMAGES", shape=[1], datatype="BYTES")
|
| 47 |
+
images_in.set_data_from_numpy(image_data_np, binary_data=True)
|
| 48 |
+
prompt_in = InferInput(name="PROMPT", shape=[1], datatype="BYTES")
|
| 49 |
+
prompt_in.set_data_from_numpy(prompt_np, binary_data=True)
|
| 50 |
+
|
| 51 |
+
results_out = InferRequestedOutput(name="RESULTS", binary_data=False)
|
| 52 |
+
|
| 53 |
+
start_time = time.time()
|
| 54 |
+
response = triton_client.infer(model_name="spacellama3.1",
|
| 55 |
+
model_version="1",
|
| 56 |
+
inputs=[prompt_in, images_in],
|
| 57 |
+
outputs=[results_out])
|
| 58 |
+
|
| 59 |
+
results = response.get_response()["outputs"][0]["data"][0]
|
| 60 |
+
print("--- %s seconds ---" % (time.time() - start_time))
|
| 61 |
+
print(results)
|
docker/models/spacellama3.1/1/model.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import io
|
| 3 |
+
import torch
|
| 4 |
+
import numpy as np
|
| 5 |
+
import triton_python_backend_utils as pb_utils
|
| 6 |
+
from prismatic import load
|
| 7 |
+
from PIL import Image
|
| 8 |
+
|
| 9 |
+
class TritonPythonModel:
|
| 10 |
+
"""Your Python model must use the same class name. Every Python model
|
| 11 |
+
that is created must have "TritonPythonModel" as the class name.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
@staticmethod
|
| 15 |
+
def auto_complete_config(auto_complete_model_config):
|
| 16 |
+
"""`auto_complete_config` is called only once when loading the model
|
| 17 |
+
assuming the server was not started with
|
| 18 |
+
`--disable-auto-complete-config`. Implementing this function is
|
| 19 |
+
optional. No implementation of `auto_complete_config` will do nothing.
|
| 20 |
+
This function can be used to set `max_batch_size`, `input` and `output`
|
| 21 |
+
properties of the model using `set_max_batch_size`, `add_input`, and
|
| 22 |
+
`add_output`. These properties will allow Triton to load the model with
|
| 23 |
+
minimal model configuration in absence of a configuration file. This
|
| 24 |
+
function returns the `pb_utils.ModelConfig` object with these
|
| 25 |
+
properties. You can use the `as_dict` function to gain read-only access
|
| 26 |
+
to the `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object
|
| 27 |
+
being returned from here will be used as the final configuration for
|
| 28 |
+
the model.
|
| 29 |
+
Note: The Python interpreter used to invoke this function will be
|
| 30 |
+
destroyed upon returning from this function and as a result none of the
|
| 31 |
+
objects created here will be available in the `initialize`, `execute`,
|
| 32 |
+
or `finalize` functions.
|
| 33 |
+
Parameters
|
| 34 |
+
----------
|
| 35 |
+
auto_complete_model_config : pb_utils.ModelConfig
|
| 36 |
+
An object containing the existing model configuration. You can build
|
| 37 |
+
upon the configuration given by this object when setting the
|
| 38 |
+
properties for this model.
|
| 39 |
+
Returns
|
| 40 |
+
-------
|
| 41 |
+
pb_utils.ModelConfig
|
| 42 |
+
An object containing the auto-completed model configuration
|
| 43 |
+
"""
|
| 44 |
+
inputs = [{
|
| 45 |
+
'name': 'PROMPT',
|
| 46 |
+
'data_type': 'TYPE_STRING',
|
| 47 |
+
'dims': [-1]
|
| 48 |
+
}, {
|
| 49 |
+
'name': 'IMAGES',
|
| 50 |
+
'data_type': 'TYPE_STRING', # Changed from TYPE_FP16 to TYPE_STRING
|
| 51 |
+
'dims': [-1] # Changed to indicate a variable-length array of strings
|
| 52 |
+
}]
|
| 53 |
+
|
| 54 |
+
outputs = [{
|
| 55 |
+
'name': 'RESULTS',
|
| 56 |
+
'data_type': 'TYPE_STRING',
|
| 57 |
+
'dims': [-1]
|
| 58 |
+
}]
|
| 59 |
+
|
| 60 |
+
config = auto_complete_model_config.as_dict()
|
| 61 |
+
input_names = []
|
| 62 |
+
output_names = []
|
| 63 |
+
for input in config['input']:
|
| 64 |
+
input_names.append(input['name'])
|
| 65 |
+
for output in config['output']:
|
| 66 |
+
output_names.append(output['name'])
|
| 67 |
+
|
| 68 |
+
for input in inputs:
|
| 69 |
+
if input['name'] not in input_names:
|
| 70 |
+
auto_complete_model_config.add_input(input)
|
| 71 |
+
for output in outputs:
|
| 72 |
+
if output['name'] not in output_names:
|
| 73 |
+
auto_complete_model_config.add_output(output)
|
| 74 |
+
|
| 75 |
+
auto_complete_model_config.set_dynamic_batching()
|
| 76 |
+
|
| 77 |
+
return auto_complete_model_config
|
| 78 |
+
|
| 79 |
+
def initialize(self, args):
|
| 80 |
+
"""`initialize` is called only once when the model is being loaded.
|
| 81 |
+
Implementing `initialize` function is optional. This function allows
|
| 82 |
+
the model to initialize any state associated with this model.
|
| 83 |
+
Parameters
|
| 84 |
+
----------
|
| 85 |
+
args : dict
|
| 86 |
+
Both keys and values are strings. The dictionary keys and values are:
|
| 87 |
+
* model_config: A JSON string containing the model configuration
|
| 88 |
+
* model_instance_kind: A string containing model instance kind
|
| 89 |
+
* model_instance_device_id: A string containing model instance device
|
| 90 |
+
ID
|
| 91 |
+
* model_repository: Model repository path
|
| 92 |
+
* model_version: Model version
|
| 93 |
+
* model_name: Model name
|
| 94 |
+
"""
|
| 95 |
+
self.model = load("remyxai/SpaceLlama3.1")
|
| 96 |
+
self.model.to(device, dtype=torch.bfloat16)
|
| 97 |
+
print('Initialized...')
|
| 98 |
+
|
| 99 |
+
def run_inference(self, prompt, image):
|
| 100 |
+
image = Image.open(io.BytesIO(image)).convert("RGB")
|
| 101 |
+
prompt_builder = self.model.get_prompt_builder()
|
| 102 |
+
prompt_builder.add_turn(role="human", message=prompt)
|
| 103 |
+
prompt_text = prompt_builder.get_prompt()
|
| 104 |
+
|
| 105 |
+
output_string = self.model.generate(
|
| 106 |
+
image,
|
| 107 |
+
prompt_text,
|
| 108 |
+
do_sample=True,
|
| 109 |
+
temperature=0.1,
|
| 110 |
+
max_new_tokens=512,
|
| 111 |
+
min_length=1,
|
| 112 |
+
)
|
| 113 |
+
output_string = output_string.split("</s>")[0]
|
| 114 |
+
output_data = np.array([output_string.encode('utf-8')], dtype=object)
|
| 115 |
+
return output_data
|
| 116 |
+
|
| 117 |
+
def execute(self, requests):
|
| 118 |
+
"""`execute` must be implemented in every Python model. `execute`
|
| 119 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
| 120 |
+
argument. This function is called when an inference is requested
|
| 121 |
+
for this model.
|
| 122 |
+
Parameters
|
| 123 |
+
----------
|
| 124 |
+
requests : list
|
| 125 |
+
A list of pb_utils.InferenceRequest
|
| 126 |
+
Returns
|
| 127 |
+
-------
|
| 128 |
+
list
|
| 129 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
| 130 |
+
be the same as `requests`
|
| 131 |
+
"""
|
| 132 |
+
|
| 133 |
+
responses = []
|
| 134 |
+
|
| 135 |
+
for request in requests:
|
| 136 |
+
# Perform inference on the request and append it to responses
|
| 137 |
+
# list...
|
| 138 |
+
prompt = [
|
| 139 |
+
t.decode("UTF-8")
|
| 140 |
+
for t in pb_utils.get_input_tensor_by_name(request, "PROMPT")
|
| 141 |
+
.as_numpy()
|
| 142 |
+
.tolist()
|
| 143 |
+
][0]
|
| 144 |
+
image = [
|
| 145 |
+
t.decode("UTF-8")
|
| 146 |
+
for t in pb_utils.get_input_tensor_by_name(request, "IMAGES")
|
| 147 |
+
.as_numpy()
|
| 148 |
+
.tolist()
|
| 149 |
+
][0]
|
| 150 |
+
results = self.run_inference(prompt, image)
|
| 151 |
+
|
| 152 |
+
# Sending results
|
| 153 |
+
inference_response = pb_utils.InferenceResponse(output_tensors=[
|
| 154 |
+
pb_utils.Tensor(
|
| 155 |
+
"RESULTS",
|
| 156 |
+
results,
|
| 157 |
+
)
|
| 158 |
+
])
|
| 159 |
+
|
| 160 |
+
responses.append(inference_response)
|
| 161 |
+
|
| 162 |
+
return responses
|
| 163 |
+
|
| 164 |
+
def finalize(self):
|
| 165 |
+
"""`finalize` is called only once when the model is being unloaded.
|
| 166 |
+
Implementing `finalize` function is optional. This function allows
|
| 167 |
+
the model to perform any necessary clean ups before exit.
|
| 168 |
+
"""
|
| 169 |
+
print('Cleaning up...')
|
docker/models/spacellama3.1/config.pbtxt
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "spacellama3.1"
|
| 2 |
+
max_batch_size: 0
|
| 3 |
+
backend: "python"
|
| 4 |
+
|
| 5 |
+
input [
|
| 6 |
+
{
|
| 7 |
+
name: "PROMPT"
|
| 8 |
+
data_type: TYPE_STRING
|
| 9 |
+
dims: [ -1 ]
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
name: "IMAGES"
|
| 13 |
+
data_type: TYPE_STRING
|
| 14 |
+
dims: [ -1 ]
|
| 15 |
+
}
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
output [
|
| 19 |
+
{
|
| 20 |
+
name: "RESULTS"
|
| 21 |
+
data_type: TYPE_STRING
|
| 22 |
+
dims: [ -1 ]
|
| 23 |
+
}
|
| 24 |
+
]
|
run-metrics.jsonl
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hparams": {"dataset": {"align_stage_components": ["/home/ubuntu/spacellava_data/data/dataset.json", "/home/ubuntu/spacellava_data/data"], "dataset_id": "spacellava", "dataset_root_dir": "/home/ubuntu/spacellava_data/data", "finetune_stage_components": ["/home/ubuntu/spacellava_data/data/dataset.json", "/home/ubuntu/spacellava_data/data"], "type": "spacellava"}, "hf_token": ".hf_token", "model": {"align_epochs": 1, "align_global_batch_size": 4, "align_learning_rate": 0.001, "align_lr_scheduler_type": "linear-warmup+cosine-decay", "align_max_grad_norm": 1.0, "align_max_steps": null, "align_per_device_batch_size": 1, "align_train_strategy": "fsdp-shard-grad-op", "align_warmup_ratio": 0.03, "align_weight_decay": 0.0, "arch_specifier": "no-align+gelu-mlp", "enable_gradient_checkpointing": true, "enable_mixed_precision_training": true, "finetune_epochs": 3, "finetune_global_batch_size": 128, "finetune_learning_rate": 2e-06, "finetune_lr_scheduler_type": "linear-warmup+cosine-decay", "finetune_max_grad_norm": 1.0, "finetune_max_steps": null, "finetune_per_device_batch_size": 4, "finetune_train_strategy": "fsdp-full-shard", "finetune_warmup_ratio": 0.03, "finetune_weight_decay": 0.1, "image_resize_strategy": "letterbox", "llm_backbone_id": "llama3-1-8b-pure", "llm_max_length": 2048, "model_id": "llama3-based-224-4epoch", "reduce_in_full_precision": false, "type": "one-stage+7b", "vision_backbone_id": "dinosiglip-vit-so-224px"}, "pretrained_checkpoint": null, "run_id": "spacellava+llama3-based-224-4epoch+stage-finetune+x7", "run_root_dir": "runs", "seed": 7, "stage": "finetune", "trackers": ["jsonl", "wandb"], "wandb_entity": "smellslikeml", "wandb_project": "prismatic"}, "run_id": "spacellava+llama3-based-224-4epoch+stage-finetune+x7"}
|
run_inference.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import requests
|
| 3 |
+
import torch
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from prismatic import load
|
| 7 |
+
|
| 8 |
+
def main(model_location, user_prompt, image_source):
|
| 9 |
+
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
| 10 |
+
|
| 11 |
+
# Load a pretrained VLM (either local path, or ID to auto-download from the HF Hub)
|
| 12 |
+
vlm = load(model_location)
|
| 13 |
+
vlm.to(device, dtype=torch.bfloat16)
|
| 14 |
+
|
| 15 |
+
# Load the image from URL or local path
|
| 16 |
+
if image_source.startswith("http://") or image_source.startswith("https://"):
|
| 17 |
+
image = Image.open(requests.get(image_source, stream=True).raw).convert("RGB")
|
| 18 |
+
else:
|
| 19 |
+
image = Image.open(image_source).convert("RGB")
|
| 20 |
+
|
| 21 |
+
# Build prompt
|
| 22 |
+
prompt_builder = vlm.get_prompt_builder()
|
| 23 |
+
prompt_builder.add_turn(role="human", message=user_prompt)
|
| 24 |
+
prompt_text = prompt_builder.get_prompt()
|
| 25 |
+
|
| 26 |
+
# Generate!
|
| 27 |
+
generated_text = vlm.generate(
|
| 28 |
+
image,
|
| 29 |
+
prompt_text,
|
| 30 |
+
do_sample=True,
|
| 31 |
+
temperature=0.1,
|
| 32 |
+
max_new_tokens=512,
|
| 33 |
+
min_length=1,
|
| 34 |
+
)
|
| 35 |
+
generated_text = generated_text.split("</s>")[0]
|
| 36 |
+
|
| 37 |
+
print("PROMPT TEXT: ", user_prompt)
|
| 38 |
+
print("GENERATED TEXT: ", generated_text)
|
| 39 |
+
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
parser = argparse.ArgumentParser(description="Process an image and prompt with a pretrained VLM model.")
|
| 42 |
+
parser.add_argument("--model_location", type=str, required=True, help="The location of the pretrained VLM model.")
|
| 43 |
+
parser.add_argument("--user_prompt", type=str, required=True, help="The prompt to process.")
|
| 44 |
+
parser.add_argument("--image_source", type=str, required=True, help="The URL or local path of the image.")
|
| 45 |
+
|
| 46 |
+
args = parser.parse_args()
|
| 47 |
+
|
| 48 |
+
main(args.model_location, args.user_prompt, args.image_source)
|
| 49 |
+
|
spacellava+llama3-based-224-4epoch+stage-finetune+x7.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|