smellslikeml
commited on
Commit
·
6199e94
1
Parent(s):
4fa8cde
adding weights
Browse files- README.md +1 -1
- config.json +54 -37
- config.yaml +52 -0
- docker/Dockerfile +32 -0
- docker/client.py +61 -0
- docker/models/spacellama3.1/1/model.py +169 -0
- docker/models/spacellama3.1/config.pbtxt +24 -0
- run-metrics.jsonl +1 -0
- run_inference.py +49 -0
- spacellava+llama3-based-224-4epoch+stage-finetune+x7.jsonl +0 -0
README.md
CHANGED
@@ -47,4 +47,4 @@ With a pipeline of expert models, we can infer spatial relationships between obj
|
|
47 |
booktitle = {International Conference on Machine Learning (ICML)},
|
48 |
year = {2024},
|
49 |
}
|
50 |
-
```
|
|
|
47 |
booktitle = {International Conference on Machine Learning (ICML)},
|
48 |
year = {2024},
|
49 |
}
|
50 |
+
```
|
config.json
CHANGED
@@ -1,42 +1,59 @@
|
|
1 |
{
|
2 |
-
"
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
"
|
8 |
-
"
|
|
|
|
|
|
|
|
|
|
|
9 |
},
|
10 |
-
"
|
11 |
-
"
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
"
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
},
|
29 |
-
"
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
"
|
34 |
-
|
35 |
-
|
|
|
36 |
],
|
37 |
-
"
|
38 |
-
"
|
39 |
-
"use_fused_vision_backbone": true,
|
40 |
-
"vision_backbone_id": "dinosiglip-vit-so-224px",
|
41 |
-
"vocab_size": 32001
|
42 |
}
|
|
|
1 |
{
|
2 |
+
"dataset": {
|
3 |
+
"align_stage_components": [
|
4 |
+
"/home/ubuntu/spacellava_data/data/dataset.json",
|
5 |
+
"/home/ubuntu/spacellava_data/data"
|
6 |
+
],
|
7 |
+
"dataset_id": "spacellava",
|
8 |
+
"dataset_root_dir": "/home/ubuntu/spacellava_data/data",
|
9 |
+
"finetune_stage_components": [
|
10 |
+
"/home/ubuntu/spacellava_data/data/dataset.json",
|
11 |
+
"/home/ubuntu/spacellava_data/data"
|
12 |
+
],
|
13 |
+
"type": "spacellava"
|
14 |
},
|
15 |
+
"hf_token": ".hf_token",
|
16 |
+
"model": {
|
17 |
+
"align_epochs": 1,
|
18 |
+
"align_global_batch_size": 4,
|
19 |
+
"align_learning_rate": 0.001,
|
20 |
+
"align_lr_scheduler_type": "linear-warmup+cosine-decay",
|
21 |
+
"align_max_grad_norm": 1.0,
|
22 |
+
"align_max_steps": null,
|
23 |
+
"align_per_device_batch_size": 1,
|
24 |
+
"align_train_strategy": "fsdp-shard-grad-op",
|
25 |
+
"align_warmup_ratio": 0.03,
|
26 |
+
"align_weight_decay": 0.0,
|
27 |
+
"arch_specifier": "no-align+gelu-mlp",
|
28 |
+
"enable_gradient_checkpointing": true,
|
29 |
+
"enable_mixed_precision_training": true,
|
30 |
+
"finetune_epochs": 3,
|
31 |
+
"finetune_global_batch_size": 128,
|
32 |
+
"finetune_learning_rate": 2e-06,
|
33 |
+
"finetune_lr_scheduler_type": "linear-warmup+cosine-decay",
|
34 |
+
"finetune_max_grad_norm": 1.0,
|
35 |
+
"finetune_max_steps": null,
|
36 |
+
"finetune_per_device_batch_size": 4,
|
37 |
+
"finetune_train_strategy": "fsdp-full-shard",
|
38 |
+
"finetune_warmup_ratio": 0.03,
|
39 |
+
"finetune_weight_decay": 0.1,
|
40 |
+
"image_resize_strategy": "letterbox",
|
41 |
+
"llm_backbone_id": "llama3-1-8b-pure",
|
42 |
+
"llm_max_length": 2048,
|
43 |
+
"model_id": "llama3-based-224-4epoch",
|
44 |
+
"reduce_in_full_precision": false,
|
45 |
+
"type": "one-stage+7b",
|
46 |
+
"vision_backbone_id": "dinosiglip-vit-so-224px"
|
47 |
},
|
48 |
+
"pretrained_checkpoint": null,
|
49 |
+
"run_id": "spacellava+llama3-based-224-4epoch+stage-finetune+x7",
|
50 |
+
"run_root_dir": "runs",
|
51 |
+
"seed": 7,
|
52 |
+
"stage": "finetune",
|
53 |
+
"trackers": [
|
54 |
+
"jsonl",
|
55 |
+
"wandb"
|
56 |
],
|
57 |
+
"wandb_entity": "smellslikeml",
|
58 |
+
"wandb_project": "prismatic"
|
|
|
|
|
|
|
59 |
}
|
config.yaml
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset:
|
2 |
+
align_stage_components:
|
3 |
+
- /home/ubuntu/spacellava_data/data/dataset.json
|
4 |
+
- /home/ubuntu/spacellava_data/data
|
5 |
+
dataset_id: spacellava
|
6 |
+
dataset_root_dir: /home/ubuntu/spacellava_data/data
|
7 |
+
finetune_stage_components:
|
8 |
+
- /home/ubuntu/spacellava_data/data/dataset.json
|
9 |
+
- /home/ubuntu/spacellava_data/data
|
10 |
+
type: spacellava
|
11 |
+
hf_token: .hf_token
|
12 |
+
model:
|
13 |
+
align_epochs: 1
|
14 |
+
align_global_batch_size: 4
|
15 |
+
align_learning_rate: 0.001
|
16 |
+
align_lr_scheduler_type: linear-warmup+cosine-decay
|
17 |
+
align_max_grad_norm: 1.0
|
18 |
+
align_max_steps: null
|
19 |
+
align_per_device_batch_size: 1
|
20 |
+
align_train_strategy: fsdp-shard-grad-op
|
21 |
+
align_warmup_ratio: 0.03
|
22 |
+
align_weight_decay: 0.0
|
23 |
+
arch_specifier: no-align+gelu-mlp
|
24 |
+
enable_gradient_checkpointing: true
|
25 |
+
enable_mixed_precision_training: true
|
26 |
+
finetune_epochs: 3
|
27 |
+
finetune_global_batch_size: 128
|
28 |
+
finetune_learning_rate: 2.0e-06
|
29 |
+
finetune_lr_scheduler_type: linear-warmup+cosine-decay
|
30 |
+
finetune_max_grad_norm: 1.0
|
31 |
+
finetune_max_steps: null
|
32 |
+
finetune_per_device_batch_size: 4
|
33 |
+
finetune_train_strategy: fsdp-full-shard
|
34 |
+
finetune_warmup_ratio: 0.03
|
35 |
+
finetune_weight_decay: 0.1
|
36 |
+
image_resize_strategy: letterbox
|
37 |
+
llm_backbone_id: llama3-1-8b-pure
|
38 |
+
llm_max_length: 2048
|
39 |
+
model_id: llama3-based-224-4epoch
|
40 |
+
reduce_in_full_precision: false
|
41 |
+
type: one-stage+7b
|
42 |
+
vision_backbone_id: dinosiglip-vit-so-224px
|
43 |
+
pretrained_checkpoint: null
|
44 |
+
run_id: spacellava+llama3-based-224-4epoch+stage-finetune+x7
|
45 |
+
run_root_dir: runs
|
46 |
+
seed: 7
|
47 |
+
stage: finetune
|
48 |
+
trackers:
|
49 |
+
- jsonl
|
50 |
+
- wandb
|
51 |
+
wandb_entity: smellslikeml
|
52 |
+
wandb_project: prismatic
|
docker/Dockerfile
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM nvcr.io/nvidia/tritonserver:22.11-py3
|
2 |
+
|
3 |
+
WORKDIR /workspace
|
4 |
+
|
5 |
+
RUN apt-get update && apt-get install cmake -y
|
6 |
+
|
7 |
+
RUN pip install --upgrade pip && pip install --upgrade tensorrt
|
8 |
+
|
9 |
+
RUN git clone https://github.com/NVIDIA/TensorRT.git -b main --single-branch \
|
10 |
+
&& cd TensorRT \
|
11 |
+
&& git submodule update --init --recursive
|
12 |
+
|
13 |
+
ENV TRT_OSSPATH=/workspace/TensorRT
|
14 |
+
WORKDIR ${TRT_OSSPATH}
|
15 |
+
|
16 |
+
RUN mkdir -p build \
|
17 |
+
&& cd build \
|
18 |
+
&& cmake .. -DTRT_OUT_DIR=$PWD/out \
|
19 |
+
&& cd plugin \
|
20 |
+
&& make -j$(nproc)
|
21 |
+
|
22 |
+
ENV PLUGIN_LIBS="${TRT_OSSPATH}/build/out/libnvinfer_plugin.so"
|
23 |
+
|
24 |
+
RUN python3 -m pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
|
25 |
+
RUN git clone https://github.com/remyxai/prismatic-vlms.git && cd prismatic-vlms && python3 -m pip install .
|
26 |
+
RUN python3 -m pip install --upgrade transformers
|
27 |
+
|
28 |
+
WORKDIR /models
|
29 |
+
COPY ./models/ .
|
30 |
+
|
31 |
+
WORKDIR /workspace
|
32 |
+
CMD ["tritonserver", "--model-store=/models"]
|
docker/client.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import time
|
3 |
+
import base64
|
4 |
+
import numpy as np
|
5 |
+
import requests
|
6 |
+
import os
|
7 |
+
from urllib.parse import urlparse
|
8 |
+
from tritonclient.http import InferenceServerClient, InferInput, InferRequestedOutput
|
9 |
+
|
10 |
+
def download_image(image_url):
|
11 |
+
parsed_url = urlparse(image_url)
|
12 |
+
filename = os.path.basename(parsed_url.path)
|
13 |
+
response = requests.get(image_url)
|
14 |
+
if response.status_code == 200:
|
15 |
+
with open(filename, 'wb') as img_file:
|
16 |
+
img_file.write(response.content)
|
17 |
+
return filename
|
18 |
+
else:
|
19 |
+
raise Exception("Failed to download image")
|
20 |
+
|
21 |
+
def image_to_base64_data_uri(image_input):
|
22 |
+
with open(image_input, "rb") as img_file:
|
23 |
+
base64_data = base64.b64encode(img_file.read()).decode('utf-8')
|
24 |
+
return base64_data
|
25 |
+
|
26 |
+
def setup_argparse():
|
27 |
+
parser = argparse.ArgumentParser(description="Client for Triton Inference Server")
|
28 |
+
parser.add_argument("--image_path", type=str, required=True, help="Path to the image or URL of the image to process")
|
29 |
+
parser.add_argument("--prompt", type=str, required=True, help="Prompt to be used for the inference")
|
30 |
+
return parser.parse_args()
|
31 |
+
|
32 |
+
if __name__ == "__main__":
|
33 |
+
args = setup_argparse()
|
34 |
+
|
35 |
+
triton_client = InferenceServerClient(url="localhost:8000", verbose=False)
|
36 |
+
|
37 |
+
if args.image_path.startswith('http://') or args.image_path.startswith('https://'):
|
38 |
+
image_path = download_image(args.image_path)
|
39 |
+
else:
|
40 |
+
image_path = args.image_path
|
41 |
+
|
42 |
+
image_data = image_to_base64_data_uri(image_path).encode('utf-8')
|
43 |
+
image_data_np = np.array([image_data], dtype=object)
|
44 |
+
prompt_np = np.array([args.prompt.encode('utf-8')], dtype=object)
|
45 |
+
|
46 |
+
images_in = InferInput(name="IMAGES", shape=[1], datatype="BYTES")
|
47 |
+
images_in.set_data_from_numpy(image_data_np, binary_data=True)
|
48 |
+
prompt_in = InferInput(name="PROMPT", shape=[1], datatype="BYTES")
|
49 |
+
prompt_in.set_data_from_numpy(prompt_np, binary_data=True)
|
50 |
+
|
51 |
+
results_out = InferRequestedOutput(name="RESULTS", binary_data=False)
|
52 |
+
|
53 |
+
start_time = time.time()
|
54 |
+
response = triton_client.infer(model_name="spacellama3.1",
|
55 |
+
model_version="1",
|
56 |
+
inputs=[prompt_in, images_in],
|
57 |
+
outputs=[results_out])
|
58 |
+
|
59 |
+
results = response.get_response()["outputs"][0]["data"][0]
|
60 |
+
print("--- %s seconds ---" % (time.time() - start_time))
|
61 |
+
print(results)
|
docker/models/spacellama3.1/1/model.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import io
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import triton_python_backend_utils as pb_utils
|
6 |
+
from prismatic import load
|
7 |
+
from PIL import Image
|
8 |
+
|
9 |
+
class TritonPythonModel:
|
10 |
+
"""Your Python model must use the same class name. Every Python model
|
11 |
+
that is created must have "TritonPythonModel" as the class name.
|
12 |
+
"""
|
13 |
+
|
14 |
+
@staticmethod
|
15 |
+
def auto_complete_config(auto_complete_model_config):
|
16 |
+
"""`auto_complete_config` is called only once when loading the model
|
17 |
+
assuming the server was not started with
|
18 |
+
`--disable-auto-complete-config`. Implementing this function is
|
19 |
+
optional. No implementation of `auto_complete_config` will do nothing.
|
20 |
+
This function can be used to set `max_batch_size`, `input` and `output`
|
21 |
+
properties of the model using `set_max_batch_size`, `add_input`, and
|
22 |
+
`add_output`. These properties will allow Triton to load the model with
|
23 |
+
minimal model configuration in absence of a configuration file. This
|
24 |
+
function returns the `pb_utils.ModelConfig` object with these
|
25 |
+
properties. You can use the `as_dict` function to gain read-only access
|
26 |
+
to the `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object
|
27 |
+
being returned from here will be used as the final configuration for
|
28 |
+
the model.
|
29 |
+
Note: The Python interpreter used to invoke this function will be
|
30 |
+
destroyed upon returning from this function and as a result none of the
|
31 |
+
objects created here will be available in the `initialize`, `execute`,
|
32 |
+
or `finalize` functions.
|
33 |
+
Parameters
|
34 |
+
----------
|
35 |
+
auto_complete_model_config : pb_utils.ModelConfig
|
36 |
+
An object containing the existing model configuration. You can build
|
37 |
+
upon the configuration given by this object when setting the
|
38 |
+
properties for this model.
|
39 |
+
Returns
|
40 |
+
-------
|
41 |
+
pb_utils.ModelConfig
|
42 |
+
An object containing the auto-completed model configuration
|
43 |
+
"""
|
44 |
+
inputs = [{
|
45 |
+
'name': 'PROMPT',
|
46 |
+
'data_type': 'TYPE_STRING',
|
47 |
+
'dims': [-1]
|
48 |
+
}, {
|
49 |
+
'name': 'IMAGES',
|
50 |
+
'data_type': 'TYPE_STRING', # Changed from TYPE_FP16 to TYPE_STRING
|
51 |
+
'dims': [-1] # Changed to indicate a variable-length array of strings
|
52 |
+
}]
|
53 |
+
|
54 |
+
outputs = [{
|
55 |
+
'name': 'RESULTS',
|
56 |
+
'data_type': 'TYPE_STRING',
|
57 |
+
'dims': [-1]
|
58 |
+
}]
|
59 |
+
|
60 |
+
config = auto_complete_model_config.as_dict()
|
61 |
+
input_names = []
|
62 |
+
output_names = []
|
63 |
+
for input in config['input']:
|
64 |
+
input_names.append(input['name'])
|
65 |
+
for output in config['output']:
|
66 |
+
output_names.append(output['name'])
|
67 |
+
|
68 |
+
for input in inputs:
|
69 |
+
if input['name'] not in input_names:
|
70 |
+
auto_complete_model_config.add_input(input)
|
71 |
+
for output in outputs:
|
72 |
+
if output['name'] not in output_names:
|
73 |
+
auto_complete_model_config.add_output(output)
|
74 |
+
|
75 |
+
auto_complete_model_config.set_dynamic_batching()
|
76 |
+
|
77 |
+
return auto_complete_model_config
|
78 |
+
|
79 |
+
def initialize(self, args):
|
80 |
+
"""`initialize` is called only once when the model is being loaded.
|
81 |
+
Implementing `initialize` function is optional. This function allows
|
82 |
+
the model to initialize any state associated with this model.
|
83 |
+
Parameters
|
84 |
+
----------
|
85 |
+
args : dict
|
86 |
+
Both keys and values are strings. The dictionary keys and values are:
|
87 |
+
* model_config: A JSON string containing the model configuration
|
88 |
+
* model_instance_kind: A string containing model instance kind
|
89 |
+
* model_instance_device_id: A string containing model instance device
|
90 |
+
ID
|
91 |
+
* model_repository: Model repository path
|
92 |
+
* model_version: Model version
|
93 |
+
* model_name: Model name
|
94 |
+
"""
|
95 |
+
self.model = load("remyxai/SpaceLlama3.1")
|
96 |
+
self.model.to(device, dtype=torch.bfloat16)
|
97 |
+
print('Initialized...')
|
98 |
+
|
99 |
+
def run_inference(self, prompt, image):
|
100 |
+
image = Image.open(io.BytesIO(image)).convert("RGB")
|
101 |
+
prompt_builder = self.model.get_prompt_builder()
|
102 |
+
prompt_builder.add_turn(role="human", message=prompt)
|
103 |
+
prompt_text = prompt_builder.get_prompt()
|
104 |
+
|
105 |
+
output_string = self.model.generate(
|
106 |
+
image,
|
107 |
+
prompt_text,
|
108 |
+
do_sample=True,
|
109 |
+
temperature=0.1,
|
110 |
+
max_new_tokens=512,
|
111 |
+
min_length=1,
|
112 |
+
)
|
113 |
+
output_string = output_string.split("</s>")[0]
|
114 |
+
output_data = np.array([output_string.encode('utf-8')], dtype=object)
|
115 |
+
return output_data
|
116 |
+
|
117 |
+
def execute(self, requests):
|
118 |
+
"""`execute` must be implemented in every Python model. `execute`
|
119 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
120 |
+
argument. This function is called when an inference is requested
|
121 |
+
for this model.
|
122 |
+
Parameters
|
123 |
+
----------
|
124 |
+
requests : list
|
125 |
+
A list of pb_utils.InferenceRequest
|
126 |
+
Returns
|
127 |
+
-------
|
128 |
+
list
|
129 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
130 |
+
be the same as `requests`
|
131 |
+
"""
|
132 |
+
|
133 |
+
responses = []
|
134 |
+
|
135 |
+
for request in requests:
|
136 |
+
# Perform inference on the request and append it to responses
|
137 |
+
# list...
|
138 |
+
prompt = [
|
139 |
+
t.decode("UTF-8")
|
140 |
+
for t in pb_utils.get_input_tensor_by_name(request, "PROMPT")
|
141 |
+
.as_numpy()
|
142 |
+
.tolist()
|
143 |
+
][0]
|
144 |
+
image = [
|
145 |
+
t.decode("UTF-8")
|
146 |
+
for t in pb_utils.get_input_tensor_by_name(request, "IMAGES")
|
147 |
+
.as_numpy()
|
148 |
+
.tolist()
|
149 |
+
][0]
|
150 |
+
results = self.run_inference(prompt, image)
|
151 |
+
|
152 |
+
# Sending results
|
153 |
+
inference_response = pb_utils.InferenceResponse(output_tensors=[
|
154 |
+
pb_utils.Tensor(
|
155 |
+
"RESULTS",
|
156 |
+
results,
|
157 |
+
)
|
158 |
+
])
|
159 |
+
|
160 |
+
responses.append(inference_response)
|
161 |
+
|
162 |
+
return responses
|
163 |
+
|
164 |
+
def finalize(self):
|
165 |
+
"""`finalize` is called only once when the model is being unloaded.
|
166 |
+
Implementing `finalize` function is optional. This function allows
|
167 |
+
the model to perform any necessary clean ups before exit.
|
168 |
+
"""
|
169 |
+
print('Cleaning up...')
|
docker/models/spacellama3.1/config.pbtxt
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "spacellama3.1"
|
2 |
+
max_batch_size: 0
|
3 |
+
backend: "python"
|
4 |
+
|
5 |
+
input [
|
6 |
+
{
|
7 |
+
name: "PROMPT"
|
8 |
+
data_type: TYPE_STRING
|
9 |
+
dims: [ -1 ]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
name: "IMAGES"
|
13 |
+
data_type: TYPE_STRING
|
14 |
+
dims: [ -1 ]
|
15 |
+
}
|
16 |
+
]
|
17 |
+
|
18 |
+
output [
|
19 |
+
{
|
20 |
+
name: "RESULTS"
|
21 |
+
data_type: TYPE_STRING
|
22 |
+
dims: [ -1 ]
|
23 |
+
}
|
24 |
+
]
|
run-metrics.jsonl
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"hparams": {"dataset": {"align_stage_components": ["/home/ubuntu/spacellava_data/data/dataset.json", "/home/ubuntu/spacellava_data/data"], "dataset_id": "spacellava", "dataset_root_dir": "/home/ubuntu/spacellava_data/data", "finetune_stage_components": ["/home/ubuntu/spacellava_data/data/dataset.json", "/home/ubuntu/spacellava_data/data"], "type": "spacellava"}, "hf_token": ".hf_token", "model": {"align_epochs": 1, "align_global_batch_size": 4, "align_learning_rate": 0.001, "align_lr_scheduler_type": "linear-warmup+cosine-decay", "align_max_grad_norm": 1.0, "align_max_steps": null, "align_per_device_batch_size": 1, "align_train_strategy": "fsdp-shard-grad-op", "align_warmup_ratio": 0.03, "align_weight_decay": 0.0, "arch_specifier": "no-align+gelu-mlp", "enable_gradient_checkpointing": true, "enable_mixed_precision_training": true, "finetune_epochs": 3, "finetune_global_batch_size": 128, "finetune_learning_rate": 2e-06, "finetune_lr_scheduler_type": "linear-warmup+cosine-decay", "finetune_max_grad_norm": 1.0, "finetune_max_steps": null, "finetune_per_device_batch_size": 4, "finetune_train_strategy": "fsdp-full-shard", "finetune_warmup_ratio": 0.03, "finetune_weight_decay": 0.1, "image_resize_strategy": "letterbox", "llm_backbone_id": "llama3-1-8b-pure", "llm_max_length": 2048, "model_id": "llama3-based-224-4epoch", "reduce_in_full_precision": false, "type": "one-stage+7b", "vision_backbone_id": "dinosiglip-vit-so-224px"}, "pretrained_checkpoint": null, "run_id": "spacellava+llama3-based-224-4epoch+stage-finetune+x7", "run_root_dir": "runs", "seed": 7, "stage": "finetune", "trackers": ["jsonl", "wandb"], "wandb_entity": "smellslikeml", "wandb_project": "prismatic"}, "run_id": "spacellava+llama3-based-224-4epoch+stage-finetune+x7"}
|
run_inference.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import requests
|
3 |
+
import torch
|
4 |
+
from PIL import Image
|
5 |
+
from pathlib import Path
|
6 |
+
from prismatic import load
|
7 |
+
|
8 |
+
def main(model_location, user_prompt, image_source):
|
9 |
+
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
10 |
+
|
11 |
+
# Load a pretrained VLM (either local path, or ID to auto-download from the HF Hub)
|
12 |
+
vlm = load(model_location)
|
13 |
+
vlm.to(device, dtype=torch.bfloat16)
|
14 |
+
|
15 |
+
# Load the image from URL or local path
|
16 |
+
if image_source.startswith("http://") or image_source.startswith("https://"):
|
17 |
+
image = Image.open(requests.get(image_source, stream=True).raw).convert("RGB")
|
18 |
+
else:
|
19 |
+
image = Image.open(image_source).convert("RGB")
|
20 |
+
|
21 |
+
# Build prompt
|
22 |
+
prompt_builder = vlm.get_prompt_builder()
|
23 |
+
prompt_builder.add_turn(role="human", message=user_prompt)
|
24 |
+
prompt_text = prompt_builder.get_prompt()
|
25 |
+
|
26 |
+
# Generate!
|
27 |
+
generated_text = vlm.generate(
|
28 |
+
image,
|
29 |
+
prompt_text,
|
30 |
+
do_sample=True,
|
31 |
+
temperature=0.1,
|
32 |
+
max_new_tokens=512,
|
33 |
+
min_length=1,
|
34 |
+
)
|
35 |
+
generated_text = generated_text.split("</s>")[0]
|
36 |
+
|
37 |
+
print("PROMPT TEXT: ", user_prompt)
|
38 |
+
print("GENERATED TEXT: ", generated_text)
|
39 |
+
|
40 |
+
if __name__ == "__main__":
|
41 |
+
parser = argparse.ArgumentParser(description="Process an image and prompt with a pretrained VLM model.")
|
42 |
+
parser.add_argument("--model_location", type=str, required=True, help="The location of the pretrained VLM model.")
|
43 |
+
parser.add_argument("--user_prompt", type=str, required=True, help="The prompt to process.")
|
44 |
+
parser.add_argument("--image_source", type=str, required=True, help="The URL or local path of the image.")
|
45 |
+
|
46 |
+
args = parser.parse_args()
|
47 |
+
|
48 |
+
main(args.model_location, args.user_prompt, args.image_source)
|
49 |
+
|
spacellava+llama3-based-224-4epoch+stage-finetune+x7.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|