adding weights

Files changed (10) hide show

README.md +1 -1
config.json +54 -37
config.yaml +52 -0
docker/Dockerfile +32 -0
docker/client.py +61 -0
docker/models/spacellama3.1/1/model.py +169 -0
docker/models/spacellama3.1/config.pbtxt +24 -0
run-metrics.jsonl +1 -0
run_inference.py +49 -0
spacellava+llama3-based-224-4epoch+stage-finetune+x7.jsonl +0 -0

README.md CHANGED Viewed

@@ -47,4 +47,4 @@ With a pipeline of expert models, we can infer spatial relationships between obj
   booktitle = {International Conference on Machine Learning (ICML)},
   year = {2024},
 }
-```

   booktitle = {International Conference on Machine Learning (ICML)},
   year = {2024},
 }
+```

config.json CHANGED Viewed

@@ -1,42 +1,59 @@
 {
-  "arch_specifier": "no-align+gelu-mlp",
-  "architectures": [
-    "PrismaticForConditionalGeneration"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_prismatic.PrismaticConfig",
-    "AutoModelForVision2Seq": "modeling_prismatic.PrismaticForConditionalGeneration"
   },
-  "hf_llm_id": "meta-llama/Meta-Llama-3.1-8B",
-  "image_resize_strategy": "letterbox",
-  "image_sizes": [
-    224,
-    224
-  ],
-  "llm_backbone_id": "llama3-1-8b-pure",
-  "llm_max_length": 2048,
-  "model_type": "prismatic",
-  "output_projector_states": false,
-  "pad_to_multiple_of": 64,
-  "pad_token_id": 0,
-  "padding_idx": 0,
-  "text_config": {
-    "model_type": "llama",
-    "pad_token_id": 128256,
-    "torch_dtype": "bfloat16",
-    "vocab_size": 128064
   },
-  "timm_model_ids": [
-    "vit_large_patch14_reg4_dinov2.lvd142m",
-    "vit_so400m_patch14_siglip_224"
-  ],
-  "timm_override_act_layers": [
-    null,
-    null
   ],
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.43.3",
-  "use_fused_vision_backbone": true,
-  "vision_backbone_id": "dinosiglip-vit-so-224px",
-  "vocab_size": 32001
 }

 {
+  "dataset": {
+    "align_stage_components": [
+      "/home/ubuntu/spacellava_data/data/dataset.json",
+      "/home/ubuntu/spacellava_data/data"
+    ],
+    "dataset_id": "spacellava",
+    "dataset_root_dir": "/home/ubuntu/spacellava_data/data",
+    "finetune_stage_components": [
+      "/home/ubuntu/spacellava_data/data/dataset.json",
+      "/home/ubuntu/spacellava_data/data"
+    ],
+    "type": "spacellava"
   },
+  "hf_token": ".hf_token",
+  "model": {
+    "align_epochs": 1,
+    "align_global_batch_size": 4,
+    "align_learning_rate": 0.001,
+    "align_lr_scheduler_type": "linear-warmup+cosine-decay",
+    "align_max_grad_norm": 1.0,
+    "align_max_steps": null,
+    "align_per_device_batch_size": 1,
+    "align_train_strategy": "fsdp-shard-grad-op",
+    "align_warmup_ratio": 0.03,
+    "align_weight_decay": 0.0,
+    "arch_specifier": "no-align+gelu-mlp",
+    "enable_gradient_checkpointing": true,
+    "enable_mixed_precision_training": true,
+    "finetune_epochs": 3,
+    "finetune_global_batch_size": 128,
+    "finetune_learning_rate": 2e-06,
+    "finetune_lr_scheduler_type": "linear-warmup+cosine-decay",
+    "finetune_max_grad_norm": 1.0,
+    "finetune_max_steps": null,
+    "finetune_per_device_batch_size": 4,
+    "finetune_train_strategy": "fsdp-full-shard",
+    "finetune_warmup_ratio": 0.03,
+    "finetune_weight_decay": 0.1,
+    "image_resize_strategy": "letterbox",
+    "llm_backbone_id": "llama3-1-8b-pure",
+    "llm_max_length": 2048,
+    "model_id": "llama3-based-224-4epoch",
+    "reduce_in_full_precision": false,
+    "type": "one-stage+7b",
+    "vision_backbone_id": "dinosiglip-vit-so-224px"
   },
+  "pretrained_checkpoint": null,
+  "run_id": "spacellava+llama3-based-224-4epoch+stage-finetune+x7",
+  "run_root_dir": "runs",
+  "seed": 7,
+  "stage": "finetune",
+  "trackers": [
+    "jsonl",
+    "wandb"
   ],
+  "wandb_entity": "smellslikeml",
+  "wandb_project": "prismatic"
 }

config.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+dataset:
+  align_stage_components:
+  - /home/ubuntu/spacellava_data/data/dataset.json
+  - /home/ubuntu/spacellava_data/data
+  dataset_id: spacellava
+  dataset_root_dir: /home/ubuntu/spacellava_data/data
+  finetune_stage_components:
+  - /home/ubuntu/spacellava_data/data/dataset.json
+  - /home/ubuntu/spacellava_data/data
+  type: spacellava
+hf_token: .hf_token
+model:
+  align_epochs: 1
+  align_global_batch_size: 4
+  align_learning_rate: 0.001
+  align_lr_scheduler_type: linear-warmup+cosine-decay
+  align_max_grad_norm: 1.0
+  align_max_steps: null
+  align_per_device_batch_size: 1
+  align_train_strategy: fsdp-shard-grad-op
+  align_warmup_ratio: 0.03
+  align_weight_decay: 0.0
+  arch_specifier: no-align+gelu-mlp
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
+  finetune_epochs: 3
+  finetune_global_batch_size: 128
+  finetune_learning_rate: 2.0e-06
+  finetune_lr_scheduler_type: linear-warmup+cosine-decay
+  finetune_max_grad_norm: 1.0
+  finetune_max_steps: null
+  finetune_per_device_batch_size: 4
+  finetune_train_strategy: fsdp-full-shard
+  finetune_warmup_ratio: 0.03
+  finetune_weight_decay: 0.1
+  image_resize_strategy: letterbox
+  llm_backbone_id: llama3-1-8b-pure
+  llm_max_length: 2048
+  model_id: llama3-based-224-4epoch
+  reduce_in_full_precision: false
+  type: one-stage+7b
+  vision_backbone_id: dinosiglip-vit-so-224px
+pretrained_checkpoint: null
+run_id: spacellava+llama3-based-224-4epoch+stage-finetune+x7
+run_root_dir: runs
+seed: 7
+stage: finetune
+trackers:
+- jsonl
+- wandb
+wandb_entity: smellslikeml
+wandb_project: prismatic

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+FROM nvcr.io/nvidia/tritonserver:22.11-py3
+WORKDIR /workspace
+RUN apt-get update && apt-get install cmake -y
+RUN pip install --upgrade pip && pip install --upgrade tensorrt
+RUN git clone https://github.com/NVIDIA/TensorRT.git -b main --single-branch \
+    && cd TensorRT \
+    && git submodule update --init --recursive
+ENV TRT_OSSPATH=/workspace/TensorRT
+WORKDIR ${TRT_OSSPATH}
+RUN mkdir -p build \
+    && cd build \
+    && cmake .. -DTRT_OUT_DIR=$PWD/out \
+    && cd plugin \
+    && make -j$(nproc)
+ENV PLUGIN_LIBS="${TRT_OSSPATH}/build/out/libnvinfer_plugin.so"
+RUN python3 -m pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
+RUN git clone https://github.com/remyxai/prismatic-vlms.git && cd prismatic-vlms && python3 -m pip install .
+RUN python3 -m pip install --upgrade transformers
+WORKDIR /models
+COPY ./models/ .
+WORKDIR /workspace
+CMD ["tritonserver", "--model-store=/models"]

docker/client.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import argparse
+import time
+import base64
+import numpy as np
+import requests
+import os
+from urllib.parse import urlparse
+from tritonclient.http import InferenceServerClient, InferInput, InferRequestedOutput
+def download_image(image_url):
+    parsed_url = urlparse(image_url)
+    filename = os.path.basename(parsed_url.path)
+    response = requests.get(image_url)
+    if response.status_code == 200:
+        with open(filename, 'wb') as img_file:
+            img_file.write(response.content)
+        return filename
+    else:
+        raise Exception("Failed to download image")
+def image_to_base64_data_uri(image_input):
+    with open(image_input, "rb") as img_file:
+        base64_data = base64.b64encode(img_file.read()).decode('utf-8')
+    return base64_data
+def setup_argparse():
+    parser = argparse.ArgumentParser(description="Client for Triton Inference Server")
+    parser.add_argument("--image_path", type=str, required=True, help="Path to the image or URL of the image to process")
+    parser.add_argument("--prompt", type=str, required=True, help="Prompt to be used for the inference")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = setup_argparse()
+    triton_client = InferenceServerClient(url="localhost:8000", verbose=False)
+    if args.image_path.startswith('http://') or args.image_path.startswith('https://'):
+        image_path = download_image(args.image_path)
+    else:
+        image_path = args.image_path
+    image_data = image_to_base64_data_uri(image_path).encode('utf-8')
+    image_data_np = np.array([image_data], dtype=object)
+    prompt_np = np.array([args.prompt.encode('utf-8')], dtype=object)
+    images_in = InferInput(name="IMAGES", shape=[1], datatype="BYTES")
+    images_in.set_data_from_numpy(image_data_np, binary_data=True)
+    prompt_in = InferInput(name="PROMPT", shape=[1], datatype="BYTES")
+    prompt_in.set_data_from_numpy(prompt_np, binary_data=True)
+    results_out = InferRequestedOutput(name="RESULTS", binary_data=False)
+    start_time = time.time()
+    response = triton_client.infer(model_name="spacellama3.1",
+                                   model_version="1",
+                                   inputs=[prompt_in, images_in],
+                                   outputs=[results_out])
+    results = response.get_response()["outputs"][0]["data"][0]
+    print("--- %s seconds ---" % (time.time() - start_time))
+    print(results)

docker/models/spacellama3.1/1/model.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import os
+import io
+import torch
+import numpy as np
+import triton_python_backend_utils as pb_utils
+from prismatic import load
+from PIL import Image
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+        """`auto_complete_config` is called only once when loading the model
+        assuming the server was not started with
+        `--disable-auto-complete-config`. Implementing this function is
+        optional. No implementation of `auto_complete_config` will do nothing.
+        This function can be used to set `max_batch_size`, `input` and `output`
+        properties of the model using `set_max_batch_size`, `add_input`, and
+        `add_output`. These properties will allow Triton to load the model with
+        minimal model configuration in absence of a configuration file. This
+        function returns the `pb_utils.ModelConfig` object with these
+        properties. You can use the `as_dict` function to gain read-only access
+        to the `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object
+        being returned from here will be used as the final configuration for
+        the model.
+        Note: The Python interpreter used to invoke this function will be
+        destroyed upon returning from this function and as a result none of the
+        objects created here will be available in the `initialize`, `execute`,
+        or `finalize` functions.
+        Parameters
+        ----------
+        auto_complete_model_config : pb_utils.ModelConfig
+          An object containing the existing model configuration. You can build
+          upon the configuration given by this object when setting the
+          properties for this model.
+        Returns
+        -------
+        pb_utils.ModelConfig
+          An object containing the auto-completed model configuration
+        """
+        inputs = [{
+            'name': 'PROMPT',
+            'data_type': 'TYPE_STRING',
+            'dims': [-1]
+        }, {
+            'name': 'IMAGES',
+            'data_type': 'TYPE_STRING',  # Changed from TYPE_FP16 to TYPE_STRING
+            'dims': [-1]  # Changed to indicate a variable-length array of strings
+        }]
+        outputs = [{
+            'name': 'RESULTS',
+            'data_type': 'TYPE_STRING',
+            'dims': [-1]
+        }]
+        config = auto_complete_model_config.as_dict()
+        input_names = []
+        output_names = []
+        for input in config['input']:
+            input_names.append(input['name'])
+        for output in config['output']:
+            output_names.append(output['name'])
+        for input in inputs:
+            if input['name'] not in input_names:
+                auto_complete_model_config.add_input(input)
+        for output in outputs:
+            if output['name'] not in output_names:
+                auto_complete_model_config.add_output(output)
+        auto_complete_model_config.set_dynamic_batching()
+        return auto_complete_model_config
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device
+            ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        self.model = load("remyxai/SpaceLlama3.1")
+        self.model.to(device, dtype=torch.bfloat16)
+        print('Initialized...')
+    def run_inference(self, prompt, image):
+	image = Image.open(io.BytesIO(image)).convert("RGB")
+        prompt_builder = self.model.get_prompt_builder()
+        prompt_builder.add_turn(role="human", message=prompt)
+        prompt_text = prompt_builder.get_prompt()
+        output_string = self.model.generate(
+	    image,
+	    prompt_text,
+	    do_sample=True,
+	    temperature=0.1,
+	    max_new_tokens=512,
+	    min_length=1,
+	)
+        output_string = output_string.split("</s>")[0]
+        output_data = np.array([output_string.encode('utf-8')], dtype=object)
+        return output_data
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        responses = []
+        for request in requests:
+            # Perform inference on the request and append it to responses
+            # list...
+            prompt = [
+                t.decode("UTF-8")
+                for t in pb_utils.get_input_tensor_by_name(request, "PROMPT")
+                .as_numpy()
+                .tolist()
+            ][0]
+            image = [
+                t.decode("UTF-8")
+                for t in pb_utils.get_input_tensor_by_name(request, "IMAGES")
+                .as_numpy()
+                .tolist()
+            ][0]
+            results = self.run_inference(prompt, image)
+            # Sending results
+            inference_response = pb_utils.InferenceResponse(output_tensors=[
+                pb_utils.Tensor(
+                    "RESULTS",
+                    results,
+                )
+            ])
+            responses.append(inference_response)
+        return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')

docker/models/spacellama3.1/config.pbtxt ADDED Viewed

	@@ -0,0 +1,24 @@

+name: "spacellama3.1"
+max_batch_size: 0
+backend: "python"
+input [
+    {
+        name: "PROMPT"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+    },
+    {
+        name: "IMAGES"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+    }
+]
+output [
+    {
+        name: "RESULTS"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+    }
+]

run-metrics.jsonl ADDED Viewed

	@@ -0,0 +1 @@

+ {"hparams": {"dataset": {"align_stage_components": ["/home/ubuntu/spacellava_data/data/dataset.json", "/home/ubuntu/spacellava_data/data"], "dataset_id": "spacellava", "dataset_root_dir": "/home/ubuntu/spacellava_data/data", "finetune_stage_components": ["/home/ubuntu/spacellava_data/data/dataset.json", "/home/ubuntu/spacellava_data/data"], "type": "spacellava"}, "hf_token": ".hf_token", "model": {"align_epochs": 1, "align_global_batch_size": 4, "align_learning_rate": 0.001, "align_lr_scheduler_type": "linear-warmup+cosine-decay", "align_max_grad_norm": 1.0, "align_max_steps": null, "align_per_device_batch_size": 1, "align_train_strategy": "fsdp-shard-grad-op", "align_warmup_ratio": 0.03, "align_weight_decay": 0.0, "arch_specifier": "no-align+gelu-mlp", "enable_gradient_checkpointing": true, "enable_mixed_precision_training": true, "finetune_epochs": 3, "finetune_global_batch_size": 128, "finetune_learning_rate": 2e-06, "finetune_lr_scheduler_type": "linear-warmup+cosine-decay", "finetune_max_grad_norm": 1.0, "finetune_max_steps": null, "finetune_per_device_batch_size": 4, "finetune_train_strategy": "fsdp-full-shard", "finetune_warmup_ratio": 0.03, "finetune_weight_decay": 0.1, "image_resize_strategy": "letterbox", "llm_backbone_id": "llama3-1-8b-pure", "llm_max_length": 2048, "model_id": "llama3-based-224-4epoch", "reduce_in_full_precision": false, "type": "one-stage+7b", "vision_backbone_id": "dinosiglip-vit-so-224px"}, "pretrained_checkpoint": null, "run_id": "spacellava+llama3-based-224-4epoch+stage-finetune+x7", "run_root_dir": "runs", "seed": 7, "stage": "finetune", "trackers": ["jsonl", "wandb"], "wandb_entity": "smellslikeml", "wandb_project": "prismatic"}, "run_id": "spacellava+llama3-based-224-4epoch+stage-finetune+x7"}

run_inference.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import argparse
+import requests
+import torch
+from PIL import Image
+from pathlib import Path
+from prismatic import load
+def main(model_location, user_prompt, image_source):
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    # Load a pretrained VLM (either local path, or ID to auto-download from the HF Hub)
+    vlm = load(model_location)
+    vlm.to(device, dtype=torch.bfloat16)
+    # Load the image from URL or local path
+    if image_source.startswith("http://") or image_source.startswith("https://"):
+        image = Image.open(requests.get(image_source, stream=True).raw).convert("RGB")
+    else:
+        image = Image.open(image_source).convert("RGB")
+    # Build prompt
+    prompt_builder = vlm.get_prompt_builder()
+    prompt_builder.add_turn(role="human", message=user_prompt)
+    prompt_text = prompt_builder.get_prompt()
+    # Generate!
+    generated_text = vlm.generate(
+        image,
+        prompt_text,
+        do_sample=True,
+        temperature=0.1,
+        max_new_tokens=512,
+        min_length=1,
+    )
+    generated_text = generated_text.split("</s>")[0]
+    print("PROMPT TEXT: ", user_prompt)
+    print("GENERATED TEXT: ", generated_text)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process an image and prompt with a pretrained VLM model.")
+    parser.add_argument("--model_location", type=str, required=True, help="The location of the pretrained VLM model.")
+    parser.add_argument("--user_prompt", type=str, required=True, help="The prompt to process.")
+    parser.add_argument("--image_source", type=str, required=True, help="The URL or local path of the image.")
+    args = parser.parse_args()
+    main(args.model_location, args.user_prompt, args.image_source)

spacellava+llama3-based-224-4epoch+stage-finetune+x7.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff