Spaces:

aletrn
/

lisa-on-cuda

Paused

App Files Files Community

alessandro trinca tornidor commited on Mar 8, 2024

Commit

acbbf71

1 Parent(s): 4d19eb4

[test] update inference function to return also output mask, useful for tests (now on saturncloud test.ipynb notebook)

Browse files

Files changed (7) hide show

README.md +34 -1
notebooks/test.ipynb +0 -0
requirements_jupyter.txt +4 -0
tests/__init__.py +0 -0
tests/imgs/example1_mask_0.png +3 -0
tests/test_app_helpers.py +88 -0
utils/app_helpers.py +26 -23

README.md CHANGED Viewed

@@ -7,7 +7,40 @@ sdk: docker
 pinned: false
 ---
-(Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference)
 [![Gradio](https://img.shields.io/badge/Gradio-Online%20Demo-blue)](http://103.170.5.190:7860/)
 [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/openxlab-app/LISA)

 pinned: false
 ---
+# exec jupyter on the remote server with port forwarding on localhost
+1. checkout repo, install venv with jupyter
+2. port forwarding in localhost wiht private key: `ssh -i ~/.ssh/id_ecdsa_saturncloud [email protected] -L 8889:localhost:8889 -N -f`
+3. start the jupyter-lab server
+4. connect to page in localhost
+## Commands to work on saturncloud after clone and git lfs install
+```bash
+cd ~/workspace/lisa-on-gpu/
+rm -rf lisa_venv
+python3 -m venv lisa_venv
+ln -s lisa_venv/ venv
+source  venv/bin/activate
+pip --version
+which python
+python -m pip install pip wheel --upgrade
+python -m pip install pytest pytest-cov jupyterlab
+python -m pip install -r requirements.txt
+nohup jupyter-lab &
+tail -F nohup.out
+```
+# Jupyterlab Howto
+To run the `test.ipynb` notebook you should already:
+- cloned project https://huggingface.co/spaces/aletrn/lisa-on-gpu with active git lfs
+- created and activated a virtualenv
+- installed jupyterlab dependencies from requirements_jupyter.txt
+- installed dependencies from requirements.txt
+## Hardware requirements
+- an nvidia gpu with 10 or 12GB of memory (a T4 should suffice)
+- at least 16GB of system ram
 [![Gradio](https://img.shields.io/badge/Gradio-Online%20Demo-blue)](http://103.170.5.190:7860/)
 [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/openxlab-app/LISA)

notebooks/test.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements_jupyter.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+jupyterlab
+ipywidgets
+pytest
+pytest-cov

tests/__init__.py ADDED Viewed

File without changes

tests/imgs/example1_mask_0.png ADDED Viewed

Git LFS Details

SHA256: aab6b5e031486029e331f6d5e30acad41b0fdd73cc544e7c241dc9966932b2ca
Pointer size: 129 Bytes
Size of remote file: 6.29 kB

tests/test_app_helpers.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import logging
+import unittest
+class TestAppBuilders(unittest.TestCase):
+    def test_default_creation(self):
+        from utils import utils
+        placeholders = utils.create_placeholder_variables()
+        self.assertIsInstance(placeholders, dict)
+        assert placeholders["no_seg_out"].shape == (512, 512, 3)
+        assert placeholders["error_happened"].shape == (512, 512, 3)
+    def test_parse_args(self):
+        from utils import app_helpers
+        test_args_parse = app_helpers.parse_args([])
+        assert vars(test_args_parse) == {
+            'version': 'xinlai/LISA-13B-llama2-v1-explanatory',
+            'vis_save_path': './vis_output',
+            'precision': 'fp16',
+            'image_size': 1024,
+            'model_max_length': 512,
+            'lora_r': 8,
+            'vision_tower': 'openai/clip-vit-large-patch14',
+            'local_rank': 0,
+            'load_in_8bit': False,
+            'load_in_4bit': True,
+            'use_mm_start_end': True,
+            'conv_type': 'llava_v1'
+        }
+    def test_inference(self):
+        import cv2
+        import numpy as np
+        from utils import app_helpers, constants, utils
+        max_diff = 0.02
+        logging.info("starting...")
+        logging.warning("Remember: before running again 'get_inference_model_by_args(test_args_parse)' free some memory")
+        test_args_parse = app_helpers.parse_args([])
+        inference_fn = app_helpers.get_inference_model_by_args(test_args_parse)
+        idx_example = 0
+        input_prompt, input_image_path = constants.examples[idx_example]
+        logging.info("running inference function with input prompt '{}'.".format(input_prompt))
+        _, output_mask, output_str = inference_fn(
+            input_prompt,
+            utils.ROOT / input_image_path
+        )
+        logging.info(f"output_str: {output_str}.")
+        expected_mask = cv2.imread(
+            str(utils.ROOT / "tests" / "imgs" / f"example{idx_example}_mask_0.png"),
+            cv2.IMREAD_GRAYSCALE
+        )
+        tot = output_mask.size
+        count = np.sum(output_mask != expected_mask)
+        perc = 100 * count / tot
+        logging.info(f"diff 1 vs 1b: {perc:.2f}!")
+        try:
+            assert np.array_equal(output_mask, expected_mask)
+        except AssertionError:
+            try:
+                logging.error("failed equality assertion!")
+                logging.info(f"assert now that perc diff between ndarrays is minor than {max_diff}.")
+                assert perc < max_diff
+            except AssertionError as ae:
+                logging.error("failed all assertions, writing debug files...")
+                import datetime
+                now_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+                output_folder = utils.ROOT / "tests" / "imgs"
+                prefix = f"broken_test_example{idx_example + 1}_{now_str}"
+                cv2.imwrite(
+                    str(output_folder / f"{prefix}.png"),
+                    output_mask
+                )
+                with open(output_folder / f"{prefix}__input_prompt.txt",
+                          "w") as dst:
+                    dst.write(input_prompt)
+                with open(output_folder / f"{prefix}__output_str.txt",
+                          "w") as dst:
+                    dst.write(output_str)
+                logging.info(f"Written files with prefix '{prefix}' in {output_folder} folder.")
+                raise ae
+        logging.info("end")

utils/app_helpers.py CHANGED Viewed

@@ -17,7 +17,6 @@ from model.llava import conversation as conversation_lib
 from model.llava.mm_utils import tokenizer_image_token
 from model.segment_anything.utils.transforms import ResizeLongestSide
 placeholders = utils.create_placeholder_variables()
@@ -96,10 +95,10 @@ def set_image_precision_by_args(input_image, precision):
 @session_logger.set_uuid_logging
 def preprocess(
-    x,
-    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
-    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
-    img_size=1024,
 ) -> torch.Tensor:
     """Normalize pixel values and pad to a square input."""
     logging.info("preprocess started")
@@ -161,7 +160,8 @@ def get_model(args_to_parse):
             }
         )
     _model = LISAForCausalLM.from_pretrained(
-        args_to_parse.version, low_cpu_mem_usage=True, vision_tower=args_to_parse.vision_tower, seg_token_idx=args_to_parse.seg_token_idx, **kwargs
     )
     _model.config.eos_token_id = _tokenizer.eos_token_id
     _model.config.bos_token_id = _tokenizer.bos_token_id
@@ -207,7 +207,6 @@ def get_inference_model_by_args(args_to_parse):
     @session_logger.set_uuid_logging
     def inference(input_str, input_image_pathname):
         ## filter out special chars
         input_str = get_cleaned_input(input_str)
         logging.info(f"input_str type: {type(input_str)}, input_image type: {type(input_image_pathname)}.")
         logging.info(f"input_str: {input_str}, input_image: {type(input_image_pathname)}.")
@@ -225,7 +224,7 @@ def get_inference_model_by_args(args_to_parse):
         prompt = utils.DEFAULT_IMAGE_TOKEN + "\n" + prompt
         if args_to_parse.use_mm_start_end:
             replace_token = (
-                utils.DEFAULT_IM_START_TOKEN + utils.DEFAULT_IMAGE_TOKEN + utils.DEFAULT_IM_END_TOKEN
             )
             prompt = prompt.replace(utils.DEFAULT_IMAGE_TOKEN, replace_token)
@@ -276,25 +275,28 @@ def get_inference_model_by_args(args_to_parse):
         text_output = text_output.replace("\n", "").replace("  ", " ")
         text_output = text_output.split("ASSISTANT: ")[-1]
-        logging.info(f"text_output type: {type(text_output)}, text_output: {text_output}.")
-        save_img = None
         for i, pred_mask in enumerate(pred_masks):
-            if pred_mask.shape[0] == 0:
                 continue
             pred_mask = pred_mask.detach().cpu().numpy()[0]
-            pred_mask = pred_mask > 0
-            save_img = image_np.copy()
-            save_img[pred_mask] = (
                 image_np * 0.5
-                + pred_mask[:, :, None].astype(np.uint8) * np.array([255, 0, 0]) * 0.5
-            )[pred_mask]
-        output_str = f"ASSISTANT: {text_output}"
-        output_image = no_seg_out if save_img is None else save_img
-        logging.info(f"output_image type: {type(output_image)}.")
-        return output_image, output_str
     logging.info("prepared inference function!")
     return inference
@@ -303,7 +305,7 @@ def get_inference_model_by_args(args_to_parse):
 @session_logger.set_uuid_logging
 def get_gradio_interface(
         fn_inference: Callable
-    ):
     return gr.Interface(
         fn_inference,
         inputs=[
@@ -311,7 +313,8 @@ def get_gradio_interface(
             gr.Image(type="filepath", label="Input Image")
         ],
         outputs=[
-            gr.Image(type="pil", label="Segmentation Output"),
             gr.Textbox(lines=1, placeholder=None, label="Text Output")
         ],
         title=constants.title,

 from model.llava.mm_utils import tokenizer_image_token
 from model.segment_anything.utils.transforms import ResizeLongestSide
 placeholders = utils.create_placeholder_variables()
 @session_logger.set_uuid_logging
 def preprocess(
+        x,
+        pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
+        pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
+        img_size=1024,
 ) -> torch.Tensor:
     """Normalize pixel values and pad to a square input."""
     logging.info("preprocess started")
             }
         )
     _model = LISAForCausalLM.from_pretrained(
+        args_to_parse.version, low_cpu_mem_usage=True, vision_tower=args_to_parse.vision_tower,
+        seg_token_idx=args_to_parse.seg_token_idx, **kwargs
     )
     _model.config.eos_token_id = _tokenizer.eos_token_id
     _model.config.bos_token_id = _tokenizer.bos_token_id
     @session_logger.set_uuid_logging
     def inference(input_str, input_image_pathname):
         ## filter out special chars
         input_str = get_cleaned_input(input_str)
         logging.info(f"input_str type: {type(input_str)}, input_image type: {type(input_image_pathname)}.")
         logging.info(f"input_str: {input_str}, input_image: {type(input_image_pathname)}.")
         prompt = utils.DEFAULT_IMAGE_TOKEN + "\n" + prompt
         if args_to_parse.use_mm_start_end:
             replace_token = (
+                    utils.DEFAULT_IM_START_TOKEN + utils.DEFAULT_IMAGE_TOKEN + utils.DEFAULT_IM_END_TOKEN
             )
             prompt = prompt.replace(utils.DEFAULT_IMAGE_TOKEN, replace_token)
         text_output = text_output.replace("\n", "").replace("  ", " ")
         text_output = text_output.split("ASSISTANT: ")[-1]
+        logging.info(
+            f"found n {len(pred_masks)} prediction masks, "
+            f"text_output type: {type(text_output)}, text_output: {text_output}."
+        )
+        output_image = no_seg_out
+        output_mask = no_seg_out
         for i, pred_mask in enumerate(pred_masks):
+            if pred_mask.shape[0] == 0 or pred_mask.shape[1] == 0:
                 continue
             pred_mask = pred_mask.detach().cpu().numpy()[0]
+            pred_mask_bool = pred_mask > 0
+            output_mask = pred_mask_bool.astype(np.uint8) * 255
+            output_image = image_np.copy()
+            output_image[pred_mask_bool] = (
                 image_np * 0.5
+                + pred_mask_bool[:, :, None].astype(np.uint8) * np.array([255, 0, 0]) * 0.5
+            )[pred_mask_bool]
+        output_str = f"ASSISTANT: {text_output} ..."
+        logging.info(f"output_image type: {type(output_mask)}.")
+        return output_image, output_mask, output_str
     logging.info("prepared inference function!")
     return inference
 @session_logger.set_uuid_logging
 def get_gradio_interface(
         fn_inference: Callable
+):
     return gr.Interface(
         fn_inference,
         inputs=[
             gr.Image(type="filepath", label="Input Image")
         ],
         outputs=[
+            gr.Image(type="pil", label="segmentation Output"),
+            gr.Image(type="pil", label="mask Output"),
             gr.Textbox(lines=1, placeholder=None, label="Text Output")
         ],
         title=constants.title,