stable-diffusion-inpainting

Paused

App Files Files Community

akhaliq HF Staff commited on Oct 19, 2022

Commit

b7133f4

1 Parent(s): 6b27ec4

files

Browse files

Files changed (31) hide show

app.py +170 -50
clipseg/LICENSE +21 -0
clipseg/Quickstart.ipynb +107 -0
clipseg/Readme.md +84 -0
clipseg/Tables.ipynb +349 -0
clipseg/Visual_Feature_Engineering.ipynb +366 -0
clipseg/datasets/coco_wrapper.py +99 -0
clipseg/datasets/pascal_classes.json +1 -0
clipseg/datasets/pascal_zeroshot.py +60 -0
clipseg/datasets/pfe_dataset.py +129 -0
clipseg/datasets/phrasecut.py +335 -0
clipseg/datasets/utils.py +68 -0
clipseg/environment.yml +15 -0
clipseg/evaluation_utils.py +292 -0
clipseg/example_image.jpg +0 -0
clipseg/experiments/ablation.yaml +84 -0
clipseg/experiments/coco.yaml +101 -0
clipseg/experiments/pascal_1shot.yaml +101 -0
clipseg/experiments/phrasecut.yaml +80 -0
clipseg/general_utils.py +272 -0
clipseg/metrics.py +271 -0
clipseg/models/clipseg.py +552 -0
clipseg/models/vitseg.py +286 -0
clipseg/overview.png +0 -0
clipseg/score.py +453 -0
clipseg/setup.py +30 -0
clipseg/training.py +266 -0
clipseg/weights/rd64-uni.pth +3 -0
init_image.png +0 -0
inpainting.py +194 -0
mask_image.png +0 -0

app.py CHANGED Viewed

@@ -1,54 +1,174 @@
-from diffusers import StableDiffusionInpaintPipeline
 import gradio as gr
-import numpy as np
-import imageio
-from PIL import Image
 from io import BytesIO
 import os
-MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD')
-print("hello sylvain")
-YOUR_TOKEN=MY_SECRET_TOKEN
-device="cpu"
-pipe = StableDiffusionInpaintPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=YOUR_TOKEN)
-pipe.to(device)
-source_img = gr.Image(source="upload", type="numpy", tool="sketch", elem_id="source_container");
-gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery").style(grid=[2], height="auto")
-def resize(height,img):
-  baseheight = height
-  img = Image.open(img)
-  hpercent = (baseheight/float(img.size[1]))
-  wsize = int((float(img.size[0])*float(hpercent)))
-  img = img.resize((wsize,baseheight), Image.Resampling.LANCZOS)
-  return img
-def predict(source_img, prompt):
-    imageio.imwrite("data.png", source_img["image"])
-    imageio.imwrite("data_mask.png", source_img["mask"])
-    src = resize(512, "data.png")
-    src.save("src.png")
-    mask = resize(512, "data_mask.png")
-    mask.save("mask.png")
-    images_list = pipe([prompt] * 2, init_image=src, mask_image=mask, strength=0.75)
-    images = []
-    safe_image = Image.open(r"unsafe.png")
-    for i, image in enumerate(images_list["sample"]):
-        if(images_list["nsfw_content_detected"][i]):
-            images.append(safe_image)
-        else:
-            images.append(image)
-    return images
-custom_css="style.css"
-title="InPainting Stable Diffusion CPU"
-description="Inpainting Stable Diffusion example using CPU and HF token. <br />Warning: Slow process... ~5/10 min inference time. <b>NSFW filter enabled.</b><br />Please use 512*512 square image as input to avoid memory error !"
-gr.Interface(fn=predict, inputs=[source_img, "text"], outputs=gallery, css=custom_css, title=title, description=description, allow_flagging="manual").launch(enable_queue=True)

 import gradio as gr
 from io import BytesIO
+import requests
+import PIL
+from PIL import Image
+import numpy as np
 import os
+import uuid
+import torch
+from torch import autocast
+import cv2
+from matplotlib import pyplot as plt
+from inpainting import StableDiffusionInpaintingPipeline
+from torchvision import transforms
+from clipseg.models.clipseg import CLIPDensePredT
+auth_token = os.environ.get("API_TOKEN") or True
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+pipe = StableDiffusionInpaintingPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    revision="fp16",
+    torch_dtype=torch.float16,
+    use_auth_token=auth_token,
+).to(device)
+model = CLIPDensePredT(version='ViT-B/16', reduce_dim=64)
+model.eval()
+model.load_state_dict(torch.load('./clipseg/weights/rd64-uni.pth', map_location=torch.device('cuda')), strict=False)
+transform = transforms.Compose([
+      transforms.ToTensor(),
+      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+      transforms.Resize((512, 512)),
+])
+def predict(radio, dict, word_mask, prompt=""):
+    if(radio == "draw a mask above"):
+        with autocast("cuda"):
+            init_image = dict["image"].convert("RGB").resize((512, 512))
+            mask = dict["mask"].convert("RGB").resize((512, 512))
+    else:
+        img = transform(dict["image"]).unsqueeze(0)
+        word_masks = [word_mask]
+        with torch.no_grad():
+            preds = model(img.repeat(len(word_masks),1,1,1), word_masks)[0]
+        init_image = dict['image'].convert('RGB').resize((512, 512))
+        filename = f"{uuid.uuid4()}.png"
+        plt.imsave(filename,torch.sigmoid(preds[0][0]))
+        img2 = cv2.imread(filename)
+        gray_image = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
+        (thresh, bw_image) = cv2.threshold(gray_image, 100, 255, cv2.THRESH_BINARY)
+        cv2.cvtColor(bw_image, cv2.COLOR_BGR2RGB)
+        mask = Image.fromarray(np.uint8(bw_image)).convert('RGB')
+        os.remove(filename)
+    with autocast("cuda"):
+        images = pipe(prompt = prompt, init_image=init_image, mask_image=mask, strength=0.8)["sample"]
+    return images[0]
+# examples = [[dict(image="init_image.png", mask="mask_image.png"), "A panda sitting on a bench"]]
+css = '''
+.container {max-width: 1150px;margin: auto;padding-top: 1.5rem}
+#image_upload{min-height:400px}
+#image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 400px}
+#mask_radio .gr-form{background:transparent; border: none}
+#word_mask{margin-top: .75em !important}
+#word_mask textarea:disabled{opacity: 0.3}
+.footer {margin-bottom: 45px;margin-top: 35px;text-align: center;border-bottom: 1px solid #e5e5e5}
+.footer>p {font-size: .8rem; display: inline-block; padding: 0 10px;transform: translateY(10px);background: white}
+.dark .footer {border-color: #303030}
+.dark .footer>p {background: #0b0f19}
+.acknowledgments h4{margin: 1.25em 0 .25em 0;font-weight: bold;font-size: 115%}
+#image_upload .touch-none{display: flex}
+'''
+def swap_word_mask(radio_option):
+    if(radio_option == "type what to mask below"):
+        return gr.update(interactive=True, placeholder="A cat")
+    else:
+        return gr.update(interactive=False, placeholder="Disabled")
+image_blocks = gr.Blocks(css=css)
+with image_blocks as demo:
+    gr.HTML(
+        """
+            <div style="text-align: center; max-width: 650px; margin: 0 auto;">
+              <div
+                style="
+                  display: inline-flex;
+                  align-items: center;
+                  gap: 0.8rem;
+                  font-size: 1.75rem;
+                "
+              >
+                <svg
+                  width="0.65em"
+                  height="0.65em"
+                  viewBox="0 0 115 115"
+                  fill="none"
+                  xmlns="http://www.w3.org/2000/svg"
+                >
+                  <rect width="23" height="23" fill="white"></rect>
+                  <rect y="69" width="23" height="23" fill="white"></rect>
+                  <rect x="23" width="23" height="23" fill="#AEAEAE"></rect>
+                  <rect x="23" y="69" width="23" height="23" fill="#AEAEAE"></rect>
+                  <rect x="46" width="23" height="23" fill="white"></rect>
+                  <rect x="46" y="69" width="23" height="23" fill="white"></rect>
+                  <rect x="69" width="23" height="23" fill="black"></rect>
+                  <rect x="69" y="69" width="23" height="23" fill="black"></rect>
+                  <rect x="92" width="23" height="23" fill="#D9D9D9"></rect>
+                  <rect x="92" y="69" width="23" height="23" fill="#AEAEAE"></rect>
+                  <rect x="115" y="46" width="23" height="23" fill="white"></rect>
+                  <rect x="115" y="115" width="23" height="23" fill="white"></rect>
+                  <rect x="115" y="69" width="23" height="23" fill="#D9D9D9"></rect>
+                  <rect x="92" y="46" width="23" height="23" fill="#AEAEAE"></rect>
+                  <rect x="92" y="115" width="23" height="23" fill="#AEAEAE"></rect>
+                  <rect x="92" y="69" width="23" height="23" fill="white"></rect>
+                  <rect x="69" y="46" width="23" height="23" fill="white"></rect>
+                  <rect x="69" y="115" width="23" height="23" fill="white"></rect>
+                  <rect x="69" y="69" width="23" height="23" fill="#D9D9D9"></rect>
+                  <rect x="46" y="46" width="23" height="23" fill="black"></rect>
+                  <rect x="46" y="115" width="23" height="23" fill="black"></rect>
+                  <rect x="46" y="69" width="23" height="23" fill="black"></rect>
+                  <rect x="23" y="46" width="23" height="23" fill="#D9D9D9"></rect>
+                  <rect x="23" y="115" width="23" height="23" fill="#AEAEAE"></rect>
+                  <rect x="23" y="69" width="23" height="23" fill="black"></rect>
+                </svg>
+                <h1 style="font-weight: 900; margin-bottom: 7px;">
+                  Stable Diffusion Multi Inpainting
+                </h1>
+              </div>
+              <p style="margin-bottom: 10px; font-size: 94%">
+                Inpaint Stable Diffusion by either drawing a mask or typing what to replace
+              </p>
+            </div>
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            image = gr.Image(source='upload', tool='sketch', elem_id="image_upload", type="pil", label="Upload").style(height=400)
+            with gr.Box(elem_id="mask_radio").style(border=False):
+                radio = gr.Radio(["draw a mask above", "type what to mask below"], value="draw a mask above", show_label=False, interactive=True).style(container=False)
+                word_mask = gr.Textbox(label = "What to find in your image", interactive=False, elem_id="word_mask", placeholder="Disabled").style(container=False)
+            prompt = gr.Textbox(label = 'Your prompt (what you want to add in place of what you are removing)')
+            radio.change(fn=swap_word_mask, inputs=radio, outputs=word_mask,show_progress=False)
+            radio.change(None, inputs=[], outputs=image_blocks, _js = """
+            () => {
+                css_style = document.styleSheets[document.styleSheets.length - 1]
+                last_item = css_style.cssRules[css_style.cssRules.length - 1]
+                last_item.style.display = ["flex", ""].includes(last_item.style.display) ? "none" : "flex";
+            }""")
+            btn = gr.Button("Run")
+        with gr.Column():
+            result = gr.Image(label="Result")
+        btn.click(fn=predict, inputs=[radio, image, word_mask, prompt], outputs=result)
+    gr.HTML(
+            """
+                <div class="footer">
+                    <p>Model by <a href="https://huggingface.co/CompVis" style="text-decoration: underline;" target="_blank">CompVis</a> and <a href="https://huggingface.co/stabilityai" style="text-decoration: underline;" target="_blank">Stability AI</a> - Inpainting by <a href="https://github.com/nagolinc" style="text-decoration: underline;" target="_blank">nagolinc</a> and <a href="https://github.com/patil-suraj" style="text-decoration: underline;">patil-suraj</a>, inpainting with words by <a href="https://twitter.com/yvrjsharma/" style="text-decoration: underline;" target="_blank">@yvrjsharma</a> and <a href="https://twitter.com/1littlecoder" style="text-decoration: underline;">@1littlecoder</a> - Gradio Demo by 🤗 Hugging Face
+                    </p>
+                </div>
+                <div class="acknowledgments">
+                    <p><h4>LICENSE</h4>
+The model is licensed with a <a href="https://huggingface.co/spaces/CompVis/stable-diffusion-license" style="text-decoration: underline;" target="_blank">CreativeML Open RAIL-M</a> license. The authors claim no rights on the outputs you generate, you are free to use them and are accountable for their use which must not go against the provisions set in this license. The license forbids you from sharing any content that violates any laws, produce any harm to a person, disseminate any personal information that would be meant for harm, spread misinformation and target vulnerable groups. For the full list of restrictions please <a href="https://huggingface.co/spaces/CompVis/stable-diffusion-license" target="_blank" style="text-decoration: underline;" target="_blank">read the license</a></p>
+                    <p><h4>Biases and content acknowledgment</h4>
+Despite how impressive being able to turn text into image is, beware to the fact that this model may output content that reinforces or exacerbates societal biases, as well as realistic faces, pornography and violence. The model was trained on the <a href="https://laion.ai/blog/laion-5b/" style="text-decoration: underline;" target="_blank">LAION-5B dataset</a>, which scraped non-curated image-text-pairs from the internet (the exception being the removal of illegal content) and is meant for research purposes. You can read more in the <a href="https://huggingface.co/CompVis/stable-diffusion-v1-4" style="text-decoration: underline;" target="_blank">model card</a></p>
+               </div>
+           """
+        )
+demo.launch()

clipseg/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+This license does not apply to the model weights.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

clipseg/Quickstart.ipynb ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import requests\n",
+    "\n",
+    "! wget https://owncloud.gwdg.de/index.php/s/ioHbRzFx6th32hn/download -O weights.zip\n",
+    "! unzip -d weights -j weights.zip\n",
+    "from models.clipseg import CLIPDensePredT\n",
+    "from PIL import Image\n",
+    "from torchvision import transforms\n",
+    "from matplotlib import pyplot as plt\n",
+    "\n",
+    "# load model\n",
+    "model = CLIPDensePredT(version='ViT-B/16', reduce_dim=64)\n",
+    "model.eval();\n",
+    "\n",
+    "# non-strict, because we only stored decoder weights (not CLIP weights)\n",
+    "model.load_state_dict(torch.load('weights/rd64-uni.pth', map_location=torch.device('cpu')), strict=False);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load and normalize `example_image.jpg`. You can also load through an URL."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load and normalize image\n",
+    "input_image = Image.open('example_image.jpg')\n",
+    "\n",
+    "# or load from URL...\n",
+    "# image_url = 'https://farm5.staticflickr.com/4141/4856248695_03475782dc_z.jpg'\n",
+    "# input_image = Image.open(requests.get(image_url, stream=True).raw)\n",
+    "\n",
+    "transform = transforms.Compose([\n",
+    "    transforms.ToTensor(),\n",
+    "    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
+    "    transforms.Resize((352, 352)),\n",
+    "])\n",
+    "img = transform(input_image).unsqueeze(0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Predict and visualize (this might take a few seconds if running without GPU support)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = ['a glass', 'something to fill', 'wood', 'a jar']\n",
+    "\n",
+    "# predict\n",
+    "with torch.no_grad():\n",
+    "    preds = model(img.repeat(4,1,1,1), prompts)[0]\n",
+    "\n",
+    "# visualize prediction\n",
+    "_, ax = plt.subplots(1, 5, figsize=(15, 4))\n",
+    "[a.axis('off') for a in ax.flatten()]\n",
+    "ax[0].imshow(input_image)\n",
+    "[ax[i+1].imshow(torch.sigmoid(preds[i][0])) for i in range(4)];\n",
+    "[ax[i+1].text(0, -15, prompts[i]) for i in range(4)];"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "800ed241f7db2bd3aa6942aa3be6809cdb30ee6b0a9e773dfecfa9fef1f4c586"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

clipseg/Readme.md ADDED Viewed

	@@ -0,0 +1,84 @@

+# Image Segmentation Using Text and Image Prompts
+This repository contains the code used in the paper ["Image Segmentation Using Text and Image Prompts"](https://arxiv.org/abs/2112.10003).
+**The Paper has been accepted to CVPR 2022!**
+<img src="overview.png" alt="drawing" height="200em"/>
+The systems allows to create segmentation models without training based on:
+- An arbitrary text query
+- Or an image with a mask highlighting stuff or an object.
+### Quick Start
+In the `Quickstart.ipynb` notebook we provide the code for using a pre-trained CLIPSeg model. If you run the notebook locally, make sure you downloaded the `rd64-uni.pth` weights, either manually or via git lfs extension.
+It can also be used interactively using [MyBinder](https://mybinder.org/v2/gh/timojl/clipseg/HEAD?labpath=Quickstart.ipynb)
+(please note that the VM does not use a GPU, thus inference takes a few seconds).
+### Dependencies
+This code base depends on pytorch, torchvision and clip (`pip install git+https://github.com/openai/CLIP.git`).
+Additional dependencies are hidden for double blind review.
+### Datasets
+* `PhraseCut` and `PhraseCutPlus`: Referring expression dataset
+* `PFEPascalWrapper`: Wrapper class for PFENet's Pascal-5i implementation
+* `PascalZeroShot`: Wrapper class for PascalZeroShot
+* `COCOWrapper`: Wrapper class for COCO.
+### Models
+* `CLIPDensePredT`: CLIPSeg model with transformer-based decoder.
+* `ViTDensePredT`: CLIPSeg model with transformer-based decoder.
+### Third Party Dependencies
+For some of the datasets third party dependencies are required. Run the following commands in the `third_party` folder.
+```bash
+git clone https://github.com/cvlab-yonsei/JoEm
+git clone https://github.com/Jia-Research-Lab/PFENet.git
+git clone https://github.com/ChenyunWu/PhraseCutDataset.git
+git clone https://github.com/juhongm999/hsnet.git
+```
+### Weights
+The MIT license does not apply to these weights.
+We provide two model weights, for D=64 (4.1MB) and D=16 (1.1MB).
+```
+wget https://owncloud.gwdg.de/index.php/s/ioHbRzFx6th32hn/download -O weights.zip
+unzip -d weights -j weights.zip
+```
+### Training and Evaluation
+To train use the `training.py` script with experiment file and experiment id parameters. E.g. `python training.py phrasecut.yaml 0` will train the first phrasecut experiment which is defined by the `configuration` and first `individual_configurations` parameters. Model weights will be written in `logs/`.
+For evaluation use `score.py`. E.g. `python score.py phrasecut.yaml 0 0` will train the first phrasecut experiment of `test_configuration` and the first configuration in `individual_configurations`.
+### Usage of PFENet Wrappers
+In order to use the dataset and model wrappers for PFENet, the PFENet repository needs to be cloned to the root folder.
+`git clone https://github.com/Jia-Research-Lab/PFENet.git `
+### License
+The source code files in this repository (excluding model weights) are released under MIT license.
+### Citation
+```
+@InProceedings{lueddecke22_cvpr,
+    author    = {L\"uddecke, Timo and Ecker, Alexander},
+    title     = {Image Segmentation Using Text and Image Prompts},
+    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month     = {June},
+    year      = {2022},
+    pages     = {7086-7096}
+}
+```

clipseg/Tables.ipynb ADDED Viewed

	@@ -0,0 +1,349 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import clip\n",
+    "from evaluation_utils import norm, denorm\n",
+    "from general_utils import *\n",
+    "from datasets.lvis_oneshot3 import LVIS_OneShot3, LVIS_OneShot"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PhraseCut"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pc = experiment('experiments/phrasecut.yaml', nums=':6').dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tab1 = pc[['name', 'pc_miou_best',  'pc_fgiou_best', 'pc_ap']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cols = ['pc_miou_0.3',  'pc_fgiou_0.3', 'pc_ap']\n",
+    "tab1 = pc[['name'] + cols]\n",
+    "for k in cols:\n",
+    "    tab1.loc[:, k] = (100 * tab1.loc[:, k]).round(1)\n",
+    "tab1.loc[:, 'name'] = ['CLIPSeg (PC+)', 'CLIPSeg (PC, $D=128$)', 'CLIPSeg (PC)', 'CLIP-Deconv', 'ViTSeg (PC+)', 'ViTSeg (PC)']\n",
+    "tab1.insert(1, 't', [0.3]*tab1.shape[0])\n",
+    "print(tab1.to_latex(header=False, index=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For 0.1 threshold"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cols = ['pc_miou_0.1',  'pc_fgiou_0.1', 'pc_ap']\n",
+    "tab1 = pc[['name'] + cols]\n",
+    "for k in cols:\n",
+    "    tab1.loc[:, k] = (100 * tab1.loc[:, k]).round(1)\n",
+    "tab1.loc[:, 'name'] = ['CLIPSeg (PC+)', 'CLIPSeg (PC, $D=128$)', 'CLIPSeg (PC)', 'CLIP-Deconv', 'ViTSeg (PC+)', 'ViTSeg (PC)']\n",
+    "tab1.insert(1, 't', [0.1]*tab1.shape[0])\n",
+    "print(tab1.to_latex(header=False, index=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# One-shot"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Pascal"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pas = experiment('experiments/pascal_1shot.yaml', nums=':19').dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pas[['name', 'pas_h2_miou_0.3', 'pas_h2_biniou_0.3', 'pas_h2_ap', 'pas_h2_fgiou_ct']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pas = experiment('experiments/pascal_1shot.yaml', nums=':8').dataframe()\n",
+    "tab1 = pas[['pas_h2_miou_0.3', 'pas_h2_biniou_0.3', 'pas_h2_ap']]\n",
+    "print('CLIPSeg (PC+) & 0.3 & CLIP & ' + ' & '.join(f'{x*100:.1f}' for x in tab1[0:4].mean(0).values), '\\\\\\\\')\n",
+    "print('CLIPSeg (PC)  & 0.3 & CLIP & ' + ' & '.join(f'{x*100:.1f}' for x in tab1[4:8].mean(0).values), '\\\\\\\\')\n",
+    "\n",
+    "pas = experiment('experiments/pascal_1shot.yaml', nums='12:16').dataframe()\n",
+    "tab1 = pas[['pas_h2_miou_0.2', 'pas_h2_biniou_0.2', 'pas_h2_ap']]\n",
+    "print('CLIP-Deconv (PC+) & 0.2 & CLIP & ' + ' & '.join(f'{x*100:.1f}' for x in tab1[0:4].mean(0).values), '\\\\\\\\')\n",
+    "\n",
+    "pas = experiment('experiments/pascal_1shot.yaml', nums='16:20').dataframe()\n",
+    "tab1 = pas[['pas_t_miou_0.2', 'pas_t_biniou_0.2', 'pas_t_ap']]\n",
+    "print('ViTSeg (PC+) & 0.2 & CLIP & ' + ' & '.join(f'{x*100:.1f}' for x in tab1[0:4].mean(0).values), '\\\\\\\\')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Pascal Zero-shot (in one-shot setting)\n",
+    "\n",
+    "Using the same setting as one-shot (hence different from the other zero-shot benchmark)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pas = experiment('experiments/pascal_1shot.yaml', nums=':8').dataframe()\n",
+    "tab1 = pas[['pas_t_miou_0.3', 'pas_t_biniou_0.3', 'pas_t_ap']]\n",
+    "print('CLIPSeg (PC+) & 0.3 & CLIP & ' + ' & '.join(f'{x*100:.1f}' for x in tab1[0:4].mean(0).values), '\\\\\\\\')\n",
+    "print('CLIPSeg (PC) & 0.3 & CLIP & ' + ' & '.join(f'{x*100:.1f}' for x in tab1[4:8].mean(0).values), '\\\\\\\\')\n",
+    "\n",
+    "pas = experiment('experiments/pascal_1shot.yaml', nums='12:16').dataframe()\n",
+    "tab1 = pas[['pas_t_miou_0.3', 'pas_t_biniou_0.3', 'pas_t_ap']]\n",
+    "print('CLIP-Deconv (PC+) & 0.3 & CLIP & ' + ' & '.join(f'{x*100:.1f}' for x in tab1[0:4].mean(0).values), '\\\\\\\\')\n",
+    "\n",
+    "pas = experiment('experiments/pascal_1shot.yaml', nums='16:20').dataframe()\n",
+    "tab1 = pas[['pas_t_miou_0.2', 'pas_t_biniou_0.2', 'pas_t_ap']]\n",
+    "print('ViTSeg (PC+) & 0.2 & CLIP & ' + ' & '.join(f'{x*100:.1f}' for x in tab1[0:4].mean(0).values), '\\\\\\\\')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# without fixed thresholds...\n",
+    "\n",
+    "pas = experiment('experiments/pascal_1shot.yaml', nums=':8').dataframe()\n",
+    "tab1 = pas[['pas_t_best_miou', 'pas_t_best_biniou', 'pas_t_ap']]\n",
+    "print('CLIPSeg (PC+) & CLIP & ' + ' & '.join(f'{x*100:.1f}' for x in tab1[0:4].mean(0).values), '\\\\\\\\')\n",
+    "print('CLIPSeg (PC) & CLIP & ' + ' & '.join(f'{x*100:.1f}' for x in tab1[4:8].mean(0).values), '\\\\\\\\')\n",
+    "\n",
+    "pas = experiment('experiments/pascal_1shot.yaml', nums='12:16').dataframe()\n",
+    "tab1 = pas[['pas_t_best_miou', 'pas_t_best_biniou', 'pas_t_ap']]\n",
+    "print('CLIP-Deconv (PC+) & CLIP & ' + ' & '.join(f'{x*100:.1f}' for x in tab1[0:4].mean(0).values), '\\\\\\\\')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### COCO"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coco = experiment('experiments/coco.yaml', nums=':29').dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tab1 = coco[['coco_h2_miou_0.1', 'coco_h2_biniou_0.1', 'coco_h2_ap']]\n",
+    "tab2 = coco[['coco_h2_miou_0.2', 'coco_h2_biniou_0.2', 'coco_h2_ap']]\n",
+    "tab3 = coco[['coco_h2_miou_best', 'coco_h2_biniou_best', 'coco_h2_ap']]\n",
+    "print('CLIPSeg (COCO) & 0.1 & CLIP &  ' + ' & '.join(f'{x*100:.1f}' for x in tab1[:4].mean(0).values), '\\\\\\\\')\n",
+    "print('CLIPSeg (COCO+N)  & 0.1 & CLIP &  ' + ' & '.join(f'{x*100:.1f}' for x in tab1[4:8].mean(0).values), '\\\\\\\\')\n",
+    "print('CLIP-Deconv (COCO+N)  & 0.1 & CLIP &  ' + ' & '.join(f'{x*100:.1f}' for x in tab1[12:16].mean(0).values), '\\\\\\\\')\n",
+    "print('ViTSeg (COCO)  & 0.1 & CLIP &  ' + ' & '.join(f'{x*100:.1f}' for x in tab1[8:12].mean(0).values), '\\\\\\\\')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Zero-shot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "zs = experiment('experiments/pascal_0shot.yaml', nums=':11').dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "tab1 = zs[['pas_zs_seen', 'pas_zs_unseen']]\n",
+    "print('CLIPSeg (PC+) & CLIP &  ' + ' & '.join(f'{x*100:.1f}' for x in tab1[8:9].values[0].tolist() + tab1[10:11].values[0].tolist()), '\\\\\\\\')\n",
+    "print('CLIP-Deconv & CLIP &  ' + ' & '.join(f'{x*100:.1f}' for x in tab1[2:3].values[0].tolist()  + tab1[3:4].values[0].tolist()), '\\\\\\\\')\n",
+    "print('ViTSeg & ImageNet-1K &  ' + ' & '.join(f'{x*100:.1f}' for x in tab1[4:5].values[0].tolist()  + tab1[5:6].values[0].tolist()), '\\\\\\\\')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Ablation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ablation = experiment('experiments/ablation.yaml', nums=':8').dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tab1 = ablation[['name', 'pc_miou_best', 'pc_ap', 'pc-vis_miou_best', 'pc-vis_ap']]\n",
+    "for k in ['pc_miou_best', 'pc_ap', 'pc-vis_miou_best', 'pc-vis_ap']:\n",
+    "    tab1.loc[:, k] = (100 * tab1.loc[:, k]).round(1)\n",
+    "tab1.loc[:, 'name'] = ['CLIPSeg', 'no CLIP pre-training', 'no-negatives', '50% negatives', 'no visual', '$D=16$', 'only layer 3', 'highlight mask']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(tab1.loc[[0,1,4,5,6,7],:].to_latex(header=False, index=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(tab1.loc[[0,1,4,5,6,7],:].to_latex(header=False, index=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Generalization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generalization = experiment('experiments/generalize.yaml').dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gen = generalization[['aff_best_fgiou', 'aff_ap', 'ability_best_fgiou', 'ability_ap', 'part_best_fgiou', 'part_ap']].values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\n",
+    "    'CLIPSeg (PC+) & ' + ' & '.join(f'{x*100:.1f}' for x in gen[1]) + ' \\\\\\\\ \\n' + \\\n",
+    "    'CLIPSeg (LVIS)  & ' + ' & '.join(f'{x*100:.1f}' for x in gen[0]) + ' \\\\\\\\ \\n' + \\\n",
+    "    'CLIP-Deconv & ' + ' & '.join(f'{x*100:.1f}' for x in gen[2]) + ' \\\\\\\\ \\n' + \\\n",
+    "    'VITSeg & ' + ' & '.join(f'{x*100:.1f}' for x in gen[3]) + ' \\\\\\\\'\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "800ed241f7db2bd3aa6942aa3be6809cdb30ee6b0a9e773dfecfa9fef1f4c586"
+  },
+  "kernelspec": {
+   "display_name": "env2",
+   "language": "python",
+   "name": "env2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

clipseg/Visual_Feature_Engineering.ipynb ADDED Viewed

	@@ -0,0 +1,366 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Systematic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import clip\n",
+    "from evaluation_utils import norm, denorm\n",
+    "from general_utils import *\n",
+    "from datasets.lvis_oneshot3 import LVIS_OneShot3\n",
+    "\n",
+    "clip_device = 'cuda'\n",
+    "clip_model, preprocess = clip.load(\"ViT-B/16\", device=clip_device)\n",
+    "clip_model.eval();\n",
+    "\n",
+    "from models.clipseg import CLIPDensePredTMasked\n",
+    "\n",
+    "clip_mask_model = CLIPDensePredTMasked(version='ViT-B/16').to(clip_device)\n",
+    "clip_mask_model.eval();"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lvis = LVIS_OneShot3('train_fixed', mask='separate', normalize=True, with_class_label=True, add_bar=False, \n",
+    "                     text_class_labels=True, image_size=352, min_area=0.1,\n",
+    "                     min_frac_s=0.05, min_frac_q=0.05, fix_find_crop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_data(lvis)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import defaultdict\n",
+    "import json\n",
+    "\n",
+    "lvis_raw = json.load(open(expanduser('~/datasets/LVIS/lvis_v1_train.json')))\n",
+    "lvis_val_raw = json.load(open(expanduser('~/datasets/LVIS/lvis_v1_val.json')))\n",
+    "\n",
+    "objects_per_image = defaultdict(lambda : set())\n",
+    "for ann in lvis_raw['annotations']:\n",
+    "    objects_per_image[ann['image_id']].add(ann['category_id'])\n",
+    "    \n",
+    "for ann in lvis_val_raw['annotations']:\n",
+    "    objects_per_image[ann['image_id']].add(ann['category_id'])    \n",
+    "    \n",
+    "objects_per_image = {o: [lvis.category_names[o] for o in v] for o, v in objects_per_image.items()}\n",
+    "\n",
+    "del lvis_raw, lvis_val_raw"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#bs = 32\n",
+    "#batches = [get_batch(lvis, i*bs, (i+1)*bs, cuda=True) for i in range(10)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from general_utils import get_batch\n",
+    "from functools import partial\n",
+    "from evaluation_utils import img_preprocess\n",
+    "import torch\n",
+    "\n",
+    "def get_similarities(batches_or_dataset, process, mask=lambda x: None, clipmask=False):\n",
+    "\n",
+    "    # base_words = [f'a photo of {x}' for x in ['a person', 'an animal', 'a knife', 'a cup']]\n",
+    "\n",
+    "    all_prompts = []\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        valid_sims = []\n",
+    "        torch.manual_seed(571)\n",
+    "        \n",
+    "        if type(batches_or_dataset) == list:\n",
+    "            loader = batches_or_dataset  # already loaded\n",
+    "            max_iter = float('inf')\n",
+    "        else:\n",
+    "            loader = DataLoader(batches_or_dataset, shuffle=False, batch_size=32)\n",
+    "            max_iter = 50\n",
+    "        \n",
+    "        global batch\n",
+    "        for i_batch, (batch, batch_y) in enumerate(loader):\n",
+    "            \n",
+    "            if i_batch >= max_iter: break\n",
+    "                \n",
+    "            processed_batch = process(batch)\n",
+    "            if type(processed_batch) == dict:\n",
+    "                \n",
+    "                # processed_batch =  {k: v.to(clip_device) for k, v in processed_batch.items()}\n",
+    "                image_features = clip_mask_model.visual_forward(**processed_batch)[0].to(clip_device).half()\n",
+    "            else:\n",
+    "                processed_batch = process(batch).to(clip_device)\n",
+    "                processed_batch = nnf.interpolate(processed_batch, (224, 224), mode='bilinear')\n",
+    "                #image_features = clip_model.encode_image(processed_batch.to(clip_device)) \n",
+    "                image_features = clip_mask_model.visual_forward(processed_batch)[0].to(clip_device).half()\n",
+    "                \n",
+    "            image_features = image_features / image_features.norm(dim=-1, keepdim=True)\n",
+    "            bs = len(batch[0])\n",
+    "            for j in range(bs):\n",
+    "            \n",
+    "                c, _, sid, qid = lvis.sample_ids[bs * i_batch + j]\n",
+    "                support_image = basename(lvis.samples[c][sid])\n",
+    "                \n",
+    "                img_objs = [o for o in objects_per_image[int(support_image)]]\n",
+    "                img_objs = [o.replace('_', ' ') for o in img_objs]\n",
+    "                \n",
+    "                other_words = [f'a photo of a {o.replace(\"_\", \" \")}' for o in img_objs \n",
+    "                               if o != batch_y[2][j]]\n",
+    "            \n",
+    "                prompts = [f'a photo of a {batch_y[2][j]}'] + other_words\n",
+    "                all_prompts += [prompts]\n",
+    "                \n",
+    "                text_cond = clip_model.encode_text(clip.tokenize(prompts).to(clip_device))\n",
+    "                text_cond = text_cond / text_cond.norm(dim=-1, keepdim=True)            \n",
+    "\n",
+    "                global logits\n",
+    "                logits = clip_model.logit_scale.exp() * image_features[j] @ text_cond.T\n",
+    "\n",
+    "                global sim\n",
+    "                sim = torch.softmax(logits, dim=-1)\n",
+    "            \n",
+    "                valid_sims += [sim]\n",
+    "                \n",
+    "        #valid_sims = torch.stack(valid_sims)\n",
+    "        return valid_sims, all_prompts\n",
+    "    \n",
+    "\n",
+    "def new_img_preprocess(x):\n",
+    "    return {'x_inp': x[1], 'mask': (11, 'cls_token', x[2])}\n",
+    "    \n",
+    "#get_similarities(lvis, partial(img_preprocess, center_context=0.5));\n",
+    "get_similarities(lvis, lambda x: x[1]);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preprocessing_functions = [\n",
+    "#     ['clip mask CLS L11', lambda x: {'x_inp': x[1].cuda(), 'mask': (11, 'cls_token', x[2].cuda())}],\n",
+    "#     ['clip mask CLS all', lambda x: {'x_inp': x[1].cuda(), 'mask': ('all', 'cls_token', x[2].cuda())}],\n",
+    "#     ['clip mask all all', lambda x: {'x_inp': x[1].cuda(), 'mask': ('all', 'all', x[2].cuda())}],\n",
+    "#     ['colorize object red', partial(img_preprocess, colorize=True)],\n",
+    "#     ['add red outline', partial(img_preprocess, outline=True)],\n",
+    "    \n",
+    "#     ['BG brightness 50%', partial(img_preprocess, bg_fac=0.5)],\n",
+    "#     ['BG brightness 10%', partial(img_preprocess, bg_fac=0.1)],\n",
+    "#     ['BG brightness 0%', partial(img_preprocess, bg_fac=0.0)],\n",
+    "#     ['BG blur', partial(img_preprocess, blur=3)],\n",
+    "#     ['BG blur & intensity 10%', partial(img_preprocess, blur=3, bg_fac=0.1)],\n",
+    "   \n",
+    "#     ['crop large context', partial(img_preprocess, center_context=0.5)],\n",
+    "#     ['crop small context', partial(img_preprocess, center_context=0.1)],\n",
+    "    ['crop & background blur', partial(img_preprocess, blur=3, center_context=0.5)],\n",
+    "    ['crop & intensity 10%', partial(img_preprocess, blur=3, bg_fac=0.1)],\n",
+    "#     ['crop & background blur & intensity 10%', partial(img_preprocess, blur=3, center_context=0.1, bg_fac=0.1)],\n",
+    "]\n",
+    "\n",
+    "preprocessing_functions = preprocessing_functions\n",
+    "\n",
+    "base, base_p = get_similarities(lvis, lambda x: x[1])\n",
+    "outs = [get_similarities(lvis, fun) for _, fun in preprocessing_functions]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outs2 = [get_similarities(lvis, fun) for _, fun in  [['BG brightness 0%', partial(img_preprocess, bg_fac=0.0)]]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for j in range(1):\n",
+    "    print(np.mean([outs2[j][0][i][0].cpu() - base[i][0].cpu() for i in range(len(base)) if len(base_p[i]) >= 3]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pandas import DataFrame\n",
+    "tab = dict()\n",
+    "for j, (name, _) in enumerate(preprocessing_functions):\n",
+    "    tab[name] =  np.mean([outs[j][0][i][0].cpu() - base[i][0].cpu() for i in range(len(base)) if len(base_p[i]) >= 3])\n",
+    "    \n",
+    "    \n",
+    "print('\\n'.join(f'{k} & {v*100:.2f} \\\\\\\\' for k,v in tab.items()))    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Visual"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from evaluation_utils import denorm, norm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_sample(filename, filename2):\n",
+    "    from os.path import join\n",
+    "    bp = expanduser('~/cloud/resources/sample_images')\n",
+    "    tf = transforms.Compose([\n",
+    "        transforms.ToTensor(),\n",
+    "        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
+    "        transforms.Resize(224),\n",
+    "        transforms.CenterCrop(224)\n",
+    "    ])\n",
+    "    tf2 = transforms.Compose([\n",
+    "        transforms.ToTensor(),\n",
+    "        transforms.Resize(224),\n",
+    "        transforms.CenterCrop(224)\n",
+    "    ])\n",
+    "    inp1 = [None, tf(Image.open(join(bp, filename))), tf2(Image.open(join(bp, filename2)))]\n",
+    "    inp1[1] = inp1[1].unsqueeze(0)\n",
+    "    inp1[2] = inp1[2][:1]   \n",
+    "    return inp1\n",
+    "\n",
+    "def all_preprocessing(inp1):\n",
+    "    return [\n",
+    "        img_preprocess(inp1),\n",
+    "        img_preprocess(inp1, colorize=True),\n",
+    "        img_preprocess(inp1, outline=True),        \n",
+    "        img_preprocess(inp1, blur=3),\n",
+    "        img_preprocess(inp1, bg_fac=0.1),\n",
+    "        #img_preprocess(inp1, bg_fac=0.5),\n",
+    "        #img_preprocess(inp1, blur=3, bg_fac=0.5),        \n",
+    "        img_preprocess(inp1, blur=3, bg_fac=0.5, center_context=0.5),\n",
+    "    ]\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torchvision import transforms\n",
+    "from PIL import Image\n",
+    "from matplotlib import pyplot as plt\n",
+    "from evaluation_utils import img_preprocess\n",
+    "import clip\n",
+    "\n",
+    "images_queries = [\n",
+    "    [load_sample('things1.jpg', 'things1_jar.png'), ['jug', 'knife', 'car', 'animal', 'sieve', 'nothing']],\n",
+    "    [load_sample('own_photos/IMG_2017s_square.jpg', 'own_photos/IMG_2017s_square_trash_can.png'), ['trash bin', 'house', 'car', 'bike', 'window', 'nothing']],\n",
+    "]\n",
+    "\n",
+    "\n",
+    "_, ax = plt.subplots(2 * len(images_queries), 6, figsize=(14, 4.5 * len(images_queries)))\n",
+    "\n",
+    "for j, (images, objects) in enumerate(images_queries):\n",
+    "    \n",
+    "    joint_image = all_preprocessing(images)\n",
+    "    \n",
+    "    joint_image = torch.stack(joint_image)[:,0]\n",
+    "    clip_model, preprocess = clip.load(\"ViT-B/16\", device='cpu')\n",
+    "    image_features = clip_model.encode_image(joint_image)\n",
+    "    image_features = image_features / image_features.norm(dim=-1, keepdim=True)\n",
+    "    \n",
+    "    prompts = [f'a photo of a {obj}'for obj in objects]\n",
+    "    text_cond = clip_model.encode_text(clip.tokenize(prompts))\n",
+    "    text_cond = text_cond / text_cond.norm(dim=-1, keepdim=True)\n",
+    "    logits = clip_model.logit_scale.exp() * image_features @ text_cond.T\n",
+    "    sim = torch.softmax(logits, dim=-1).detach().cpu()\n",
+    "\n",
+    "    for i, img in enumerate(joint_image):\n",
+    "        ax[2*j, i].axis('off')\n",
+    "        \n",
+    "        ax[2*j, i].imshow(torch.clamp(denorm(joint_image[i]).permute(1,2,0), 0, 1))\n",
+    "        ax[2*j+ 1, i].grid(True)\n",
+    "        \n",
+    "        ax[2*j + 1, i].set_ylim(0,1)\n",
+    "        ax[2*j + 1, i].set_yticklabels([])\n",
+    "        ax[2*j + 1, i].set_xticks([])  # set_xticks(range(len(prompts)))\n",
+    "#         ax[1, i].set_xticklabels(objects, rotation=90)\n",
+    "        for k in range(len(sim[i])):\n",
+    "            ax[2*j + 1, i].bar(k, sim[i][k], color=plt.cm.tab20(1) if k!=0 else plt.cm.tab20(3))\n",
+    "            ax[2*j + 1, i].text(k, 0.07, objects[k], rotation=90, ha='center', fontsize=15)\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.savefig('figures/prompt_engineering.pdf', bbox_inches='tight')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env2",
+   "language": "python",
+   "name": "env2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

clipseg/datasets/coco_wrapper.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import pickle
+from types import new_class
+import torch
+import numpy as np
+import os
+import json
+from os.path import join, dirname, isdir, isfile, expanduser, realpath, basename
+from random import shuffle, seed as set_seed
+from PIL import Image
+from itertools import combinations
+from torchvision import transforms
+from torchvision.transforms.transforms import Resize
+from datasets.utils import blend_image_segmentation
+from general_utils import get_from_repository
+COCO_CLASSES = {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'}
+class COCOWrapper(object):
+    def __init__(self, split, fold=0, image_size=400, aug=None, mask='separate', negative_prob=0,
+                 with_class_label=False):
+        super().__init__()
+        self.mask = mask
+        self.with_class_label = with_class_label
+        self.negative_prob = negative_prob
+        from third_party.hsnet.data.coco import DatasetCOCO
+        get_from_repository('COCO-20i', ['COCO-20i.tar'])
+        foldpath = join(dirname(__file__), '../third_party/hsnet/data/splits/coco/%s/fold%d.pkl')
+        def build_img_metadata_classwise(self):
+            with open(foldpath % (self.split, self.fold), 'rb') as f:
+                img_metadata_classwise = pickle.load(f)
+            return img_metadata_classwise
+        DatasetCOCO.build_img_metadata_classwise = build_img_metadata_classwise
+        # DatasetCOCO.read_mask = read_mask
+        mean = [0.485, 0.456, 0.406]
+        std = [0.229, 0.224, 0.225]
+        transform = transforms.Compose([
+            transforms.Resize((image_size, image_size)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean, std)
+        ])
+        self.coco = DatasetCOCO(expanduser('~/datasets/COCO-20i/'), fold, transform, split, 1, False)
+        self.all_classes = [self.coco.class_ids]
+        self.coco.base_path = join(expanduser('~/datasets/COCO-20i'))
+    def __len__(self):
+        return len(self.coco)
+    def __getitem__(self, i):
+        sample = self.coco[i]
+        label_name = COCO_CLASSES[int(sample['class_id'])]
+        img_s, seg_s = sample['support_imgs'][0], sample['support_masks'][0]
+        if self.negative_prob > 0 and torch.rand(1).item() < self.negative_prob:
+            new_class_id = sample['class_id']
+            while new_class_id == sample['class_id']:
+                sample2 = self.coco[torch.randint(0, len(self), (1,)).item()]
+                new_class_id = sample2['class_id']
+            img_s = sample2['support_imgs'][0]
+            seg_s = torch.zeros_like(seg_s)
+        mask = self.mask
+        if mask == 'separate':
+            supp = (img_s, seg_s)
+        elif mask == 'text_label':
+            # DEPRECATED
+            supp = [int(sample['class_id'])]
+        elif mask == 'text':
+            supp = [label_name]
+        else:
+            if mask.startswith('text_and_'):
+                mask = mask[9:]
+                label_add = [label_name]
+            else:
+                label_add = []
+            supp = label_add + blend_image_segmentation(img_s, seg_s, mode=mask)
+        if self.with_class_label:
+            label = (torch.zeros(0), sample['class_id'],)
+        else:
+            label = (torch.zeros(0), )
+        return (sample['query_img'],) + tuple(supp), (sample['query_mask'].unsqueeze(0),) + label

clipseg/datasets/pascal_classes.json ADDED Viewed

	@@ -0,0 +1 @@

+ [{"id": 1, "synonyms": ["aeroplane"]}, {"id": 2, "synonyms": ["bicycle"]}, {"id": 3, "synonyms": ["bird"]}, {"id": 4, "synonyms": ["boat"]}, {"id": 5, "synonyms": ["bottle"]}, {"id": 6, "synonyms": ["bus"]}, {"id": 7, "synonyms": ["car"]}, {"id": 8, "synonyms": ["cat"]}, {"id": 9, "synonyms": ["chair"]}, {"id": 10, "synonyms": ["cow"]}, {"id": 11, "synonyms": ["diningtable"]}, {"id": 12, "synonyms": ["dog"]}, {"id": 13, "synonyms": ["horse"]}, {"id": 14, "synonyms": ["motorbike"]}, {"id": 15, "synonyms": ["person"]}, {"id": 16, "synonyms": ["pottedplant"]}, {"id": 17, "synonyms": ["sheep"]}, {"id": 18, "synonyms": ["sofa"]}, {"id": 19, "synonyms": ["train"]}, {"id": 20, "synonyms": ["tvmonitor"]}]

clipseg/datasets/pascal_zeroshot.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from os.path import expanduser
+import torch
+import json
+import torchvision
+from general_utils import get_from_repository
+from general_utils import log
+from torchvision import transforms
+PASCAL_VOC_CLASSES_ZS = [['cattle.n.01', 'motorcycle.n.01'], ['aeroplane.n.01', 'sofa.n.01'],
+                         ['cat.n.01', 'television.n.03'], ['train.n.01', 'bottle.n.01'],
+                          ['chair.n.01', 'pot_plant.n.01']]
+class PascalZeroShot(object):
+    def __init__(self, split, n_unseen, image_size=224) -> None:
+        super().__init__()
+        import sys
+        sys.path.append('third_party/JoEm')
+        from third_party.JoEm.data_loader.dataset import VOCSegmentation
+        from third_party.JoEm.data_loader import get_seen_idx, get_unseen_idx, VOC
+        self.pascal_classes = VOC
+        self.image_size = image_size
+        self.transform = transforms.Compose([
+            transforms.Resize((image_size, image_size)),
+        ])
+        if split == 'train':
+            self.voc = VOCSegmentation(get_unseen_idx(n_unseen), get_seen_idx(n_unseen),
+                                       split=split, transform=True, transform_args=dict(base_size=312, crop_size=312),
+                                       ignore_bg=False, ignore_unseen=False, remv_unseen_img=True)
+        elif split == 'val':
+            self.voc = VOCSegmentation(get_unseen_idx(n_unseen), get_seen_idx(n_unseen),
+                                       split=split, transform=False,
+                                       ignore_bg=False, ignore_unseen=False)
+        self.unseen_idx = get_unseen_idx(n_unseen)
+    def __len__(self):
+        return len(self.voc)
+    def __getitem__(self, i):
+        sample = self.voc[i]
+        label = sample['label'].long()
+        all_labels = [l for l in torch.where(torch.bincount(label.flatten())>0)[0].numpy().tolist() if l != 255]
+        class_indices = [l for l in all_labels]
+        class_names = [self.pascal_classes[l] for l in all_labels]
+        image = self.transform(sample['image'])
+        label = transforms.Resize((self.image_size, self.image_size),
+            interpolation=torchvision.transforms.InterpolationMode.NEAREST)(label.unsqueeze(0))[0]
+        return (image,), (label, )

clipseg/datasets/pfe_dataset.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from os.path import expanduser
+import torch
+import json
+from general_utils import get_from_repository
+from datasets.lvis_oneshot3 import blend_image_segmentation
+from general_utils import log
+PASCAL_CLASSES = {a['id']: a['synonyms'] for a in json.load(open('datasets/pascal_classes.json'))}
+class PFEPascalWrapper(object):
+    def __init__(self, mode, split, mask='separate', image_size=473, label_support=None, size=None, p_negative=0, aug=None):
+        import sys
+        # sys.path.append(expanduser('~/projects/new_one_shot'))
+        from third_party.PFENet.util.dataset import SemData
+        get_from_repository('PascalVOC2012', ['Pascal5i.tar'])
+        self.p_negative = p_negative
+        self.size = size
+        self.mode = mode
+        self.image_size = image_size
+        if label_support in {True, False}:
+            log.warning('label_support argument is deprecated. Use mask instead.')
+            #raise ValueError()
+        self.mask = mask
+        value_scale = 255
+        mean = [0.485, 0.456, 0.406]
+        mean = [item * value_scale for item in mean]
+        std = [0.229, 0.224, 0.225]
+        std = [item * value_scale for item in std]
+        import third_party.PFENet.util.transform as transform
+        if mode == 'val':
+            data_list = expanduser('~/projects/old_one_shot/PFENet/lists/pascal/val.txt')
+            data_transform = [transform.test_Resize(size=image_size)] if image_size != 'original' else []
+            data_transform += [
+                transform.ToTensor(),
+                transform.Normalize(mean=mean, std=std)
+            ]
+        elif mode == 'train':
+            data_list =  expanduser('~/projects/old_one_shot/PFENet/lists/pascal/voc_sbd_merge_noduplicate.txt')
+            assert image_size != 'original'
+            data_transform = [
+                transform.RandScale([0.9, 1.1]),
+                transform.RandRotate([-10, 10], padding=mean, ignore_label=255),
+                transform.RandomGaussianBlur(),
+                transform.RandomHorizontalFlip(),
+                transform.Crop((image_size, image_size), crop_type='rand', padding=mean, ignore_label=255),
+                transform.ToTensor(),
+                transform.Normalize(mean=mean, std=std)
+            ]
+        data_transform = transform.Compose(data_transform)
+        self.dataset = SemData(split=split, mode=mode, data_root=expanduser('~/datasets/PascalVOC2012/VOC2012'),
+                               data_list=data_list, shot=1, transform=data_transform, use_coco=False, use_split_coco=False)
+        self.class_list = self.dataset.sub_val_list if mode == 'val' else self.dataset.sub_list
+        # verify that subcls_list always has length 1
+        # assert len(set([len(d[4]) for d in self.dataset])) == 1
+        print('actual length', len(self.dataset.data_list))
+    def __len__(self):
+        if self.mode == 'val':
+            return len(self.dataset.data_list)
+        else:
+            return len(self.dataset.data_list)
+    def __getitem__(self, index):
+        if self.dataset.mode == 'train':
+            image, label, s_x, s_y, subcls_list = self.dataset[index % len(self.dataset.data_list)]
+        elif self.dataset.mode == 'val':
+            image, label, s_x, s_y, subcls_list, ori_label = self.dataset[index % len(self.dataset.data_list)]
+            ori_label = torch.from_numpy(ori_label).unsqueeze(0)
+            if self.image_size != 'original':
+                longerside = max(ori_label.size(1), ori_label.size(2))
+                backmask = torch.ones(ori_label.size(0), longerside, longerside).cuda()*255
+                backmask[0, :ori_label.size(1), :ori_label.size(2)] = ori_label
+                label = backmask.clone().long()
+            else:
+                label = label.unsqueeze(0)
+            # assert label.shape == (473, 473)
+        if self.p_negative > 0:
+            if torch.rand(1).item() < self.p_negative:
+                while True:
+                    idx = torch.randint(0, len(self.dataset.data_list), (1,)).item()
+                    _, _, s_x, s_y, subcls_list_tmp, _ = self.dataset[idx]
+                    if subcls_list[0] != subcls_list_tmp[0]:
+                        break
+        s_x = s_x[0]
+        s_y = (s_y == 1)[0]
+        label_fg = (label == 1).float()
+        val_mask = (label != 255).float()
+        class_id = self.class_list[subcls_list[0]]
+        label_name = PASCAL_CLASSES[class_id][0]
+        label_add = ()
+        mask = self.mask
+        if mask == 'text':
+            support = ('a photo of a ' + label_name + '.',)
+        elif mask == 'separate':
+            support = (s_x, s_y)
+        else:
+            if mask.startswith('text_and_'):
+                label_add = (label_name,)
+                mask = mask[9:]
+            support = (blend_image_segmentation(s_x, s_y.float(), mask)[0],)
+        return (image,) + label_add + support, (label_fg.unsqueeze(0), val_mask.unsqueeze(0), subcls_list[0])

clipseg/datasets/phrasecut.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import torch
+import numpy as np
+import os
+from os.path import join, isdir, isfile, expanduser
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms.transforms import Resize
+from torch.nn import functional as nnf
+from general_utils import get_from_repository
+from skimage.draw import polygon2mask
+def random_crop_slices(origin_size, target_size):
+    """Gets slices of a random crop. """
+    assert origin_size[0] >= target_size[0] and origin_size[1] >= target_size[1], f'actual size: {origin_size}, target size: {target_size}'
+    offset_y = torch.randint(0, origin_size[0] - target_size[0] + 1, (1,)).item()  # range: 0 <= value < high
+    offset_x = torch.randint(0, origin_size[1] - target_size[1] + 1, (1,)).item()
+    return slice(offset_y, offset_y + target_size[0]), slice(offset_x, offset_x + target_size[1])
+def find_crop(seg, image_size, iterations=1000, min_frac=None, best_of=None):
+    best_crops = []
+    best_crop_not_ok = float('-inf'), None, None
+    min_sum = 0
+    seg = seg.astype('bool')
+    if min_frac is not None:
+        #min_sum = seg.sum() * min_frac
+        min_sum = seg.shape[0] * seg.shape[1] * min_frac
+    for iteration in range(iterations):
+        sl_y, sl_x = random_crop_slices(seg.shape, image_size)
+        seg_ = seg[sl_y, sl_x]
+        sum_seg_ = seg_.sum()
+        if sum_seg_ > min_sum:
+            if best_of is None:
+                return sl_y, sl_x, False
+            else:
+                best_crops += [(sum_seg_, sl_y, sl_x)]
+                if len(best_crops) >= best_of:
+                    best_crops.sort(key=lambda x:x[0], reverse=True)
+                    sl_y, sl_x = best_crops[0][1:]
+                    return sl_y, sl_x, False
+        else:
+            if sum_seg_ > best_crop_not_ok[0]:
+                best_crop_not_ok = sum_seg_, sl_y, sl_x
+    else:
+        # return best segmentation found
+        return best_crop_not_ok[1:] + (best_crop_not_ok[0] <= min_sum,)
+class PhraseCut(object):
+    def __init__(self, split, image_size=400, negative_prob=0, aug=None, aug_color=False, aug_crop=True,
+                 min_size=0, remove_classes=None, with_visual=False, only_visual=False, mask=None):
+        super().__init__()
+        self.negative_prob = negative_prob
+        self.image_size = image_size
+        self.with_visual = with_visual
+        self.only_visual = only_visual
+        self.phrase_form = '{}'
+        self.mask = mask
+        self.aug_crop = aug_crop
+        if aug_color:
+            self.aug_color = transforms.Compose([
+                transforms.ColorJitter(0.5, 0.5, 0.2, 0.05),
+            ])
+        else:
+            self.aug_color = None
+        get_from_repository('PhraseCut', ['PhraseCut.tar'], integrity_check=lambda local_dir: all([
+            isdir(join(local_dir, 'VGPhraseCut_v0')),
+            isdir(join(local_dir, 'VGPhraseCut_v0', 'images')),
+            isfile(join(local_dir, 'VGPhraseCut_v0', 'refer_train.json')),
+            len(os.listdir(join(local_dir, 'VGPhraseCut_v0', 'images'))) in {108250, 108249}
+        ]))
+        from third_party.PhraseCutDataset.utils.refvg_loader import RefVGLoader
+        self.refvg_loader = RefVGLoader(split=split)
+        # img_ids where the size in the annotations does not match actual size
+        invalid_img_ids = set([150417, 285665, 498246, 61564, 285743, 498269, 498010, 150516, 150344, 286093, 61530,
+                               150333, 286065, 285814, 498187, 285761, 498042])
+        mean = [0.485, 0.456, 0.406]
+        std = [0.229, 0.224, 0.225]
+        self.normalize = transforms.Normalize(mean, std)
+        self.sample_ids = [(i, j)
+                           for i in self.refvg_loader.img_ids
+                           for j in range(len(self.refvg_loader.get_img_ref_data(i)['phrases']))
+                           if i not in invalid_img_ids]
+        # self.all_phrases = list(set([p for i in self.refvg_loader.img_ids for p in self.refvg_loader.get_img_ref_data(i)['phrases']]))
+        from nltk.stem import WordNetLemmatizer
+        wnl = WordNetLemmatizer()
+        # Filter by class (if remove_classes is set)
+        if remove_classes is None:
+            pass
+        else:
+            from datasets.generate_lvis_oneshot import PASCAL_SYNSETS, traverse_lemmas, traverse_lemmas_hypo
+            from nltk.corpus import wordnet
+            print('remove pascal classes...')
+            get_data = self.refvg_loader.get_img_ref_data  # shortcut
+            keep_sids = None
+            if remove_classes[0] == 'pas5i':
+                subset_id = remove_classes[1]
+                from datasets.generate_lvis_oneshot import PASCAL_5I_SYNSETS_ORDERED, PASCAL_5I_CLASS_IDS
+                avoid = [PASCAL_5I_SYNSETS_ORDERED[i] for i in range(20) if i+1 not in PASCAL_5I_CLASS_IDS[subset_id]]
+            elif remove_classes[0] == 'zs':
+                stop = remove_classes[1]
+                from datasets.pascal_zeroshot import PASCAL_VOC_CLASSES_ZS
+                avoid = [c for class_set in PASCAL_VOC_CLASSES_ZS[:stop] for c in class_set]
+                print(avoid)
+            elif remove_classes[0] == 'aff':
+                # avoid = ['drink.v.01', 'sit.v.01', 'ride.v.02']
+                # all_lemmas = set(['drink', 'sit', 'ride'])
+                avoid = ['drink', 'drinks', 'drinking', 'sit', 'sits', 'sitting',
+                         'ride', 'rides', 'riding',
+                         'fly', 'flies', 'flying', 'drive', 'drives', 'driving', 'driven',
+                         'swim', 'swims', 'swimming',
+                         'wheels', 'wheel', 'legs', 'leg', 'ear', 'ears']
+                keep_sids = [(i, j) for i, j in self.sample_ids if
+                             all(x not in avoid for x in get_data(i)['phrases'][j].split(' '))]
+            print('avoid classes:', avoid)
+            if keep_sids is None:
+                all_lemmas = [s for ps in avoid for s in traverse_lemmas_hypo(wordnet.synset(ps), max_depth=None)]
+                all_lemmas = list(set(all_lemmas))
+                all_lemmas = [h.replace('_', ' ').lower() for h in all_lemmas]
+                all_lemmas = set(all_lemmas)
+                # divide into multi word and single word
+                all_lemmas_s = set(l for l in all_lemmas if ' ' not in l)
+                all_lemmas_m = set(l for l in all_lemmas if l not in all_lemmas_s)
+                # new3
+                phrases = [get_data(i)['phrases'][j] for i, j in self.sample_ids]
+                remove_sids = set((i,j) for (i,j), phrase in zip(self.sample_ids, phrases)
+                                  if any(l in phrase for l in all_lemmas_m) or
+                                  len(set(wnl.lemmatize(w) for w in phrase.split(' ')).intersection(all_lemmas_s)) > 0
+                )
+                keep_sids = [(i, j) for i, j in self.sample_ids if (i,j) not in remove_sids]
+            print(f'Reduced to {len(keep_sids) / len(self.sample_ids):.3f}')
+            removed_ids = set(self.sample_ids) - set(keep_sids)
+            print('Examples of removed', len(removed_ids))
+            for i, j in list(removed_ids)[:20]:
+                print(i, get_data(i)['phrases'][j])
+            self.sample_ids = keep_sids
+        from itertools import groupby
+        samples_by_phrase = [(self.refvg_loader.get_img_ref_data(i)['phrases'][j], (i, j))
+                             for i, j in self.sample_ids]
+        samples_by_phrase = sorted(samples_by_phrase)
+        samples_by_phrase = groupby(samples_by_phrase, key=lambda x: x[0])
+        self.samples_by_phrase = {prompt: [s[1] for s in prompt_sample_ids] for prompt, prompt_sample_ids in samples_by_phrase}
+        self.all_phrases = list(set(self.samples_by_phrase.keys()))
+        if self.only_visual:
+            assert self.with_visual
+            self.sample_ids = [(i, j) for i, j in self.sample_ids
+                               if len(self.samples_by_phrase[self.refvg_loader.get_img_ref_data(i)['phrases'][j]]) > 1]
+        # Filter by size (if min_size is set)
+        sizes = [self.refvg_loader.get_img_ref_data(i)['gt_boxes'][j] for i, j in self.sample_ids]
+        image_sizes = [self.refvg_loader.get_img_ref_data(i)['width'] * self.refvg_loader.get_img_ref_data(i)['height'] for i, j in self.sample_ids]
+        #self.sizes = [sum([(s[2] - s[0]) * (s[3] - s[1]) for s in size]) for size in sizes]
+        self.sizes = [sum([s[2] * s[3] for s in size]) / img_size for size, img_size in zip(sizes, image_sizes)]
+        if min_size:
+            print('filter by size')
+        self.sample_ids = [self.sample_ids[i] for i in range(len(self.sample_ids)) if self.sizes[i] > min_size]
+        self.base_path = join(expanduser('~/datasets/PhraseCut/VGPhraseCut_v0/images/'))
+    def __len__(self):
+        return len(self.sample_ids)
+    def load_sample(self, sample_i, j):
+        img_ref_data = self.refvg_loader.get_img_ref_data(sample_i)
+        polys_phrase0 = img_ref_data['gt_Polygons'][j]
+        phrase = img_ref_data['phrases'][j]
+        phrase = self.phrase_form.format(phrase)
+        masks = []
+        for polys in polys_phrase0:
+            for poly in polys:
+                poly = [p[::-1] for p in poly]  # swap x,y
+                masks += [polygon2mask((img_ref_data['height'], img_ref_data['width']), poly)]
+        seg = np.stack(masks).max(0)
+        img = np.array(Image.open(join(self.base_path, str(img_ref_data['image_id']) + '.jpg')))
+        min_shape = min(img.shape[:2])
+        if self.aug_crop:
+            sly, slx, exceed = find_crop(seg, (min_shape, min_shape), iterations=50, min_frac=0.05)
+        else:
+            sly, slx = slice(0, None), slice(0, None)
+        seg = seg[sly, slx]
+        img = img[sly, slx]
+        seg = seg.astype('uint8')
+        seg = torch.from_numpy(seg).view(1, 1, *seg.shape)
+        if img.ndim == 2:
+            img = np.dstack([img] * 3)
+        img = torch.from_numpy(img).permute(2,0,1).unsqueeze(0).float()
+        seg = nnf.interpolate(seg, (self.image_size, self.image_size), mode='nearest')[0,0]
+        img = nnf.interpolate(img, (self.image_size, self.image_size), mode='bilinear', align_corners=True)[0]
+        # img = img.permute([2,0, 1])
+        img = img / 255.0
+        if self.aug_color is not None:
+            img = self.aug_color(img)
+        img = self.normalize(img)
+        return img, seg, phrase
+    def __getitem__(self, i):
+        sample_i, j = self.sample_ids[i]
+        img, seg, phrase = self.load_sample(sample_i, j)
+        if self.negative_prob > 0:
+            if torch.rand((1,)).item() < self.negative_prob:
+                new_phrase = None
+                while new_phrase is None or new_phrase == phrase:
+                    idx = torch.randint(0, len(self.all_phrases), (1,)).item()
+                    new_phrase = self.all_phrases[idx]
+                phrase = new_phrase
+                seg = torch.zeros_like(seg)
+        if self.with_visual:
+            # find a corresponding visual image
+            if phrase in self.samples_by_phrase and len(self.samples_by_phrase[phrase]) > 1:
+                idx = torch.randint(0, len(self.samples_by_phrase[phrase]), (1,)).item()
+                other_sample = self.samples_by_phrase[phrase][idx]
+                #print(other_sample)
+                img_s, seg_s, _ = self.load_sample(*other_sample)
+                from datasets.utils import blend_image_segmentation
+                if self.mask in {'separate', 'text_and_separate'}:
+                    # assert img.shape[1:] == img_s.shape[1:] == seg_s.shape == seg.shape[1:]
+                    add_phrase = [phrase] if self.mask == 'text_and_separate' else []
+                    vis_s = add_phrase + [img_s, seg_s, True]
+                else:
+                    if self.mask.startswith('text_and_'):
+                        mask_mode = self.mask[9:]
+                        label_add = [phrase]
+                    else:
+                        mask_mode = self.mask
+                        label_add = []
+                    masked_img_s = torch.from_numpy(blend_image_segmentation(img_s, seg_s, mode=mask_mode, image_size=self.image_size)[0])
+                    vis_s = label_add + [masked_img_s, True]
+            else:
+                # phrase is unique
+                vis_s = torch.zeros_like(img)
+                if self.mask in {'separate', 'text_and_separate'}:
+                    add_phrase = [phrase] if self.mask == 'text_and_separate' else []
+                    vis_s = add_phrase + [vis_s, torch.zeros(*vis_s.shape[1:], dtype=torch.uint8), False]
+                elif self.mask.startswith('text_and_'):
+                    vis_s = [phrase, vis_s, False]
+                else:
+                    vis_s = [vis_s, False]
+        else:
+            assert self.mask == 'text'
+            vis_s = [phrase]
+        seg = seg.unsqueeze(0).float()
+        data_x = (img,) + tuple(vis_s)
+        return data_x, (seg, torch.zeros(0), i)
+class PhraseCutPlus(PhraseCut):
+    def __init__(self, split, image_size=400, aug=None, aug_color=False, aug_crop=True, min_size=0, remove_classes=None, only_visual=False, mask=None):
+        super().__init__(split, image_size=image_size, negative_prob=0.2, aug=aug, aug_color=aug_color, aug_crop=aug_crop, min_size=min_size,
+                         remove_classes=remove_classes, with_visual=True, only_visual=only_visual, mask=mask)

clipseg/datasets/utils.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import numpy as np
+import torch
+def blend_image_segmentation(img, seg, mode, image_size=224):
+    if mode in {'blur_highlight', 'blur3_highlight', 'blur3_highlight01', 'blur_highlight_random', 'crop'}:
+        if isinstance(img, np.ndarray):
+            img = torch.from_numpy(img)
+        if isinstance(seg, np.ndarray):
+            seg = torch.from_numpy(seg)
+    if mode == 'overlay':
+        out = img * seg
+        out = [out.astype('float32')]
+    elif mode == 'highlight':
+        out = img * seg[None, :, :] * 0.85 + 0.15 * img
+        out = [out.astype('float32')]
+    elif mode == 'highlight2':
+        img = img / 2
+        out = (img+0.1) * seg[None, :, :] + 0.3 * img
+        out = [out.astype('float32')]
+    elif mode == 'blur_highlight':
+        from evaluation_utils import img_preprocess
+        out  = [img_preprocess((None, [img], [seg]), blur=1, bg_fac=0.5).numpy()[0] - 0.01]
+    elif mode == 'blur3_highlight':
+        from evaluation_utils import img_preprocess
+        out  = [img_preprocess((None, [img], [seg]), blur=3, bg_fac=0.5).numpy()[0] - 0.01]
+    elif mode == 'blur3_highlight01':
+        from evaluation_utils import img_preprocess
+        out  = [img_preprocess((None, [img], [seg]), blur=3, bg_fac=0.1).numpy()[0] - 0.01]
+    elif mode == 'blur_highlight_random':
+        from evaluation_utils import img_preprocess
+        out  = [img_preprocess((None, [img], [seg]), blur=0 + torch.randint(0, 3, (1,)).item(), bg_fac=0.1 + 0.8*torch.rand(1).item()).numpy()[0] - 0.01]
+    elif mode == 'crop':
+        from evaluation_utils import img_preprocess
+        out  = [img_preprocess((None, [img], [seg]), blur=1, center_context=0.1, image_size=image_size)[0].numpy()]
+    elif mode == 'crop_blur_highlight':
+        from evaluation_utils import img_preprocess
+        out  = [img_preprocess((None, [img], [seg]), blur=3, center_context=0.1, bg_fac=0.1, image_size=image_size)[0].numpy()]
+    elif mode == 'crop_blur_highlight352':
+        from evaluation_utils import img_preprocess
+        out  = [img_preprocess((None, [img], [seg]), blur=3, center_context=0.1, bg_fac=0.1, image_size=352)[0].numpy()]
+    elif mode == 'shape':
+        out = [np.stack([seg[:, :]]*3).astype('float32')]
+    elif mode == 'concat':
+        out = [np.concatenate([img, seg[None, :, :]]).astype('float32')]
+    elif mode == 'image_only':
+        out = [img.astype('float32')]
+    elif mode == 'image_black':
+        out = [img.astype('float32')*0]
+    elif mode is None:
+        out = [img.astype('float32')]
+    elif mode == 'separate':
+        out = [img.astype('float32'), seg.astype('int64')]
+    elif mode == 'separate_img_black':
+        out = [img.astype('float32')*0, seg.astype('int64')]
+    elif mode == 'separate_seg_ones':
+        out = [img.astype('float32'), np.ones_like(seg).astype('int64')]
+    elif mode == 'separate_both_black':
+        out = [img.astype('float32')*0, seg.astype('int64')*0]
+    else:
+        raise ValueError(f'invalid mode: {mode}')
+    return out

clipseg/environment.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+name: clipseg-environment
+channels:
+  - conda-forge
+  - pytorch
+dependencies:
+  - numpy
+  - scipy
+  - matplotlib-base
+  - pip
+  - pip:
+    - --find-links https://download.pytorch.org/whl/torch_stable.html
+    - torch==1.10.0+cpu
+    - torchvision==0.11.1+cpu
+    - opencv-python
+    - git+https://github.com/openai/CLIP.git

clipseg/evaluation_utils.py ADDED Viewed

	@@ -0,0 +1,292 @@

+from torch.functional import Tensor
+from general_utils import load_model
+from torch.utils.data import DataLoader
+import torch
+import numpy as np
+def denorm(img):
+    np_input = False
+    if isinstance(img, np.ndarray):
+        img = torch.from_numpy(img)
+        np_input = True
+    mean = torch.Tensor([0.485, 0.456, 0.406])
+    std = torch.Tensor([0.229, 0.224, 0.225])
+    img_denorm = (img*std[:,None,None]) + mean[:,None,None]
+    if np_input:
+        img_denorm = np.clip(img_denorm.numpy(), 0, 1)
+    else:
+        img_denorm = torch.clamp(img_denorm, 0, 1)
+    return img_denorm
+def norm(img):
+    mean = torch.Tensor([0.485, 0.456, 0.406])
+    std = torch.Tensor([0.229, 0.224, 0.225])
+    return (img - mean[:,None,None]) / std[:,None,None]
+def fast_iou_curve(p, g):
+    g = g[p.sort().indices]
+    p = torch.sigmoid(p.sort().values)
+    scores = []
+    vals = np.linspace(0, 1, 50)
+    for q in vals:
+        n = int(len(g) * q)
+        valid = torch.where(p > q)[0]
+        if len(valid) > 0:
+            n = int(valid[0])
+        else:
+            n = len(g)
+        fn = g[:n].sum()
+        tn = n - fn
+        tp = g[n:].sum()
+        fp = len(g) - n - tp
+        iou = tp / (tp + fn + fp)
+        precision = tp / (tp + fp)
+        recall = tp / (tp + fn)
+        scores += [iou]
+    return vals, scores
+def fast_rp_curve(p, g):
+    g = g[p.sort().indices]
+    p = torch.sigmoid(p.sort().values)
+    precisions, recalls = [], []
+    vals = np.linspace(p.min(), p.max(), 250)
+    for q in p[::100000]:
+        n = int(len(g) * q)
+        valid = torch.where(p > q)[0]
+        if len(valid) > 0:
+            n = int(valid[0])
+        else:
+            n = len(g)
+        fn = g[:n].sum()
+        tn = n - fn
+        tp = g[n:].sum()
+        fp = len(g) - n - tp
+        iou = tp / (tp + fn + fp)
+        precision = tp / (tp + fp)
+        recall = tp / (tp + fn)
+        precisions += [precision]
+        recalls += [recall]
+    return recalls, precisions
+# Image processing
+def img_preprocess(batch, blur=0, grayscale=False, center_context=None, rect=False, rect_color=(255,0,0), rect_width=2,
+                   brightness=1.0, bg_fac=1, colorize=False, outline=False, image_size=224):
+    import cv2
+    rw = rect_width
+    out = []
+    for img, mask in zip(batch[1], batch[2]):
+        img = img.cpu() if isinstance(img, torch.Tensor) else torch.from_numpy(img)
+        mask = mask.cpu() if isinstance(mask, torch.Tensor) else torch.from_numpy(mask)
+        img *= brightness
+        img_bl = img
+        if blur > 0: # best 5
+            img_bl = torch.from_numpy(cv2.GaussianBlur(img.permute(1,2,0).numpy(), (15, 15), blur)).permute(2,0,1)
+        if grayscale:
+            img_bl = img_bl[1][None]
+        #img_inp = img_ratio*img*mask + (1-img_ratio)*img_bl
+        # img_inp = img_ratio*img*mask + (1-img_ratio)*img_bl * (1-mask)
+        img_inp = img*mask + (bg_fac) * img_bl * (1-mask)
+        if rect:
+            _, bbox = crop_mask(img, mask, context=0.1)
+            img_inp[:, bbox[2]: bbox[3], max(0, bbox[0]-rw):bbox[0]+rw] = torch.tensor(rect_color)[:,None,None]
+            img_inp[:, bbox[2]: bbox[3], max(0, bbox[1]-rw):bbox[1]+rw] = torch.tensor(rect_color)[:,None,None]
+            img_inp[:, max(0, bbox[2]-1): bbox[2]+rw, bbox[0]:bbox[1]] = torch.tensor(rect_color)[:,None,None]
+            img_inp[:, max(0, bbox[3]-1): bbox[3]+rw, bbox[0]:bbox[1]] = torch.tensor(rect_color)[:,None,None]
+        if center_context is not None:
+            img_inp = object_crop(img_inp, mask, context=center_context, image_size=image_size)
+        if colorize:
+            img_gray = denorm(img)
+            img_gray = cv2.cvtColor(img_gray.permute(1,2,0).numpy(), cv2.COLOR_RGB2GRAY)
+            img_gray = torch.stack([torch.from_numpy(img_gray)]*3)
+            img_inp = torch.tensor([1,0.2,0.2])[:,None,None] * img_gray * mask + bg_fac * img_gray * (1-mask)
+            img_inp = norm(img_inp)
+        if outline:
+            cont = cv2.findContours(mask.byte().numpy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            outline_img = np.zeros(mask.shape, dtype=np.uint8)
+            cv2.drawContours(outline_img, cont[0], -1, thickness=5, color=(255, 255, 255))
+            outline_img = torch.stack([torch.from_numpy(outline_img)]*3).float() / 255.
+            img_inp = torch.tensor([1,0,0])[:,None,None] *  outline_img + denorm(img_inp) * (1- outline_img)
+            img_inp = norm(img_inp)
+        out += [img_inp]
+    return torch.stack(out)
+def object_crop(img, mask, context=0.0, square=False, image_size=224):
+    img_crop, bbox = crop_mask(img, mask, context=context, square=square)
+    img_crop = pad_to_square(img_crop, channel_dim=0)
+    img_crop = torch.nn.functional.interpolate(img_crop.unsqueeze(0), (image_size, image_size)).squeeze(0)
+    return img_crop
+def crop_mask(img, mask, context=0.0, square=False):
+    assert img.shape[1:] == mask.shape
+    bbox = [mask.max(0).values.argmax(), mask.size(0) - mask.max(0).values.flip(0).argmax()]
+    bbox += [mask.max(1).values.argmax(), mask.size(1) - mask.max(1).values.flip(0).argmax()]
+    bbox = [int(x) for x in bbox]
+    width, height = (bbox[3] - bbox[2]), (bbox[1] - bbox[0])
+    # square mask
+    if square:
+        bbox[0] = int(max(0, bbox[0] - context * height))
+        bbox[1] = int(min(mask.size(0), bbox[1] + context * height))
+        bbox[2] = int(max(0, bbox[2] - context * width))
+        bbox[3] = int(min(mask.size(1), bbox[3] + context * width))
+        width, height = (bbox[3] - bbox[2]), (bbox[1] - bbox[0])
+        if height > width:
+            bbox[2] = int(max(0, (bbox[2] - 0.5*height)))
+            bbox[3] = bbox[2] + height
+        else:
+            bbox[0] = int(max(0, (bbox[0] - 0.5*width)))
+            bbox[1] = bbox[0] + width
+    else:
+        bbox[0] = int(max(0, bbox[0] - context * height))
+        bbox[1] = int(min(mask.size(0), bbox[1] + context * height))
+        bbox[2] = int(max(0, bbox[2] - context * width))
+        bbox[3] = int(min(mask.size(1), bbox[3] + context * width))
+    width, height = (bbox[3] - bbox[2]), (bbox[1] - bbox[0])
+    img_crop = img[:, bbox[2]: bbox[3], bbox[0]: bbox[1]]
+    return img_crop, bbox
+def pad_to_square(img, channel_dim=2, fill=0):
+    """
+    add padding such that a squared image is returned """
+    from torchvision.transforms.functional import pad
+    if channel_dim == 2:
+        img = img.permute(2, 0, 1)
+    elif channel_dim == 0:
+        pass
+    else:
+        raise ValueError('invalid channel_dim')
+    h, w = img.shape[1:]
+    pady1 = pady2 = padx1 = padx2 = 0
+    if h > w:
+        padx1 = (h - w) // 2
+        padx2 = h - w - padx1
+    elif w > h:
+        pady1 = (w - h) // 2
+        pady2 = w - h - pady1
+    img_padded = pad(img, padding=(padx1, pady1, padx2, pady2), padding_mode='constant')
+    if channel_dim == 2:
+        img_padded = img_padded.permute(1, 2, 0)
+    return img_padded
+# qualitative
+def split_sentence(inp, limit=9):
+    t_new, current_len = [], 0
+    for k, t in enumerate(inp.split(' ')):
+        current_len += len(t) + 1
+        t_new += [t+' ']
+        # not last
+        if current_len > limit and k != len(inp.split(' ')) - 1:
+            current_len = 0
+            t_new += ['\n']
+    t_new = ''.join(t_new)
+    return t_new
+from matplotlib import pyplot as plt
+def plot(imgs, *preds, labels=None, scale=1, cmap=plt.cm.magma, aps=None, gt_labels=None, vmax=None):
+    row_off = 0 if labels is None else 1
+    _, ax = plt.subplots(len(imgs) + row_off, 1 + len(preds), figsize=(scale * float(1 + 2*len(preds)), scale * float(len(imgs)*2)))
+    [a.axis('off') for a in ax.flatten()]
+    if labels is not None:
+        for j in range(len(labels)):
+            t_new = split_sentence(labels[j], limit=6)
+            ax[0, 1+ j].text(0.5, 0.1, t_new, ha='center', fontsize=3+ 10*scale)
+    for i in range(len(imgs)):
+        ax[i + row_off,0].imshow(imgs[i])
+        for j in range(len(preds)):
+            img = preds[j][i][0].detach().cpu().numpy()
+            if gt_labels is not None and labels[j] == gt_labels[i]:
+                print(j, labels[j], gt_labels[i])
+                edgecolor = 'red'
+                if aps is not None:
+                    ax[i + row_off, 1 + j].text(30, 70, f'AP: {aps[i]:.3f}', color='red', fontsize=8)
+            else:
+                edgecolor = 'k'
+            rect = plt.Rectangle([0,0], img.shape[0], img.shape[1], facecolor="none",
+                                 edgecolor=edgecolor, linewidth=3)
+            ax[i + row_off,1 + j].add_patch(rect)
+            if vmax is None:
+                this_vmax = 1
+            elif vmax == 'per_prompt':
+                this_vmax = max([preds[j][_i][0].max() for _i in range(len(imgs))])
+            elif vmax == 'per_image':
+                this_vmax = max([preds[_j][i][0].max() for _j in range(len(preds))])
+            ax[i + row_off,1 + j].imshow(img, vmin=0, vmax=this_vmax, cmap=cmap)
+            # ax[i,1 + j].imshow(preds[j][i][0].detach().cpu().numpy(), vmin=preds[j].min(), vmax=preds[j].max())
+    plt.tight_layout()
+    plt.subplots_adjust(wspace=0.05, hspace=0.05)

clipseg/example_image.jpg ADDED Viewed

clipseg/experiments/ablation.yaml ADDED Viewed

	@@ -0,0 +1,84 @@

+configuration:
+  batch_size: 64
+  optimizer: torch.optim.AdamW
+  lr: 0.001
+  trainer: experiment_setup.train_loop
+  scorer: experiment_setup.score
+  model: models.clipseg.CLIPDensePredT
+  lr_scheduler: cosine
+  T_max: 20000
+  eta_min: 0.0001
+  max_iterations: 20000    #  <-##########################################
+  val_interval: null
+  # dataset
+  dataset: datasets.phrasecut.PhraseCut   # <-----------------
+  split_mode: pascal_test
+  split: train
+  mask: text_and_crop_blur_highlight352
+  image_size: 352
+  negative_prob: 0.2
+  mix_text_max: 0.5
+  # general
+  mix: True # <-----------------
+  prompt: shuffle+
+  norm_cond: True
+  mix_text_min: 0.0
+  with_visual: True
+  # model
+  version: 'ViT-B/16'
+  extract_layers: [3, 7, 9]
+  reduce_dim: 64
+  depth: 3
+  fix_shift: False            #  <-##########################################
+  loss: torch.nn.functional.binary_cross_entropy_with_logits
+  amp: True
+test_configuration_common:
+  normalize: True
+  image_size: 352
+  batch_size: 32
+  sigmoid: True
+  split: test
+  label_support: True
+test_configuration:
+  -
+    name: pc
+    metric: metrics.FixedIntervalMetrics
+    test_dataset: phrasecut
+    mask: text
+  -
+    name: pc-vis
+    metric: metrics.FixedIntervalMetrics
+    test_dataset: phrasecut
+    mask: crop_blur_highlight352
+    with_visual: True
+    visual_only: True
+columns: [name,
+pc_fgiou_best, pc_miou_best,  pc_fgiou_0.5,
+pc-vis_fgiou_best, pc-vis_miou_best,  pc-vis_fgiou_0.5,
+duration]
+individual_configurations:
+- {name: rd64-uni}
+- {name: rd64-no-pretrain, not_pretrained: True, lr: 0.0003}
+- {name: rd64-no-negatives, negative_prob: 0.0}
+- {name: rd64-neg0.5, negative_prob: 0.5}
+- {name: rd64-no-visual, with_visual: False, mix: False}
+- {name: rd16-uni, reduce_dim: 16}
+- {name: rd64-layer3, extract_layers: [3], depth: 1}
+- {name: rd64-blur-highlight, mask: text_and_blur_highlight, test_configuration: {mask: blur_highlight}}

clipseg/experiments/coco.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+configuration:
+  batch_size: 64
+  optimizer: torch.optim.AdamW
+  lr: 0.001
+  trainer: experiment_setup.train_loop
+  scorer: experiment_setup.score
+  model: models.clipseg.CLIPDensePredT
+  lr_scheduler: cosine
+  T_max: 20000
+  eta_min: 0.0001
+  max_iterations: 20000
+  val_interval: null
+  # dataset
+  dataset: datasets.coco_wrapper.COCOWrapper
+  # split_mode: pascal_test
+  split: train
+  mask: text_and_blur3_highlight01
+  image_size: 352
+  normalize: True
+  pre_crop_image_size: [sample, 1, 1.5]
+  aug: 1new
+  # general
+  mix: True
+  prompt: shuffle+
+  norm_cond: True
+  mix_text_min: 0.0
+  # model
+  out: 1
+  extract_layers: [3, 7, 9]
+  reduce_dim: 64
+  depth: 3
+  fix_shift: False
+  loss: torch.nn.functional.binary_cross_entropy_with_logits
+  amp: True
+test_configuration_common:
+  normalize: True
+  image_size: 352
+  # max_iterations: 10
+  batch_size: 8
+  sigmoid: True
+  test_dataset: coco
+  metric: metrics.FixedIntervalMetrics
+test_configuration:
+  -
+    name: coco_t
+    mask: text
+  -
+    name: coco_h
+    mask: blur3_highlight01
+  -
+    name: coco_h2
+    mask: crop_blur_highlight352
+columns: [i, name,
+coco_t_fgiou_best, coco_t_miou_best,  coco_t_fgiou_0.5,
+coco_h_fgiou_best, coco_h_miou_best,  coco_h_fgiou_0.5,
+coco_h2_fgiou_best, coco_h2_miou_best,  coco_h2_fgiou_0.5, coco_h2_fgiou_best_t,
+train_loss, duration, date
+]
+individual_configurations:
+- {name: rd64-7K-vit16-cbh-coco-0, version: 'ViT-B/16', fold: 0, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000}
+- {name: rd64-7K-vit16-cbh-coco-1, version: 'ViT-B/16', fold: 1, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000}
+- {name: rd64-7K-vit16-cbh-coco-2, version: 'ViT-B/16', fold: 2, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000}
+- {name: rd64-7K-vit16-cbh-coco-3, version: 'ViT-B/16', fold: 3, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000}
+- {name: rd64-7K-vit16-cbh-neg0.2-coco-0, version: 'ViT-B/16', negative_prob: 0.2, fold: 0, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000}
+- {name: rd64-7K-vit16-cbh-neg0.2-coco-1, version: 'ViT-B/16', negative_prob: 0.2, fold: 1, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000}
+- {name: rd64-7K-vit16-cbh-neg0.2-coco-2, version: 'ViT-B/16', negative_prob: 0.2, fold: 2, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000}
+- {name: rd64-7K-vit16-cbh-neg0.2-coco-3, version: 'ViT-B/16', negative_prob: 0.2, fold: 3, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000}
+# ViT
+- {name: vit64-7K-vit16-cbh-coco-0, version: 'ViT-B/16', model: models.vitseg.VITDensePredT, fold: 0, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000, lr: 0.0001}
+- {name: vit64-7K-vit16-cbh-coco-1, version: 'ViT-B/16', model: models.vitseg.VITDensePredT, fold: 1, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000, lr: 0.0001}
+- {name: vit64-7K-vit16-cbh-coco-2, version: 'ViT-B/16', model: models.vitseg.VITDensePredT, fold: 2, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000, lr: 0.0001}
+- {name: vit64-7K-vit16-cbh-coco-3, version: 'ViT-B/16', model: models.vitseg.VITDensePredT, fold: 3, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000, lr: 0.0001}
+# BASELINE
+- {name: bl64-7K-vit16-cbh-neg0.2-coco-0, model: models.clipseg.CLIPDenseBaseline, reduce2_dim: 64, version: 'ViT-B/16', negative_prob: 0.2, fold: 0, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000}
+- {name: bl64-7K-vit16-cbh-neg0.2-coco-1, model: models.clipseg.CLIPDenseBaseline, reduce2_dim: 64, version: 'ViT-B/16', negative_prob: 0.2, fold: 1, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000}
+- {name: bl64-7K-vit16-cbh-neg0.2-coco-2, model: models.clipseg.CLIPDenseBaseline, reduce2_dim: 64, version: 'ViT-B/16', negative_prob: 0.2, fold: 2, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000}
+- {name: bl64-7K-vit16-cbh-neg0.2-coco-3, model: models.clipseg.CLIPDenseBaseline, reduce2_dim: 64, version: 'ViT-B/16', negative_prob: 0.2, fold: 3, reduce_dim: 64, mask: text_and_crop_blur_highlight352, T_max: 7000, max_iterations: 7000}

clipseg/experiments/pascal_1shot.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+configuration:
+  batch_size: 64
+  optimizer: torch.optim.AdamW
+  lr: 0.001
+  trainer: experiment_setup.train_loop
+  scorer: experiment_setup.score
+  model: models.clipseg.CLIPDensePredT
+  lr_scheduler: cosine
+  T_max: 20000
+  eta_min: 0.0001
+  max_iterations: 20000    #  <-##########################################
+  val_interval: null
+  # dataset
+  dataset: datasets.phrasecut.PhraseCut
+  split_mode: pascal_test
+  mode: train
+  mask: text_and_crop_blur_highlight352
+  image_size: 352
+  normalize: True
+  pre_crop_image_size: [sample, 1, 1.5]
+  aug: 1new
+  with_visual: True
+  split: train
+  # general
+  mix: True
+  prompt: shuffle+
+  norm_cond: True
+  mix_text_min: 0.0
+  # model
+  out: 1
+  version: 'ViT-B/16'
+  extract_layers: [3, 7, 9]
+  reduce_dim: 64
+  depth: 3
+  loss: torch.nn.functional.binary_cross_entropy_with_logits
+  amp: True
+test_configuration_common:
+  normalize: True
+  image_size: 352
+  metric: metrics.FixedIntervalMetrics
+  batch_size: 1
+  test_dataset: pascal
+  sigmoid: True
+  # max_iterations: 250
+test_configuration:
+  -
+    name: pas_t
+    mask: text
+  -
+    name: pas_h
+    mask: blur3_highlight01
+  -
+    name: pas_h2
+    mask: crop_blur_highlight352
+columns: [name,
+pas_t_fgiou_best, pas_t_miou_best,  pas_t_fgiou_ct,
+pas_h_fgiou_best, pas_h_miou_best,  pas_h_fgiou_ct,
+pas_h2_fgiou_best, pas_h2_miou_best,  pas_h2_fgiou_ct, pas_h2_fgiou_best_t,
+train_loss, duration, date
+]
+individual_configurations:
+- {name: rd64-uni-phrasepas5i-0, remove_classes: [pas5i, 0], negative_prob: 0.2, mix_text_max: 0.5, test_configuration: {splits: [0], custom_threshold: 0.24}}
+- {name: rd64-uni-phrasepas5i-1, remove_classes: [pas5i, 1], negative_prob: 0.2, mix_text_max: 0.5, test_configuration: {splits: [1], custom_threshold: 0.24}}
+- {name: rd64-uni-phrasepas5i-2, remove_classes: [pas5i, 2], negative_prob: 0.2, mix_text_max: 0.5, test_configuration: {splits: [2], custom_threshold: 0.24}}
+- {name: rd64-uni-phrasepas5i-3, remove_classes: [pas5i, 3], negative_prob: 0.2, mix_text_max: 0.5, test_configuration: {splits: [3], custom_threshold: 0.24}}
+- {name: rd64-phrasepas5i-0, remove_classes: [pas5i, 0], negative_prob: 0.0, test_configuration: {splits: [0], custom_threshold: 0.28}}
+- {name: rd64-phrasepas5i-1, remove_classes: [pas5i, 1], negative_prob: 0.0, test_configuration: {splits: [1], custom_threshold: 0.28}}
+- {name: rd64-phrasepas5i-2, remove_classes: [pas5i, 2], negative_prob: 0.0, test_configuration: {splits: [2], custom_threshold: 0.28}}
+- {name: rd64-phrasepas5i-3, remove_classes: [pas5i, 3], negative_prob: 0.0, test_configuration: {splits: [3], custom_threshold: 0.28}}
+# baseline
+- {name: bl64-phrasepas5i-0, model: models.clipseg.CLIPDenseBaseline, remove_classes: [pas5i, 0], reduce2_dim: 64, negative_prob: 0.0, test_configuration: {splits: [0], custom_threshold: 0.24}}
+- {name: bl64-phrasepas5i-1, model: models.clipseg.CLIPDenseBaseline, remove_classes: [pas5i, 1], reduce2_dim: 64, negative_prob: 0.0, test_configuration: {splits: [1], custom_threshold: 0.24}}
+- {name: bl64-phrasepas5i-2, model: models.clipseg.CLIPDenseBaseline, remove_classes: [pas5i, 2], reduce2_dim: 64, negative_prob: 0.0, test_configuration: {splits: [2], custom_threshold: 0.24}}
+- {name: bl64-phrasepas5i-3, model: models.clipseg.CLIPDenseBaseline, remove_classes: [pas5i, 3], reduce2_dim: 64, negative_prob: 0.0, test_configuration: {splits: [3], custom_threshold: 0.24}}
+# ViT
+- {name: vit64-uni-phrasepas5i-0, remove_classes: [pas5i, 0], model: models.vitseg.VITDensePredT, negative_prob: 0.2, mix_text_max: 0.5, lr: 0.0001, test_configuration: {splits: [0], custom_threshold: 0.02}}
+- {name: vit64-uni-phrasepas5i-1, remove_classes: [pas5i, 1], model: models.vitseg.VITDensePredT, negative_prob: 0.2, mix_text_max: 0.5, lr: 0.0001, test_configuration: {splits: [1], custom_threshold: 0.02}}
+- {name: vit64-uni-phrasepas5i-2, remove_classes: [pas5i, 2], model: models.vitseg.VITDensePredT, negative_prob: 0.2, mix_text_max: 0.5, lr: 0.0001, test_configuration: {splits: [2], custom_threshold: 0.02}}
+- {name: vit64-uni-phrasepas5i-3, remove_classes: [pas5i, 3], model: models.vitseg.VITDensePredT, negative_prob: 0.2, mix_text_max: 0.5, lr: 0.0001, test_configuration: {splits: [3], custom_threshold: 0.02}}

clipseg/experiments/phrasecut.yaml ADDED Viewed

	@@ -0,0 +1,80 @@

+configuration:
+  batch_size: 64
+  optimizer: torch.optim.AdamW
+  lr: 0.001
+  trainer: experiment_setup.train_loop
+  scorer: experiment_setup.score
+  model: models.clipseg.CLIPDensePredT
+  lr_scheduler: cosine
+  T_max: 20000
+  eta_min: 0.0001
+  max_iterations: 20000
+  val_interval: null
+  # dataset
+  dataset: datasets.phrasecut.PhraseCut   # <-----------------
+  split_mode: pascal_test
+  split: train
+  mask: text_and_crop_blur_highlight352
+  image_size: 352
+  normalize: True
+  pre_crop_image_size: [sample, 1, 1.5]
+  aug: 1new
+  # general
+  mix: False # <-----------------
+  prompt: shuffle+
+  norm_cond: True
+  mix_text_min: 0.0
+  # model
+  out: 1
+  extract_layers: [3, 7, 9]
+  reduce_dim: 64
+  depth: 3
+  fix_shift: False
+  loss: torch.nn.functional.binary_cross_entropy_with_logits
+  amp: True
+test_configuration_common:
+  normalize: True
+  image_size: 352
+  batch_size: 32
+  # max_iterations: 5
+  # max_iterations: 150
+test_configuration:
+  -
+    name: pc  # old: phrasecut
+    metric: metrics.FixedIntervalMetrics
+    test_dataset: phrasecut
+    split: test
+    mask: text
+    label_support: True
+    sigmoid: True
+columns: [i, name, pc_miou_0.3, pc_fgiou_0.3, pc_fgiou_0.5, pc_ap, duration, date]
+individual_configurations:
+# important ones
+- {name: rd64-uni, version: 'ViT-B/16', reduce_dim: 64, with_visual: True, negative_prob: 0.2, mix: True, mix_text_max: 0.5}
+# this was accedentally trained using old mask
+- {name: rd128-vit16-phrasecut, version: 'ViT-B/16', reduce_dim: 128, mask: text_and_blur3_highlight01}
+- {name: rd64-uni-novis, version: 'ViT-B/16', reduce_dim: 64, with_visual: False, negative_prob: 0.2, mix: False}
+# this was accedentally trained using old mask
+- {name: baseline3-vit16-phrasecut, model: models.clipseg.CLIPDenseBaseline, version: 'ViT-B/16', reduce_dim: 64, reduce2_dim: 64, mask: text_and_blur3_highlight01}
+- {name: vit64-uni, version: 'ViT-B/16', model: models.vitseg.VITDensePredT, reduce_dim: 64, with_visual: True, only_visual: True, negative_prob: 0.2, mask: crop_blur_highlight352, lr: 0.0003}
+- {name: vit64-uni-novis, version: 'ViT-B/16', model: models.vitseg.VITDensePredT, with_visual: False, reduce_dim: 64, lr: 0.0001}

clipseg/general_utils.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import json
+import inspect
+import torch
+import os
+import sys
+import yaml
+from shutil import copy, copytree
+from os.path import join, dirname, realpath, expanduser, isfile, isdir, basename
+class Logger(object):
+    def __getattr__(self, k):
+        return print
+log = Logger()
+def training_config_from_cli_args():
+    experiment_name = sys.argv[1]
+    experiment_id = int(sys.argv[2])
+    yaml_config = yaml.load(open(f'experiments/{experiment_name}'), Loader=yaml.SafeLoader)
+    config = yaml_config['configuration']
+    config = {**config, **yaml_config['individual_configurations'][experiment_id]}
+    config = AttributeDict(config)
+    return config
+def score_config_from_cli_args():
+    experiment_name = sys.argv[1]
+    experiment_id = int(sys.argv[2])
+    yaml_config = yaml.load(open(f'experiments/{experiment_name}'), Loader=yaml.SafeLoader)
+    config = yaml_config['test_configuration_common']
+    if type(yaml_config['test_configuration']) == list:
+        test_id = int(sys.argv[3])
+        config = {**config, **yaml_config['test_configuration'][test_id]}
+    else:
+        config = {**config, **yaml_config['test_configuration']}
+    if 'test_configuration' in yaml_config['individual_configurations'][experiment_id]:
+        config = {**config, **yaml_config['individual_configurations'][experiment_id]['test_configuration']}
+    train_checkpoint_id = yaml_config['individual_configurations'][experiment_id]['name']
+    config = AttributeDict(config)
+    return config, train_checkpoint_id
+def get_from_repository(local_name, repo_files, integrity_check=None, repo_dir='~/dataset_repository',
+                        local_dir='~/datasets'):
+    """ copies files from repository to local folder.
+    repo_files: list of filenames or list of tuples [filename, target path]
+    e.g. get_from_repository('MyDataset', [['data/dataset1.tar', 'other/path/ds03.tar'])
+    will create a folder 'MyDataset' in local_dir, and extract the content of
+    '<repo_dir>/data/dataset1.tar' to <local_dir>/MyDataset/other/path.
+     """
+    local_dir = realpath(join(expanduser(local_dir), local_name))
+    dataset_exists = True
+    # check if folder is available
+    if not isdir(local_dir):
+        dataset_exists = False
+    if integrity_check is not None:
+        try:
+            integrity_ok = integrity_check(local_dir)
+        except BaseException:
+            integrity_ok = False
+        if integrity_ok:
+            log.hint('Passed custom integrity check')
+        else:
+            log.hint('Custom integrity check failed')
+        dataset_exists = dataset_exists and integrity_ok
+    if not dataset_exists:
+        repo_dir = realpath(expanduser(repo_dir))
+        for i, filename in enumerate(repo_files):
+            if type(filename) == str:
+                origin, target = filename, filename
+                archive_target = join(local_dir, basename(origin))
+                extract_target = join(local_dir)
+            else:
+                origin, target = filename
+                archive_target = join(local_dir, dirname(target), basename(origin))
+                extract_target = join(local_dir, dirname(target))
+            archive_origin = join(repo_dir, origin)
+            log.hint(f'copy: {archive_origin} to {archive_target}')
+            # make sure the path exists
+            os.makedirs(dirname(archive_target), exist_ok=True)
+            if os.path.isfile(archive_target):
+                # only copy if size differs
+                if os.path.getsize(archive_target) != os.path.getsize(archive_origin):
+                    log.hint(f'file exists but filesize differs: target {os.path.getsize(archive_target)} vs. origin {os.path.getsize(archive_origin)}')
+                    copy(archive_origin, archive_target)
+            else:
+                copy(archive_origin, archive_target)
+            extract_archive(archive_target, extract_target, noarchive_ok=True)
+            # concurrent processes might have deleted the file
+            if os.path.isfile(archive_target):
+                os.remove(archive_target)
+def extract_archive(filename, target_folder=None, noarchive_ok=False):
+    from subprocess import run, PIPE
+    if filename.endswith('.tgz') or filename.endswith('.tar'):
+        command = f'tar -xf {filename}'
+        command += f' -C {target_folder}' if target_folder is not None else ''
+    elif filename.endswith('.tar.gz'):
+        command = f'tar -xzf {filename}'
+        command += f' -C {target_folder}' if target_folder is not None else ''
+    elif filename.endswith('zip'):
+        command = f'unzip {filename}'
+        command += f' -d {target_folder}' if target_folder is not None else ''
+    else:
+        if noarchive_ok:
+            return
+        else:
+            raise ValueError(f'unsuppored file ending of {filename}')
+    log.hint(command)
+    result = run(command.split(), stdout=PIPE, stderr=PIPE)
+    if result.returncode != 0:
+        print(result.stdout, result.stderr)
+class AttributeDict(dict):
+    """
+    An extended dictionary that allows access to elements as atttributes and counts
+    these accesses. This way, we know if some attributes were never used.
+    """
+    def __init__(self, *args, **kwargs):
+        from collections import Counter
+        super().__init__(*args, **kwargs)
+        self.__dict__['counter'] = Counter()
+    def __getitem__(self, k):
+        self.__dict__['counter'][k] += 1
+        return super().__getitem__(k)
+    def __getattr__(self, k):
+        self.__dict__['counter'][k] += 1
+        return super().get(k)
+    def __setattr__(self, k, v):
+        return super().__setitem__(k, v)
+    def __delattr__(self, k, v):
+        return super().__delitem__(k, v)
+    def unused_keys(self, exceptions=()):
+        return [k for k in super().keys() if self.__dict__['counter'][k] == 0 and k not in exceptions]
+    def assume_no_unused_keys(self, exceptions=()):
+        if len(self.unused_keys(exceptions=exceptions)) > 0:
+            log.warning('Unused keys:', self.unused_keys(exceptions=exceptions))
+def get_attribute(name):
+    import importlib
+    if name is None:
+        raise ValueError('The provided attribute is None')
+    name_split = name.split('.')
+    mod = importlib.import_module('.'.join(name_split[:-1]))
+    return getattr(mod, name_split[-1])
+def filter_args(input_args, default_args):
+    updated_args = {k: input_args[k] if k in input_args else v for k, v in default_args.items()}
+    used_args = {k: v for k, v in input_args.items() if k in default_args}
+    unused_args = {k: v for k, v in input_args.items() if k not in default_args}
+    return AttributeDict(updated_args), AttributeDict(used_args), AttributeDict(unused_args)
+def load_model(checkpoint_id, weights_file=None, strict=True, model_args='from_config', with_config=False):
+    config = json.load(open(join('logs', checkpoint_id, 'config.json')))
+    if model_args != 'from_config' and type(model_args) != dict:
+        raise ValueError('model_args must either be "from_config" or a dictionary of values')
+    model_cls = get_attribute(config['model'])
+    # load model
+    if model_args == 'from_config':
+        _, model_args, _ = filter_args(config, inspect.signature(model_cls).parameters)
+    model = model_cls(**model_args)
+    if weights_file is None:
+        weights_file = realpath(join('logs', checkpoint_id, 'weights.pth'))
+    else:
+        weights_file = realpath(join('logs', checkpoint_id, weights_file))
+    if isfile(weights_file):
+        weights = torch.load(weights_file)
+        for _, w in weights.items():
+            assert not torch.any(torch.isnan(w)), 'weights contain NaNs'
+        model.load_state_dict(weights, strict=strict)
+    else:
+        raise FileNotFoundError(f'model checkpoint {weights_file} was not found')
+    if with_config:
+        return model, config
+    return model
+class TrainingLogger(object):
+    def __init__(self, model, log_dir, config=None, *args):
+        super().__init__()
+        self.model = model
+        self.base_path = join(f'logs/{log_dir}') if log_dir is not None else None
+        os.makedirs('logs/', exist_ok=True)
+        os.makedirs(self.base_path, exist_ok=True)
+        if config is not None:
+            json.dump(config, open(join(self.base_path, 'config.json'), 'w'))
+    def iter(self, i, **kwargs):
+        if i % 100 == 0 and 'loss' in kwargs:
+            loss = kwargs['loss']
+            print(f'iteration {i}: loss {loss:.4f}')
+    def save_weights(self, only_trainable=False, weight_file='weights.pth'):
+        if self.model is None:
+            raise AttributeError('You need to provide a model reference when initializing TrainingTracker to save weights.')
+        weights_path = join(self.base_path, weight_file)
+        weight_dict = self.model.state_dict()
+        if only_trainable:
+            weight_dict = {n: weight_dict[n] for n, p in self.model.named_parameters() if p.requires_grad}
+        torch.save(weight_dict, weights_path)
+        log.info(f'Saved weights to {weights_path}')
+    def __enter__(self):
+        return self
+    def __exit__(self, type, value, traceback):
+        """ automatically stop processes if used in a context manager """
+        pass

clipseg/metrics.py ADDED Viewed

	@@ -0,0 +1,271 @@

+from torch.functional import Tensor
+from general_utils import log
+from collections import defaultdict
+import numpy as np
+import torch
+from torch.nn import functional as nnf
+class BaseMetric(object):
+    def __init__(self, metric_names, pred_range=None, gt_index=0, pred_index=0, eval_intermediate=True,
+                 eval_validation=True):
+        self._names = tuple(metric_names)
+        self._eval_intermediate = eval_intermediate
+        self._eval_validation = eval_validation
+        self._pred_range = pred_range
+        self._pred_index = pred_index
+        self._gt_index = gt_index
+        self.predictions = []
+        self.ground_truths = []
+    def eval_intermediate(self):
+        return self._eval_intermediate
+    def eval_validation(self):
+        return self._eval_validation
+    def names(self):
+        return self._names
+    def add(self, predictions, ground_truth):
+        raise NotImplementedError
+    def value(self):
+        raise NotImplementedError
+    def scores(self):
+        # similar to value but returns dict
+        value = self.value()
+        if type(value) == dict:
+            return value
+        else:
+            assert type(value) in {list, tuple}
+            return list(zip(self.names(), self.value()))
+    def _get_pred_gt(self, predictions, ground_truth):
+        pred = predictions[self._pred_index]
+        gt = ground_truth[self._gt_index]
+        if self._pred_range is not None:
+            pred = pred[:, self._pred_range[0]: self._pred_range[1]]
+        return pred, gt
+class FixedIntervalMetrics(BaseMetric):
+    def __init__(self, sigmoid=False, ignore_mask=False, resize_to=None,
+                 resize_pred=None, n_values=51, custom_threshold=None):
+        super().__init__(('ap', 'best_fgiou', 'best_miou', 'fgiou0.5', 'fgiou0.1', 'mean_iou_0p5', 'mean_iou_0p1', 'best_biniou', 'biniou_0.5', 'fgiou_thresh'))
+        self.intersections = []
+        self.unions = []
+        # self.threshold = threshold
+        self.sigmoid = sigmoid
+        self.resize_to = resize_to
+        self.resize_pred = resize_pred  # resize prediction to match ground truth
+        self.class_count = defaultdict(lambda: 0)
+        self.per_class = defaultdict(lambda : [0,0])
+        self.ignore_mask = ignore_mask
+        self.custom_threshold = custom_threshold
+        self.scores_ap = []
+        self.scores_iou = []
+        self.gts, self.preds = [], []
+        self.classes = []
+        # [1:-1] ignores 0 and 1
+        self.threshold_values = np.linspace(0, 1, n_values)[1:-1]
+        self.metrics = dict(tp=[], fp=[], fn=[], tn=[])
+    def add(self, pred, gt):
+        pred_batch = pred[0].cpu()
+        if self.sigmoid:
+            pred_batch = torch.sigmoid(pred_batch)
+        gt_batch = gt[0].cpu()
+        mask_batch = gt[1] if len(gt) > 1 and not self.ignore_mask and gt[1].numel() > 0 else ([None] * len(pred_batch))
+        cls_batch = gt[2] if len(gt) > 2 else [None] * len(pred_batch)
+        if self.resize_to is not None:
+            gt_batch = nnf.interpolate(gt_batch, self.resize_to, mode='nearest')
+            pred_batch = nnf.interpolate(pred_batch, self.resize_to, mode='bilinear', align_corners=False)
+        if isinstance(cls_batch, torch.Tensor):
+            cls_batch = cls_batch.cpu().numpy().tolist()
+        assert len(gt_batch) == len(pred_batch) == len(cls_batch), f'{len(gt_batch)} {len(pred_batch)} {len(cls_batch)}'
+        for predictions, ground_truth, mask, cls in zip(pred_batch, gt_batch, mask_batch, cls_batch):
+            if self.resize_pred:
+                predictions = nnf.interpolate(predictions.unsqueeze(0).float(), size=ground_truth.size()[-2:], mode='bilinear', align_corners=True)
+            p = predictions.flatten()
+            g = ground_truth.flatten()
+            assert len(p) == len(g)
+            if mask is not None:
+                m = mask.flatten().bool()
+                p = p[m]
+                g = g[m]
+            p_sorted = p.sort()
+            p = p_sorted.values
+            g = g[p_sorted.indices]
+            tps, fps, fns, tns = [], [], [], []
+            for thresh in self.threshold_values:
+                valid = torch.where(p > thresh)[0]
+                if len(valid) > 0:
+                    n = int(valid[0])
+                else:
+                    n = len(g)
+                fn = int(g[:n].sum())
+                tp = int(g[n:].sum())
+                fns += [fn]
+                tns += [n - fn]
+                tps += [tp]
+                fps += [len(g) - n - tp]
+            self.metrics['tp'] += [tps]
+            self.metrics['fp'] += [fps]
+            self.metrics['fn'] += [fns]
+            self.metrics['tn'] += [tns]
+            self.classes += [cls.item() if isinstance(cls, torch.Tensor) else cls]
+    def value(self):
+        import time
+        t_start = time.time()
+        if set(self.classes) == set([None]):
+            all_classes = None
+            log.warning('classes were not provided, cannot compute mIoU')
+        else:
+            all_classes = set(int(c) for c in self.classes)
+            # log.info(f'compute metrics for {len(all_classes)} classes')
+        summed = {k: [sum([self.metrics[k][i][j]
+                           for i in range(len(self.metrics[k]))])
+                      for j in range(len(self.threshold_values))]
+                  for k in self.metrics.keys()}
+        if all_classes is not None:
+            assert len(self.classes) == len(self.metrics['tp']) == len(self.metrics['fn'])
+            # group by class
+            metrics_by_class = {c: {k: [] for k in self.metrics.keys()} for c in all_classes}
+            for i in range(len(self.metrics['tp'])):
+                for k in self.metrics.keys():
+                    metrics_by_class[self.classes[i]][k] += [self.metrics[k][i]]
+            # sum over all instances within the classes
+            summed_by_cls = {k: {c: np.array(metrics_by_class[c][k]).sum(0).tolist() for c in all_classes} for k in self.metrics.keys()}
+        # Compute average precision
+        assert (np.array(summed['fp']) + np.array(summed['tp']) ).sum(), 'no predictions is made'
+        # only consider values where a prediction is made
+        precisions = [summed['tp'][j] / (1 + summed['tp'][j] + summed['fp'][j]) for j in range(len(self.threshold_values))
+                      if summed['tp'][j] + summed['fp'][j] > 0]
+        recalls = [summed['tp'][j] / (1 + summed['tp'][j] + summed['fn'][j]) for j in range(len(self.threshold_values))
+                           if summed['tp'][j] + summed['fp'][j] > 0]
+        # remove duplicate recall-precision-pairs (and sort by recall value)
+        recalls, precisions = zip(*sorted(list(set(zip(recalls, precisions))), key=lambda x: x[0]))
+        from scipy.integrate import simps
+        ap = simps(precisions, recalls)
+        # Compute best IoU
+        fgiou_scores = [summed['tp'][j] / (1 + summed['tp'][j] + summed['fp'][j] + summed['fn'][j]) for j in range(len(self.threshold_values))]
+        biniou_scores = [
+            0.5*(summed['tp'][j] / (1 + summed['tp'][j] + summed['fp'][j] + summed['fn'][j])) +
+            0.5*(summed['tn'][j] / (1 + summed['tn'][j] + summed['fn'][j] + summed['fp'][j]))
+            for j in range(len(self.threshold_values))
+        ]
+        index_0p5 = self.threshold_values.tolist().index(0.5)
+        index_0p1 = self.threshold_values.tolist().index(0.1)
+        index_0p2 = self.threshold_values.tolist().index(0.2)
+        index_0p3 = self.threshold_values.tolist().index(0.3)
+        if self.custom_threshold is not None:
+            index_ct = self.threshold_values.tolist().index(self.custom_threshold)
+        if all_classes is not None:
+            # mean IoU
+            mean_ious = [np.mean([summed_by_cls['tp'][c][j] / (1 + summed_by_cls['tp'][c][j] + summed_by_cls['fp'][c][j] + summed_by_cls['fn'][c][j])
+                            for c in all_classes])
+                        for j in range(len(self.threshold_values))]
+            mean_iou_dict = {
+                'miou_best': max(mean_ious) if all_classes is not None else None,
+                'miou_0.5': mean_ious[index_0p5] if all_classes is not None else None,
+                'miou_0.1': mean_ious[index_0p1] if all_classes is not None else None,
+                'miou_0.2': mean_ious[index_0p2] if all_classes is not None else None,
+                'miou_0.3': mean_ious[index_0p3] if all_classes is not None else None,
+                'miou_best_t': self.threshold_values[np.argmax(mean_ious)],
+                'mean_iou_ct': mean_ious[index_ct] if all_classes is not None and self.custom_threshold is not None else None,
+                'mean_iou_scores': mean_ious,
+            }
+        print(f'metric computation on {(len(all_classes) if all_classes is not None else "no")} classes took {time.time() - t_start:.1f}s')
+        return {
+            'ap': ap,
+            # fgiou
+            'fgiou_best': max(fgiou_scores),
+            'fgiou_0.5': fgiou_scores[index_0p5],
+            'fgiou_0.1': fgiou_scores[index_0p1],
+            'fgiou_0.2': fgiou_scores[index_0p2],
+            'fgiou_0.3': fgiou_scores[index_0p3],
+            'fgiou_best_t': self.threshold_values[np.argmax(fgiou_scores)],
+            # mean iou
+            # biniou
+            'biniou_best': max(biniou_scores),
+            'biniou_0.5': biniou_scores[index_0p5],
+            'biniou_0.1': biniou_scores[index_0p1],
+            'biniou_0.2': biniou_scores[index_0p2],
+            'biniou_0.3': biniou_scores[index_0p3],
+            'biniou_best_t': self.threshold_values[np.argmax(biniou_scores)],
+            # custom threshold
+            'fgiou_ct': fgiou_scores[index_ct] if self.custom_threshold is not None else None,
+            'biniou_ct': biniou_scores[index_ct] if self.custom_threshold is not None else None,
+            'ct': self.custom_threshold,
+            # statistics
+            'fgiou_scores': fgiou_scores,
+            'biniou_scores': biniou_scores,
+            'precision_recall_curve': sorted(list(set(zip(recalls, precisions)))),
+            'summed_statistics': summed,
+            'summed_by_cls_statistics': summed_by_cls,
+            **mean_iou_dict
+        }
+        # ('ap', 'best_fgiou', 'best_miou', 'fgiou0.5', 'fgiou0.1', 'mean_iou_0p5', 'mean_iou_0p1', 'best_biniou', 'biniou_0.5', 'fgiou_thresh'
+        # return ap, best_fgiou, best_mean_iou, iou_0p5, iou_0p1, mean_iou_0p5, mean_iou_0p1, best_biniou, biniou0p5, best_fgiou_thresh, {'summed': summed, 'summed_by_cls': summed_by_cls}

clipseg/models/clipseg.py ADDED Viewed

	@@ -0,0 +1,552 @@

+import math
+from os.path import basename, dirname, join, isfile
+import torch
+from torch import nn
+from torch.nn import functional as nnf
+from torch.nn.modules.activation import ReLU
+def precompute_clip_vectors():
+    from trails.initialization import init_dataset
+    lvis = init_dataset('LVIS_OneShot3', split='train', mask='text_label', image_size=224, aug=1, normalize=True,
+                                       reduce_factor=None, add_bar=False, negative_prob=0.5)
+    all_names = list(lvis.category_names.values())
+    import clip
+    from models.clip_prompts import imagenet_templates
+    clip_model = clip.load("ViT-B/32", device='cuda', jit=False)[0]
+    prompt_vectors = {}
+    for name in all_names[:100]:
+        with torch.no_grad():
+            conditionals = [t.format(name).replace('_', ' ') for t in imagenet_templates]
+            text_tokens = clip.tokenize(conditionals).cuda()
+            cond = clip_model.encode_text(text_tokens).cpu()
+            for cond, vec in zip(conditionals, cond):
+                prompt_vectors[cond] = vec.cpu()
+    import pickle
+    pickle.dump(prompt_vectors, open('precomputed_prompt_vectors.pickle', 'wb'))
+def get_prompt_list(prompt):
+    if prompt == 'plain':
+        return ['{}']
+    elif prompt == 'fixed':
+        return ['a photo of a {}.']
+    elif prompt == 'shuffle':
+        return ['a photo of a {}.', 'a photograph of a {}.', 'an image of a {}.', '{}.']
+    elif prompt == 'shuffle+':
+        return ['a photo of a {}.', 'a photograph of a {}.', 'an image of a {}.', '{}.',
+                            'a cropped photo of a {}.', 'a good photo of a {}.', 'a photo of one {}.',
+                            'a bad photo of a {}.', 'a photo of the {}.']
+    elif prompt == 'shuffle_clip':
+        from models.clip_prompts import imagenet_templates
+        return imagenet_templates
+    else:
+        raise ValueError('Invalid value for prompt')
+def forward_multihead_attention(x, b, with_aff=False, attn_mask=None):
+    """
+    Simplified version of multihead attention (taken from torch source code but without tons of if clauses).
+    The mlp and layer norm come from CLIP.
+    x: input.
+    b: multihead attention module.
+    """
+    x_ = b.ln_1(x)
+    q, k, v = nnf.linear(x_, b.attn.in_proj_weight, b.attn.in_proj_bias).chunk(3, dim=-1)
+    tgt_len, bsz, embed_dim = q.size()
+    head_dim = embed_dim // b.attn.num_heads
+    scaling = float(head_dim) ** -0.5
+    q = q.contiguous().view(tgt_len, bsz * b.attn.num_heads, b.attn.head_dim).transpose(0, 1)
+    k = k.contiguous().view(-1, bsz * b.attn.num_heads, b.attn.head_dim).transpose(0, 1)
+    v = v.contiguous().view(-1, bsz * b.attn.num_heads, b.attn.head_dim).transpose(0, 1)
+    q = q * scaling
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2)) #  n_heads * batch_size, tokens^2, tokens^2
+    if attn_mask is not None:
+        attn_mask_type, attn_mask = attn_mask
+        n_heads = attn_output_weights.size(0) // attn_mask.size(0)
+        attn_mask = attn_mask.repeat(n_heads, 1)
+        if attn_mask_type == 'cls_token':
+            # the mask only affects similarities compared to the readout-token.
+            attn_output_weights[:, 0, 1:] = attn_output_weights[:, 0, 1:] * attn_mask[None,...]
+            # attn_output_weights[:, 0, 0] = 0*attn_output_weights[:, 0, 0]
+        if attn_mask_type == 'all':
+            # print(attn_output_weights.shape, attn_mask[:, None].shape)
+            attn_output_weights[:, 1:, 1:] = attn_output_weights[:, 1:, 1:] * attn_mask[:, None]
+    attn_output_weights = torch.softmax(attn_output_weights, dim=-1)
+    attn_output = torch.bmm(attn_output_weights, v)
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+    attn_output = b.attn.out_proj(attn_output)
+    x = x + attn_output
+    x = x + b.mlp(b.ln_2(x))
+    if with_aff:
+        return x, attn_output_weights
+    else:
+        return x
+class CLIPDenseBase(nn.Module):
+    def __init__(self, version, reduce_cond, reduce_dim, prompt, n_tokens):
+        super().__init__()
+        import clip
+        # prec = torch.FloatTensor
+        self.clip_model, _ = clip.load(version, device='cpu', jit=False)
+        self.model = self.clip_model.visual
+        # if not None, scale conv weights such that we obtain n_tokens.
+        self.n_tokens = n_tokens
+        for p in self.clip_model.parameters():
+            p.requires_grad_(False)
+        # conditional
+        if reduce_cond is not None:
+            self.reduce_cond = nn.Linear(512, reduce_cond)
+            for p in self.reduce_cond.parameters():
+                p.requires_grad_(False)
+        else:
+            self.reduce_cond = None
+        self.film_mul = nn.Linear(512 if reduce_cond is None else reduce_cond, reduce_dim)
+        self.film_add = nn.Linear(512 if reduce_cond is None else reduce_cond, reduce_dim)
+        self.reduce = nn.Linear(768, reduce_dim)
+        self.prompt_list = get_prompt_list(prompt)
+        # precomputed prompts
+        import pickle
+        if isfile('precomputed_prompt_vectors.pickle'):
+            precomp = pickle.load(open('precomputed_prompt_vectors.pickle', 'rb'))
+            self.precomputed_prompts = {k: torch.from_numpy(v) for k, v in precomp.items()}
+        else:
+            self.precomputed_prompts = dict()
+    def rescaled_pos_emb(self, new_size):
+        assert len(new_size) == 2
+        a = self.model.positional_embedding[1:].T.view(1, 768, *self.token_shape)
+        b = nnf.interpolate(a, new_size, mode='bicubic', align_corners=False).squeeze(0).view(768, new_size[0]*new_size[1]).T
+        return torch.cat([self.model.positional_embedding[:1], b])
+    def visual_forward(self, x_inp, extract_layers=(), skip=False, mask=None):
+        with torch.no_grad():
+            inp_size = x_inp.shape[2:]
+            if self.n_tokens is not None:
+                stride2 = x_inp.shape[2] // self.n_tokens
+                conv_weight2 = nnf.interpolate(self.model.conv1.weight, (stride2, stride2), mode='bilinear', align_corners=True)
+                x = nnf.conv2d(x_inp, conv_weight2, bias=self.model.conv1.bias, stride=stride2, dilation=self.model.conv1.dilation)
+            else:
+                x = self.model.conv1(x_inp)  # shape = [*, width, grid, grid]
+            x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+            x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+            x = torch.cat([self.model.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+            standard_n_tokens = 50 if self.model.conv1.kernel_size[0] == 32 else 197
+            if x.shape[1] != standard_n_tokens:
+                new_shape = int(math.sqrt(x.shape[1]-1))
+                x = x + self.rescaled_pos_emb((new_shape, new_shape)).to(x.dtype)[None,:,:]
+            else:
+                x = x + self.model.positional_embedding.to(x.dtype)
+            x = self.model.ln_pre(x)
+            x = x.permute(1, 0, 2)  # NLD -> LND
+            activations, affinities = [], []
+            for i, res_block in enumerate(self.model.transformer.resblocks):
+                if mask is not None:
+                    mask_layer, mask_type, mask_tensor = mask
+                    if mask_layer == i or mask_layer == 'all':
+                        # import ipdb; ipdb.set_trace()
+                        size = int(math.sqrt(x.shape[0] - 1))
+                        attn_mask = (mask_type, nnf.interpolate(mask_tensor.unsqueeze(1).float(), (size, size)).view(mask_tensor.shape[0], size * size))
+                    else:
+                        attn_mask = None
+                else:
+                    attn_mask = None
+                x, aff_per_head = forward_multihead_attention(x, res_block, with_aff=True, attn_mask=attn_mask)
+                if i in extract_layers:
+                    affinities += [aff_per_head]
+                    #if self.n_tokens is not None:
+                    #    activations += [nnf.interpolate(x, inp_size, mode='bilinear', align_corners=True)]
+                    #else:
+                    activations += [x]
+                if len(extract_layers) > 0 and i == max(extract_layers) and skip:
+                    print('early skip')
+                    break
+            x = x.permute(1, 0, 2)  # LND -> NLD
+            x = self.model.ln_post(x[:, 0, :])
+            if self.model.proj is not None:
+                x = x @ self.model.proj
+            return x, activations, affinities
+    def sample_prompts(self, words, prompt_list=None):
+        prompt_list = prompt_list if prompt_list is not None else self.prompt_list
+        prompt_indices = torch.multinomial(torch.ones(len(prompt_list)), len(words), replacement=True)
+        prompts = [prompt_list[i] for i in prompt_indices]
+        return [promt.format(w) for promt, w in zip(prompts, words)]
+    def get_cond_vec(self, conditional, batch_size):
+        # compute conditional from a single string
+        if conditional is not None and type(conditional) == str:
+            cond = self.compute_conditional(conditional)
+            cond = cond.repeat(batch_size, 1)
+        # compute conditional from string list/tuple
+        elif conditional is not None and type(conditional) in {list, tuple} and type(conditional[0]) == str:
+            assert len(conditional) == batch_size
+            cond = self.compute_conditional(conditional)
+        # use conditional directly
+        elif conditional is not None and type(conditional) == torch.Tensor and conditional.ndim == 2:
+            cond = conditional
+        # compute conditional from image
+        elif conditional is not None and type(conditional) == torch.Tensor:
+            with torch.no_grad():
+                cond, _, _ = self.visual_forward(conditional)
+        else:
+            raise ValueError('invalid conditional')
+        return cond
+    def compute_conditional(self, conditional):
+        import clip
+        dev = next(self.parameters()).device
+        if type(conditional) in {list, tuple}:
+            text_tokens = clip.tokenize(conditional).to(dev)
+            cond = self.clip_model.encode_text(text_tokens)
+        else:
+            if conditional in self.precomputed_prompts:
+                cond = self.precomputed_prompts[conditional].float().to(dev)
+            else:
+                text_tokens = clip.tokenize([conditional]).to(dev)
+                cond = self.clip_model.encode_text(text_tokens)[0]
+        if self.shift_vector is not None:
+            return cond + self.shift_vector
+        else:
+            return cond
+def clip_load_untrained(version):
+    assert version == 'ViT-B/16'
+    from clip.model import CLIP
+    from clip.clip import _MODELS, _download
+    model = torch.jit.load(_download(_MODELS['ViT-B/16'])).eval()
+    state_dict = model.state_dict()
+    vision_width = state_dict["visual.conv1.weight"].shape[0]
+    vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+    vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+    grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+    image_resolution = vision_patch_size * grid_size
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
+    return CLIP(embed_dim, image_resolution, vision_layers, vision_width, vision_patch_size,
+        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers)
+class CLIPDensePredT(CLIPDenseBase):
+    def __init__(self, version='ViT-B/32', extract_layers=(3, 6, 9), cond_layer=0, reduce_dim=128, n_heads=4, prompt='fixed',
+                 extra_blocks=0, reduce_cond=None, fix_shift=False,
+                 learn_trans_conv_only=False,  limit_to_clip_only=False, upsample=False,
+                 add_calibration=False, rev_activations=False, trans_conv=None, n_tokens=None):
+        super().__init__(version, reduce_cond, reduce_dim, prompt, n_tokens)
+        # device = 'cpu'
+        self.extract_layers = extract_layers
+        self.cond_layer = cond_layer
+        self.limit_to_clip_only = limit_to_clip_only
+        self.process_cond = None
+        self.rev_activations = rev_activations
+        depth = len(extract_layers)
+        if add_calibration:
+            self.calibration_conds = 1
+        self.upsample_proj = nn.Conv2d(reduce_dim, 1, kernel_size=1) if upsample else None
+        self.add_activation1 = True
+        self.version = version
+        self.token_shape = {'ViT-B/32': (7, 7), 'ViT-B/16': (14, 14)}[version]
+        if fix_shift:
+            # self.shift_vector = nn.Parameter(torch.load(join(dirname(basename(__file__)), 'clip_text_shift_vector.pth')), requires_grad=False)
+            self.shift_vector = nn.Parameter(torch.load(join(dirname(basename(__file__)), 'shift_text_to_vis.pth')), requires_grad=False)
+            # self.shift_vector = nn.Parameter(-1*torch.load(join(dirname(basename(__file__)), 'shift2.pth')), requires_grad=False)
+        else:
+            self.shift_vector = None
+        if trans_conv is None:
+            trans_conv_ks = {'ViT-B/32': (32, 32), 'ViT-B/16': (16, 16)}[version]
+        else:
+            # explicitly define transposed conv kernel size
+            trans_conv_ks = (trans_conv, trans_conv)
+        self.trans_conv = nn.ConvTranspose2d(reduce_dim, 1, trans_conv_ks, stride=trans_conv_ks)
+        assert len(self.extract_layers) == depth
+        self.reduces = nn.ModuleList([nn.Linear(768, reduce_dim) for _ in range(depth)])
+        self.blocks = nn.ModuleList([nn.TransformerEncoderLayer(d_model=reduce_dim, nhead=n_heads) for _ in range(len(self.extract_layers))])
+        self.extra_blocks = nn.ModuleList([nn.TransformerEncoderLayer(d_model=reduce_dim, nhead=n_heads) for _ in range(extra_blocks)])
+        # refinement and trans conv
+        if learn_trans_conv_only:
+            for p in self.parameters():
+                p.requires_grad_(False)
+            for p in self.trans_conv.parameters():
+                p.requires_grad_(True)
+        self.prompt_list = get_prompt_list(prompt)
+    def forward(self, inp_image, conditional=None, return_features=False, mask=None):
+        assert type(return_features) == bool
+        inp_image = inp_image.to(self.model.positional_embedding.device)
+        if mask is not None:
+            raise ValueError('mask not supported')
+        # x_inp = normalize(inp_image)
+        x_inp = inp_image
+        bs, dev = inp_image.shape[0], x_inp.device
+        cond = self.get_cond_vec(conditional, bs)
+        visual_q, activations, _ = self.visual_forward(x_inp, extract_layers=[0] + list(self.extract_layers))
+        activation1 = activations[0]
+        activations = activations[1:]
+        _activations = activations[::-1] if not self.rev_activations else activations
+        a = None
+        for i, (activation, block, reduce) in enumerate(zip(_activations, self.blocks, self.reduces)):
+            if a is not None:
+                a = reduce(activation) + a
+            else:
+                a = reduce(activation)
+            if i == self.cond_layer:
+                if self.reduce_cond is not None:
+                    cond = self.reduce_cond(cond)
+                a = self.film_mul(cond) * a + self.film_add(cond)
+            a = block(a)
+        for block in self.extra_blocks:
+            a = a + block(a)
+        a = a[1:].permute(1, 2, 0) # rm cls token and -> BS, Feats, Tokens
+        size = int(math.sqrt(a.shape[2]))
+        a = a.view(bs, a.shape[1], size, size)
+        a = self.trans_conv(a)
+        if self.n_tokens is not None:
+            a = nnf.interpolate(a, x_inp.shape[2:], mode='bilinear', align_corners=True)
+        if self.upsample_proj is not None:
+            a = self.upsample_proj(a)
+            a = nnf.interpolate(a, x_inp.shape[2:], mode='bilinear')
+        if return_features:
+            return a, visual_q, cond, [activation1] + activations
+        else:
+            return a,
+class CLIPDensePredTMasked(CLIPDensePredT):
+    def __init__(self, version='ViT-B/32', extract_layers=(3, 6, 9), cond_layer=0, reduce_dim=128, n_heads=4,
+                 prompt='fixed', extra_blocks=0, reduce_cond=None, fix_shift=False, learn_trans_conv_only=False,
+                 refine=None, limit_to_clip_only=False, upsample=False, add_calibration=False, n_tokens=None):
+        super().__init__(version=version, extract_layers=extract_layers, cond_layer=cond_layer, reduce_dim=reduce_dim,
+                         n_heads=n_heads, prompt=prompt, extra_blocks=extra_blocks, reduce_cond=reduce_cond,
+                         fix_shift=fix_shift, learn_trans_conv_only=learn_trans_conv_only,
+                         limit_to_clip_only=limit_to_clip_only, upsample=upsample, add_calibration=add_calibration,
+                         n_tokens=n_tokens)
+    def visual_forward_masked(self, img_s, seg_s):
+        return super().visual_forward(img_s, mask=('all', 'cls_token', seg_s))
+    def forward(self, img_q, cond_or_img_s, seg_s=None, return_features=False):
+        if seg_s is None:
+            cond = cond_or_img_s
+        else:
+            img_s = cond_or_img_s
+            with torch.no_grad():
+                cond, _, _ = self.visual_forward_masked(img_s, seg_s)
+        return super().forward(img_q, cond, return_features=return_features)
+class CLIPDenseBaseline(CLIPDenseBase):
+    def __init__(self, version='ViT-B/32', cond_layer=0,
+                extract_layer=9, reduce_dim=128, reduce2_dim=None, prompt='fixed',
+                 reduce_cond=None, limit_to_clip_only=False, n_tokens=None):
+        super().__init__(version, reduce_cond, reduce_dim, prompt, n_tokens)
+        device = 'cpu'
+        # self.cond_layer = cond_layer
+        self.extract_layer = extract_layer
+        self.limit_to_clip_only = limit_to_clip_only
+        self.shift_vector = None
+        self.token_shape = {'ViT-B/32': (7, 7), 'ViT-B/16': (14, 14)}[version]
+        assert reduce2_dim is not None
+        self.reduce2 = nn.Sequential(
+            nn.Linear(reduce_dim, reduce2_dim),
+            nn.ReLU(),
+            nn.Linear(reduce2_dim, reduce_dim)
+        )
+        trans_conv_ks = {'ViT-B/32': (32, 32), 'ViT-B/16': (16, 16)}[version]
+        self.trans_conv = nn.ConvTranspose2d(reduce_dim, 1, trans_conv_ks, stride=trans_conv_ks)
+    def forward(self, inp_image, conditional=None, return_features=False):
+        inp_image = inp_image.to(self.model.positional_embedding.device)
+        # x_inp = normalize(inp_image)
+        x_inp = inp_image
+        bs, dev = inp_image.shape[0], x_inp.device
+        cond = self.get_cond_vec(conditional, bs)
+        visual_q, activations, affinities = self.visual_forward(x_inp, extract_layers=[self.extract_layer])
+        a = activations[0]
+        a = self.reduce(a)
+        a = self.film_mul(cond) * a + self.film_add(cond)
+        if self.reduce2 is not None:
+            a = self.reduce2(a)
+        # the original model would execute a transformer block here
+        a = a[1:].permute(1, 2, 0) # rm cls token and -> BS, Feats, Tokens
+        size = int(math.sqrt(a.shape[2]))
+        a = a.view(bs, a.shape[1], size, size)
+        a = self.trans_conv(a)
+        if return_features:
+            return a, visual_q, cond, activations
+        else:
+            return a,
+class CLIPSegMultiLabel(nn.Module):
+    def __init__(self, model) -> None:
+        super().__init__()
+        from third_party.JoEm.data_loader import get_seen_idx, get_unseen_idx, VOC
+        self.pascal_classes = VOC
+        from models.clipseg import CLIPDensePredT
+        from general_utils import load_model
+        # self.clipseg = load_model('rd64-vit16-neg0.2-phrasecut', strict=False)
+        self.clipseg = load_model(model, strict=False)
+        self.clipseg.eval()
+    def forward(self, x):
+        bs = x.shape[0]
+        out = torch.ones(21, bs, 352, 352).to(x.device) * -10
+        for class_id, class_name in enumerate(self.pascal_classes):
+            fac = 3 if class_name == 'background' else 1
+            with torch.no_grad():
+                pred = torch.sigmoid(self.clipseg(x, class_name)[0][:,0]) * fac
+            out[class_id] += pred
+        out = out.permute(1, 0, 2, 3)
+        return out
+        # construct output tensor

clipseg/models/vitseg.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import math
+from posixpath import basename, dirname, join
+# import clip
+from clip.model import convert_weights
+import torch
+import json
+from torch import nn
+from torch.nn import functional as nnf
+from torch.nn.modules import activation
+from torch.nn.modules.activation import ReLU
+from torchvision import transforms
+normalize = transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
+from torchvision.models import ResNet
+def process_prompts(conditional, prompt_list, conditional_map):
+    # DEPRECATED
+    # randomly sample a synonym
+    words = [conditional_map[int(i)] for i in conditional]
+    words = [syns[torch.multinomial(torch.ones(len(syns)), 1, replacement=True).item()] for syns in words]
+    words = [w.replace('_', ' ') for w in words]
+    if prompt_list is not None:
+        prompt_indices = torch.multinomial(torch.ones(len(prompt_list)), len(words), replacement=True)
+        prompts = [prompt_list[i] for i in prompt_indices]
+    else:
+        prompts = ['a photo of {}'] * (len(words))
+    return [promt.format(w) for promt, w in zip(prompts, words)]
+class VITDenseBase(nn.Module):
+    def rescaled_pos_emb(self, new_size):
+        assert len(new_size) == 2
+        a = self.model.positional_embedding[1:].T.view(1, 768, *self.token_shape)
+        b = nnf.interpolate(a, new_size, mode='bicubic', align_corners=False).squeeze(0).view(768, new_size[0]*new_size[1]).T
+        return torch.cat([self.model.positional_embedding[:1], b])
+    def visual_forward(self, x_inp, extract_layers=(), skip=False, mask=None):
+        with torch.no_grad():
+            x_inp = nnf.interpolate(x_inp, (384, 384))
+            x = self.model.patch_embed(x_inp)
+            cls_token = self.model.cls_token.expand(x.shape[0], -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+            if self.model.dist_token is None:
+                x = torch.cat((cls_token, x), dim=1)
+            else:
+                x = torch.cat((cls_token, self.model.dist_token.expand(x.shape[0], -1, -1), x), dim=1)
+            x = self.model.pos_drop(x + self.model.pos_embed)
+            activations = []
+            for i, block in enumerate(self.model.blocks):
+                x = block(x)
+                if i in extract_layers:
+                    # permute to be compatible with CLIP
+                    activations += [x.permute(1,0,2)]
+            x = self.model.norm(x)
+            x = self.model.head(self.model.pre_logits(x[:, 0]))
+            # again for CLIP compatibility
+            # x = x.permute(1, 0, 2)
+        return x, activations, None
+    def sample_prompts(self, words, prompt_list=None):
+        prompt_list = prompt_list if prompt_list is not None else self.prompt_list
+        prompt_indices = torch.multinomial(torch.ones(len(prompt_list)), len(words), replacement=True)
+        prompts = [prompt_list[i] for i in prompt_indices]
+        return [promt.format(w) for promt, w in zip(prompts, words)]
+    def get_cond_vec(self, conditional, batch_size):
+        # compute conditional from a single string
+        if conditional is not None and type(conditional) == str:
+            cond = self.compute_conditional(conditional)
+            cond = cond.repeat(batch_size, 1)
+        # compute conditional from string list/tuple
+        elif conditional is not None and type(conditional) in {list, tuple} and type(conditional[0]) == str:
+            assert len(conditional) == batch_size
+            cond = self.compute_conditional(conditional)
+        # use conditional directly
+        elif conditional is not None and type(conditional) == torch.Tensor and conditional.ndim == 2:
+            cond = conditional
+        # compute conditional from image
+        elif conditional is not None and type(conditional) == torch.Tensor:
+            with torch.no_grad():
+                cond, _, _ = self.visual_forward(conditional)
+        else:
+            raise ValueError('invalid conditional')
+        return cond
+    def compute_conditional(self, conditional):
+        import clip
+        dev = next(self.parameters()).device
+        if type(conditional) in {list, tuple}:
+            text_tokens = clip.tokenize(conditional).to(dev)
+            cond = self.clip_model.encode_text(text_tokens)
+        else:
+            if conditional in self.precomputed_prompts:
+                cond = self.precomputed_prompts[conditional].float().to(dev)
+            else:
+                text_tokens = clip.tokenize([conditional]).to(dev)
+                cond = self.clip_model.encode_text(text_tokens)[0]
+        return cond
+class VITDensePredT(VITDenseBase):
+    def __init__(self, extract_layers=(3, 6, 9), cond_layer=0, reduce_dim=128, n_heads=4, prompt='fixed',
+                 depth=3, extra_blocks=0, reduce_cond=None, fix_shift=False,
+                 learn_trans_conv_only=False, refine=None, limit_to_clip_only=False, upsample=False,
+                 add_calibration=False, process_cond=None, not_pretrained=False):
+        super().__init__()
+        # device = 'cpu'
+        self.extract_layers = extract_layers
+        self.cond_layer = cond_layer
+        self.limit_to_clip_only = limit_to_clip_only
+        self.process_cond = None
+        if add_calibration:
+            self.calibration_conds = 1
+        self.upsample_proj = nn.Conv2d(reduce_dim, 1, kernel_size=1) if upsample else None
+        self.add_activation1 = True
+        import timm
+        self.model = timm.create_model('vit_base_patch16_384', pretrained=True)
+        self.model.head = nn.Linear(768, 512 if reduce_cond is None else reduce_cond)
+        for p in self.model.parameters():
+            p.requires_grad_(False)
+        import clip
+        self.clip_model, _ = clip.load('ViT-B/16', device='cpu', jit=False)
+        # del self.clip_model.visual
+        self.token_shape = (14, 14)
+        # conditional
+        if reduce_cond is not None:
+            self.reduce_cond = nn.Linear(512, reduce_cond)
+            for p in self.reduce_cond.parameters():
+                p.requires_grad_(False)
+        else:
+            self.reduce_cond = None
+        # self.film = AVAILABLE_BLOCKS['film'](512, 128)
+        self.film_mul = nn.Linear(512 if reduce_cond is None else reduce_cond, reduce_dim)
+        self.film_add = nn.Linear(512 if reduce_cond is None else reduce_cond, reduce_dim)
+        # DEPRECATED
+        # self.conditional_map = {c['id']: c['synonyms'] for c in json.load(open(cond_map))}
+        assert len(self.extract_layers) == depth
+        self.reduces = nn.ModuleList([nn.Linear(768, reduce_dim) for _ in range(depth)])
+        self.blocks = nn.ModuleList([nn.TransformerEncoderLayer(d_model=reduce_dim, nhead=n_heads) for _ in range(len(self.extract_layers))])
+        self.extra_blocks = nn.ModuleList([nn.TransformerEncoderLayer(d_model=reduce_dim, nhead=n_heads) for _ in range(extra_blocks)])
+        trans_conv_ks = (16, 16)
+        self.trans_conv = nn.ConvTranspose2d(reduce_dim, 1, trans_conv_ks, stride=trans_conv_ks)
+        # refinement and trans conv
+        if learn_trans_conv_only:
+            for p in self.parameters():
+                p.requires_grad_(False)
+            for p in self.trans_conv.parameters():
+                p.requires_grad_(True)
+        if prompt == 'fixed':
+            self.prompt_list = ['a photo of a {}.']
+        elif prompt == 'shuffle':
+            self.prompt_list = ['a photo of a {}.', 'a photograph of a {}.', 'an image of a {}.', '{}.']
+        elif prompt == 'shuffle+':
+            self.prompt_list = ['a photo of a {}.', 'a photograph of a {}.', 'an image of a {}.', '{}.',
+                                'a cropped photo of a {}.', 'a good photo of a {}.', 'a photo of one {}.',
+                                'a bad photo of a {}.', 'a photo of the {}.']
+        elif prompt == 'shuffle_clip':
+            from models.clip_prompts import imagenet_templates
+            self.prompt_list = imagenet_templates
+        if process_cond is not None:
+            if process_cond == 'clamp' or process_cond[0] == 'clamp':
+                val = process_cond[1] if type(process_cond) in {list, tuple} else 0.2
+                def clamp_vec(x):
+                    return torch.clamp(x, -val, val)
+                self.process_cond = clamp_vec
+            elif process_cond.endswith('.pth'):
+                shift = torch.load(process_cond)
+                def add_shift(x):
+                    return x + shift.to(x.device)
+                self.process_cond = add_shift
+        import pickle
+        precomp = pickle.load(open('precomputed_prompt_vectors.pickle', 'rb'))
+        self.precomputed_prompts = {k: torch.from_numpy(v) for k, v in precomp.items()}
+    def forward(self, inp_image, conditional=None, return_features=False, mask=None):
+        assert type(return_features) == bool
+        # inp_image = inp_image.to(self.model.positional_embedding.device)
+        if mask is not None:
+            raise ValueError('mask not supported')
+        # x_inp = normalize(inp_image)
+        x_inp = inp_image
+        bs, dev = inp_image.shape[0], x_inp.device
+        inp_image_size = inp_image.shape[2:]
+        cond = self.get_cond_vec(conditional, bs)
+        visual_q, activations, _ = self.visual_forward(x_inp, extract_layers=[0] + list(self.extract_layers))
+        activation1 = activations[0]
+        activations = activations[1:]
+        a = None
+        for i, (activation, block, reduce) in enumerate(zip(activations[::-1], self.blocks, self.reduces)):
+            if a is not None:
+                a = reduce(activation) + a
+            else:
+                a = reduce(activation)
+            if i == self.cond_layer:
+                if self.reduce_cond is not None:
+                    cond = self.reduce_cond(cond)
+                a = self.film_mul(cond) * a + self.film_add(cond)
+            a = block(a)
+        for block in self.extra_blocks:
+            a = a + block(a)
+        a = a[1:].permute(1, 2, 0) # rm cls token and -> BS, Feats, Tokens
+        size = int(math.sqrt(a.shape[2]))
+        a = a.view(bs, a.shape[1], size, size)
+        if self.trans_conv is not None:
+            a = self.trans_conv(a)
+        if self.upsample_proj is not None:
+            a = self.upsample_proj(a)
+            a = nnf.interpolate(a, x_inp.shape[2:], mode='bilinear')
+        a = nnf.interpolate(a, inp_image_size)
+        if return_features:
+            return a, visual_q, cond, [activation1] + activations
+        else:
+            return a,

clipseg/overview.png ADDED Viewed

clipseg/score.py ADDED Viewed

	@@ -0,0 +1,453 @@

+from torch.functional import Tensor
+import torch
+import inspect
+import json
+import yaml
+import time
+import sys
+from general_utils import log
+import numpy as np
+from os.path import expanduser, join, isfile, realpath
+from torch.utils.data import DataLoader
+from metrics import FixedIntervalMetrics
+from general_utils import load_model, log, score_config_from_cli_args, AttributeDict, get_attribute, filter_args
+DATASET_CACHE = dict()
+def load_model(checkpoint_id, weights_file=None, strict=True, model_args='from_config', with_config=False, ignore_weights=False):
+    config = json.load(open(join('logs', checkpoint_id, 'config.json')))
+    if model_args != 'from_config' and type(model_args) != dict:
+        raise ValueError('model_args must either be "from_config" or a dictionary of values')
+    model_cls = get_attribute(config['model'])
+    # load model
+    if model_args == 'from_config':
+        _, model_args, _ = filter_args(config, inspect.signature(model_cls).parameters)
+    model = model_cls(**model_args)
+    if weights_file is None:
+        weights_file = realpath(join('logs', checkpoint_id, 'weights.pth'))
+    else:
+        weights_file = realpath(join('logs', checkpoint_id, weights_file))
+    if isfile(weights_file) and not ignore_weights:
+        weights = torch.load(weights_file)
+        for _, w in weights.items():
+            assert not torch.any(torch.isnan(w)), 'weights contain NaNs'
+        model.load_state_dict(weights, strict=strict)
+    else:
+        if not ignore_weights:
+            raise FileNotFoundError(f'model checkpoint {weights_file} was not found')
+    if with_config:
+        return model, config
+    return model
+def compute_shift2(model, datasets, seed=123, repetitions=1):
+    """ computes shift """
+    model.eval()
+    model.cuda()
+    import random
+    random.seed(seed)
+    preds, gts = [], []
+    for i_dataset, dataset in enumerate(datasets):
+        loader = DataLoader(dataset, batch_size=1, num_workers=0, shuffle=False, drop_last=False)
+        max_iterations = int(repetitions * len(dataset.dataset.data_list))
+        with torch.no_grad():
+            i, losses = 0, []
+            for i_all, (data_x, data_y) in enumerate(loader):
+                data_x = [v.cuda(non_blocking=True) if v is not None else v for v in data_x]
+                data_y = [v.cuda(non_blocking=True) if v is not None else v for v in data_y]
+                pred, = model(data_x[0], data_x[1], data_x[2])
+                preds += [pred.detach()]
+                gts += [data_y]
+                i += 1
+                if max_iterations and i >= max_iterations:
+                    break
+    from metrics import FixedIntervalMetrics
+    n_values = 51
+    thresholds = np.linspace(0, 1, n_values)[1:-1]
+    metric = FixedIntervalMetrics(resize_pred=True, sigmoid=True, n_values=n_values)
+    for p, y in zip(preds, gts):
+        metric.add(p.unsqueeze(1), y)
+    best_idx = np.argmax(metric.value()['fgiou_scores'])
+    best_thresh = thresholds[best_idx]
+    return best_thresh
+def get_cached_pascal_pfe(split, config):
+    from datasets.pfe_dataset import PFEPascalWrapper
+    try:
+        dataset =  DATASET_CACHE[(split, config.image_size, config.label_support, config.mask)]
+    except KeyError:
+        dataset = PFEPascalWrapper(mode='val', split=split, mask=config.mask, image_size=config.image_size, label_support=config.label_support)
+        DATASET_CACHE[(split, config.image_size, config.label_support, config.mask)] = dataset
+    return dataset
+def main():
+    config, train_checkpoint_id = score_config_from_cli_args()
+    metrics = score(config, train_checkpoint_id, None)
+    for dataset in metrics.keys():
+        for k in metrics[dataset]:
+            if type(metrics[dataset][k]) in {float, int}:
+                print(dataset, f'{k:<16} {metrics[dataset][k]:.3f}')
+def score(config, train_checkpoint_id, train_config):
+    config = AttributeDict(config)
+    print(config)
+    # use training dataset and loss
+    train_config = AttributeDict(json.load(open(f'logs/{train_checkpoint_id}/config.json')))
+    cp_str = f'_{config.iteration_cp}' if config.iteration_cp is not None else ''
+    model_cls = get_attribute(train_config['model'])
+    _, model_args, _ = filter_args(train_config, inspect.signature(model_cls).parameters)
+    model_args = {**model_args, **{k: config[k] for k in ['process_cond', 'fix_shift'] if k in config}}
+    strict_models = {'ConditionBase4', 'PFENetWrapper'}
+    model = load_model(train_checkpoint_id, strict=model_cls.__name__ in strict_models, model_args=model_args,
+                        weights_file=f'weights{cp_str}.pth', )
+    model.eval()
+    model.cuda()
+    metric_args = dict()
+    if 'threshold' in config:
+        if config.metric.split('.')[-1] == 'SkLearnMetrics':
+            metric_args['threshold'] = config.threshold
+    if 'resize_to' in config:
+        metric_args['resize_to'] = config.resize_to
+    if 'sigmoid' in config:
+        metric_args['sigmoid'] = config.sigmoid
+    if 'custom_threshold' in config:
+        metric_args['custom_threshold'] = config.custom_threshold
+    if config.test_dataset == 'pascal':
+        loss_fn = get_attribute(train_config.loss)
+        # assume that if no split is specified in train_config, test on all splits,
+        if 'splits' in config:
+            splits = config.splits
+        else:
+            if 'split' in train_config and type(train_config.split) == int:
+                # unless train_config has a split set, in that case assume train mode in training
+                splits = [train_config.split]
+                assert train_config.mode == 'train'
+            else:
+                splits = [0,1,2,3]
+        log.info('Test on these splits', splits)
+        scores = dict()
+        for split in splits:
+            shift = config.shift if 'shift' in config else 0
+            # automatic shift
+            if shift == 'auto':
+                shift_compute_t = time.time()
+                shift = compute_shift2(model, [get_cached_pascal_pfe(s, config) for s in range(4) if s != split], repetitions=config.compute_shift_fac)
+                log.info(f'Best threshold is {shift}, computed on splits: {[s for s in range(4) if s != split]}, took {time.time() - shift_compute_t:.1f}s')
+            dataset = get_cached_pascal_pfe(split, config)
+            eval_start_t = time.time()
+            loader = DataLoader(dataset, batch_size=1, num_workers=0, shuffle=False, drop_last=False)
+            assert config.batch_size is None or config.batch_size == 1, 'When PFE Dataset is used, batch size must be 1'
+            metric = FixedIntervalMetrics(resize_pred=True, sigmoid=True, custom_threshold=shift, **metric_args)
+            with torch.no_grad():
+                i, losses = 0, []
+                for i_all, (data_x, data_y) in enumerate(loader):
+                    data_x = [v.cuda(non_blocking=True) if isinstance(v, torch.Tensor) else v for v in data_x]
+                    data_y = [v.cuda(non_blocking=True) if isinstance(v, torch.Tensor) else v for v in data_y]
+                    if config.mask == 'separate':  # for old CondBase model
+                        pred, = model(data_x[0], data_x[1], data_x[2])
+                    else:
+                        # assert config.mask in {'text', 'highlight'}
+                        pred, _, _, _  = model(data_x[0], data_x[1], return_features=True)
+                    # loss = loss_fn(pred, data_y[0])
+                    metric.add(pred.unsqueeze(1) + shift, data_y)
+                    # losses += [float(loss)]
+                    i += 1
+                    if config.max_iterations and i >= config.max_iterations:
+                        break
+            #scores[split] = {m: s for m, s in zip(metric.names(), metric.value())}
+            log.info(f'Dataset length: {len(dataset)}, took {time.time() - eval_start_t:.1f}s to evaluate.')
+            print(metric.value()['mean_iou_scores'])
+            scores[split] = metric.scores()
+            log.info(f'Completed split {split}')
+        key_prefix = config['name'] if 'name' in config else 'pas'
+        all_keys = set.intersection(*[set(v.keys()) for v in scores.values()])
+        valid_keys = [k for k in all_keys if all(v[k] is not None and isinstance(v[k], (int, float, np.float)) for v in scores.values())]
+        return {key_prefix: {k: np.mean([s[k] for s in scores.values()]) for k in valid_keys}}
+    if config.test_dataset == 'coco':
+        from datasets.coco_wrapper import COCOWrapper
+        coco_dataset = COCOWrapper('test', fold=train_config.fold, image_size=train_config.image_size, mask=config.mask,
+                                    with_class_label=True)
+        log.info('Dataset length', len(coco_dataset))
+        loader = DataLoader(coco_dataset, batch_size=config.batch_size, num_workers=2, shuffle=False, drop_last=False)
+        metric = get_attribute(config.metric)(resize_pred=True, **metric_args)
+        shift = config.shift if 'shift' in config else 0
+        with torch.no_grad():
+            i, losses = 0, []
+            for i_all, (data_x, data_y) in enumerate(loader):
+                data_x = [v.cuda(non_blocking=True) if isinstance(v, torch.Tensor) else v for v in data_x]
+                data_y = [v.cuda(non_blocking=True) if isinstance(v, torch.Tensor) else v for v in data_y]
+                if config.mask == 'separate':  # for old CondBase model
+                    pred, = model(data_x[0], data_x[1], data_x[2])
+                else:
+                    # assert config.mask in {'text', 'highlight'}
+                    pred, _, _, _  = model(data_x[0], data_x[1], return_features=True)
+                metric.add([pred + shift], data_y)
+                i += 1
+                if config.max_iterations and i >= config.max_iterations:
+                    break
+        key_prefix = config['name'] if 'name' in config else 'coco'
+        return {key_prefix: metric.scores()}
+        #return {key_prefix: {k: v for k, v in zip(metric.names(), metric.value())}}
+    if config.test_dataset == 'phrasecut':
+        from datasets.phrasecut import PhraseCut
+        only_visual = config.only_visual is not None and config.only_visual
+        with_visual = config.with_visual is not None and config.with_visual
+        dataset = PhraseCut('test',
+                            image_size=train_config.image_size,
+                            mask=config.mask,
+                            with_visual=with_visual, only_visual=only_visual, aug_crop=False,
+                            aug_color=False)
+        loader = DataLoader(dataset, batch_size=config.batch_size, num_workers=2, shuffle=False, drop_last=False)
+        metric = get_attribute(config.metric)(resize_pred=True, **metric_args)
+        shift = config.shift if 'shift' in config else 0
+        with torch.no_grad():
+            i, losses = 0, []
+            for i_all, (data_x, data_y) in enumerate(loader):
+                data_x = [v.cuda(non_blocking=True) if isinstance(v, torch.Tensor) else v for v in data_x]
+                data_y = [v.cuda(non_blocking=True) if isinstance(v, torch.Tensor) else v for v in data_y]
+                pred, _, _, _  = model(data_x[0], data_x[1], return_features=True)
+                metric.add([pred + shift], data_y)
+                i += 1
+                if config.max_iterations and i >= config.max_iterations:
+                    break
+        key_prefix = config['name'] if 'name' in config else 'phrasecut'
+        return {key_prefix: metric.scores()}
+        #return {key_prefix: {k: v for k, v in zip(metric.names(), metric.value())}}
+    if config.test_dataset == 'pascal_zs':
+        from third_party.JoEm.model.metric import Evaluator
+        from third_party.JoEm.data_loader import get_seen_idx, get_unseen_idx, VOC
+        from datasets.pascal_zeroshot import PascalZeroShot, PASCAL_VOC_CLASSES_ZS
+        from models.clipseg import CLIPSegMultiLabel
+        n_unseen = train_config.remove_classes[1]
+        pz = PascalZeroShot('val', n_unseen, image_size=352)
+        m = CLIPSegMultiLabel(model=train_config.name).cuda()
+        m.eval();
+        print(len(pz), n_unseen)
+        print('training removed', [c for class_set in PASCAL_VOC_CLASSES_ZS[:n_unseen // 2] for c in class_set])
+        print('unseen', [VOC[i] for i in get_unseen_idx(n_unseen)])
+        print('seen', [VOC[i] for i in get_seen_idx(n_unseen)])
+        loader = DataLoader(pz, batch_size=8)
+        evaluator = Evaluator(21, get_unseen_idx(n_unseen), get_seen_idx(n_unseen))
+        for i, (data_x, data_y) in enumerate(loader):
+            pred = m(data_x[0].cuda())
+            evaluator.add_batch(data_y[0].numpy(), pred.argmax(1).cpu().detach().numpy())
+            if config.max_iter is not None and i > config.max_iter:
+                break
+        scores = evaluator.Mean_Intersection_over_Union()
+        key_prefix = config['name'] if 'name' in config else 'pas_zs'
+        return {key_prefix: {k: scores[k] for k in ['seen', 'unseen', 'harmonic', 'overall']}}
+    elif config.test_dataset in {'same_as_training', 'affordance'}:
+        loss_fn = get_attribute(train_config.loss)
+        metric_cls = get_attribute(config.metric)
+        metric = metric_cls(**metric_args)
+        if config.test_dataset == 'same_as_training':
+            dataset_cls = get_attribute(train_config.dataset)
+        elif config.test_dataset == 'affordance':
+            dataset_cls = get_attribute('datasets.lvis_oneshot3.LVIS_Affordance')
+            dataset_name = 'aff'
+        else:
+            dataset_cls = get_attribute('datasets.lvis_oneshot3.LVIS_OneShot')
+            dataset_name = 'lvis'
+        _, dataset_args, _ = filter_args(config, inspect.signature(dataset_cls).parameters)
+        dataset_args['image_size'] = train_config.image_size  # explicitly use training image size for evaluation
+        if model.__class__.__name__ == 'PFENetWrapper':
+            dataset_args['image_size'] = config.image_size
+        log.info('init dataset', str(dataset_cls))
+        dataset = dataset_cls(**dataset_args)
+        log.info(f'Score on {model.__class__.__name__} on {dataset_cls.__name__}')
+        data_loader = torch.utils.data.DataLoader(dataset, batch_size=config.batch_size, shuffle=config.shuffle)
+        # explicitly set prompts
+        if config.prompt == 'plain':
+            model.prompt_list = ['{}']
+        elif config.prompt == 'fixed':
+            model.prompt_list = ['a photo of a {}.']
+        elif config.prompt == 'shuffle':
+            model.prompt_list = ['a photo of a {}.', 'a photograph of a {}.', 'an image of a {}.', '{}.']
+        elif config.prompt == 'shuffle_clip':
+            from models.clip_prompts import imagenet_templates
+            model.prompt_list = imagenet_templates
+        config.assume_no_unused_keys(exceptions=['max_iterations'])
+        t_start = time.time()
+        with torch.no_grad():  # TODO: switch to inference_mode (torch 1.9)
+            i, losses = 0, []
+            for data_x, data_y in data_loader:
+                data_x = [x.cuda() if isinstance(x, torch.Tensor) else x for x in data_x]
+                data_y = [x.cuda() if isinstance(x, torch.Tensor) else x for x in data_y]
+                if model.__class__.__name__ in {'ConditionBase4', 'PFENetWrapper'}:
+                    pred, = model(data_x[0], data_x[1], data_x[2])
+                    visual_q = None
+                else:
+                    pred, visual_q, _, _  = model(data_x[0], data_x[1], return_features=True)
+                loss = loss_fn(pred, data_y[0])
+                metric.add([pred], data_y)
+                losses += [float(loss)]
+                i += 1
+                if config.max_iterations and i >= config.max_iterations:
+                    break
+        # scores = {m: s for m, s in zip(metric.names(), metric.value())}
+        scores = metric.scores()
+        keys = set(scores.keys())
+        if dataset.negative_prob > 0 and 'mIoU' in keys:
+            keys.remove('mIoU')
+        name_mask = dataset.mask.replace('text_label', 'txt')[:3]
+        name_neg = '' if dataset.negative_prob == 0 else '_' + str(dataset.negative_prob)
+        score_name = config.name if 'name' in config else f'{dataset_name}_{name_mask}{name_neg}'
+        scores = {score_name: {k: v for k,v in scores.items() if k in keys}}
+        scores[score_name].update({'test_loss': np.mean(losses)})
+        log.info(f'Evaluation took {time.time() - t_start:.1f}s')
+        return scores
+    else:
+        raise ValueError('invalid test dataset')
+if __name__ == '__main__':
+    main()

clipseg/setup.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from setuptools import setup
+with open("README.md", "r", encoding="utf-8") as readme_file:
+    readme = readme_file.read()
+requirements = [
+    "numpy",
+    "scipy",
+    "matplotlib",
+    "torch",
+    "torchvision",
+    "opencv-python",
+    "CLIP @ git+https://github.com/openai/CLIP.git"
+]
+setup(
+    name='clipseg',
+    packages=['clipseg'],
+    package_dir={'clipseg': 'models'},
+    package_data={'clipseg': [
+        "../weights/*.pth",
+    ]},
+    version='0.0.1',
+    url='https://github.com/timojl/clipseg',
+    python_requires='>=3.9',
+    install_requires=requirements,
+    description='This repository contains the code used in the paper "Image Segmentation Using Text and Image Prompts".',
+    long_description=readme,
+    long_description_content_type="text/markdown",
+)

clipseg/training.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import torch
+import inspect
+import json
+import yaml
+import math
+import os
+import sys
+from general_utils import log
+import numpy as np
+from functools import partial
+from os.path import expanduser, join, isfile, basename
+from torch.cuda.amp import autocast, GradScaler
+from torch.optim.lr_scheduler import LambdaLR
+from contextlib import nullcontext
+from torch.utils.data import DataLoader
+from general_utils import TrainingLogger, get_attribute, filter_args, log, training_config_from_cli_args
+def cosine_warmup_lr(i, warmup=10, max_iter=90):
+    """ Cosine LR with Warmup """
+    if i < warmup:
+        return (i+1)/(warmup+1)
+    else:
+        return 0.5 + 0.5*math.cos(math.pi*(((i-warmup)/(max_iter- warmup))))
+def validate(model, dataset, config):
+    data_loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=False)
+    metric_class, use_metric = config.val_metric_class, config.use_val_metric
+    loss_fn = get_attribute(config.loss)
+    model.eval()
+    model.cuda()
+    if metric_class is not None:
+        metric = get_attribute(metric_class)()
+    with torch.no_grad():
+        i, losses = 0, []
+        for data_x, data_y in data_loader:
+            data_x = [x.cuda() if isinstance(x, torch.Tensor) else x for x in data_x]
+            data_y = [x.cuda() if isinstance(x, torch.Tensor) else x for x in data_y]
+            prompts = model.sample_prompts(data_x[1], prompt_list=('a photo of a {}',))
+            pred, visual_q, _, _  = model(data_x[0], prompts, return_features=True)
+            if metric_class is not None:
+                metric.add([pred], data_y)
+            # pred = model(data_x[0], prompts)
+            # loss = loss_fn(pred[0], data_y[0])
+            loss = loss_fn(pred, data_y[0])
+            losses += [float(loss)]
+            i += 1
+            if config.val_max_iterations is not None and i > config.val_max_iterations:
+                break
+    if use_metric is None:
+        return np.mean(losses), {}, False
+    else:
+        metric_scores = {m: s for m, s in zip(metric.names(), metric.value())} if metric is not None else {}
+        return np.mean(losses), metric_scores, True
+def main():
+    config = training_config_from_cli_args()
+    val_interval, best_val_loss, best_val_score = config.val_interval, float('inf'), float('-inf')
+    model_cls = get_attribute(config.model)
+    _, model_args, _ = filter_args(config, inspect.signature(model_cls).parameters)
+    model = model_cls(**model_args).cuda()
+    dataset_cls = get_attribute(config.dataset)
+    _, dataset_args, _ = filter_args(config, inspect.signature(dataset_cls).parameters)
+    dataset = dataset_cls(**dataset_args)
+    log.info(f'Train dataset {dataset.__class__.__name__} (length: {len(dataset)})')
+    if val_interval is not None:
+        dataset_val_args = {k[4:]: v for k,v in config.items() if k.startswith('val_') and k != 'val_interval'}
+        _, dataset_val_args, _ = filter_args(dataset_val_args, inspect.signature(dataset_cls).parameters)
+        print('val args', {**dataset_args, **{'split': 'val', 'aug': 0}, **dataset_val_args})
+        dataset_val = dataset_cls(**{**dataset_args, **{'split': 'val', 'aug': 0}, **dataset_val_args})
+    # optimizer
+    opt_cls = get_attribute(config.optimizer)
+    if config.optimize == 'torch.optim.SGD':
+        opt_args = {'momentum': config.momentum if 'momentum' in config else 0}
+    else:
+        opt_args = {}
+    opt = opt_cls(model.parameters(), lr=config.lr, **opt_args)
+    if config.lr_scheduler == 'cosine':
+        assert config.T_max is not None and config.eta_min is not None
+        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, config.T_max, config.eta_min)
+    elif config.lr_scheduler == 'warmup_cosine':
+        lr_scheduler = LambdaLR(opt, partial(cosine_warmup_lr, max_iter=(config.max_iterations), warmup=config.warmup))
+    else:
+        lr_scheduler = None
+    batch_size, max_iterations = config.batch_size, config.max_iterations
+    loss_fn = get_attribute(config.loss)
+    if config.amp:
+        log.info('Using AMP')
+        autocast_fn = autocast
+        scaler = GradScaler()
+    else:
+        autocast_fn, scaler = nullcontext, None
+    save_only_trainable = True
+    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=4)
+    # disable config when hyperparam. opt. to avoid writing logs.
+    tracker_config = config if not config.hyperparameter_optimization else None
+    with TrainingLogger(log_dir=config.name, model=model, config=tracker_config) as logger:
+        i = 0
+        while True:
+            for data_x, data_y in data_loader:
+                # between caption and output feature.
+                # 1. Sample random captions
+                # 2. Check alignment with CLIP
+                # randomly mix text and visual support conditionals
+                if config.mix:
+                    assert config.mask.startswith('text_and')
+                    with autocast_fn():
+                        # data_x[1] = text label
+                        prompts = model.sample_prompts(data_x[1])
+                        # model.clip_model()
+                        text_cond = model.compute_conditional(prompts)
+                        if model.__class__.__name__ == 'CLIPDensePredTMasked':
+                            # when mask=='separate'
+                            visual_s_cond, _, _ = model.visual_forward_masked(data_x[2].cuda(), data_x[3].cuda())
+                        else:
+                            # data_x[2] = visual prompt
+                            visual_s_cond, _, _ = model.visual_forward(data_x[2].cuda())
+                    max_txt = config.mix_text_max if config.mix_text_max is not None else 1
+                    batch_size = text_cond.shape[0]
+                    # sample weights for each element in batch
+                    text_weights = torch.distributions.Uniform(config.mix_text_min, max_txt).sample((batch_size,))[:, None]
+                    text_weights = text_weights.cuda()
+                    if dataset.__class__.__name__ == 'PhraseCut':
+                        # give full weight to text where support_image is invalid
+                        visual_is_valid = data_x[4] if model.__class__.__name__ == 'CLIPDensePredTMasked' else data_x[3]
+                        text_weights = torch.max(text_weights[:,0], 1 - visual_is_valid.float().cuda()).unsqueeze(1)
+                    cond = text_cond * text_weights + visual_s_cond * (1 - text_weights)
+                else:
+                    # no mix
+                    if model.__class__.__name__ == 'CLIPDensePredTMasked':
+                        # compute conditional vector using CLIP masking
+                        with autocast_fn():
+                            assert config.mask == 'separate'
+                            cond, _, _ = model.visual_forward_masked(data_x[1].cuda(), data_x[2].cuda())
+                    else:
+                        cond = data_x[1]
+                        if isinstance(cond, torch.Tensor):
+                            cond = cond.cuda()
+                with autocast_fn():
+                    visual_q = None
+                    pred, visual_q, _, _  = model(data_x[0].cuda(), cond, return_features=True)
+                    loss = loss_fn(pred, data_y[0].cuda())
+                    if torch.isnan(loss) or torch.isinf(loss):
+                        # skip if loss is nan
+                        log.warning('Training stopped due to inf/nan loss.')
+                        sys.exit(-1)
+                    extra_loss = 0
+                    loss += extra_loss
+                opt.zero_grad()
+                if scaler is None:
+                    loss.backward()
+                    opt.step()
+                else:
+                    scaler.scale(loss).backward()
+                    scaler.step(opt)
+                    scaler.update()
+                if lr_scheduler is not None:
+                    lr_scheduler.step()
+                    if i % 2000 == 0:
+                        current_lr = [g['lr'] for g in opt.param_groups][0]
+                        log.info(f'current lr: {current_lr:.5f} ({len(opt.param_groups)} parameter groups)')
+                logger.iter(i=i, loss=loss)
+                i += 1
+                if i >= max_iterations:
+                    if not isfile(join(logger.base_path, 'weights.pth')):
+                        # only write if no weights were already written
+                        logger.save_weights(only_trainable=save_only_trainable)
+                    sys.exit(0)
+                if config.checkpoint_iterations is not None and i in config.checkpoint_iterations:
+                    logger.save_weights(only_trainable=save_only_trainable, weight_file=f'weights_{i}.pth')
+                if val_interval is not None and i % val_interval == val_interval - 1:
+                    val_loss, val_scores, maximize = validate(model, dataset_val, config)
+                    if len(val_scores) > 0:
+                        score_str = f', scores: ' + ', '.join(f'{k}: {v}' for k, v in val_scores.items())
+                        if maximize and val_scores[config.use_val_metric] > best_val_score:
+                            logger.save_weights(only_trainable=save_only_trainable)
+                            best_val_score = val_scores[config.use_val_metric]
+                        elif not maximize and val_scores[config.use_val_metric] < best_val_score:
+                            logger.save_weights(only_trainable=save_only_trainable)
+                            best_val_score = val_scores[config.use_val_metric]
+                    else:
+                        score_str = ''
+                        # if no score is used, fall back to loss
+                        if val_loss < best_val_loss:
+                            logger.save_weights(only_trainable=save_only_trainable)
+                            best_val_loss = val_loss
+                    log.info(f'Validation loss: {val_loss}' + score_str)
+                    logger.iter(i=i, val_loss=val_loss, extra_loss=float(extra_loss), **val_scores)
+                    model.train()
+            print('epoch complete')
+if __name__ == '__main__':
+    main()

clipseg/weights/rd64-uni.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13845f6cee4d54ca46f62ee19dd354822094a26e0efccc64e606be93d6a7e26f
+size 4306645

init_image.png ADDED Viewed

inpainting.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import inspect
+from typing import List, Optional, Union
+import numpy as np
+import torch
+import PIL
+from diffusers import AutoencoderKL, DDIMScheduler, DiffusionPipeline, PNDMScheduler, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from tqdm.auto import tqdm
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+def preprocess_image(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+def preprocess_mask(mask):
+    mask = mask.convert("L")
+    w, h = mask.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
+    mask = np.array(mask).astype(np.float32) / 255.0
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+    mask = 1 - mask  # repaint white, keep black
+    mask = torch.from_numpy(mask)
+    return mask
+class StableDiffusionInpaintingPipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+        scheduler = scheduler.set_format("pt")
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        init_image: torch.FloatTensor,
+        mask_image: torch.FloatTensor,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+    ):
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        # set timesteps
+        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {}
+        offset = 0
+        if accepts_offset:
+            offset = 1
+            extra_set_kwargs["offset"] = 1
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+        # preprocess image
+        init_image = preprocess_image(init_image).to(self.device)
+        # encode the init image into latents and scale the latents
+        init_latent_dist = self.vae.encode(init_image).latent_dist
+        init_latents = init_latent_dist.sample(generator=generator)
+        init_latents = 0.18215 * init_latents
+        # prepare init_latents noise to latents
+        init_latents = torch.cat([init_latents] * batch_size)
+        init_latents_orig = init_latents
+        # preprocess mask
+        mask = preprocess_mask(mask_image).to(self.device)
+        mask = torch.cat([mask] * batch_size)
+        # check sizes
+        if not mask.shape == init_latents.shape:
+            raise ValueError(f"The mask and init_image should be the same size!")
+        # get the original timestep using init_timestep
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+        timesteps = self.scheduler.timesteps[-init_timestep]
+        timesteps = torch.tensor([timesteps] * batch_size, dtype=torch.long, device=self.device)
+        # add noise to latents using the timesteps
+        noise = torch.randn(init_latents.shape, generator=generator, device=self.device)
+        init_latents = self.scheduler.add_noise(init_latents, noise, timesteps)
+        # get prompt text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            max_length = text_input.input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        latents = init_latents
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        for i, t in tqdm(enumerate(self.scheduler.timesteps[t_start:])):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)["prev_sample"]
+            # masking
+            init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, t)
+            latents = (init_latents_proper * mask) + (latents * (1 - mask))
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        # run safety checker
+        safety_cheker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(self.device)
+        image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_cheker_input.pixel_values)
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        return {"sample": image, "nsfw_content_detected": has_nsfw_concept}

mask_image.png ADDED Viewed