{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset, load_from_disk, Dataset\n", "import matplotlib.pyplot as plt\n", "from PIL import Image\n", "import numpy as np\n", "from hashlib import sha256\n", "import requests\n", "from io import BytesIO\n", "from tqdm import tqdm\n", "from collections import defaultdict\n", "import json\n", "import os\n", "import random\n", "import math\n", "%matplotlib inline\n", "\n", "def plot(img, points=None):\n", " plt.imshow(img)\n", " if points:\n", " for p in points:\n", " plt.scatter(p[0], p[1], color='red', s=10)\n", " plt.axis('off')\n", " plt.show()\n", "\n", "def plot_masked_image(img: Image.Image, mask: np.ndarray):\n", " \"\"\"\n", " Display only the parts of `img` where `mask` is True.\n", " Parameters:\n", " - img: PIL.Image of size (w, h) or (h, w)\n", " - mask: 2D boolean numpy array of shape (h, w)\n", " \"\"\"\n", " # Ensure image has an alpha channel\n", " img_rgba = img.convert(\"RGBA\")\n", " arr = np.array(img_rgba)\n", "\n", " # Set alpha to 255 where mask=True, 0 otherwise\n", " arr[..., 3] = mask.astype(np.uint8) * 255\n", "\n", " # Convert back and plot\n", " masked_img = Image.fromarray(arr)\n", " plt.figure(figsize=(6, 6))\n", " plt.imshow(masked_img)\n", " plt.axis('off')\n", " plt.show()\n", "\n", "def url2img(url):\n", " response = requests.get(url)\n", " img = Image.open(BytesIO(response.content))\n", " return img\n", "\n", "def denorm(x,y,w,h):\n", " x = (x/100) * w\n", " y = (x/100) * h\n", " return x,y\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### num_bbox: num_images" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' \\nFor pointing subset:\\n num unique images = 155k\\n bbox per image : num_images = {\\n 0: 280k\\n 1: 1.2M\\n 2: 235k\\n 3: 100k\\n 4: 80k\\n 5: 40k\\n 6: 30k....\\n }\\n\\n one_bbox, pointing subset: 1.2M images, 152k unique images\\n\\n 1. train all 1.1 images (best)\\n 3. sample 150k random images (suboptimal) \\n 2. train 150k unique images and sample:\\n a. k random bboxes\\n b. k largest bboxes\\n\\nwrite code : unique image --> bboxes (a or b)\\nfor each 80k images of pixmo --> 80k images of refCOCOg (higher quality)\\n'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\" \n", "For pointing subset:\n", " num unique images = 155k\n", " bbox per image : num_images = {\n", " 0: 280k\n", " 1: 1.2M\n", " 2: 235k\n", " 3: 100k\n", " 4: 80k\n", " 5: 40k\n", " 6: 30k....\n", " }\n", "\n", " one_bbox, pointing subset: 1.2M images, 152k unique images\n", "\n", " 1. train all 1.1 images (best)\n", " 3. sample 150k random images (suboptimal) \n", " 2. train 150k unique images and sample:\n", " a. k random bboxes\n", " b. k largest bboxes\n", "\n", "write code : unique image --> bboxes (a or b)\n", "for each 80k images of pixmo --> 80k images of refCOCOg (higher quality)\n", "\"\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Pointing subset DataFrame (2M samples)\n", "data = /pixmo-train\n", "image_dir = /pixmo_images" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# data = load_dataset(\"allenai/pixmo-points\")['train']\n", "# df = data.to_pandas()\n", "# df.head(3)\n", "\n", "\n", "# pixmoid2url = dict(zip(list(df.pixmo_id),list(df.image_url) ))\n", "# import pickle\n", "# with open(\"/storage/users/manugaur/mllm_inversion/pixmo/pixmoid2url.pkl\", \"wb\") as f:\n", "# pickle.dump(pixmoid2url, f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# url2pixmoID = {}\n", "# counter = 0\n", "# for url in list(df.image_url):\n", "# if url not in url2pixmoID:\n", "# url2pixmoID[url] = F\"pixmo{counter}\"\n", "# counter+=1\n", "# else:\n", "# continue\n", "# pixmoID = [url2pixmoID[url] for url in list(df['image_url'])] #new column\n", "# df[\"pixmo_id\"] = pixmoID\n", "## df.to_csv(\"/storage/users/manugaur/mllm_inversion/pixmo/pixmo-train.csv\")\n", "## Dataset.from_pandas(df).save_to_disk(\"/storage/users/manugaur/mllm_inversion/pixmo/pixmo-train\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "num image-text pairs : 2057090\n" ] }, { "data": { "text/html": [ "
\n", " | image_url | \n", "image_sha256 | \n", "points | \n", "count | \n", "label | \n", "collection_method | \n", "pixmo_id | \n", "
---|---|---|---|---|---|---|---|
319132 | \n", "https://www.greatplacetowork.com/images/profil... | \n", "fa55de563cfa9562e9afd78a7921753e1f45895d786ed0... | \n", "[{'x': 91.23325516399498, 'y': 31.635033060150... | \n", "NaN | \n", "balloons | \n", "pointing | \n", "pixmo72762 | \n", "