{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "view-in-github"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/ivelin/donut_ui_refexp/blob/main/Inference_Playground_Donut_UI_RefExp_Gradio.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "x6dFfL0QUr8P",
        "outputId": "58f3b497-f4e8-46bc-a40c-b564f6e14010"
      },
      "outputs": [],
      "source": [
        "#@title Check out source repo if not automatically available\n",
        "# !git clone https://github.com/GuardianUI/ui-refexp\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "RQdzURjDWYco",
        "outputId": "2628c536-780e-4544-8f37-33a7e79ee367"
      },
      "outputs": [],
      "source": [
        "# Go to hf space dir if not already there\n",
        "# !cd ui-refexp/hf-space && \n",
        "\n",
        "!pip3 install -r requirements.txt"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# from PIL import Image, ImageDraw\n",
        "# from transformers import DonutProcessor, VisionEncoderDecoderModel\n",
        "\n",
        "# pretrained_repo_name = 'ivelin/donut-refexp-click'\n",
        "# pretrained_revision = 'main'\n",
        "# # revision can be git commit hash, branch or tag\n",
        "# # use 'main' for latest revision\n",
        "# print(f\"Loading model checkpoint: {pretrained_repo_name}\")\n",
        "\n",
        "# proc = DonutProcessor.from_pretrained(\n",
        "#     pretrained_repo_name, revision=pretrained_revision, use_auth_token=\"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\")\n",
        "# proc.image_processor.do_align_long_axis = False\n",
        "# proc.image_processor.do_resize = False\n",
        "# proc.image_processor.do_thumbnail = False\n",
        "# proc.image_processor.do_pad = False\n",
        "# proc.image_processor.do_rescale = False\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/home/gitpod/.pyenv/versions/3.8.16/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
            "  from .autonotebook import tqdm as notebook_tqdm\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Loading model checkpoint: ivelin/donut-refexp-click\n",
            "processor image size: {'height': 1280, 'width': 960}\n",
            "Running on local URL:  http://127.0.0.1:7860\n",
            "Running on public URL: https://f2beb057-2b06-4a52.gradio.live\n",
            "\n",
            "This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "<div><iframe src=\"https://f2beb057-2b06-4a52.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "(image, prompt): <PIL.Image.Image image mode=RGB size=2719x980 at 0x7F8F12C6E3D0>, click on search button\n",
            "predicted decoder sequence: &lt;s_refexp&gt;&lt;s_prompt&gt; click on search button&lt;/s_prompt&gt;&lt;s_target_center&gt;&lt;s_x&gt; 0.23&lt;/s_x&gt;&lt;s_y&gt; 0.33&lt;/s_y&gt;&lt;/s_target_center&gt;&lt;/s&gt;\n",
            "predicted decoder sequence before token2json: &lt;s_prompt&gt; click on search button&lt;/s_prompt&gt;&lt;s_target_center&gt;&lt;s_x&gt; 0.23&lt;/s_x&gt;&lt;s_y&gt; 0.33&lt;/s_y&gt;&lt;/s_target_center&gt;\n",
            "predicted center_point with text coordinates: {'x': '0.23', 'y': '0.33'}\n",
            "predicted center_point with float coordinates: {'x': 0.23, 'y': 0.33, 'decoder output sequence (before x,y adjustment)': '<s_prompt> click on search button</s_prompt><s_target_center><s_x> 0.23</s_x><s_y> 0.33</s_y></s_target_center>'}\n",
            "input image size: (2719, 980)\n",
            "processed prompt: <s_refexp><s_prompt>click on search button</s_prompt><s_target_center>\n",
            "point={'x': 0.23, 'y': 0.33, 'decoder output sequence (before x,y adjustment)': '<s_prompt> click on search button</s_prompt><s_target_center><s_x> 0.23</s_x><s_y> 0.33</s_y></s_target_center>'}, input_image_size=(2719, 980), output_image_size=(960, 1280)\n",
            ">>> resized_width=960\n",
            ">>> resized_height=346\n",
            "translated point={'x': 0.23, 'y': 1.2208092485549134, 'decoder output sequence (before x,y adjustment)': '<s_prompt> click on search button</s_prompt><s_target_center><s_x> 0.23</s_x><s_y> 0.33</s_y></s_target_center>'}, resized_image_size: (960, 346)\n",
            "to image pixel values: x, y: (625, 1196)\n",
            "(image, prompt): <PIL.Image.Image image mode=RGB size=2719x980 at 0x7F8F12C5C9D0>, click on search names\n",
            "predicted decoder sequence: &lt;s_refexp&gt;&lt;s_prompt&gt; click on search names&lt;/s_prompt&gt;&lt;s_target_center&gt;&lt;s_x&gt; 0.5&lt;/s_x&gt;&lt;s_y&gt; 0.18&lt;/s_y&gt;&lt;/s_target_center&gt;&lt;/s&gt;\n",
            "predicted decoder sequence before token2json: &lt;s_prompt&gt; click on search names&lt;/s_prompt&gt;&lt;s_target_center&gt;&lt;s_x&gt; 0.5&lt;/s_x&gt;&lt;s_y&gt; 0.18&lt;/s_y&gt;&lt;/s_target_center&gt;\n",
            "predicted center_point with text coordinates: {'x': '0.5', 'y': '0.18'}\n",
            "predicted center_point with float coordinates: {'x': 0.5, 'y': 0.18, 'decoder output sequence (before x,y adjustment)': '<s_prompt> click on search names</s_prompt><s_target_center><s_x> 0.5</s_x><s_y> 0.18</s_y></s_target_center>'}\n",
            "input image size: (2719, 980)\n",
            "processed prompt: <s_refexp><s_prompt>click on search names</s_prompt><s_target_center>\n",
            "point={'x': 0.5, 'y': 0.18, 'decoder output sequence (before x,y adjustment)': '<s_prompt> click on search names</s_prompt><s_target_center><s_x> 0.5</s_x><s_y> 0.18</s_y></s_target_center>'}, input_image_size=(2719, 980), output_image_size=(960, 1280)\n",
            ">>> resized_width=960\n",
            ">>> resized_height=346\n",
            "translated point={'x': 0.5, 'y': 0.6658959537572254, 'decoder output sequence (before x,y adjustment)': '<s_prompt> click on search names</s_prompt><s_target_center><s_x> 0.5</s_x><s_y> 0.18</s_y></s_target_center>'}, resized_image_size: (960, 346)\n",
            "to image pixel values: x, y: (1359, 652)\n"
          ]
        }
      ],
      "source": [
        "import app\n",
        "\n",
        "# img = Image.open('val-image-4.jpg')\n",
        "# print(img.size)\n",
        "# display(img)\n",
        "# out_size = (proc.image_processor.size['width'],\n",
        "#             proc.image_processor.size['height'])\n",
        "# oimg = app.prepare_image_for_encoder(img, output_image_size=out_size)\n",
        "# print(oimg.size)\n",
        "# display(oimg)\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# import transformers\n",
        "\n",
        "# turn off normalization so we can see the image\n",
        "# otherwise its tiny [0..1] float values that all look like the color black(0)\n",
        "# proc.image_processor.do_normalize = False\n",
        "\n",
        "# npimg = proc.image_processor.preprocess(oimg)\n",
        "# pimg = transformers.image_transforms.to_pil_image(npimg['pixel_values'][0])\n",
        "# pimg.save('tmp.png')\n",
        "# display(pimg)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "include_colab_link": true,
      "provenance": []
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.16"
    },
    "vscode": {
      "interpreter": {
        "hash": "9ac03a0a6051494cc606d484d27d20fce22fb7b4d169f583271e11d5ba46a56e"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}