BoltzmannEntropy commited on
Commit
fadb9e6
·
0 Parent(s):

First commit

Browse files
Dockerfile ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile customized for deployment on HuggingFace Spaces platform
2
+
3
+ # -- The Dockerfile has been tailored specifically for use on HuggingFace.
4
+ # -- It implies that certain modifications or optimizations have been made with HuggingFace's environment in mind.
5
+ # -- It uses "HuggingFace Spaces" to be more specific about the target platform.
6
+
7
+ # FROM pytorch/pytorch:2.2.1-cuda12.1-cudnn8-devel
8
+ FROM pytorch/pytorch:2.4.0-cuda12.1-cudnn9-devel
9
+ # FOR HF
10
+
11
+ USER root
12
+
13
+ ENV DEBIAN_FRONTEND=noninteractive
14
+ RUN apt-get update && apt-get install -y \
15
+ git \
16
+ cmake \
17
+ python3 \
18
+ python3-pip \
19
+ python3-venv \
20
+ python3-dev \
21
+ python3-numpy \
22
+ gcc \
23
+ build-essential \
24
+ gfortran \
25
+ wget \
26
+ curl \
27
+ pkg-config \
28
+ software-properties-common \
29
+ zip \
30
+ && apt-get clean && rm -rf /tmp/* /var/tmp/*
31
+
32
+ RUN apt-get update && DEBIAN_FRONTEND=noninteractive \
33
+ apt-get install -y python3.10 python3-pip
34
+
35
+ RUN apt-get install -y libopenblas-base libopenmpi-dev
36
+
37
+ ENV TZ=Asia/Dubai
38
+ RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
39
+
40
+
41
+
42
+ RUN useradd -m -u 1000 user
43
+
44
+ RUN apt-get update && apt-get install -y sudo && \
45
+ echo 'user ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
46
+
47
+ USER user
48
+ ENV HOME=/home/user \
49
+ PATH=/home/user/.local/bin:$PATH
50
+
51
+ RUN mkdir $HOME/app
52
+ RUN mkdir $HOME/app/test_images
53
+
54
+ # WORKDIR $HOME/app
55
+
56
+ RUN chown -R user:user $HOME/app
57
+
58
+ USER user
59
+ WORKDIR $HOME/app
60
+
61
+ RUN python -m pip install qwen-vl-utils
62
+ RUN python -m pip install --pre -U -f https://mlc.ai/wheels mlc-llm-nightly-cu122 mlc-ai-nightly-cu122
63
+
64
+ RUN python3 -m pip install chromadb db-sqlite3 auto-gptq exllama sqlalchemy
65
+ WORKDIR $HOME/app
66
+ RUN git clone https://github.com/casper-hansen/AutoAWQ
67
+ WORKDIR $HOME/app/AutoAWQ/
68
+ RUN python3 -m pip install -e .
69
+ WORKDIR $HOME/app
70
+ # ENV FLASH_ATTENTION_FORCE_BUILD=TRUE
71
+ RUN python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
72
+ RUN python -m pip install accelerate diffusers datasets timm flash-attn==2.6.1 gradio
73
+
74
+ RUN python3 -m pip install --no-deps optimum
75
+ RUN python3 -m pip install --no-deps autoawq>=0.1.8
76
+
77
+ #This seems to be a must : Intel Extension for PyTorch 2.4 needs to work with PyTorch 2.4.*, but PyTorch 2.2.2 is
78
+ RUN python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
79
+ RUN python3 -m pip install -U accelerate
80
+ RUN python3 -m pip install -U git+https://github.com/huggingface/transformers
81
+
82
+ WORKDIR $HOME/app
83
+ COPY --chown=user:user app.py .
84
+ COPY --chown=user:user test_images /home/user/app/test_images
85
+ # /home/user/app/
86
+ # chown -R user:user /home/user/.cache/
87
+
88
+ ENV PYTHONUNBUFFERED=1 GRADIO_ALLOW_FLAGGING=never GRADIO_NUM_PORTS=1 GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 SYSTEM=spaces
89
+ RUN python3 -m pip install pennylane sympy pennylane-qiskit duckdb
90
+ WORKDIR $HOME/app
91
+
92
+ EXPOSE 8097 7842 8501 8000 6666 7860
93
+
94
+ CMD ["python", "app.py"]
95
+
96
+
97
+ # ERROR! Intel® Extension for PyTorch* needs to work with PyTorch 2.4.*, but PyTorch 2.2.2 is found. Please switch to the matching version and run again.
98
+ # ERROR! Intel® Extension for PyTorch* needs to work with PyTorch 2.4.*, but PyTorch 2.2.2 is found. Please switch to the matching version and run again.
99
+ # `Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
100
+ # /home/user/.local/lib/python3.10/site-packages/transformers/modeling_utils.py:4749: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead
101
+ # warnings.warn(
102
+ # /home/user/.local/lib/python3.10/site-packages/accelerate/utils/imports.py:336: UserWarning: Intel Extension for PyTorch 2.4 needs to work with PyTorch 2.4.*, but PyTorch 2.2.2 is found. Please switch to the matching version and run again.
103
+ # warnings.warn(
104
+ # Error loading model Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4: Found modules on cpu/disk.
105
+ # Using Exllama or Exllamav2 backend requires all the modules to be on GPU.You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object
106
+ # Error loading model Qwen/Qwen2-VL-7B-Instruct: (ReadTimeoutError("HTTPSConnectionPool(host='hf.co', port=443):
107
+ # Read timed out. (read timeout=10)"), '(Request ID: b8269a88-9b6b-43e0-942d-1049f173dc00)')
108
+
109
+ # Error loading model Qwen/Qwen2-VL-7B-Instruct: CUDA out of memory.
110
+ # Tried to allocate 130.00 MiB. GPU 0 has a total capacity of 14.58 GiB of which 77.62 MiB is free.
111
+
112
+ # instruct: FlashAttention only supports Ampere GPUs or newer.
README.md ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Vlms
3
+ emoji: 🔥
4
+ colorFrom: gray
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ ---
10
+
11
+ # VLM-Image-Analysis: A Vision-and-Language Modeling Framework
12
+
13
+ Welcome to the Hugging Face Space for VLM-Image-Analysis. This space showcases a cutting-edge framework that combines multiple Vision-Language Models (VLMs) and a Large Language Model (LLM) to provide comprehensive image analysis and captioning.
14
+
15
+ <h1 align="center">
16
+ <img src="static/image.jpg" width="50%"></a>
17
+ <h6> (Adapted from wang2023allseeing: https://huggingface.co/datasets/OpenGVLab/CRPE?row=1) <h6>
18
+ </h1>
19
+
20
+ This repository contains the core code for a multi-model framework that enhances image interpretation through the combined power of several Vision-and-Language Modeling (VLM) systems. VLM-Image-Analysis delivers detailed, multi-faceted analyses of images by leveraging N cutting-edge VLM models, pre-trained on a wide range of datasets to detect diverse visual cues and linguistic patterns.
21
+
22
+ It is available on HF at the follwing HF space: https://huggingface.co/spaces/BoltzmannEntropy/vlms/
23
+
24
+ <h1 align="center">
25
+ <img src="static/003.png" width="100%"></a>
26
+ </h1>
27
+
28
+ 1. Multiple VLMs analyze the input image independently, providing diverse perspectives.
29
+ 2. A 'judge' LLM synthesizes these outputs into a comprehensive, coherent description.
30
+
31
+
32
+ ## Abstract
33
+
34
+
35
+ This research introduces a novel technique in the realm of automatic image captioning that leverages the collaborative potential between Vision-Language Models (VLMs) and Large Language Models (LLMs). Our innovative method utilizes a two-tiered framework: initially, various VLMs provide diverse verbal descriptions of an image based on unique prompts; subsequently, these array of captions are integrated by a central 'judge' LLM to produce a cohesive and comprehensive caption that encapsulates the essence of the image. The objective of this synergistic approach is to elevate the precision, richness, and contextual appropriateness of image descriptions by pooling together diverse model capabilities. We validate the effectiveness of our dual-model strategy across a spectrum of VLMs—namely Qwen2-VL, Phi-3-vision, and Moondream2—and assess its performance on different datasets. Our empirical results indicate that this ensemble method yields image captions with substantially higher quality and depth than traditional single-model approaches, marking a significant stride in the evolution of computer vision technology.
36
+
37
+
38
+ ## Introduction
39
+
40
+ The field of computer vision has undergone significant transformations in recent times, especially within the niche area of generating descriptive text for images—a task known as image captioning. Vision-Language Models (VLMs) have risen to prominence as instrumental instruments that facilitate the seamless integration of visual information with comprehension of natural language. Despite their capabilities, individual VLMs may possess unique advantages and constraints, which can sometimes result in descriptions that are either biased or lack completeness. This situation thus opens up a scope for creative approaches that can capitalize on the strengths of multiple models to produce more balanced and comprehensive image interpretations.
41
+
42
+ <h1 align="center">
43
+ <img src="static/005.png" width="1000%"></a>
44
+ </h1>
45
+
46
+ To address this issue, we introduce a novel methodology that leverages the combined strengths of multiple VLMs in conjunction with a Large Language Model (LLM) acting as a judge. Our approach is designed to produce more comprehensive, accurate, and contextually rich image captions by following a two-stage process:
47
+
48
+ 1. **Multi-VLM Caption Generation**: In the first stage, we employ several distinct VLMs, including Qwen2-VL, Phi-3-vision, and Moondream2, to generate image descriptions. Each model is prompted with different instructions, encouraging diverse perspectives on the image content. This diversity is crucial, as it allows for a more comprehensive capture of the image's nuances and details.
49
+
50
+ 2. **LLM-based Caption Synthesis**: The second stage involves using a 'judge' LLM to analyse and synthesise the outputs from the first stage. This model is tasked with unifying the various descriptions into a single, coherent caption that captures the essence of all initial responses. The judge LLM's role is not merely to aggregate information but to intelligently combine and refine the insights provided by the VLMs.
51
+
52
+ Our methodology leverages state-of-the-art models and techniques, including:
53
+
54
+ - The Qwen2-VL family of models, which offer varying capabilities and model sizes, allowing for a range of perspectives and computational efficiencies.
55
+ - Microsoft's Phi-3-vision model, known for its efficiency and performance in visual understanding tasks.
56
+ - The Moondream2 model, which brings unique perspectives to image understanding, potentially capturing aspects overlooked by other models.
57
+
58
+ The implementation utilises advanced libraries such as PyTorch and Hugging Face's Transformers, ensuring efficient processing and easy integration with existing deep learning workflows. We have also incorporated features to handle both individual images and batch processing from datasets or ZIP files, making our approach versatile for various research and practical applications.
59
+
60
+ By combining multiple VLMs and using an LLM as a judge, we aim to mitigate individual model biases, capture a broader range of image details, and produce captions that are more informative and contextually appropriate. This approach not only improves the quality of image captioning but also opens up new avenues for exploring the synergies between different AI models in multimodal tasks.
61
+
62
+ It is important to note, however, that while our method shows promise, it is not without limitations. The effectiveness of the approach may vary depending on the specific combination of models used and the nature of the images being captioned. Additionally, the computational resources required for this ensemble approach are significantly higher than those needed for single-model methods, which may limit its applicability in resource-constrained environments.
63
+
64
+ In the following sections, we shall detail our methodology, present the experimental setup, and discuss the results and implications of our findings. We believe this work contributes significantly to the field of image captioning and demonstrates the potential of ensemble approaches in vision-language tasks.
65
+
66
+
67
+
68
+ ## Table of Contents
69
+
70
+ - [VLM-Image-Analysis: A Vision-and-Language Modeling Framework](#vlm-image-analysis-a-vision-and-language-modeling-framework)
71
+ - [Abstract](#abstract)
72
+ - [Introduction](#introduction)
73
+ - [Table of Contents](#table-of-contents)
74
+ - [Base VLMs](#base-vlms)
75
+ - [Judge VLM](#judge-vlm)
76
+ - [How to Use](#how-to-use)
77
+ - [Example Process](#example-process)
78
+ - [Models](#models)
79
+ - [Setup](#setup)
80
+ - [Inference](#inference)
81
+ - [The Gradio APP](#the-gradio-app)
82
+ - [1. VLM Model Selection](#1-vlm-model-selection)
83
+ - [2. Device Settings](#2-device-settings)
84
+ - [3. Image Selection](#3-image-selection)
85
+ - [Docker Setup for LLMs](#docker-setup-for-llms)
86
+ - [Base System](#base-system)
87
+ - [LLM-related Packages](#llm-related-packages)
88
+ - [Jupyter Configuration](#jupyter-configuration)
89
+ - [Exposed Ports](#exposed-ports)
90
+ - [Prompts](#prompts)
91
+ - [Example of Available Prompts](#example-of-available-prompts)
92
+ - [Citation](#citation)
93
+
94
+ ## Base VLMs
95
+
96
+ The first phase of the framework isolates each specialized VLM model to individually analyse an input image. Each model processes the image independently, resulting in multiple descriptive outputs from different viewpoints. These models are carefully selected based on their unique strengths in visual and textual analysis, ensuring a comprehensive examination of the image.
97
+
98
+ For instance, a VLM might be prompted with the following directive:
99
+ ```
100
+ "Provide an extensive description of all elements in this image, including objects, people, and activities."
101
+ ```
102
+
103
+ This phase enables the generation of varied textual descriptions and captions, offering diverse perspectives on the visual content.
104
+
105
+ ## Judge VLM
106
+
107
+ In the second phase, the outputs from the base VLMs are passed to a synthesizing "Judge" VLM model. This model is specifically trained to merge the individual outputs into a coherent, unified conclusion that captures the overall context of the image.
108
+
109
+ The Judge VLM is designed to handle different types of input data, including textual descriptions from both K and V models. Its role is to harmonise and integrate these multiple viewpoints, providing a more comprehensive synthesis of the image's content.
110
+
111
+ # How to Use
112
+
113
+ 1. **Access the Space**: Click on the "Open in Spaces" badge at the top of this README or visit [https://huggingface.co/spaces/BoltzmannEntropy/vlms](https://huggingface.co/spaces/BoltzmannEntropy/vlms)
114
+
115
+ 2. **Select a Dataset**: Choose from the available Hugging Face datasets:
116
+ - gokaygokay/panorama_hdr_dataset
117
+ - OpenGVLab/CRPE
118
+
119
+ 3. **Select Models**: Choose which VLMs you want to use for the analysis from the available options.
120
+
121
+ 4. **Choose Prompts**: Select from predefined prompts or enter a custom prompt to guide the analysis.
122
+
123
+ 5. **Run Analysis**: Click the "Submit" button to start the image analysis process.
124
+
125
+ 6. **View Results**: The generated descriptions, detected text, and translations will be displayed in the interface.
126
+
127
+ ## Example Process
128
+
129
+ For instance, when the system processes an image and generates an initial textual output from a model such as "microsoft/Phi-3-vision-128k-instruct," the Judge VLM then engages the "Qwen/Qwen2-VL-7B-Instruct" model for further analysis. The Qwen/Qwen2 model evaluates this initial interpretation and offers a synthesized response that integrates both visual and textual insights, providing a final, nuanced interpretation to the user or downstream system.
130
+
131
+ ## Models
132
+
133
+ The framework utilises multiple advanced Vision-and-Language Models (VLMs). For more details, refer to the respective model documentation:
134
+ - [Microsoft/Phi-3-Vision-128K-Instruct](https://huggingface.co/microsoft/Phi-3-Vision-128K-Instruct)
135
+ - [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)
136
+
137
+ ## Setup
138
+
139
+ To set up this project, follow these steps in a terminal:
140
+
141
+ 1. **Clone the Repository**
142
+
143
+ - Clone the repository to your local machine:
144
+ ```bash
145
+ git clone https://github.com/your-repo/vlm-image-analysis.git
146
+ cd vlm-image-analysis
147
+ ```
148
+
149
+ 2. **Run the docker:**
150
+
151
+
152
+ After completing these steps, your setup should be complete and you can start using the project.
153
+
154
+ ## Inference
155
+
156
+ Start the docker container:
157
+ ```bash
158
+ docker run --gpus all --rm -it --shm-size=8gb --memory="16g" --env="DISPLAY" -p 8077:7842 -p 7860:7860 -p 8501:8501 -v %cd%:/RAG -v %cd%:/root/sharedfolder -v %cd%/.cache:/root/.cache lmdeploy-docker:latest bash
159
+ ```
160
+
161
+ You should see something like:
162
+ ```
163
+ ==========
164
+ == CUDA ==
165
+ ==========
166
+
167
+ CUDA Version 12.1.1
168
+
169
+ Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
170
+
171
+ This container image and its contents are governed by the NVIDIA Deep Learning Container License.
172
+ By pulling and using the container, you accept the terms and conditions of this license:
173
+ https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license
174
+
175
+ A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.
176
+
177
+ root@c6d710b08cde:/RAG/web#
178
+ ```
179
+
180
+ To run the application:
181
+
182
+ ```bash
183
+ root@c6d710b08cde:/RAG/web# python vlms.py
184
+ ```
185
+
186
+ Which should output:
187
+ ```bash
188
+ Running on local URL: http://0.0.0.0:7860
189
+ To create a public link, set `share=True` in `launch()`.
190
+ ```
191
+
192
+ On initial use, the models are very slowly downloaded to a folder which is mapped externally using the docker -v command:
193
+ ```
194
+ -v %cd%/.cache:/root/.cache
195
+ ```
196
+
197
+ For instance:
198
+ ```
199
+ root@c6d710b08cde:/RAG/web# ls -ls /root/.cache/huggingface/
200
+ 0 drwxr-xr-x 1 root root 512 Sep 12 06:15 models--OpenGVLab--InternVL2-1B
201
+ 0 drwxr-xr-x 1 root root 512 Sep 12 06:27 models--Qwen--Qwen2-VL-7B-Instruct
202
+ 0 drwxr-xr-x 1 root root 512 Sep 16 11:14 models--Qwen--Qwen2-VL-7B-Instruct-GPTQ-Int8
203
+ 0 drwxr-xr-x 1 root root 512 Sep 11 11:00 models--microsoft--Phi-3-vision-128k-instruct
204
+ 0 drwxr-xr-x 1 root root 512 Sep 15 06:02 models--vikhyatk--moondream2
205
+ ```
206
+
207
+ ## The Gradio APP
208
+
209
+ The Gradio application provides a user-friendly interface for interacting with the VLM-Image-Analysis framework. It includes three main tabs: **VLM Model Selection**, **Device Settings**, and **Image Selection**. Each tab allows users to configure different aspects of the image analysis process.
210
+
211
+ ### 1. VLM Model Selection
212
+
213
+ This tab allows users to select the Vision-and-Language Models (VLMs) they want to use, specify the directory of images to be analyzed, and choose prompts for generating textual descriptions. The interface includes:
214
+
215
+ - **Model Selection**: A checkbox group where users can select from available models such as `Qwen/Qwen2-VL-7B-Instruct`, `microsoft/Phi-3-vision-128k-instruct`, `vikhyatk/moondream2`, and `Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8`.
216
+ - **Image Directory Path**: A textbox to input the path to the directory containing images.
217
+ - **Prompt Selection**: A checkbox group where users can select from predefined prompts that guide the analysis.
218
+ - **Submit Button**: A button to run the inference based on the selected models, prompts, and image directory.
219
+
220
+ The results of the inference are displayed in an HTML format, showing the generated descriptions, detected text, and translations.
221
+
222
+ ### 2. Device Settings
223
+
224
+ <h1 align="center">
225
+ <img src="static/004.png" width="75%"></a>
226
+ </h1>
227
+
228
+ In this tab, users can configure settings related to the computational resources used for running the models:
229
+
230
+ - **Device Map**: A radio button selection to choose the device map (`auto`, `cpu`, `cuda`). This determines whether the inference will use the GPU or CPU.
231
+ - **Torch Dtype**: A radio button to select the data type for PyTorch (`torch.float16` or `torch.float32`). This affects the precision and performance of the model.
232
+ - **Trust Remote Code**: A checkbox to indicate whether to trust remote code when loading models. This is relevant for models that require remote execution code.
233
+
234
+ ### 3. Image Selection
235
+
236
+ This tab allows users to process images from the specified HF Dataset:
237
+ ```
238
+ # List of available Hugging Face datasets
239
+ dataset_options = [
240
+ "gokaygokay/panorama_hdr_dataset",
241
+ "OpenGVLab/CRPE"
242
+ ]
243
+ ```
244
+
245
+ - **HF Image Dataset**: A textbox to input the path to the HF dataset containing images.
246
+ - **Load a ZIP of Images Button**: A button to load and images from the specified ZIP file.
247
+
248
+ ## Docker Setup for LLMs
249
+
250
+ ### Base System
251
+
252
+ The Docker image is based on the `pytorch/pytorch:2.2.2-cuda12.1-cudnn8-devel` image, which provides a PyTorch environment with CUDA 12.1 and cuDNN 8 for GPU acceleration.
253
+
254
+ ### LLM-related Packages
255
+
256
+ - **MLC LLM**: Installed from a nightly build URL, including `mlc-llm-nightly-cu122` for LLM services.
257
+ - **AutoAWQ**: Cloned from GitHub and installed, which likely includes functionality for automatic quantization and weight compression.
258
+ - **FlashAttention**: The package `flash-attn==2.6.1` is installed to optimize attention mechanisms in transformer models.
259
+ - **Transformers**: The `transformers==4.33.0` package is used for working with pre-trained transformer models, along with `accelerate`, `diffusers`, `datasets`, `timm` for various model and dataset utilities.
260
+ - **Other Libraries**: `chromadb`, `db-sqlite3`, `auto-gptq`, `exllama`, `sqlalchemy`, `optimum`, and `autoawq` for additional functionalities related to data handling, model optimization, and automatic weight quantization.
261
+
262
+ ### Jupyter Configuration
263
+
264
+ - **Jupyter**: Jupyter Notebook and related packages are set up, including configuration through `jupyter_notebook_config.py` and a script `run_jupyter.sh`.
265
+
266
+ ### Exposed Ports
267
+
268
+ The container exposes the following ports for accessing services:
269
+ - `8097`, `7842`, `8501`, `8000`, `6666`, `7860`
270
+
271
+
272
+ ## Prompts
273
+
274
+ The framework uses a variety of prompts to guide the Vision-and-Language Modeling (VLM) systems in generating descriptive outputs from images. Each prompt is designed to elicit detailed and specific information from the models.
275
+
276
+ ### Example of Available Prompts
277
+
278
+ 1. **Comprehensive Image Analysis**
279
+ - **Prompt**:
280
+ ```
281
+ Thoroughly analyse the provided image and generate a comprehensive description of its content. This includes identifying and describing all objects, people, and activities visible in the image. Additionally, detect and extract any text present within the image using Optical Character Recognition (OCR), regardless of the language. Present the extracted text in a structured table format, including columns for the original text, its translation into English, and the language of the text.
282
+ ```
283
+ - **Rationale**: This prompt is designed to provide an exhaustive analysis of the image, encompassing both visual and textual information. It aims to capture every detail, making it suitable for applications where a complete understanding of the image is necessary, such as in detailed reporting or comprehensive data annotation.
284
+
285
+
286
+ ## Citation
287
+
288
+ Please consider citing the work if you consider it useful for your research:
289
+
290
+ ```
291
+ @book{judge-vlm,
292
+ author = {Shlomo Kashani},
293
+ title = {VLM-Image-Analysis: A Vision-and-Language Modeling Framework},
294
+ year = {2024},
295
+ github = {https://huggingface.co/spaces/BoltzmannEntropy/vlms/}
296
+ }
297
+ ```
app.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import io
4
+ import sqlite3
5
+ import torch
6
+ import gradio as gr
7
+ import pandas as pd
8
+ from PIL import Image
9
+ import requests
10
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForCausalLM, AutoTokenizer
11
+ from huggingface_hub import hf_hub_download
12
+ from datasets import load_dataset
13
+ import traceback
14
+ from tqdm import tqdm
15
+ import zipfile
16
+
17
+ # Define constants for vikhyatk/moondream2 model
18
+ MOON_DREAM_MODEL_ID = "vikhyatk/moondream2"
19
+ MOON_DREAM_REVISION = "2024-08-26"
20
+
21
+ # Define constants for the Qwen2-VL models
22
+ QWEN2_VL_MODELS = [
23
+ 'Qwen/Qwen2-VL-7B-Instruct',
24
+ 'Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4',
25
+ 'OpenGVLab/InternVL2-1B',
26
+ 'Qwen/Qwen2-VL-72B',
27
+ ]
28
+
29
+ # List of models to use (combining unique entries from available models and QWEN2_VL_MODELS)
30
+ available_models = [
31
+ *QWEN2_VL_MODELS, # Expands the QWEN2_VL_MODELS list into the available_models
32
+ 'microsoft/Phi-3-vision-128k-instruct',
33
+ 'vikhyatk/moondream2'
34
+ ]
35
+
36
+ # List of available Hugging Face datasets
37
+ dataset_options = [
38
+ "gokaygokay/panorama_hdr_dataset",
39
+ "OpenGVLab/CRPE"
40
+ ]
41
+
42
+ # List of text prompts to use
43
+ text_prompts = [
44
+ "Provide a detailed description of the image contents, including all visible objects, people, activities, and extract any text present within the image using Optical Character Recognition (OCR). Organize the extracted text in a structured table format with columns for original text, its translation into English, and the language it is written in.",
45
+ "Offer a thorough description of all elements within the image, from objects to individuals and their activities. Ensure any legible text seen in the image is extracted using Optical Character Recognition (OCR). Provide an accurate narrative that encapsulates the full content of the image.",
46
+ "Create a four-sentence caption for the image. Start by specifying the style and type, such as painting, photograph, or digital art. In the next sentences, detail the contents and the composition clearly and concisely. Use language suited for prompting a text-to-image model, separating descriptive terms with commas instead of 'or'. Keep the description direct, avoiding interpretive phrases or abstract expressions",
47
+ ]
48
+
49
+ # SQLite setup
50
+ # def init_db():
51
+ # conn = sqlite3.connect('image_outputs.db')
52
+ # cursor = conn.cursor()
53
+ # cursor.execute('''
54
+ # CREATE TABLE IF NOT EXISTS image_outputs (
55
+ # id INTEGER PRIMARY KEY AUTOINCREMENT,
56
+ # image BLOB,
57
+ # prompt TEXT,
58
+ # output TEXT,
59
+ # model_name TEXT
60
+ # )
61
+ # ''')
62
+ # conn.commit()
63
+ # conn.close()
64
+
65
+ def image_to_binary(image_path):
66
+ with open(image_path, 'rb') as file:
67
+ return file.read()
68
+
69
+ # def store_in_db(image_path, prompt, output, model_name):
70
+ # conn = sqlite3.connect('image_outputs.db')
71
+ # cursor = conn.cursor()
72
+ # image_blob = image_to_binary(image_path)
73
+ # cursor.execute('''
74
+ # INSERT INTO image_outputs (image, prompt, output, model_name)
75
+ # VALUES (?, ?, ?, ?)
76
+ # ''', (image_blob, prompt, output, model_name))
77
+ # conn.commit()
78
+ # conn.close()
79
+
80
+ # Function to encode an image to base64 for HTML display
81
+ def encode_image(image):
82
+ img_buffer = io.BytesIO()
83
+ image.save(img_buffer, format="PNG")
84
+ img_str = base64.b64encode(img_buffer.getvalue()).decode("utf-8")
85
+ return f'<img src="data:image/png;base64,{img_str}" style="max-width:500px;"/>'
86
+
87
+ # Function to load and display images from the panorama_hdr_dataset
88
+ def load_dataset_images(dataset_name, num_images):
89
+ try:
90
+ dataset = load_dataset(dataset_name, split='train')
91
+ images = []
92
+ for i, item in enumerate(dataset[:num_images]):
93
+ if 'image' in item:
94
+ img = item['image']
95
+ print (type(img))
96
+ encoded_img = encode_image(img)
97
+ metadata = f"Width: {img.width}, Height: {img.height}"
98
+ if 'hdr' in item:
99
+ metadata += f", HDR: {item['hdr']}"
100
+ images.append(f"<div style='display: inline-block; margin: 10px; text-align: center;'><h3>Image {i+1}</h3>{encoded_img}<p>{metadata}</p></div>")
101
+ if not images:
102
+ return "No images could be loaded from this dataset. Please check the dataset structure."
103
+ return "".join(images)
104
+ except Exception as e:
105
+ print(f"Error loading dataset: {e}")
106
+ traceback.print_exc()
107
+
108
+ # Function to generate output
109
+ def generate_output(model, processor, prompt, image, model_name, device):
110
+ try:
111
+ image_bytes = io.BytesIO()
112
+ image.save(image_bytes, format="PNG")
113
+ image_bytes = image_bytes.getvalue()
114
+
115
+ if model_name in QWEN2_VL_MODELS:
116
+ messages = [
117
+ {
118
+ "role": "user",
119
+ "content": [
120
+ {"type": "image", "image": image_bytes},
121
+ {"type": "text", "text": prompt},
122
+ ]
123
+ }
124
+ ]
125
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
126
+ inputs = processor(
127
+ text=[text],
128
+ images=[Image.open(io.BytesIO(image_bytes))],
129
+ padding=True,
130
+ return_tensors="pt",
131
+ )
132
+ inputs = {k: v.to(device) for k, v in inputs.items()}
133
+ generated_ids = model.generate(**inputs, max_new_tokens=1024)
134
+ generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)]
135
+ response_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
136
+ return response_text
137
+
138
+ elif model_name == 'microsoft/Phi-3-vision-128k-instruct':
139
+ messages = [{"role": "user", "content": f"<|image_1|>\n{prompt}"}]
140
+ prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
141
+ inputs = processor(prompt, [image], return_tensors="pt")
142
+ inputs = {k: v.to(device) for k, v in inputs.items()}
143
+ generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, max_new_tokens=1024)
144
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
145
+ response_text = processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
146
+ return response_text
147
+
148
+ elif model_name == 'vikhyatk/moondream2':
149
+ tokenizer = AutoTokenizer.from_pretrained(MOON_DREAM_MODEL_ID, revision=MOON_DREAM_REVISION)
150
+ enc_image = model.encode_image(image)
151
+ response_text = model.answer_question(enc_image, prompt, tokenizer)
152
+ return response_text
153
+ except Exception as e:
154
+ return f"Error during generation with model {model_name}: {e}"
155
+
156
+ # Function to list and encode images from a directory
157
+ def list_images(directory_path):
158
+ images = []
159
+ for filename in os.listdir(directory_path):
160
+ if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
161
+ image_path = os.path.join(directory_path, filename)
162
+ encoded_img = encode_image(image_path)
163
+ images.append({
164
+ "filename": filename,
165
+ "image": encoded_img
166
+ })
167
+ return images
168
+
169
+ # Function to extract images from a ZIP file
170
+ # Function to extract images from a ZIP file
171
+ def extract_images_from_zip(zip_file):
172
+ images = []
173
+ with zipfile.ZipFile(zip_file, 'r') as zip_ref:
174
+ for file_info in zip_ref.infolist():
175
+ if file_info.filename.lower().endswith(('.png', '.jpg', '.jpeg')):
176
+ with zip_ref.open(file_info) as file:
177
+ try:
178
+ img = Image.open(file)
179
+ img = img.convert("RGB") # Ensure the image is in RGB mode
180
+ encoded_img = img
181
+ images.append({
182
+ "filename": file_info.filename,
183
+ "image": encoded_img
184
+ })
185
+ except Exception as e:
186
+ print(f"Error opening image {file_info.filename}: {e}")
187
+ return images
188
+
189
+ # Gradio interface function for running inference
190
+ def run_inference(model_names, dataset_input, num_images_input, prompts, device_map, torch_dtype, trust_remote_code,use_flash_attn, use_zip, zip_file):
191
+ data = []
192
+
193
+ torch_dtype_value = torch.float16 if torch_dtype == "torch.float16" else torch.float32
194
+ device_map_value = "cuda" if torch.cuda.is_available() else "cpu" if device_map == "auto" else device_map
195
+
196
+ model_processors = {}
197
+ for model_name in model_names:
198
+ try:
199
+ if model_name in QWEN2_VL_MODELS:
200
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
201
+ model_name,
202
+ torch_dtype=torch_dtype_value,
203
+ device_map=device_map_value
204
+ ).eval()
205
+ processor = AutoProcessor.from_pretrained(model_name)
206
+ elif model_name == 'microsoft/Phi-3-vision-128k-instruct':
207
+ model = AutoModelForCausalLM.from_pretrained(
208
+ model_name,
209
+ device_map=device_map_value,
210
+ torch_dtype=torch_dtype_value,
211
+ trust_remote_code=trust_remote_code,
212
+ use_flash_attn=use_flash_attn
213
+ ).eval()
214
+ processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=trust_remote_code)
215
+ elif model_name == 'vikhyatk/moondream2':
216
+ model = AutoModelForCausalLM.from_pretrained(
217
+ MOON_DREAM_MODEL_ID,
218
+ trust_remote_code=True,
219
+ revision=MOON_DREAM_REVISION
220
+ ).eval()
221
+ processor = None # No processor needed for this model
222
+
223
+ model_processors[model_name] = (model, processor)
224
+
225
+ except Exception as e:
226
+ print(f"Error loading model {model_name}: {e}")
227
+
228
+ try:
229
+ # Load images from the ZIP file if use_zip is True
230
+ if use_zip:
231
+ images = extract_images_from_zip(zip_file)
232
+ print ("Number of images in zip:" , len(images))
233
+ for img in tqdm(images):
234
+ try:
235
+ img_data = img['image']
236
+ if not isinstance(img_data, str):
237
+ # Convert the Image object to a base64-encoded string
238
+ img_buffer = io.BytesIO()
239
+ img['image'].save(img_buffer, format="PNG")
240
+ img_data = base64.b64encode(img_buffer.getvalue()).decode("utf-8")
241
+
242
+ img_data=f'<img src="data:image/png;base64,{img_data}" style="max-width:500px;"/>'
243
+
244
+ row_data = {"Image": img_data} # Assuming encode_image is defined elsewhere
245
+ for model_name in model_names:
246
+ if model_name in model_processors:
247
+ model, processor = model_processors[model_name]
248
+ for prompt in prompts:
249
+ try:
250
+ # Ensure image is defined
251
+ image = img['image']
252
+ response_text = generate_output(model, processor, prompt, image, model_name, device_map_value)
253
+ row_data[f"{model_name}_Response_{prompt}"] = response_text
254
+ except Exception as e:
255
+ row_data[f"{model_name}_Response_{prompt}"] = f"Error during generation with model {model_name}: {e}"
256
+ traceback.print_exc()
257
+
258
+ data.append(row_data)
259
+ except Exception as e:
260
+ print(f"Error processing image {img['filename']}: {e}")
261
+ traceback.print_exc()
262
+
263
+ # Load the dataset if use_zip is False
264
+ else:
265
+ dataset = load_dataset(dataset_input, split='train')
266
+ for i in tqdm(range(num_images_input)):
267
+ if dataset_input == "OpenGVLab/CRPE":
268
+ image = dataset[i]['image']
269
+ elif dataset_input == "gokaygokay/panorama_hdr_dataset":
270
+ image = dataset[i]['png_image']
271
+ else:
272
+ image = dataset[i]['image']
273
+
274
+ encoded_img = encode_image(image)
275
+ row_data = {"Image": encoded_img}
276
+
277
+ for model_name in model_names:
278
+ if model_name in model_processors:
279
+ model, processor = model_processors[model_name]
280
+ for prompt in prompts:
281
+ try:
282
+ response_text = generate_output(model, processor, prompt, image, model_name, device_map_value)
283
+ row_data[f"{model_name}_Response_{prompt}"] = response_text
284
+ except Exception as e:
285
+ row_data[f"{model_name}_Response_{prompt}"] = f"Error during generation with model {model_name}: {e}"
286
+
287
+ data.append(row_data)
288
+
289
+ except Exception as e:
290
+ print(f"Error loading dataset: {e}")
291
+ traceback.print_exc()
292
+
293
+ return pd.DataFrame(data).to_html(escape=False)
294
+
295
+ # Gradio UI setup
296
+ def create_gradio_interface():
297
+ css = """
298
+ #output {
299
+ height: 500px;
300
+ overflow: auto;
301
+ }
302
+ """
303
+ with gr.Blocks(css=css) as demo:
304
+ # Title
305
+ gr.Markdown("# VLM-Image-Analysis: A Vision-and-Language Modeling Framework.")
306
+ gr.Markdown("""
307
+ - Handle a batch of images from a ZIP file OR
308
+ - Processes images from an HF DB
309
+ - Compatible with png, jpg, jpeg, and webp formats
310
+ - Compatibility with various AI models: Qwen2-VL-7B-Instruct, Qwen2-VL-2B-Instruct-GPTQ-Int4, InternVL2-1B, Qwen2-VL-72B, /Phi-3-vision-128k-instruct and moondream2""")
311
+
312
+ gr.Image(value="static/image.jpg", label="HF Image", width=300, height=300) # Set custom width and height
313
+
314
+ with gr.Tab("VLM model and Dataset selection"):
315
+ gr.Markdown("### Dataset Selection: HF or from a ZIP file.")
316
+ with gr.Accordion("Advanced Settings", open=True):
317
+ with gr.Row():
318
+ # with gr.Column():
319
+ use_zip_input = gr.Checkbox(label="Use ZIP File", value=False)
320
+ dataset_input = gr.Dropdown(choices=dataset_options, label="Select Dataset", value=dataset_options[1], visible=True)
321
+ num_images_input = gr.Radio(choices=[1, 5, 20], label="Number of Images", value=5)
322
+ zip_file_input = gr.File(label="Upload ZIP File of Images", file_types=[".zip"])
323
+ gr.Markdown("### VLM Model Selection")
324
+ with gr.Row():
325
+ with gr.Column():
326
+ models_input = gr.CheckboxGroup(choices=available_models, label="Select Models", value=available_models[4])
327
+ prompts_input = gr.CheckboxGroup(choices=text_prompts, label="Select Prompts", value=text_prompts[2])
328
+ submit_btn = gr.Button("Run Inference")
329
+
330
+ with gr.Row():
331
+ output_display = gr.HTML(label="Results")
332
+
333
+ with gr.Tab("GPU Device Settings"):
334
+ device_map_input = gr.Radio(choices=["auto", "cpu", "cuda"], label="Device Map", value="auto")
335
+ torch_dtype_input = gr.Radio(choices=["torch.float16", "torch.float32"], label="Torch Dtype", value="torch.float16")
336
+ trust_remote_code_input = gr.Checkbox(label="Trust Remote Code", value=True)
337
+ use_flash_attn = gr.Checkbox(label="Use flash-attn 2 (Ampere GPUs or newer.)", value=False)
338
+
339
+
340
+ def run_inference_wrapper(model_names, dataset_input, num_images_input, prompts, device_map, torch_dtype, trust_remote_code,use_flash_attn, use_zip, zip_file):
341
+ return run_inference(model_names, dataset_input, num_images_input, prompts, device_map, torch_dtype, trust_remote_code,use_flash_attn, use_zip, zip_file)
342
+
343
+ def toggle_dataset_visibility(use_zip):
344
+ return gr.update(visible=not use_zip)
345
+
346
+ submit_btn.click(
347
+ fn=run_inference_wrapper,
348
+ inputs=[models_input, dataset_input, num_images_input, prompts_input, device_map_input, torch_dtype_input, trust_remote_code_input,use_flash_attn, use_zip_input, zip_file_input],
349
+ outputs=output_display
350
+ )
351
+
352
+ use_zip_input.change(
353
+ fn=toggle_dataset_visibility,
354
+ inputs=use_zip_input,
355
+ outputs=dataset_input
356
+ )
357
+
358
+ demo.launch(debug=True, share=False)
359
+
360
+ if __name__ == "__main__":
361
+ create_gradio_interface()
build.bat ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ docker build -t hf-docker .
2
+
hf-db-test.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import matplotlib.pyplot as plt
3
+
4
+ # Specify dataset and number of images to load
5
+ dataset_name = "OpenGVLab/CRPE" # Can switch to "OpenGVLab/CRPE"
6
+ num_images = 2 # Specify the number of images you want to load
7
+
8
+ # Load the dataset
9
+ dataset = load_dataset(dataset_name, split='train')
10
+
11
+ # Check if the dataset is 'OpenGVLab/CRPE' or 'gokaygokay/panorama_hdr_dataset' and handle accordingly
12
+ for i in range(num_images):
13
+ if dataset_name == "gokaygokay/panorama_hdr_dataset":
14
+ img = dataset[i]["png_image"] # Access image for this dataset
15
+ elif dataset_name == "OpenGVLab/CRPE":
16
+ img = dataset[i]["image"] # Access image for 'OpenGVLab/CRPE'
17
+
18
+ # Display the image
19
+ plt.imshow(img)
20
+ plt.title(f"Image {i+1}")
21
+ plt.axis('off') # Hide axes for better visualization
22
+ plt.show()
run-d.bat ADDED
@@ -0,0 +1 @@
 
 
1
+ docker run --gpus all --rm -it --shm-size=8gb --memory="16g" --env="DISPLAY" -p 8077:7842 -p 7860:7860 -p 8501:8501 -v %cd%:/home/user/app -v %cd%:/home/user/sharedfolder -v %cd%/.cache:/home/user/.cache hf-docker:latest bash
static/002.png ADDED
static/003.png ADDED
static/004.png ADDED
static/image.jpg ADDED
test_images/001.png ADDED
test_images/002.png ADDED
test_images/003.png ADDED
test_images/004.png ADDED