Spaces:
Running
Running
BoltzmannEntropy
commited on
Commit
·
fadb9e6
0
Parent(s):
First commit
Browse files- Dockerfile +112 -0
- README.md +297 -0
- app.py +361 -0
- build.bat +2 -0
- hf-db-test.py +22 -0
- run-d.bat +1 -0
- static/002.png +0 -0
- static/003.png +0 -0
- static/004.png +0 -0
- static/image.jpg +0 -0
- test_images/001.png +0 -0
- test_images/002.png +0 -0
- test_images/003.png +0 -0
- test_images/004.png +0 -0
Dockerfile
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Dockerfile customized for deployment on HuggingFace Spaces platform
|
2 |
+
|
3 |
+
# -- The Dockerfile has been tailored specifically for use on HuggingFace.
|
4 |
+
# -- It implies that certain modifications or optimizations have been made with HuggingFace's environment in mind.
|
5 |
+
# -- It uses "HuggingFace Spaces" to be more specific about the target platform.
|
6 |
+
|
7 |
+
# FROM pytorch/pytorch:2.2.1-cuda12.1-cudnn8-devel
|
8 |
+
FROM pytorch/pytorch:2.4.0-cuda12.1-cudnn9-devel
|
9 |
+
# FOR HF
|
10 |
+
|
11 |
+
USER root
|
12 |
+
|
13 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
14 |
+
RUN apt-get update && apt-get install -y \
|
15 |
+
git \
|
16 |
+
cmake \
|
17 |
+
python3 \
|
18 |
+
python3-pip \
|
19 |
+
python3-venv \
|
20 |
+
python3-dev \
|
21 |
+
python3-numpy \
|
22 |
+
gcc \
|
23 |
+
build-essential \
|
24 |
+
gfortran \
|
25 |
+
wget \
|
26 |
+
curl \
|
27 |
+
pkg-config \
|
28 |
+
software-properties-common \
|
29 |
+
zip \
|
30 |
+
&& apt-get clean && rm -rf /tmp/* /var/tmp/*
|
31 |
+
|
32 |
+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive \
|
33 |
+
apt-get install -y python3.10 python3-pip
|
34 |
+
|
35 |
+
RUN apt-get install -y libopenblas-base libopenmpi-dev
|
36 |
+
|
37 |
+
ENV TZ=Asia/Dubai
|
38 |
+
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
RUN useradd -m -u 1000 user
|
43 |
+
|
44 |
+
RUN apt-get update && apt-get install -y sudo && \
|
45 |
+
echo 'user ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
|
46 |
+
|
47 |
+
USER user
|
48 |
+
ENV HOME=/home/user \
|
49 |
+
PATH=/home/user/.local/bin:$PATH
|
50 |
+
|
51 |
+
RUN mkdir $HOME/app
|
52 |
+
RUN mkdir $HOME/app/test_images
|
53 |
+
|
54 |
+
# WORKDIR $HOME/app
|
55 |
+
|
56 |
+
RUN chown -R user:user $HOME/app
|
57 |
+
|
58 |
+
USER user
|
59 |
+
WORKDIR $HOME/app
|
60 |
+
|
61 |
+
RUN python -m pip install qwen-vl-utils
|
62 |
+
RUN python -m pip install --pre -U -f https://mlc.ai/wheels mlc-llm-nightly-cu122 mlc-ai-nightly-cu122
|
63 |
+
|
64 |
+
RUN python3 -m pip install chromadb db-sqlite3 auto-gptq exllama sqlalchemy
|
65 |
+
WORKDIR $HOME/app
|
66 |
+
RUN git clone https://github.com/casper-hansen/AutoAWQ
|
67 |
+
WORKDIR $HOME/app/AutoAWQ/
|
68 |
+
RUN python3 -m pip install -e .
|
69 |
+
WORKDIR $HOME/app
|
70 |
+
# ENV FLASH_ATTENTION_FORCE_BUILD=TRUE
|
71 |
+
RUN python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
72 |
+
RUN python -m pip install accelerate diffusers datasets timm flash-attn==2.6.1 gradio
|
73 |
+
|
74 |
+
RUN python3 -m pip install --no-deps optimum
|
75 |
+
RUN python3 -m pip install --no-deps autoawq>=0.1.8
|
76 |
+
|
77 |
+
#This seems to be a must : Intel Extension for PyTorch 2.4 needs to work with PyTorch 2.4.*, but PyTorch 2.2.2 is
|
78 |
+
RUN python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
79 |
+
RUN python3 -m pip install -U accelerate
|
80 |
+
RUN python3 -m pip install -U git+https://github.com/huggingface/transformers
|
81 |
+
|
82 |
+
WORKDIR $HOME/app
|
83 |
+
COPY --chown=user:user app.py .
|
84 |
+
COPY --chown=user:user test_images /home/user/app/test_images
|
85 |
+
# /home/user/app/
|
86 |
+
# chown -R user:user /home/user/.cache/
|
87 |
+
|
88 |
+
ENV PYTHONUNBUFFERED=1 GRADIO_ALLOW_FLAGGING=never GRADIO_NUM_PORTS=1 GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 SYSTEM=spaces
|
89 |
+
RUN python3 -m pip install pennylane sympy pennylane-qiskit duckdb
|
90 |
+
WORKDIR $HOME/app
|
91 |
+
|
92 |
+
EXPOSE 8097 7842 8501 8000 6666 7860
|
93 |
+
|
94 |
+
CMD ["python", "app.py"]
|
95 |
+
|
96 |
+
|
97 |
+
# ERROR! Intel® Extension for PyTorch* needs to work with PyTorch 2.4.*, but PyTorch 2.2.2 is found. Please switch to the matching version and run again.
|
98 |
+
# ERROR! Intel® Extension for PyTorch* needs to work with PyTorch 2.4.*, but PyTorch 2.2.2 is found. Please switch to the matching version and run again.
|
99 |
+
# `Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
|
100 |
+
# /home/user/.local/lib/python3.10/site-packages/transformers/modeling_utils.py:4749: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead
|
101 |
+
# warnings.warn(
|
102 |
+
# /home/user/.local/lib/python3.10/site-packages/accelerate/utils/imports.py:336: UserWarning: Intel Extension for PyTorch 2.4 needs to work with PyTorch 2.4.*, but PyTorch 2.2.2 is found. Please switch to the matching version and run again.
|
103 |
+
# warnings.warn(
|
104 |
+
# Error loading model Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4: Found modules on cpu/disk.
|
105 |
+
# Using Exllama or Exllamav2 backend requires all the modules to be on GPU.You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object
|
106 |
+
# Error loading model Qwen/Qwen2-VL-7B-Instruct: (ReadTimeoutError("HTTPSConnectionPool(host='hf.co', port=443):
|
107 |
+
# Read timed out. (read timeout=10)"), '(Request ID: b8269a88-9b6b-43e0-942d-1049f173dc00)')
|
108 |
+
|
109 |
+
# Error loading model Qwen/Qwen2-VL-7B-Instruct: CUDA out of memory.
|
110 |
+
# Tried to allocate 130.00 MiB. GPU 0 has a total capacity of 14.58 GiB of which 77.62 MiB is free.
|
111 |
+
|
112 |
+
# instruct: FlashAttention only supports Ampere GPUs or newer.
|
README.md
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Vlms
|
3 |
+
emoji: 🔥
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: green
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
license: mit
|
9 |
+
---
|
10 |
+
|
11 |
+
# VLM-Image-Analysis: A Vision-and-Language Modeling Framework
|
12 |
+
|
13 |
+
Welcome to the Hugging Face Space for VLM-Image-Analysis. This space showcases a cutting-edge framework that combines multiple Vision-Language Models (VLMs) and a Large Language Model (LLM) to provide comprehensive image analysis and captioning.
|
14 |
+
|
15 |
+
<h1 align="center">
|
16 |
+
<img src="static/image.jpg" width="50%"></a>
|
17 |
+
<h6> (Adapted from wang2023allseeing: https://huggingface.co/datasets/OpenGVLab/CRPE?row=1) <h6>
|
18 |
+
</h1>
|
19 |
+
|
20 |
+
This repository contains the core code for a multi-model framework that enhances image interpretation through the combined power of several Vision-and-Language Modeling (VLM) systems. VLM-Image-Analysis delivers detailed, multi-faceted analyses of images by leveraging N cutting-edge VLM models, pre-trained on a wide range of datasets to detect diverse visual cues and linguistic patterns.
|
21 |
+
|
22 |
+
It is available on HF at the follwing HF space: https://huggingface.co/spaces/BoltzmannEntropy/vlms/
|
23 |
+
|
24 |
+
<h1 align="center">
|
25 |
+
<img src="static/003.png" width="100%"></a>
|
26 |
+
</h1>
|
27 |
+
|
28 |
+
1. Multiple VLMs analyze the input image independently, providing diverse perspectives.
|
29 |
+
2. A 'judge' LLM synthesizes these outputs into a comprehensive, coherent description.
|
30 |
+
|
31 |
+
|
32 |
+
## Abstract
|
33 |
+
|
34 |
+
|
35 |
+
This research introduces a novel technique in the realm of automatic image captioning that leverages the collaborative potential between Vision-Language Models (VLMs) and Large Language Models (LLMs). Our innovative method utilizes a two-tiered framework: initially, various VLMs provide diverse verbal descriptions of an image based on unique prompts; subsequently, these array of captions are integrated by a central 'judge' LLM to produce a cohesive and comprehensive caption that encapsulates the essence of the image. The objective of this synergistic approach is to elevate the precision, richness, and contextual appropriateness of image descriptions by pooling together diverse model capabilities. We validate the effectiveness of our dual-model strategy across a spectrum of VLMs—namely Qwen2-VL, Phi-3-vision, and Moondream2—and assess its performance on different datasets. Our empirical results indicate that this ensemble method yields image captions with substantially higher quality and depth than traditional single-model approaches, marking a significant stride in the evolution of computer vision technology.
|
36 |
+
|
37 |
+
|
38 |
+
## Introduction
|
39 |
+
|
40 |
+
The field of computer vision has undergone significant transformations in recent times, especially within the niche area of generating descriptive text for images—a task known as image captioning. Vision-Language Models (VLMs) have risen to prominence as instrumental instruments that facilitate the seamless integration of visual information with comprehension of natural language. Despite their capabilities, individual VLMs may possess unique advantages and constraints, which can sometimes result in descriptions that are either biased or lack completeness. This situation thus opens up a scope for creative approaches that can capitalize on the strengths of multiple models to produce more balanced and comprehensive image interpretations.
|
41 |
+
|
42 |
+
<h1 align="center">
|
43 |
+
<img src="static/005.png" width="1000%"></a>
|
44 |
+
</h1>
|
45 |
+
|
46 |
+
To address this issue, we introduce a novel methodology that leverages the combined strengths of multiple VLMs in conjunction with a Large Language Model (LLM) acting as a judge. Our approach is designed to produce more comprehensive, accurate, and contextually rich image captions by following a two-stage process:
|
47 |
+
|
48 |
+
1. **Multi-VLM Caption Generation**: In the first stage, we employ several distinct VLMs, including Qwen2-VL, Phi-3-vision, and Moondream2, to generate image descriptions. Each model is prompted with different instructions, encouraging diverse perspectives on the image content. This diversity is crucial, as it allows for a more comprehensive capture of the image's nuances and details.
|
49 |
+
|
50 |
+
2. **LLM-based Caption Synthesis**: The second stage involves using a 'judge' LLM to analyse and synthesise the outputs from the first stage. This model is tasked with unifying the various descriptions into a single, coherent caption that captures the essence of all initial responses. The judge LLM's role is not merely to aggregate information but to intelligently combine and refine the insights provided by the VLMs.
|
51 |
+
|
52 |
+
Our methodology leverages state-of-the-art models and techniques, including:
|
53 |
+
|
54 |
+
- The Qwen2-VL family of models, which offer varying capabilities and model sizes, allowing for a range of perspectives and computational efficiencies.
|
55 |
+
- Microsoft's Phi-3-vision model, known for its efficiency and performance in visual understanding tasks.
|
56 |
+
- The Moondream2 model, which brings unique perspectives to image understanding, potentially capturing aspects overlooked by other models.
|
57 |
+
|
58 |
+
The implementation utilises advanced libraries such as PyTorch and Hugging Face's Transformers, ensuring efficient processing and easy integration with existing deep learning workflows. We have also incorporated features to handle both individual images and batch processing from datasets or ZIP files, making our approach versatile for various research and practical applications.
|
59 |
+
|
60 |
+
By combining multiple VLMs and using an LLM as a judge, we aim to mitigate individual model biases, capture a broader range of image details, and produce captions that are more informative and contextually appropriate. This approach not only improves the quality of image captioning but also opens up new avenues for exploring the synergies between different AI models in multimodal tasks.
|
61 |
+
|
62 |
+
It is important to note, however, that while our method shows promise, it is not without limitations. The effectiveness of the approach may vary depending on the specific combination of models used and the nature of the images being captioned. Additionally, the computational resources required for this ensemble approach are significantly higher than those needed for single-model methods, which may limit its applicability in resource-constrained environments.
|
63 |
+
|
64 |
+
In the following sections, we shall detail our methodology, present the experimental setup, and discuss the results and implications of our findings. We believe this work contributes significantly to the field of image captioning and demonstrates the potential of ensemble approaches in vision-language tasks.
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
## Table of Contents
|
69 |
+
|
70 |
+
- [VLM-Image-Analysis: A Vision-and-Language Modeling Framework](#vlm-image-analysis-a-vision-and-language-modeling-framework)
|
71 |
+
- [Abstract](#abstract)
|
72 |
+
- [Introduction](#introduction)
|
73 |
+
- [Table of Contents](#table-of-contents)
|
74 |
+
- [Base VLMs](#base-vlms)
|
75 |
+
- [Judge VLM](#judge-vlm)
|
76 |
+
- [How to Use](#how-to-use)
|
77 |
+
- [Example Process](#example-process)
|
78 |
+
- [Models](#models)
|
79 |
+
- [Setup](#setup)
|
80 |
+
- [Inference](#inference)
|
81 |
+
- [The Gradio APP](#the-gradio-app)
|
82 |
+
- [1. VLM Model Selection](#1-vlm-model-selection)
|
83 |
+
- [2. Device Settings](#2-device-settings)
|
84 |
+
- [3. Image Selection](#3-image-selection)
|
85 |
+
- [Docker Setup for LLMs](#docker-setup-for-llms)
|
86 |
+
- [Base System](#base-system)
|
87 |
+
- [LLM-related Packages](#llm-related-packages)
|
88 |
+
- [Jupyter Configuration](#jupyter-configuration)
|
89 |
+
- [Exposed Ports](#exposed-ports)
|
90 |
+
- [Prompts](#prompts)
|
91 |
+
- [Example of Available Prompts](#example-of-available-prompts)
|
92 |
+
- [Citation](#citation)
|
93 |
+
|
94 |
+
## Base VLMs
|
95 |
+
|
96 |
+
The first phase of the framework isolates each specialized VLM model to individually analyse an input image. Each model processes the image independently, resulting in multiple descriptive outputs from different viewpoints. These models are carefully selected based on their unique strengths in visual and textual analysis, ensuring a comprehensive examination of the image.
|
97 |
+
|
98 |
+
For instance, a VLM might be prompted with the following directive:
|
99 |
+
```
|
100 |
+
"Provide an extensive description of all elements in this image, including objects, people, and activities."
|
101 |
+
```
|
102 |
+
|
103 |
+
This phase enables the generation of varied textual descriptions and captions, offering diverse perspectives on the visual content.
|
104 |
+
|
105 |
+
## Judge VLM
|
106 |
+
|
107 |
+
In the second phase, the outputs from the base VLMs are passed to a synthesizing "Judge" VLM model. This model is specifically trained to merge the individual outputs into a coherent, unified conclusion that captures the overall context of the image.
|
108 |
+
|
109 |
+
The Judge VLM is designed to handle different types of input data, including textual descriptions from both K and V models. Its role is to harmonise and integrate these multiple viewpoints, providing a more comprehensive synthesis of the image's content.
|
110 |
+
|
111 |
+
# How to Use
|
112 |
+
|
113 |
+
1. **Access the Space**: Click on the "Open in Spaces" badge at the top of this README or visit [https://huggingface.co/spaces/BoltzmannEntropy/vlms](https://huggingface.co/spaces/BoltzmannEntropy/vlms)
|
114 |
+
|
115 |
+
2. **Select a Dataset**: Choose from the available Hugging Face datasets:
|
116 |
+
- gokaygokay/panorama_hdr_dataset
|
117 |
+
- OpenGVLab/CRPE
|
118 |
+
|
119 |
+
3. **Select Models**: Choose which VLMs you want to use for the analysis from the available options.
|
120 |
+
|
121 |
+
4. **Choose Prompts**: Select from predefined prompts or enter a custom prompt to guide the analysis.
|
122 |
+
|
123 |
+
5. **Run Analysis**: Click the "Submit" button to start the image analysis process.
|
124 |
+
|
125 |
+
6. **View Results**: The generated descriptions, detected text, and translations will be displayed in the interface.
|
126 |
+
|
127 |
+
## Example Process
|
128 |
+
|
129 |
+
For instance, when the system processes an image and generates an initial textual output from a model such as "microsoft/Phi-3-vision-128k-instruct," the Judge VLM then engages the "Qwen/Qwen2-VL-7B-Instruct" model for further analysis. The Qwen/Qwen2 model evaluates this initial interpretation and offers a synthesized response that integrates both visual and textual insights, providing a final, nuanced interpretation to the user or downstream system.
|
130 |
+
|
131 |
+
## Models
|
132 |
+
|
133 |
+
The framework utilises multiple advanced Vision-and-Language Models (VLMs). For more details, refer to the respective model documentation:
|
134 |
+
- [Microsoft/Phi-3-Vision-128K-Instruct](https://huggingface.co/microsoft/Phi-3-Vision-128K-Instruct)
|
135 |
+
- [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)
|
136 |
+
|
137 |
+
## Setup
|
138 |
+
|
139 |
+
To set up this project, follow these steps in a terminal:
|
140 |
+
|
141 |
+
1. **Clone the Repository**
|
142 |
+
|
143 |
+
- Clone the repository to your local machine:
|
144 |
+
```bash
|
145 |
+
git clone https://github.com/your-repo/vlm-image-analysis.git
|
146 |
+
cd vlm-image-analysis
|
147 |
+
```
|
148 |
+
|
149 |
+
2. **Run the docker:**
|
150 |
+
|
151 |
+
|
152 |
+
After completing these steps, your setup should be complete and you can start using the project.
|
153 |
+
|
154 |
+
## Inference
|
155 |
+
|
156 |
+
Start the docker container:
|
157 |
+
```bash
|
158 |
+
docker run --gpus all --rm -it --shm-size=8gb --memory="16g" --env="DISPLAY" -p 8077:7842 -p 7860:7860 -p 8501:8501 -v %cd%:/RAG -v %cd%:/root/sharedfolder -v %cd%/.cache:/root/.cache lmdeploy-docker:latest bash
|
159 |
+
```
|
160 |
+
|
161 |
+
You should see something like:
|
162 |
+
```
|
163 |
+
==========
|
164 |
+
== CUDA ==
|
165 |
+
==========
|
166 |
+
|
167 |
+
CUDA Version 12.1.1
|
168 |
+
|
169 |
+
Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
170 |
+
|
171 |
+
This container image and its contents are governed by the NVIDIA Deep Learning Container License.
|
172 |
+
By pulling and using the container, you accept the terms and conditions of this license:
|
173 |
+
https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license
|
174 |
+
|
175 |
+
A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.
|
176 |
+
|
177 |
+
root@c6d710b08cde:/RAG/web#
|
178 |
+
```
|
179 |
+
|
180 |
+
To run the application:
|
181 |
+
|
182 |
+
```bash
|
183 |
+
root@c6d710b08cde:/RAG/web# python vlms.py
|
184 |
+
```
|
185 |
+
|
186 |
+
Which should output:
|
187 |
+
```bash
|
188 |
+
Running on local URL: http://0.0.0.0:7860
|
189 |
+
To create a public link, set `share=True` in `launch()`.
|
190 |
+
```
|
191 |
+
|
192 |
+
On initial use, the models are very slowly downloaded to a folder which is mapped externally using the docker -v command:
|
193 |
+
```
|
194 |
+
-v %cd%/.cache:/root/.cache
|
195 |
+
```
|
196 |
+
|
197 |
+
For instance:
|
198 |
+
```
|
199 |
+
root@c6d710b08cde:/RAG/web# ls -ls /root/.cache/huggingface/
|
200 |
+
0 drwxr-xr-x 1 root root 512 Sep 12 06:15 models--OpenGVLab--InternVL2-1B
|
201 |
+
0 drwxr-xr-x 1 root root 512 Sep 12 06:27 models--Qwen--Qwen2-VL-7B-Instruct
|
202 |
+
0 drwxr-xr-x 1 root root 512 Sep 16 11:14 models--Qwen--Qwen2-VL-7B-Instruct-GPTQ-Int8
|
203 |
+
0 drwxr-xr-x 1 root root 512 Sep 11 11:00 models--microsoft--Phi-3-vision-128k-instruct
|
204 |
+
0 drwxr-xr-x 1 root root 512 Sep 15 06:02 models--vikhyatk--moondream2
|
205 |
+
```
|
206 |
+
|
207 |
+
## The Gradio APP
|
208 |
+
|
209 |
+
The Gradio application provides a user-friendly interface for interacting with the VLM-Image-Analysis framework. It includes three main tabs: **VLM Model Selection**, **Device Settings**, and **Image Selection**. Each tab allows users to configure different aspects of the image analysis process.
|
210 |
+
|
211 |
+
### 1. VLM Model Selection
|
212 |
+
|
213 |
+
This tab allows users to select the Vision-and-Language Models (VLMs) they want to use, specify the directory of images to be analyzed, and choose prompts for generating textual descriptions. The interface includes:
|
214 |
+
|
215 |
+
- **Model Selection**: A checkbox group where users can select from available models such as `Qwen/Qwen2-VL-7B-Instruct`, `microsoft/Phi-3-vision-128k-instruct`, `vikhyatk/moondream2`, and `Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8`.
|
216 |
+
- **Image Directory Path**: A textbox to input the path to the directory containing images.
|
217 |
+
- **Prompt Selection**: A checkbox group where users can select from predefined prompts that guide the analysis.
|
218 |
+
- **Submit Button**: A button to run the inference based on the selected models, prompts, and image directory.
|
219 |
+
|
220 |
+
The results of the inference are displayed in an HTML format, showing the generated descriptions, detected text, and translations.
|
221 |
+
|
222 |
+
### 2. Device Settings
|
223 |
+
|
224 |
+
<h1 align="center">
|
225 |
+
<img src="static/004.png" width="75%"></a>
|
226 |
+
</h1>
|
227 |
+
|
228 |
+
In this tab, users can configure settings related to the computational resources used for running the models:
|
229 |
+
|
230 |
+
- **Device Map**: A radio button selection to choose the device map (`auto`, `cpu`, `cuda`). This determines whether the inference will use the GPU or CPU.
|
231 |
+
- **Torch Dtype**: A radio button to select the data type for PyTorch (`torch.float16` or `torch.float32`). This affects the precision and performance of the model.
|
232 |
+
- **Trust Remote Code**: A checkbox to indicate whether to trust remote code when loading models. This is relevant for models that require remote execution code.
|
233 |
+
|
234 |
+
### 3. Image Selection
|
235 |
+
|
236 |
+
This tab allows users to process images from the specified HF Dataset:
|
237 |
+
```
|
238 |
+
# List of available Hugging Face datasets
|
239 |
+
dataset_options = [
|
240 |
+
"gokaygokay/panorama_hdr_dataset",
|
241 |
+
"OpenGVLab/CRPE"
|
242 |
+
]
|
243 |
+
```
|
244 |
+
|
245 |
+
- **HF Image Dataset**: A textbox to input the path to the HF dataset containing images.
|
246 |
+
- **Load a ZIP of Images Button**: A button to load and images from the specified ZIP file.
|
247 |
+
|
248 |
+
## Docker Setup for LLMs
|
249 |
+
|
250 |
+
### Base System
|
251 |
+
|
252 |
+
The Docker image is based on the `pytorch/pytorch:2.2.2-cuda12.1-cudnn8-devel` image, which provides a PyTorch environment with CUDA 12.1 and cuDNN 8 for GPU acceleration.
|
253 |
+
|
254 |
+
### LLM-related Packages
|
255 |
+
|
256 |
+
- **MLC LLM**: Installed from a nightly build URL, including `mlc-llm-nightly-cu122` for LLM services.
|
257 |
+
- **AutoAWQ**: Cloned from GitHub and installed, which likely includes functionality for automatic quantization and weight compression.
|
258 |
+
- **FlashAttention**: The package `flash-attn==2.6.1` is installed to optimize attention mechanisms in transformer models.
|
259 |
+
- **Transformers**: The `transformers==4.33.0` package is used for working with pre-trained transformer models, along with `accelerate`, `diffusers`, `datasets`, `timm` for various model and dataset utilities.
|
260 |
+
- **Other Libraries**: `chromadb`, `db-sqlite3`, `auto-gptq`, `exllama`, `sqlalchemy`, `optimum`, and `autoawq` for additional functionalities related to data handling, model optimization, and automatic weight quantization.
|
261 |
+
|
262 |
+
### Jupyter Configuration
|
263 |
+
|
264 |
+
- **Jupyter**: Jupyter Notebook and related packages are set up, including configuration through `jupyter_notebook_config.py` and a script `run_jupyter.sh`.
|
265 |
+
|
266 |
+
### Exposed Ports
|
267 |
+
|
268 |
+
The container exposes the following ports for accessing services:
|
269 |
+
- `8097`, `7842`, `8501`, `8000`, `6666`, `7860`
|
270 |
+
|
271 |
+
|
272 |
+
## Prompts
|
273 |
+
|
274 |
+
The framework uses a variety of prompts to guide the Vision-and-Language Modeling (VLM) systems in generating descriptive outputs from images. Each prompt is designed to elicit detailed and specific information from the models.
|
275 |
+
|
276 |
+
### Example of Available Prompts
|
277 |
+
|
278 |
+
1. **Comprehensive Image Analysis**
|
279 |
+
- **Prompt**:
|
280 |
+
```
|
281 |
+
Thoroughly analyse the provided image and generate a comprehensive description of its content. This includes identifying and describing all objects, people, and activities visible in the image. Additionally, detect and extract any text present within the image using Optical Character Recognition (OCR), regardless of the language. Present the extracted text in a structured table format, including columns for the original text, its translation into English, and the language of the text.
|
282 |
+
```
|
283 |
+
- **Rationale**: This prompt is designed to provide an exhaustive analysis of the image, encompassing both visual and textual information. It aims to capture every detail, making it suitable for applications where a complete understanding of the image is necessary, such as in detailed reporting or comprehensive data annotation.
|
284 |
+
|
285 |
+
|
286 |
+
## Citation
|
287 |
+
|
288 |
+
Please consider citing the work if you consider it useful for your research:
|
289 |
+
|
290 |
+
```
|
291 |
+
@book{judge-vlm,
|
292 |
+
author = {Shlomo Kashani},
|
293 |
+
title = {VLM-Image-Analysis: A Vision-and-Language Modeling Framework},
|
294 |
+
year = {2024},
|
295 |
+
github = {https://huggingface.co/spaces/BoltzmannEntropy/vlms/}
|
296 |
+
}
|
297 |
+
```
|
app.py
ADDED
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import base64
|
3 |
+
import io
|
4 |
+
import sqlite3
|
5 |
+
import torch
|
6 |
+
import gradio as gr
|
7 |
+
import pandas as pd
|
8 |
+
from PIL import Image
|
9 |
+
import requests
|
10 |
+
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForCausalLM, AutoTokenizer
|
11 |
+
from huggingface_hub import hf_hub_download
|
12 |
+
from datasets import load_dataset
|
13 |
+
import traceback
|
14 |
+
from tqdm import tqdm
|
15 |
+
import zipfile
|
16 |
+
|
17 |
+
# Define constants for vikhyatk/moondream2 model
|
18 |
+
MOON_DREAM_MODEL_ID = "vikhyatk/moondream2"
|
19 |
+
MOON_DREAM_REVISION = "2024-08-26"
|
20 |
+
|
21 |
+
# Define constants for the Qwen2-VL models
|
22 |
+
QWEN2_VL_MODELS = [
|
23 |
+
'Qwen/Qwen2-VL-7B-Instruct',
|
24 |
+
'Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4',
|
25 |
+
'OpenGVLab/InternVL2-1B',
|
26 |
+
'Qwen/Qwen2-VL-72B',
|
27 |
+
]
|
28 |
+
|
29 |
+
# List of models to use (combining unique entries from available models and QWEN2_VL_MODELS)
|
30 |
+
available_models = [
|
31 |
+
*QWEN2_VL_MODELS, # Expands the QWEN2_VL_MODELS list into the available_models
|
32 |
+
'microsoft/Phi-3-vision-128k-instruct',
|
33 |
+
'vikhyatk/moondream2'
|
34 |
+
]
|
35 |
+
|
36 |
+
# List of available Hugging Face datasets
|
37 |
+
dataset_options = [
|
38 |
+
"gokaygokay/panorama_hdr_dataset",
|
39 |
+
"OpenGVLab/CRPE"
|
40 |
+
]
|
41 |
+
|
42 |
+
# List of text prompts to use
|
43 |
+
text_prompts = [
|
44 |
+
"Provide a detailed description of the image contents, including all visible objects, people, activities, and extract any text present within the image using Optical Character Recognition (OCR). Organize the extracted text in a structured table format with columns for original text, its translation into English, and the language it is written in.",
|
45 |
+
"Offer a thorough description of all elements within the image, from objects to individuals and their activities. Ensure any legible text seen in the image is extracted using Optical Character Recognition (OCR). Provide an accurate narrative that encapsulates the full content of the image.",
|
46 |
+
"Create a four-sentence caption for the image. Start by specifying the style and type, such as painting, photograph, or digital art. In the next sentences, detail the contents and the composition clearly and concisely. Use language suited for prompting a text-to-image model, separating descriptive terms with commas instead of 'or'. Keep the description direct, avoiding interpretive phrases or abstract expressions",
|
47 |
+
]
|
48 |
+
|
49 |
+
# SQLite setup
|
50 |
+
# def init_db():
|
51 |
+
# conn = sqlite3.connect('image_outputs.db')
|
52 |
+
# cursor = conn.cursor()
|
53 |
+
# cursor.execute('''
|
54 |
+
# CREATE TABLE IF NOT EXISTS image_outputs (
|
55 |
+
# id INTEGER PRIMARY KEY AUTOINCREMENT,
|
56 |
+
# image BLOB,
|
57 |
+
# prompt TEXT,
|
58 |
+
# output TEXT,
|
59 |
+
# model_name TEXT
|
60 |
+
# )
|
61 |
+
# ''')
|
62 |
+
# conn.commit()
|
63 |
+
# conn.close()
|
64 |
+
|
65 |
+
def image_to_binary(image_path):
|
66 |
+
with open(image_path, 'rb') as file:
|
67 |
+
return file.read()
|
68 |
+
|
69 |
+
# def store_in_db(image_path, prompt, output, model_name):
|
70 |
+
# conn = sqlite3.connect('image_outputs.db')
|
71 |
+
# cursor = conn.cursor()
|
72 |
+
# image_blob = image_to_binary(image_path)
|
73 |
+
# cursor.execute('''
|
74 |
+
# INSERT INTO image_outputs (image, prompt, output, model_name)
|
75 |
+
# VALUES (?, ?, ?, ?)
|
76 |
+
# ''', (image_blob, prompt, output, model_name))
|
77 |
+
# conn.commit()
|
78 |
+
# conn.close()
|
79 |
+
|
80 |
+
# Function to encode an image to base64 for HTML display
|
81 |
+
def encode_image(image):
|
82 |
+
img_buffer = io.BytesIO()
|
83 |
+
image.save(img_buffer, format="PNG")
|
84 |
+
img_str = base64.b64encode(img_buffer.getvalue()).decode("utf-8")
|
85 |
+
return f'<img src="data:image/png;base64,{img_str}" style="max-width:500px;"/>'
|
86 |
+
|
87 |
+
# Function to load and display images from the panorama_hdr_dataset
|
88 |
+
def load_dataset_images(dataset_name, num_images):
|
89 |
+
try:
|
90 |
+
dataset = load_dataset(dataset_name, split='train')
|
91 |
+
images = []
|
92 |
+
for i, item in enumerate(dataset[:num_images]):
|
93 |
+
if 'image' in item:
|
94 |
+
img = item['image']
|
95 |
+
print (type(img))
|
96 |
+
encoded_img = encode_image(img)
|
97 |
+
metadata = f"Width: {img.width}, Height: {img.height}"
|
98 |
+
if 'hdr' in item:
|
99 |
+
metadata += f", HDR: {item['hdr']}"
|
100 |
+
images.append(f"<div style='display: inline-block; margin: 10px; text-align: center;'><h3>Image {i+1}</h3>{encoded_img}<p>{metadata}</p></div>")
|
101 |
+
if not images:
|
102 |
+
return "No images could be loaded from this dataset. Please check the dataset structure."
|
103 |
+
return "".join(images)
|
104 |
+
except Exception as e:
|
105 |
+
print(f"Error loading dataset: {e}")
|
106 |
+
traceback.print_exc()
|
107 |
+
|
108 |
+
# Function to generate output
|
109 |
+
def generate_output(model, processor, prompt, image, model_name, device):
|
110 |
+
try:
|
111 |
+
image_bytes = io.BytesIO()
|
112 |
+
image.save(image_bytes, format="PNG")
|
113 |
+
image_bytes = image_bytes.getvalue()
|
114 |
+
|
115 |
+
if model_name in QWEN2_VL_MODELS:
|
116 |
+
messages = [
|
117 |
+
{
|
118 |
+
"role": "user",
|
119 |
+
"content": [
|
120 |
+
{"type": "image", "image": image_bytes},
|
121 |
+
{"type": "text", "text": prompt},
|
122 |
+
]
|
123 |
+
}
|
124 |
+
]
|
125 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
126 |
+
inputs = processor(
|
127 |
+
text=[text],
|
128 |
+
images=[Image.open(io.BytesIO(image_bytes))],
|
129 |
+
padding=True,
|
130 |
+
return_tensors="pt",
|
131 |
+
)
|
132 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
133 |
+
generated_ids = model.generate(**inputs, max_new_tokens=1024)
|
134 |
+
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)]
|
135 |
+
response_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
136 |
+
return response_text
|
137 |
+
|
138 |
+
elif model_name == 'microsoft/Phi-3-vision-128k-instruct':
|
139 |
+
messages = [{"role": "user", "content": f"<|image_1|>\n{prompt}"}]
|
140 |
+
prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
141 |
+
inputs = processor(prompt, [image], return_tensors="pt")
|
142 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
143 |
+
generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, max_new_tokens=1024)
|
144 |
+
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
145 |
+
response_text = processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
|
146 |
+
return response_text
|
147 |
+
|
148 |
+
elif model_name == 'vikhyatk/moondream2':
|
149 |
+
tokenizer = AutoTokenizer.from_pretrained(MOON_DREAM_MODEL_ID, revision=MOON_DREAM_REVISION)
|
150 |
+
enc_image = model.encode_image(image)
|
151 |
+
response_text = model.answer_question(enc_image, prompt, tokenizer)
|
152 |
+
return response_text
|
153 |
+
except Exception as e:
|
154 |
+
return f"Error during generation with model {model_name}: {e}"
|
155 |
+
|
156 |
+
# Function to list and encode images from a directory
|
157 |
+
def list_images(directory_path):
|
158 |
+
images = []
|
159 |
+
for filename in os.listdir(directory_path):
|
160 |
+
if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
|
161 |
+
image_path = os.path.join(directory_path, filename)
|
162 |
+
encoded_img = encode_image(image_path)
|
163 |
+
images.append({
|
164 |
+
"filename": filename,
|
165 |
+
"image": encoded_img
|
166 |
+
})
|
167 |
+
return images
|
168 |
+
|
169 |
+
# Function to extract images from a ZIP file
|
170 |
+
# Function to extract images from a ZIP file
|
171 |
+
def extract_images_from_zip(zip_file):
|
172 |
+
images = []
|
173 |
+
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
|
174 |
+
for file_info in zip_ref.infolist():
|
175 |
+
if file_info.filename.lower().endswith(('.png', '.jpg', '.jpeg')):
|
176 |
+
with zip_ref.open(file_info) as file:
|
177 |
+
try:
|
178 |
+
img = Image.open(file)
|
179 |
+
img = img.convert("RGB") # Ensure the image is in RGB mode
|
180 |
+
encoded_img = img
|
181 |
+
images.append({
|
182 |
+
"filename": file_info.filename,
|
183 |
+
"image": encoded_img
|
184 |
+
})
|
185 |
+
except Exception as e:
|
186 |
+
print(f"Error opening image {file_info.filename}: {e}")
|
187 |
+
return images
|
188 |
+
|
189 |
+
# Gradio interface function for running inference
|
190 |
+
def run_inference(model_names, dataset_input, num_images_input, prompts, device_map, torch_dtype, trust_remote_code,use_flash_attn, use_zip, zip_file):
|
191 |
+
data = []
|
192 |
+
|
193 |
+
torch_dtype_value = torch.float16 if torch_dtype == "torch.float16" else torch.float32
|
194 |
+
device_map_value = "cuda" if torch.cuda.is_available() else "cpu" if device_map == "auto" else device_map
|
195 |
+
|
196 |
+
model_processors = {}
|
197 |
+
for model_name in model_names:
|
198 |
+
try:
|
199 |
+
if model_name in QWEN2_VL_MODELS:
|
200 |
+
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
201 |
+
model_name,
|
202 |
+
torch_dtype=torch_dtype_value,
|
203 |
+
device_map=device_map_value
|
204 |
+
).eval()
|
205 |
+
processor = AutoProcessor.from_pretrained(model_name)
|
206 |
+
elif model_name == 'microsoft/Phi-3-vision-128k-instruct':
|
207 |
+
model = AutoModelForCausalLM.from_pretrained(
|
208 |
+
model_name,
|
209 |
+
device_map=device_map_value,
|
210 |
+
torch_dtype=torch_dtype_value,
|
211 |
+
trust_remote_code=trust_remote_code,
|
212 |
+
use_flash_attn=use_flash_attn
|
213 |
+
).eval()
|
214 |
+
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=trust_remote_code)
|
215 |
+
elif model_name == 'vikhyatk/moondream2':
|
216 |
+
model = AutoModelForCausalLM.from_pretrained(
|
217 |
+
MOON_DREAM_MODEL_ID,
|
218 |
+
trust_remote_code=True,
|
219 |
+
revision=MOON_DREAM_REVISION
|
220 |
+
).eval()
|
221 |
+
processor = None # No processor needed for this model
|
222 |
+
|
223 |
+
model_processors[model_name] = (model, processor)
|
224 |
+
|
225 |
+
except Exception as e:
|
226 |
+
print(f"Error loading model {model_name}: {e}")
|
227 |
+
|
228 |
+
try:
|
229 |
+
# Load images from the ZIP file if use_zip is True
|
230 |
+
if use_zip:
|
231 |
+
images = extract_images_from_zip(zip_file)
|
232 |
+
print ("Number of images in zip:" , len(images))
|
233 |
+
for img in tqdm(images):
|
234 |
+
try:
|
235 |
+
img_data = img['image']
|
236 |
+
if not isinstance(img_data, str):
|
237 |
+
# Convert the Image object to a base64-encoded string
|
238 |
+
img_buffer = io.BytesIO()
|
239 |
+
img['image'].save(img_buffer, format="PNG")
|
240 |
+
img_data = base64.b64encode(img_buffer.getvalue()).decode("utf-8")
|
241 |
+
|
242 |
+
img_data=f'<img src="data:image/png;base64,{img_data}" style="max-width:500px;"/>'
|
243 |
+
|
244 |
+
row_data = {"Image": img_data} # Assuming encode_image is defined elsewhere
|
245 |
+
for model_name in model_names:
|
246 |
+
if model_name in model_processors:
|
247 |
+
model, processor = model_processors[model_name]
|
248 |
+
for prompt in prompts:
|
249 |
+
try:
|
250 |
+
# Ensure image is defined
|
251 |
+
image = img['image']
|
252 |
+
response_text = generate_output(model, processor, prompt, image, model_name, device_map_value)
|
253 |
+
row_data[f"{model_name}_Response_{prompt}"] = response_text
|
254 |
+
except Exception as e:
|
255 |
+
row_data[f"{model_name}_Response_{prompt}"] = f"Error during generation with model {model_name}: {e}"
|
256 |
+
traceback.print_exc()
|
257 |
+
|
258 |
+
data.append(row_data)
|
259 |
+
except Exception as e:
|
260 |
+
print(f"Error processing image {img['filename']}: {e}")
|
261 |
+
traceback.print_exc()
|
262 |
+
|
263 |
+
# Load the dataset if use_zip is False
|
264 |
+
else:
|
265 |
+
dataset = load_dataset(dataset_input, split='train')
|
266 |
+
for i in tqdm(range(num_images_input)):
|
267 |
+
if dataset_input == "OpenGVLab/CRPE":
|
268 |
+
image = dataset[i]['image']
|
269 |
+
elif dataset_input == "gokaygokay/panorama_hdr_dataset":
|
270 |
+
image = dataset[i]['png_image']
|
271 |
+
else:
|
272 |
+
image = dataset[i]['image']
|
273 |
+
|
274 |
+
encoded_img = encode_image(image)
|
275 |
+
row_data = {"Image": encoded_img}
|
276 |
+
|
277 |
+
for model_name in model_names:
|
278 |
+
if model_name in model_processors:
|
279 |
+
model, processor = model_processors[model_name]
|
280 |
+
for prompt in prompts:
|
281 |
+
try:
|
282 |
+
response_text = generate_output(model, processor, prompt, image, model_name, device_map_value)
|
283 |
+
row_data[f"{model_name}_Response_{prompt}"] = response_text
|
284 |
+
except Exception as e:
|
285 |
+
row_data[f"{model_name}_Response_{prompt}"] = f"Error during generation with model {model_name}: {e}"
|
286 |
+
|
287 |
+
data.append(row_data)
|
288 |
+
|
289 |
+
except Exception as e:
|
290 |
+
print(f"Error loading dataset: {e}")
|
291 |
+
traceback.print_exc()
|
292 |
+
|
293 |
+
return pd.DataFrame(data).to_html(escape=False)
|
294 |
+
|
295 |
+
# Gradio UI setup
|
296 |
+
def create_gradio_interface():
|
297 |
+
css = """
|
298 |
+
#output {
|
299 |
+
height: 500px;
|
300 |
+
overflow: auto;
|
301 |
+
}
|
302 |
+
"""
|
303 |
+
with gr.Blocks(css=css) as demo:
|
304 |
+
# Title
|
305 |
+
gr.Markdown("# VLM-Image-Analysis: A Vision-and-Language Modeling Framework.")
|
306 |
+
gr.Markdown("""
|
307 |
+
- Handle a batch of images from a ZIP file OR
|
308 |
+
- Processes images from an HF DB
|
309 |
+
- Compatible with png, jpg, jpeg, and webp formats
|
310 |
+
- Compatibility with various AI models: Qwen2-VL-7B-Instruct, Qwen2-VL-2B-Instruct-GPTQ-Int4, InternVL2-1B, Qwen2-VL-72B, /Phi-3-vision-128k-instruct and moondream2""")
|
311 |
+
|
312 |
+
gr.Image(value="static/image.jpg", label="HF Image", width=300, height=300) # Set custom width and height
|
313 |
+
|
314 |
+
with gr.Tab("VLM model and Dataset selection"):
|
315 |
+
gr.Markdown("### Dataset Selection: HF or from a ZIP file.")
|
316 |
+
with gr.Accordion("Advanced Settings", open=True):
|
317 |
+
with gr.Row():
|
318 |
+
# with gr.Column():
|
319 |
+
use_zip_input = gr.Checkbox(label="Use ZIP File", value=False)
|
320 |
+
dataset_input = gr.Dropdown(choices=dataset_options, label="Select Dataset", value=dataset_options[1], visible=True)
|
321 |
+
num_images_input = gr.Radio(choices=[1, 5, 20], label="Number of Images", value=5)
|
322 |
+
zip_file_input = gr.File(label="Upload ZIP File of Images", file_types=[".zip"])
|
323 |
+
gr.Markdown("### VLM Model Selection")
|
324 |
+
with gr.Row():
|
325 |
+
with gr.Column():
|
326 |
+
models_input = gr.CheckboxGroup(choices=available_models, label="Select Models", value=available_models[4])
|
327 |
+
prompts_input = gr.CheckboxGroup(choices=text_prompts, label="Select Prompts", value=text_prompts[2])
|
328 |
+
submit_btn = gr.Button("Run Inference")
|
329 |
+
|
330 |
+
with gr.Row():
|
331 |
+
output_display = gr.HTML(label="Results")
|
332 |
+
|
333 |
+
with gr.Tab("GPU Device Settings"):
|
334 |
+
device_map_input = gr.Radio(choices=["auto", "cpu", "cuda"], label="Device Map", value="auto")
|
335 |
+
torch_dtype_input = gr.Radio(choices=["torch.float16", "torch.float32"], label="Torch Dtype", value="torch.float16")
|
336 |
+
trust_remote_code_input = gr.Checkbox(label="Trust Remote Code", value=True)
|
337 |
+
use_flash_attn = gr.Checkbox(label="Use flash-attn 2 (Ampere GPUs or newer.)", value=False)
|
338 |
+
|
339 |
+
|
340 |
+
def run_inference_wrapper(model_names, dataset_input, num_images_input, prompts, device_map, torch_dtype, trust_remote_code,use_flash_attn, use_zip, zip_file):
|
341 |
+
return run_inference(model_names, dataset_input, num_images_input, prompts, device_map, torch_dtype, trust_remote_code,use_flash_attn, use_zip, zip_file)
|
342 |
+
|
343 |
+
def toggle_dataset_visibility(use_zip):
|
344 |
+
return gr.update(visible=not use_zip)
|
345 |
+
|
346 |
+
submit_btn.click(
|
347 |
+
fn=run_inference_wrapper,
|
348 |
+
inputs=[models_input, dataset_input, num_images_input, prompts_input, device_map_input, torch_dtype_input, trust_remote_code_input,use_flash_attn, use_zip_input, zip_file_input],
|
349 |
+
outputs=output_display
|
350 |
+
)
|
351 |
+
|
352 |
+
use_zip_input.change(
|
353 |
+
fn=toggle_dataset_visibility,
|
354 |
+
inputs=use_zip_input,
|
355 |
+
outputs=dataset_input
|
356 |
+
)
|
357 |
+
|
358 |
+
demo.launch(debug=True, share=False)
|
359 |
+
|
360 |
+
if __name__ == "__main__":
|
361 |
+
create_gradio_interface()
|
build.bat
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
docker build -t hf-docker .
|
2 |
+
|
hf-db-test.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
|
4 |
+
# Specify dataset and number of images to load
|
5 |
+
dataset_name = "OpenGVLab/CRPE" # Can switch to "OpenGVLab/CRPE"
|
6 |
+
num_images = 2 # Specify the number of images you want to load
|
7 |
+
|
8 |
+
# Load the dataset
|
9 |
+
dataset = load_dataset(dataset_name, split='train')
|
10 |
+
|
11 |
+
# Check if the dataset is 'OpenGVLab/CRPE' or 'gokaygokay/panorama_hdr_dataset' and handle accordingly
|
12 |
+
for i in range(num_images):
|
13 |
+
if dataset_name == "gokaygokay/panorama_hdr_dataset":
|
14 |
+
img = dataset[i]["png_image"] # Access image for this dataset
|
15 |
+
elif dataset_name == "OpenGVLab/CRPE":
|
16 |
+
img = dataset[i]["image"] # Access image for 'OpenGVLab/CRPE'
|
17 |
+
|
18 |
+
# Display the image
|
19 |
+
plt.imshow(img)
|
20 |
+
plt.title(f"Image {i+1}")
|
21 |
+
plt.axis('off') # Hide axes for better visualization
|
22 |
+
plt.show()
|
run-d.bat
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
docker run --gpus all --rm -it --shm-size=8gb --memory="16g" --env="DISPLAY" -p 8077:7842 -p 7860:7860 -p 8501:8501 -v %cd%:/home/user/app -v %cd%:/home/user/sharedfolder -v %cd%/.cache:/home/user/.cache hf-docker:latest bash
|
static/002.png
ADDED
![]() |
static/003.png
ADDED
![]() |
static/004.png
ADDED
![]() |
static/image.jpg
ADDED
![]() |
test_images/001.png
ADDED
![]() |
test_images/002.png
ADDED
![]() |
test_images/003.png
ADDED
![]() |
test_images/004.png
ADDED
![]() |