File size: 5,541 Bytes
7e738ef
78a7c54
f9c5a74
 
 
 
 
 
f42f33d
 
78a7c54
 
 
 
 
 
 
 
 
 
 
 
 
f9c5a74
 
 
 
 
 
 
 
 
 
f42f33d
f9c5a74
2f43f91
f9c5a74
 
 
 
2f43f91
f9c5a74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f43f91
f9c5a74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f43f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9c5a74
 
2f43f91
 
 
 
 
 
 
f9c5a74
 
2f43f91
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import spaces
import os
import gradio as gr
from pdf2image import convert_from_path
from byaldi import RAGMultiModalModel
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import torchvision
import subprocess

# Run the commands from setup.sh to install poppler-utils
def install_poppler():
    try:
        subprocess.run(["pdfinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except FileNotFoundError:
        print("Poppler not found. Installing...")
        # Run the setup commands
        subprocess.run("apt-get update", shell=True)
        subprocess.run("apt-get install -y poppler-utils", shell=True)

# Call the Poppler installation check
install_poppler()

# Install flash-attn if not already installed
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

# Load the RAG Model and the Qwen2-VL-2B-Instruct model
RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")
model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
                                                        trust_remote_code=True, torch_dtype=torch.bfloat16).cuda().eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True)

@spaces.GPU()
def process_pdf_and_query(pdf_file, user_query):
    images = convert_from_path(pdf_file.name)  
    num_images = len(images)

    RAG.index(
        input_path=pdf_file.name,
        index_name="image_index",  
        store_collection_with_index=False,
        overwrite=True
    )

    # Search the query in the RAG model
    results = RAG.search(user_query, k=1)
    if not results:
        return "No results found.", num_images

    # Retrieve the page number and process image
    image_index = results[0]["page_num"] - 1
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": images[image_index],
                },
                {"type": "text", "text": user_query},
            ],
        }
    ]


    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    
    generated_ids = model.generate(**inputs, max_new_tokens=50)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0], num_images


css = """
    .duplicate-button {
        background-color: #6272a4;
        color: white;
        font-weight: bold;
        border-radius: 5px;
        margin-top: 20px;
        padding: 10px;
        text-align: center;
    }
    .gradio-container {
        background-color: #282a36;
        color: #f8f8f2;
        font-family: 'Courier New', Courier, monospace;
        padding: 20px;
        border-radius: 10px;
        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
    }
"""

explanation = """
### Multimodal RAG with Image Query
This demo showcases the **Multimodal RAG (Retriever-Augmented Generation)** model. The RAG system integrates retrieval and generation, allowing it to retrieve relevant information from a multimodal database (like PDFs with text and images) and then generate detailed responses. 

We use **ColPali**, a state-of-the-art multimodal retriever, combined with the **Byaldi** library from **answer.ai**, which simplifies using ColPali. The language model used for generating answers is **Qwen/Qwen2-VL-2B-Instruct**, a powerful vision-language model capable of understanding both text and images.
"""

footer = """
<div style="text-align: center; margin-top: 20px;">
    <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
    <a href="https://github.com/arad1367" target="_blank">GitHub</a> |
    <a href="https://arad1367.pythonanywhere.com/" target="_blank">Live demo of my PhD defense</a> |
    <a href="https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct" target="_blank">Qwen/Qwen2-VL-2B-Instruct</a> |
    <a href="https://github.com/AnswerDotAI/byaldi" target="_blank">Byaldi</a> |
    <a href="https://github.com/illuin-tech/colpali" target="_blank">ColPali</a>
    <br>
    Made with πŸ’– by Pejman Ebrahimi
</div>
"""

pdf_input = gr.File(label="Upload PDF")  
query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")  
output_text = gr.Textbox(label="Model Answer")  
output_images = gr.Textbox(label="Number of Images in PDF") 
duplicate_button = gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")

# Launch the Gradio app 
demo = gr.Interface(
    fn=process_pdf_and_query, 
    inputs=[pdf_input, query_input],  
    outputs=[output_text, output_images],  
    title="Multimodal RAG with Image Query - By Pejman Ebrahimi - Please like the space if it is useful",
    theme='freddyaboulton/dracula_revamped',
    css=css,
    description=explanation,
    allow_flagging="auto"
)

with demo:
    gr.HTML(footer)
    duplicate_button

demo.launch(debug=True)