Spaces:

arad1367
/

Multimodal_RAG_Pejman

Runtime error

File size: 5,637 Bytes

7e738ef
78a7c54
f9c5a74
 
 
 
 
 
f42f33d
 
78a7c54
bc890cd
78a7c54
 
 
 
 
 
 
 
 
 
 
f9c5a74
 
 
 
 
 
 
 
 
 
f42f33d
f9c5a74
f21b6d3
bc890cd
f9c5a74
 
f21b6d3
f9c5a74
 
bc890cd
f9c5a74
 
 
 
f21b6d3
f9c5a74
 
 
 
f21b6d3
f9c5a74
 
 
 
 
 
 
 
 
 
 
 
 
 
f21b6d3
f9c5a74
 
 
 
 
 
 
 
 
 
 
f21b6d3
f9c5a74
 
 
 
 
 
 
 
 
 
bc890cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f21b6d3
bc890cd
 
 
 
2f43f91
 
bc890cd
 
 
 
 
2f43f91
 
 
 
 
 
 
 
 
bc890cd
2f43f91
 
 
bc890cd
 
 
 
 
 
 
 
 
 
0a97d2e
bc890cd
0a97d2e
bc890cd
f21b6d3
bc890cd

import spaces
import os
import gradio as gr
from pdf2image import convert_from_path
from byaldi import RAGMultiModalModel
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import torchvision
import subprocess

# Run the commands from setup.sh to install poppler-utils
def install_poppler():
    try:
        subprocess.run(["pdfinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except FileNotFoundError:
        print("Poppler not found. Installing...")
        # Run the setup commands
        subprocess.run("apt-get update", shell=True)
        subprocess.run("apt-get install -y poppler-utils", shell=True)

# Call the Poppler installation check
install_poppler()

# Install flash-attn if not already installed
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

# Load the RAG Model and the Qwen2-VL-2B-Instruct model
RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")
model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
                                                        trust_remote_code=True, torch_dtype=torch.bfloat16).cuda().eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True)

@spaces.GPU()
def process_pdf_and_query(pdf_file, user_query):
    # Convert the PDF to images
    images = convert_from_path(pdf_file.name)  
    num_images = len(images)

    # Indexing the PDF in RAG
    RAG.index(
        input_path=pdf_file.name,
        index_name="image_index",  
        store_collection_with_index=False,
        overwrite=True
    )

    # Search the query in the RAG model
    results = RAG.search(user_query, k=1)
    if not results:
        return "No results found.", num_images

    # Retrieve the page number and process image
    image_index = results[0]["page_num"] - 1
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": images[image_index],
                },
                {"type": "text", "text": user_query},
            ],
        }
    ]

    # Generate text with the Qwen model
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    
    # Generate the output response
    generated_ids = model.generate(**inputs, max_new_tokens=50)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0], num_images

css = """
<style>
    .title {
        text-align: center;
        font-size: 32px;
        font-weight: bold;
        margin-bottom: 20px;
    }
    .duplicate-button {
        background-color: #FFD700; /* Yellow */
        color: black;
        border: none;
        padding: 10px 20px;
        cursor: pointer;
        font-size: 16px;
        border-radius: 5px;
    }
    .gr-button {
        background-color: #4CAF50; /* Green */
        color: white;
        border: none;
        padding: 10px 20px;
        cursor: pointer;
        font-size: 16px;
        border-radius: 5px;
    }
</style>
"""

description = """
<div style="text-align: center; margin-bottom: 20px;">
    <p>Welcome to the Multimodal RAG interface! This tool allows you to query PDF documents using a combination of image and text information.</p>
    <p>We utilize ColPali as a multimodal retriever, and Byaldi is a new library by Answer.ai that simplifies the use of ColPali. Our system incorporates the Qwen/Qwen2-VL-2B-Instruct LLM for generating insightful responses based on the information retrieved.</p>
</div>
"""

pdf_input = gr.File(label="Upload PDF")  
query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")  
output_text = gr.Textbox(label="Model Answer")  
output_images = gr.Textbox(label="Number of Images in PDF")  

footer = """
<div style="text-align: center; margin-top: 20px;">
    <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
    <a href="https://github.com/arad1367" target="_blank">GitHub</a> |
    <a href="https://arad1367.pythonanywhere.com/" target="_blank">Live demo of my PhD defense</a> |
    <a href="https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct" target="_blank">Qwen/Qwen2-VL-2B-Instruct</a> |
    <a href="https://github.com/AnswerDotAI/byaldi" target="_blank">Byaldi</a> |
    <a href="https://github.com/illuin-tech/colpali" target="_blank">ColPali</a>
    <br>
    Made with 💖 by <a href="https://github.com/arad1367" target="_blank">Pejman Ebrahimi</a>
</div>
"""

# Launch the Gradio app
demo = gr.Interface(
    fn=process_pdf_and_query, 
    inputs=[pdf_input, query_input], 
    outputs=[output_text, output_images],  
    title="<div class='title'>Multimodal RAG with Image Query</div>", 
    description=description,  
    theme='freddyaboulton/dracula_revamped',  
    css=css  
)

demo.add_component(gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button"))

demo.add_component(gr.HTML(footer))

demo.launch(debug=True)