Spaces:

arad1367
/

Multimodal_RAG_Pejman

Runtime error

App Files Files Community

arad1367 commited on Oct 15, 2024

Commit

0a97d2e

verified ·

1 Parent(s): f21b6d3

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -50

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import torch
 import torchvision
 import subprocess
-# Run the commands from setup.sh to install poppler-utils
 def install_poppler():
     try:
         subprocess.run(["pdfinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@@ -34,7 +34,7 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_rem
 @spaces.GPU()
 def process_pdf_and_query(pdf_file, user_query):
     # Convert the PDF to images
-    images = convert_from_path(pdf_file.name)
     num_images = len(images)
     # Indexing the PDF in RAG
@@ -88,44 +88,9 @@ def process_pdf_and_query(pdf_file, user_query):
     return output_text[0], num_images
-pdf_input = gr.File(label="Upload PDF")
-query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
-output_text = gr.Textbox(label="Model Answer")
-output_images = gr.Textbox(label="Number of Images in PDF")
-# CSS styling
-css = """
-body {
-    background-color: #282a36;
-    font-family: Arial, sans-serif;
-    color: #f8f8f2;
-}
-h1 {
-    text-align: center;
-    font-size: 2.5em;
-    font-weight: bold;
-    margin-bottom: 20px;
-}
-footer {
-    margin-top: 20px;
-}
-.duplicate-button {
-    text-align: center;
-    background-color: #50fa7b;
-    color: #282a36;
-    font-weight: bold;
-    border: none;
-    padding: 10px;
-    cursor: pointer;
-}
-"""
 description = """
-### About Multimodal RAG
-Multimodal Retrieval-Augmented Generation (RAG) integrates both images and text to provide more comprehensive and contextually accurate responses to user queries. It uses a retriever model like **ColPali** to search and retrieve relevant data and a large language model (LLM) like **Qwen/Qwen2-VL-2B-Instruct** to generate natural language answers based on the input.
-In this demo, **ColPali** is used as a multimodal retriever, and the **Byaldi** library from answer.ai simplifies the use of ColPali. We are utilizing **Qwen2-VL-2B-Instruct** for text generation, enabling both text and image-based queries.
 """
 footer = """
@@ -137,21 +102,24 @@ footer = """
     <a href="https://github.com/AnswerDotAI/byaldi" target="_blank">Byaldi</a> |
     <a href="https://github.com/illuin-tech/colpali" target="_blank">ColPali</a>
     <br>
-    Made with 💖 by <a href="https://github.com/arad1367" target="_blank">Pejman Ebrahimi</a>
 </div>
 """
-# Gradio Interface
-with gr.Blocks(theme='freddyaboulton/dracula_revamped', css=css) as demo:
-    gr.Markdown("<h1>Multimodal RAG with Image Query</h1>")
     gr.Markdown(description)
-    with gr.Row():
-        pdf_input = gr.File(label="Upload PDF")
-        query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
-    output_text = gr.Textbox(label="Model Answer")
-    output_images = gr.Textbox(label="Number of Images in PDF")
-    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
     gr.HTML(footer)
-    demo.launch(debug=True)

 import torchvision
 import subprocess
+# Run the commands from setup.sh to install poppler-utils -- This is necessary --> Noted by Pejman! --> I found this way instead of setup.sh
 def install_poppler():
     try:
         subprocess.run(["pdfinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 @spaces.GPU()
 def process_pdf_and_query(pdf_file, user_query):
     # Convert the PDF to images
+    images = convert_from_path(pdf_file.name)
     num_images = len(images)
     # Indexing the PDF in RAG
     return output_text[0], num_images
 description = """
+**Multimodal RAG** is a retrieval-augmented generation (RAG) model that works with multiple modalities, such as text and images, to retrieve relevant information from a knowledge base and generate coherent responses. In this demo, we use **ColPali**, a multimodal retriever capable of efficiently retrieving from large datasets, along with **Qwen2-VL-2B-Instruct**, a powerful large language model for answering questions based on the retrieved information.
+Byaldi, developed by **Answer.ai**, is used to simplify the integration of ColPali into our pipeline.
 """
 footer = """
     <a href="https://github.com/AnswerDotAI/byaldi" target="_blank">Byaldi</a> |
     <a href="https://github.com/illuin-tech/colpali" target="_blank">ColPali</a>
     <br>
+    Made with 💖 by Pejman Ebrahimi
 </div>
 """
+with gr.Blocks(theme='freddyaboulton/dracula_revamped') as demo:
+    gr.Markdown("<h1 style='text-align: center; font-weight: bold;'>Multimodal RAG with Image Query - By <a href='https://github.com/arad1367' target='_blank'>Pejman Ebrahimi</a></h1>")
     gr.Markdown(description)
+    pdf_input = gr.File(label="Upload PDF")
+    query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
+    output_text = gr.Textbox(label="Model Answer")
+    output_images = gr.Textbox(label="Number of Images in PDF")
+    duplicate_button = gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
+    gr.Row([pdf_input, query_input])
+    gr.Row([output_text, output_images])
+    gr.Row([duplicate_button])
     gr.HTML(footer)
+demo.launch(debug=True)