Spaces:

arad1367
/

Multimodal_RAG_Pejman

Runtime error

App Files Files Community

arad1367 commited on Oct 15, 2024

Commit

f21b6d3

verified ·

1 Parent(s): e50fed9

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -57

app.py CHANGED Viewed

@@ -33,20 +33,24 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_rem
 @spaces.GPU()
 def process_pdf_and_query(pdf_file, user_query):
     images = convert_from_path(pdf_file.name)
     num_images = len(images)
     RAG.index(
         input_path=pdf_file.name,
-        index_name="image_index",
         store_collection_with_index=False,
         overwrite=True
     )
     results = RAG.search(user_query, k=1)
     if not results:
         return "No results found.", num_images
     image_index = results[0]["page_num"] - 1
     messages = [
         {
@@ -61,6 +65,7 @@ def process_pdf_and_query(pdf_file, user_query):
         }
     ]
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     image_inputs, video_inputs = process_vision_info(messages)
     inputs = processor(
@@ -72,6 +77,7 @@ def process_pdf_and_query(pdf_file, user_query):
     )
     inputs = inputs.to("cuda")
     generated_ids = model.generate(**inputs, max_new_tokens=50)
     generated_ids_trimmed = [
         out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
@@ -82,37 +88,44 @@ def process_pdf_and_query(pdf_file, user_query):
     return output_text[0], num_images
 css = """
-    .duplicate-button {
-        background-color: #6272a4;
-        color: white;
-        font-weight: bold;
-        border-radius: 5px;
-        margin-top: 20px;
-        padding: 10px;
-        text-align: center;
-    }
-    .gradio-container {
-        background-color: #282a36;
-        color: #f8f8f2;
-        font-family: 'Courier New', Courier, monospace;
-        padding: 20px;
-        border-radius: 10px;
-        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-    }
-    .title {
-        font-size: 30px;
-        font-weight: bold;
-        text-align: center;
-        margin-bottom: 20px;
-    }
 """
-explanation = """
-### Multimodal RAG with Image Query
-This demo showcases the **Multimodal RAG (Retriever-Augmented Generation)** model. The RAG system integrates retrieval and generation, allowing it to retrieve relevant information from a multimodal database (like PDFs with text and images) and then generate detailed responses.
-We use **ColPali**, a state-of-the-art multimodal retriever, combined with the **Byaldi** library from **answer.ai**, which simplifies using ColPali. The language model used for generating answers is **Qwen/Qwen2-VL-2B-Instruct**, a powerful vision-language model capable of understanding both text and images.
 """
 footer = """
@@ -124,36 +137,21 @@ footer = """
     <a href="https://github.com/AnswerDotAI/byaldi" target="_blank">Byaldi</a> |
     <a href="https://github.com/illuin-tech/colpali" target="_blank">ColPali</a>
     <br>
-    Made with 💖 by Pejman Ebrahimi
 </div>
 """
-pdf_input = gr.File(label="Upload PDF")  # Single PDF file input
-query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")  # User query input
-output_text = gr.Textbox(label="Model Answer")  # Output for the model's answer
-output_images = gr.Textbox(label="Number of Images in PDF")  # Output for number of images
-duplicate_button = gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
-# Launch the Gradio app
-demo = gr.Interface(
-    fn=process_pdf_and_query,
-    inputs=[pdf_input, query_input],  # List of inputs
-    outputs=[output_text, output_images],  # List of outputs
-    title="",
-    theme='freddyaboulton/dracula_revamped',
-    css=css,
-    description=explanation,
-    allow_flagging="auto"
-)
-with demo:
-    gr.HTML("""
-    <div class='title'>
-        Multimodal RAG with Image Query -
-        <a href="https://github.com/arad1367" target="_blank" style="color: #ff79c6; text-decoration: none;">
-            Pejman Ebrahimi
-        </a>
-    </div>
-    """)
     gr.HTML(footer)
-    duplicate_button

 @spaces.GPU()
 def process_pdf_and_query(pdf_file, user_query):
+    # Convert the PDF to images
     images = convert_from_path(pdf_file.name)
     num_images = len(images)
+    # Indexing the PDF in RAG
     RAG.index(
         input_path=pdf_file.name,
+        index_name="image_index",  # index will be saved at index_root/index_name/
         store_collection_with_index=False,
         overwrite=True
     )
+    # Search the query in the RAG model
     results = RAG.search(user_query, k=1)
     if not results:
         return "No results found.", num_images
+    # Retrieve the page number and process image
     image_index = results[0]["page_num"] - 1
     messages = [
         {
         }
     ]
+    # Generate text with the Qwen model
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     image_inputs, video_inputs = process_vision_info(messages)
     inputs = processor(
     )
     inputs = inputs.to("cuda")
+    # Generate the output response
     generated_ids = model.generate(**inputs, max_new_tokens=50)
     generated_ids_trimmed = [
         out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     return output_text[0], num_images
+pdf_input = gr.File(label="Upload PDF")
+query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
+output_text = gr.Textbox(label="Model Answer")
+output_images = gr.Textbox(label="Number of Images in PDF")
+# CSS styling
 css = """
+body {
+    background-color: #282a36;
+    font-family: Arial, sans-serif;
+    color: #f8f8f2;
+}
+h1 {
+    text-align: center;
+    font-size: 2.5em;
+    font-weight: bold;
+    margin-bottom: 20px;
+}
+footer {
+    margin-top: 20px;
+}
+.duplicate-button {
+    text-align: center;
+    background-color: #50fa7b;
+    color: #282a36;
+    font-weight: bold;
+    border: none;
+    padding: 10px;
+    cursor: pointer;
+}
 """
+description = """
+### About Multimodal RAG
+Multimodal Retrieval-Augmented Generation (RAG) integrates both images and text to provide more comprehensive and contextually accurate responses to user queries. It uses a retriever model like **ColPali** to search and retrieve relevant data and a large language model (LLM) like **Qwen/Qwen2-VL-2B-Instruct** to generate natural language answers based on the input.
+In this demo, **ColPali** is used as a multimodal retriever, and the **Byaldi** library from answer.ai simplifies the use of ColPali. We are utilizing **Qwen2-VL-2B-Instruct** for text generation, enabling both text and image-based queries.
 """
 footer = """
     <a href="https://github.com/AnswerDotAI/byaldi" target="_blank">Byaldi</a> |
     <a href="https://github.com/illuin-tech/colpali" target="_blank">ColPali</a>
     <br>
+    Made with 💖 by <a href="https://github.com/arad1367" target="_blank">Pejman Ebrahimi</a>
 </div>
 """
+# Gradio Interface
+with gr.Blocks(theme='freddyaboulton/dracula_revamped', css=css) as demo:
+    gr.Markdown("<h1>Multimodal RAG with Image Query</h1>")
+    gr.Markdown(description)
+    with gr.Row():
+        pdf_input = gr.File(label="Upload PDF")
+        query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
+    output_text = gr.Textbox(label="Model Answer")
+    output_images = gr.Textbox(label="Number of Images in PDF")
+    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
     gr.HTML(footer)
+    demo.launch(debug=True)