File size: 13,431 Bytes
67a56f6
 
a6c0d87
 
3fdd093
87baec5
3fdd093
87baec5
 
ced2810
a6c0d87
87baec5
a6c0d87
40696fb
 
225229c
87baec5
 
 
 
a6c0d87
87baec5
a6c0d87
 
87baec5
a6c0d87
 
87baec5
a6c0d87
 
 
009e0ad
a6c0d87
 
 
 
87baec5
a6c0d87
87baec5
a6c0d87
 
 
 
 
87baec5
 
a6c0d87
 
 
 
 
 
87baec5
a6c0d87
87baec5
a6c0d87
87baec5
a6c0d87
87baec5
a6c0d87
 
 
 
 
 
 
 
87baec5
a6c0d87
87baec5
 
 
 
 
 
 
a6c0d87
 
 
 
 
87baec5
 
 
 
a6c0d87
 
87baec5
 
a6c0d87
87baec5
 
a6c0d87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7154bdc
a6c0d87
 
 
87baec5
a6c0d87
 
 
87baec5
 
 
 
a6c0d87
87baec5
 
a6c0d87
87baec5
a6c0d87
 
 
 
 
 
87baec5
a6c0d87
 
87baec5
 
 
 
 
a6c0d87
87baec5
a6c0d87
87baec5
a6c0d87
 
 
 
 
 
 
 
 
 
 
 
 
87baec5
a6c0d87
87baec5
 
a6c0d87
 
87baec5
 
 
 
 
 
 
 
a6c0d87
87baec5
a6c0d87
87baec5
a6c0d87
87baec5
 
 
a6c0d87
 
87baec5
a6c0d87
 
 
 
87baec5
 
 
 
a6c0d87
87baec5
 
 
 
 
 
 
 
a6c0d87
 
 
 
 
87baec5
 
 
a6c0d87
87baec5
a6c0d87
 
 
 
 
87baec5
 
 
a6c0d87
87baec5
 
 
 
 
 
 
 
a6c0d87
 
 
 
 
87baec5
a6c0d87
 
 
 
 
 
87baec5
a6c0d87
87baec5
a6c0d87
87baec5
a6c0d87
87baec5
a6c0d87
87baec5
 
 
 
 
 
 
 
a6c0d87
87baec5
 
 
a6c0d87
 
87baec5
a6c0d87
 
 
87baec5
a6c0d87
87baec5
 
 
a6c0d87
 
 
7fdd092
87baec5
 
a6c0d87
87baec5
a6c0d87
 
 
87baec5
 
a6c0d87
 
87baec5
 
a6c0d87
 
87baec5
a6c0d87
 
7fdd092
a6c0d87
d179e57
87baec5
 
a6c0d87
87baec5
 
a6c0d87
87baec5
a6c0d87
 
 
87baec5
 
a6c0d87
 
87baec5
 
 
 
a6c0d87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d179e57
3fdd093
87baec5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import os
import gradio as gr
import tempfile
from pathlib import Path

# Import vectorstore and embeddings from langchain community package
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
# Text splitter to break large documents into manageable chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
# HF Inference client for running multimodal models
from huggingface_hub import InferenceClient
# Unstructured for PDF processing with image extraction
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.utils.constants import PartitionStrategy

# ── Globals ───────────────────────────────────────────────────────────────────
index = None               # FAISS index storing document embeddings
retriever = None           # Retriever to fetch relevant chunks
current_pdf_name = None    # Name of the currently loaded PDF
extracted_content = None   # Combined text and image descriptions

# ── HF Inference clients ─────────────────────────────────────────────────────
# Text generation client (using a good open model)
text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
# Vision client for image analysis
vision_client = InferenceClient(model="llava-hf/llava-1.5-7b-hf")

# ── Embeddings ───────────────────────────────────────────────────────────────
# Use BGE embeddings for vectorizing text chunks
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Create temporary directories for processing
temp_dir = tempfile.mkdtemp()
figures_dir = os.path.join(temp_dir, "figures")
os.makedirs(figures_dir, exist_ok=True)

def extract_image_description(image_path):
    """
    Analyze an extracted image using vision model to get text description.
    Args:
        image_path: Path to the extracted image file
    Returns:
        Text description of the image content
    """
    try:
        # Read image and send to vision model
        with open(image_path, "rb") as img_file:
            # Use vision client to analyze the image
            response = vision_client.text_to_image_generation(
                prompt="Describe what you see in this image in detail, including any text, charts, diagrams, or important visual elements.",
                image=img_file.read()
            )
            return f"Image content: {response}"
    except Exception as e:
        return f"Image content: [Could not analyze image - {str(e)}]"

def process_pdf_multimodal(pdf_file):
    """
    1. Extracts text and images from PDF using unstructured
    2. Analyzes extracted images with vision model
    3. Combines text and image descriptions
    4. Creates FAISS index for retrieval
    Args:
        pdf_file: Uploaded PDF file
    Returns:
        - PDF filename, status message, and UI updates
    """
    global current_pdf_name, index, retriever, extracted_content

    if pdf_file is None:
        return None, "❌ Please upload a PDF file.", gr.update(interactive=False)

    current_pdf_name = os.path.basename(pdf_file.name)
    
    try:
        # Clear previous figures
        for file in os.listdir(figures_dir):
            os.remove(os.path.join(figures_dir, file))
        
        # Extract elements from PDF including images
        elements = partition_pdf(
            pdf_file.name,
            strategy=PartitionStrategy.HI_RES,
            extract_image_block_types=["Image", "Table"],
            extract_image_block_output_dir=figures_dir,
            extract_image_block_to_payload=False
        )
        
        # Separate text elements
        text_elements = []
        for element in elements:
            if element.category not in ["Image", "Table"]:
                text_elements.append(element.text)
        
        # Process extracted images
        image_descriptions = []
        if os.path.exists(figures_dir):
            for image_file in os.listdir(figures_dir):
                if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    image_path = os.path.join(figures_dir, image_file)
                    description = extract_image_description(image_path)
                    image_descriptions.append(description)
        
        # Combine text and image descriptions
        all_content = text_elements + image_descriptions
        extracted_content = "\n\n".join(all_content)
        
        # Split into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            add_start_index=True
        )
        chunks = text_splitter.split_text(extracted_content)
        
        # Create FAISS index
        index = FAISS.from_texts(chunks, embeddings)
        retriever = index.as_retriever(search_kwargs={"k": 3})
        
        # Status message
        num_images = len(image_descriptions)
        status = f"βœ… Processed '{current_pdf_name}' β€” {len(chunks)} text chunks, {num_images} images analyzed"
        
        return current_pdf_name, status, gr.update(interactive=True)
        
    except Exception as e:
        error_msg = f"❌ Error processing PDF: {str(e)}"
        return current_pdf_name, error_msg, gr.update(interactive=False)

def ask_multimodal_question(pdf_name, question):
    """
    Answer questions using both text and image content from the PDF.
    Args:
        pdf_name: Display name (unused)
        question: User's question
    Returns:
        Generated answer combining text and visual information
    """
    global retriever
    
    if index is None or retriever is None:
        return "❌ Please upload and process a PDF first."
    
    if not question.strip():
        return "❌ Please enter a question."
    
    try:
        # Retrieve relevant chunks (text + image descriptions)
        docs = retriever.get_relevant_documents(question)
        context = "\n\n".join(doc.page_content for doc in docs)
        
        # Enhanced prompt for multimodal content
        prompt = (
            "You are an AI assistant analyzing a document that contains both text and images. "
            "Use the following content (which includes text excerpts and descriptions of images/charts/tables) "
            "to answer the question comprehensively.\n\n"
            f"Document Content:\n{context}\n\n"
            f"Question: {question}\n\n"
            "Provide a detailed answer based on both the textual information and visual elements described above. "
            "If the answer involves data from charts, tables, or images, mention that explicitly.\n"
            "Answer:"
        )
        
        # Generate response
        response = text_client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=256,
            temperature=0.5
        )
        
        answer = response["choices"][0]["message"]["content"].strip()
        return answer
        
    except Exception as e:
        return f"❌ Error generating answer: {str(e)}"

def generate_multimodal_summary():
    """
    Generate a summary considering both text and visual elements.
    """
    if not extracted_content:
        return "❌ Please upload and process a PDF first."
    
    try:
        # Use first 3000 characters for summary
        content_preview = extracted_content[:3000]
        
        prompt = (
            "Provide a comprehensive summary of this document that contains both text and visual elements "
            "(images, charts, tables). Mention key textual information as well as important visual content.\n\n"
            f"{content_preview}..."
        )
        
        response = text_client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=200,
            temperature=0.5
        )
        
        return response["choices"][0]["message"]["content"].strip()
        
    except Exception as e:
        return f"❌ Error generating summary: {str(e)}"

def extract_multimodal_keywords():
    """
    Extract keywords from both text and visual content.
    """
    if not extracted_content:
        return "❌ Please upload and process a PDF first."
    
    try:
        content_preview = extracted_content[:3000]
        
        prompt = (
            "Extract 10-15 key terms and concepts from this document that contains both text and visual elements. "
            "Include important terms from both textual content and visual elements like charts, images, and tables.\n\n"
            f"{content_preview}..."
        )
        
        response = text_client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=100,
            temperature=0.5
        )
        
        return response["choices"][0]["message"]["content"].strip()
        
    except Exception as e:
        return f"❌ Error extracting keywords: {str(e)}"

def clear_multimodal_interface():
    """
    Reset all global state and clear UI.
    """
    global index, retriever, current_pdf_name, extracted_content
    
    # Clear figures directory
    try:
        for file in os.listdir(figures_dir):
            os.remove(os.path.join(figures_dir, file))
    except:
        pass
    
    # Reset globals
    index = retriever = None
    current_pdf_name = extracted_content = None
    
    return None, "", gr.update(interactive=False)

# ── Gradio UI ────────────────────────────────────────────────────────────────
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")

with gr.Blocks(theme=theme, css="""
    .container { border-radius: 10px; padding: 15px; }
    .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
    .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
    .main-title {
        text-align: center;
        font-size: 64px;
        font-weight: bold;
        margin-bottom: 20px;
    }
    .multimodal-badge {
        background: linear-gradient(45deg, #6366f1, #8b5cf6);
        color: white;
        padding: 5px 15px;
        border-radius: 20px;
        font-size: 14px;
        display: inline-block;
        margin: 10px auto;
    }
""") as demo:
    
    # Application title with multimodal badge
    gr.Markdown("<div class='main-title'>MultiModal DocQueryAI</div>")
    gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>πŸ–ΌοΈ Text + Images + Charts</span></div>")

    with gr.Row():
        with gr.Column():
            gr.Markdown("## πŸ“„ Document Input")
            pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
            pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
            upload_button = gr.Button("πŸ”„ Process Document (Extract Text + Images)", variant="primary")
            status_box = gr.Textbox(label="Processing Status", interactive=False)

        with gr.Column():
            gr.Markdown("## ❓ Ask Questions")
            gr.Markdown("*Ask about text content, images, charts, tables, or any visual elements in your PDF*")
            question_input = gr.Textbox(
                lines=3, 
                placeholder="Ask about text, images, charts, or any content in the PDF...",
                interactive=False
            )
            ask_button = gr.Button("πŸ” Ask Question", variant="primary")
            answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)

    # Analysis tools
    with gr.Row():
        with gr.Column():
            summary_button = gr.Button("πŸ“‹ Generate Summary", variant="secondary")
            summary_output = gr.Textbox(label="Document Summary", lines=4, interactive=False)
        with gr.Column():
            keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
            keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)

    # Clear button
    clear_button = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
    
    gr.Markdown("""
    <div class='footer'>
        Powered by LangChain + Unstructured + Vision AI + FAISS | 
        Supports: Text, Images, Charts, Tables, Diagrams
    </div>
    """)

    # Event bindings
    upload_button.click(
        process_pdf_multimodal, 
        [pdf_file], 
        [pdf_display, status_box, question_input]
    )
    ask_button.click(
        ask_multimodal_question, 
        [pdf_display, question_input], 
        answer_output
    )
    summary_button.click(generate_multimodal_summary, [], summary_output)
    keywords_button.click(extract_multimodal_keywords, [], keywords_output)
    clear_button.click(
        clear_multimodal_interface, 
        [], 
        [pdf_file, pdf_display, question_input]
    )

if __name__ == "__main__":
    demo.launch(debug=True, share=True)