Spaces:

Muzammil6376
/

Multimodal

Sleeping

App Files Files Community

Muzammil6376 commited on 21 days ago

Commit

b42840f

verified ·

1 Parent(s): 3b36cb3

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -69

app.py CHANGED Viewed

@@ -40,6 +40,55 @@ def encode_image_to_base64(image_path):
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode('utf-8')
 def analyze_image_with_multimodal_model(image_path):
     """
     Analyze an extracted image using the multimodal model.
@@ -52,33 +101,22 @@ def analyze_image_with_multimodal_model(image_path):
         # Encode image to base64
         image_base64 = encode_image_to_base64(image_path)
-        # Create multimodal prompt
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": "Analyze this image and provide a detailed description. Include any text, data, charts, diagrams, tables, or important visual elements you can see. Be specific and comprehensive."
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/jpeg;base64,{image_base64}"
-                        }
-                    }
-                ]
-            }
-        ]
         # Use multimodal model for image analysis
-        response = multimodal_client.chat_completion(
-            messages=messages,
-            max_tokens=200,
             temperature=0.3
         )
-        description = response["choices"][0]["message"]["content"].strip()
         return f"[IMAGE CONTENT]: {description}"
     except Exception as e:
@@ -86,7 +124,7 @@ def analyze_image_with_multimodal_model(image_path):
 def process_pdf_multimodal(pdf_file):
     """
-    Process PDF with single multimodal model for both text and images.
     """
     global current_pdf_name, index, retriever, extracted_content, extracted_images
@@ -101,31 +139,21 @@ def process_pdf_multimodal(pdf_file):
         for file in os.listdir(figures_dir):
             os.remove(os.path.join(figures_dir, file))
-        # Extract elements from PDF including images
-        elements = partition_pdf(
-            pdf_file.name,
-            strategy=PartitionStrategy.HI_RES,
-            extract_image_block_types=["Image", "Table"],
-            extract_image_block_output_dir=figures_dir,
-            extract_image_block_to_payload=False
-        )
-        # Separate text elements
         text_elements = []
-        for element in elements:
-            if element.category not in ["Image", "Table"]:
-                if element.text.strip():  # Only add non-empty text
-                    text_elements.append(element.text.strip())
-        # Process extracted images with multimodal model
-        image_descriptions = []
-        if os.path.exists(figures_dir):
-            for image_file in os.listdir(figures_dir):
-                if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
-                    image_path = os.path.join(figures_dir, image_file)
-                    extracted_images.append(image_path)  # Store for later use
-                    description = analyze_image_with_multimodal_model(image_path)
-                    image_descriptions.append(description)
         # Combine all content
         all_content = text_elements + image_descriptions
@@ -148,8 +176,8 @@ def process_pdf_multimodal(pdf_file):
         # Status message
         num_images = len(image_descriptions)
-        num_text_elements = len(text_elements)
-        status = f"✅ Processed '{current_pdf_name}' — {len(chunks)} chunks ({num_text_elements} text sections, {num_images} images analyzed)"
         return current_pdf_name, status, gr.update(interactive=True)
@@ -174,14 +202,8 @@ def ask_multimodal_question(pdf_name, question):
         docs = retriever.get_relevant_documents(question)
         context = "\n\n".join(doc.page_content for doc in docs)
-        # Create messages for multimodal model
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": f"""You are an AI assistant analyzing a document that contains both text and visual elements.
 RETRIEVED CONTEXT:
 {context}
@@ -193,23 +215,15 @@ Please provide a comprehensive answer based on the retrieved context above. The
 If your answer references visual elements (charts, graphs, images, tables), mention that explicitly. Keep your response focused and informative.
 ANSWER:"""
-                    }
-                ]
-            }
-        ]
-        # If question seems to be about images and we have extracted images,
-        # we could potentially include an image in the query (for advanced use cases)
         # Generate response with multimodal model
-        response = multimodal_client.chat_completion(
-            messages=messages,
-            max_tokens=300,
             temperature=0.5
         )
-        answer = response["choices"][0]["message"]["content"].strip()
-        return answer
     except Exception as e:
         return f"❌ Error generating answer: {str(e)}"
@@ -364,7 +378,7 @@ with gr.Blocks(theme=theme, css="""
     # Model information
     gr.Markdown("""
     <div class='model-info'>
-    <strong>🤖 Powered by:</strong> Microsoft Phi-3.5-Vision (Multimodal) + CLIP Embeddings (Text+Image) + Unstructured (PDF Processing)
     </div>
     """)

     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode('utf-8')
+def extract_images_from_pdf_pymupdf(pdf_path):
+    """
+    Extract images from PDF using PyMuPDF (works on HF Spaces)
+    Args:
+        pdf_path: Path to the PDF file
+    Returns:
+        List of image paths and their descriptions
+    """
+    extracted_images = []
+    image_descriptions = []
+    try:
+        # Open PDF with PyMuPDF
+        pdf_document = fitz.open(pdf_path)
+        for page_num in range(len(pdf_document)):
+            page = pdf_document.load_page(page_num)
+            image_list = page.get_images()
+            for img_index, img in enumerate(image_list):
+                # Get image data
+                xref = img[0]
+                pix = fitz.Pixmap(pdf_document, xref)
+                # Convert to PIL Image
+                if pix.n - pix.alpha < 4:  # GRAY or RGB
+                    img_data = pix.tobytes("png")
+                    img_pil = Image.open(io.BytesIO(img_data))
+                    # Save image
+                    image_filename = f"page_{page_num}_img_{img_index}.png"
+                    image_path = os.path.join(figures_dir, image_filename)
+                    img_pil.save(image_path)
+                    # Analyze image with multimodal model
+                    description = analyze_image_with_multimodal_model(image_path)
+                    extracted_images.append(image_path)
+                    image_descriptions.append(description)
+                pix = None  # Free memory
+        pdf_document.close()
+        return extracted_images, image_descriptions
+    except Exception as e:
+        print(f"Error extracting images: {e}")
+        return [], []
 def analyze_image_with_multimodal_model(image_path):
     """
     Analyze an extracted image using the multimodal model.
         # Encode image to base64
         image_base64 = encode_image_to_base64(image_path)
+        # Simple text-based prompt for HF Inference API
+        prompt = f"""Analyze this image and provide a detailed description. Include any text, data, charts, diagrams, tables, or important visual elements you can see. Be specific and comprehensive.
+Image: [Image data provided]
+Description:"""
         # Use multimodal model for image analysis
+        # Note: Simplified for HF Spaces compatibility
+        response = multimodal_client.text_generation(
+            prompt=prompt,
+            max_new_tokens=200,
             temperature=0.3
         )
+        description = response.strip()
         return f"[IMAGE CONTENT]: {description}"
     except Exception as e:
 def process_pdf_multimodal(pdf_file):
     """
+    Process PDF using PyMuPDF (HF Spaces compatible).
     """
     global current_pdf_name, index, retriever, extracted_content, extracted_images
         for file in os.listdir(figures_dir):
             os.remove(os.path.join(figures_dir, file))
+        # Extract text using PyMuPDF
+        pdf_document = fitz.open(pdf_file.name)
         text_elements = []
+        for page_num in range(len(pdf_document)):
+            page = pdf_document.load_page(page_num)
+            text = page.get_text()
+            if text.strip():
+                text_elements.append(f"[PAGE {page_num + 1}]\n{text.strip()}")
+        pdf_document.close()
+        # Extract images using PyMuPDF
+        image_paths, image_descriptions = extract_images_from_pdf_pymupdf(pdf_file.name)
+        extracted_images.extend(image_paths)
         # Combine all content
         all_content = text_elements + image_descriptions
         # Status message
         num_images = len(image_descriptions)
+        num_text_pages = len(text_elements)
+        status = f"✅ Processed '{current_pdf_name}' — {len(chunks)} chunks ({num_text_pages} pages, {num_images} images analyzed)"
         return current_pdf_name, status, gr.update(interactive=True)
         docs = retriever.get_relevant_documents(question)
         context = "\n\n".join(doc.page_content for doc in docs)
+        # Create prompt for text generation
+        prompt = f"""You are an AI assistant analyzing a document that contains both text and visual elements.
 RETRIEVED CONTEXT:
 {context}
 If your answer references visual elements (charts, graphs, images, tables), mention that explicitly. Keep your response focused and informative.
 ANSWER:"""
         # Generate response with multimodal model
+        response = multimodal_client.text_generation(
+            prompt=prompt,
+            max_new_tokens=300,
             temperature=0.5
         )
+        return response.strip()
     except Exception as e:
         return f"❌ Error generating answer: {str(e)}"
     # Model information
     gr.Markdown("""
     <div class='model-info'>
+    <strong>🤖 Powered by:</strong> Microsoft Phi-3.5-Vision + CLIP Embeddings + PyMuPDF (HF Spaces Compatible)
     </div>
     """)