Spaces:

Daemontatox
/

Chat_with_PDFS_and_Images

Running on Zero

App Files Files Community

Daemontatox commited on 27 days ago

Commit

0f2aa55

verified ·

1 Parent(s): cd3a11d

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -76

app.py CHANGED Viewed

@@ -35,10 +35,7 @@ class DocumentState:
 doc_state = DocumentState()
 def process_pdf_file(file_path):
-    """
-    Convert PDF to images and extract text using PyMuPDF with improved error handling
-    and image quality settings.
-    """
     try:
         doc = fitz.open(file_path)
         images = []
@@ -47,38 +44,24 @@ def process_pdf_file(file_path):
         for page_num in range(doc.page_count):
             try:
                 page = doc[page_num]
-                # Extract text with better formatting
                 page_text = page.get_text("text")
-                if page_text.strip():  # Only add non-empty pages
                     text += f"Page {page_num + 1}:\n{page_text}\n\n"
-                # Improved image extraction with error handling
-                try:
-                    # Use higher DPI for better quality
-                    zoom = 2  # Increase zoom factor for better resolution
-                    mat = fitz.Matrix(zoom, zoom)
-                    pix = page.get_pixmap(matrix=mat, alpha=False)
-                    # Convert to PIL Image with proper color handling
-                    img_data = pix.tobytes("png")
-                    img = Image.open(io.BytesIO(img_data))
-                    # Ensure RGB mode and reasonable size
-                    img = img.convert("RGB")
-                    # Resize if image is too large (keeping aspect ratio)
-                    max_size = 1600
-                    if max(img.size) > max_size:
-                        ratio = max_size / max(img.size)
-                        new_size = tuple(int(dim * ratio) for dim in img.size)
-                        img = img.resize(new_size, Image.Resampling.LANCZOS)
-                    images.append(img)
-                except Exception as e:
-                    logger.error(f"Error processing page {page_num} image: {str(e)}")
-                    continue
             except Exception as e:
                 logger.error(f"Error processing page {page_num}: {str(e)}")
@@ -95,28 +78,27 @@ def process_pdf_file(file_path):
         logger.error(f"Error processing PDF file: {str(e)}")
         raise
-def process_file(file):
-    """Process either PDF or image file with improved error handling."""
     try:
         doc_state.clear()
-        if isinstance(file, dict):
-            file_path = file["path"]
-        else:
-            file_path = file
-        if file_path.lower().endswith('pdf'):
             doc_state.doc_type = 'pdf'
             try:
                 doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
                 return f"PDF processed successfully. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
             except Exception as e:
-                return f"Error processing PDF: {str(e)}. Please try a different PDF file or check if the file is corrupted."
         else:
             doc_state.doc_type = 'image'
             try:
                 img = Image.open(file_path).convert("RGB")
-                # Resize if necessary
                 max_size = 1600
                 if max(img.size) > max_size:
                     ratio = max_size / max(img.size)
@@ -133,28 +115,13 @@ def process_file(file):
 @spaces.GPU()
 def bot_streaming(message, history, max_new_tokens=8192):
     try:
-        txt = message["text"]
         messages = []
-        # Process new file if provided
-        if message.get("files") and len(message["files"]) > 0:
-            result = process_file(message["files"][0])
-            if "Error" in result:
-                yield result
-                return
-        # Process history with better error handling
         for i, msg in enumerate(history):
             try:
-                if isinstance(msg[0], dict):
-                    user_content = [{"type": "text", "text": msg[0]["text"]}]
-                    if "files" in msg[0] and len(msg[0]["files"]) > 0:
-                        user_content.append({"type": "image"})
-                    messages.append({"role": "user", "content": user_content})
-                    messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
-                elif isinstance(msg[0], str):
-                    messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
-                    messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
             except Exception as e:
                 logger.error(f"Error processing history message {i}: {str(e)}")
                 continue
@@ -162,10 +129,10 @@ def bot_streaming(message, history, max_new_tokens=8192):
         # Include document context
         if doc_state.current_doc_images:
             context = f"\nDocument context:\n{doc_state.current_doc_text}" if doc_state.current_doc_text else ""
-            current_msg = f"{txt}{context}"
             messages.append({"role": "user", "content": [{"type": "text", "text": current_msg}, {"type": "image"}]})
         else:
-            messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
         # Process inputs
         texts = processor.apply_chat_template(messages, add_generation_prompt=True)
@@ -210,17 +177,21 @@ with gr.Blocks() as demo:
     gr.Markdown("# Document Analyzer with Chat Support")
     gr.Markdown("Upload a PDF or image and chat about its contents. For PDFs, all pages will be processed for visual analysis.")
     chatbot = gr.ChatInterface(
         fn=bot_streaming,
         title="Document Chat",
-        examples=[
-            [{"text": "Which era does this piece belong to? Give details about the era.", "files":["./examples/rococo.jpg"]}, 200],
-            [{"text": "Where do the droughts happen according to this diagram?", "files":["./examples/weather_events.png"]}, 250],
-            [{"text": "What happens when you take out white cat from this chain?", "files":["./examples/ai2d_test.jpg"]}, 250],
-            [{"text": "How long does it take from invoice date to due date? Be short and concise.", "files":["./examples/invoice.png"]}, 250],
-            [{"text": "Where to find this monument? Can you give me other recommendations around the area?", "files":["./examples/wat_arun.jpg"]}, 250],
-        ],
-        textbox=gr.MultimodalTextbox(),
         additional_inputs=[
             gr.Slider(
                 minimum=10,
@@ -230,16 +201,20 @@ with gr.Blocks() as demo:
                 label="Maximum number of new tokens to generate",
             )
         ],
-        cache_examples=False,
         stop_btn="Stop Generation",
-        fill_height=True,
-        multimodal=True
     )
-    clear_btn = gr.Button("Clear Document Context")
-    clear_btn.click(fn=clear_context)
-    chatbot.textbox.file_types = ["image", "pdf", "text"]
 # Launch the interface
 demo.launch(debug=True)

 doc_state = DocumentState()
 def process_pdf_file(file_path):
+    """Convert PDF to images and extract text using PyMuPDF."""
     try:
         doc = fitz.open(file_path)
         images = []
         for page_num in range(doc.page_count):
             try:
                 page = doc[page_num]
                 page_text = page.get_text("text")
+                if page_text.strip():
                     text += f"Page {page_num + 1}:\n{page_text}\n\n"
+                zoom = 2
+                mat = fitz.Matrix(zoom, zoom)
+                pix = page.get_pixmap(matrix=mat, alpha=False)
+                img_data = pix.tobytes("png")
+                img = Image.open(io.BytesIO(img_data))
+                img = img.convert("RGB")
+                max_size = 1600
+                if max(img.size) > max_size:
+                    ratio = max_size / max(img.size)
+                    new_size = tuple(int(dim * ratio) for dim in img.size)
+                    img = img.resize(new_size, Image.Resampling.LANCZOS)
+                images.append(img)
             except Exception as e:
                 logger.error(f"Error processing page {page_num}: {str(e)}")
         logger.error(f"Error processing PDF file: {str(e)}")
         raise
+def process_uploaded_file(file):
+    """Process uploaded file and update document state."""
     try:
         doc_state.clear()
+        if file is None:
+            return "No file uploaded. Please upload a file."
+        file_path = file.name if isinstance(file, FileData) else file
+        if file_path.lower().endswith('.pdf'):
             doc_state.doc_type = 'pdf'
             try:
                 doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
                 return f"PDF processed successfully. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
             except Exception as e:
+                return f"Error processing PDF: {str(e)}. Please try a different PDF file."
         else:
             doc_state.doc_type = 'image'
             try:
                 img = Image.open(file_path).convert("RGB")
                 max_size = 1600
                 if max(img.size) > max_size:
                     ratio = max_size / max(img.size)
 @spaces.GPU()
 def bot_streaming(message, history, max_new_tokens=8192):
     try:
         messages = []
+        # Process history
         for i, msg in enumerate(history):
             try:
+                messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
+                messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
             except Exception as e:
                 logger.error(f"Error processing history message {i}: {str(e)}")
                 continue
         # Include document context
         if doc_state.current_doc_images:
             context = f"\nDocument context:\n{doc_state.current_doc_text}" if doc_state.current_doc_text else ""
+            current_msg = f"{message}{context}"
             messages.append({"role": "user", "content": [{"type": "text", "text": current_msg}, {"type": "image"}]})
         else:
+            messages.append({"role": "user", "content": [{"type": "text", "text": message}]})
         # Process inputs
         texts = processor.apply_chat_template(messages, add_generation_prompt=True)
     gr.Markdown("# Document Analyzer with Chat Support")
     gr.Markdown("Upload a PDF or image and chat about its contents. For PDFs, all pages will be processed for visual analysis.")
+    with gr.Row():
+        file_upload = gr.File(
+            label="Upload Document (PDF or Image)",
+            file_types=["pdf", "image"]
+        )
+        upload_status = gr.Textbox(
+            label="Upload Status",
+            interactive=False
+        )
+    clear_btn = gr.Button("Clear Document Context")
     chatbot = gr.ChatInterface(
         fn=bot_streaming,
         title="Document Chat",
         additional_inputs=[
             gr.Slider(
                 minimum=10,
                 label="Maximum number of new tokens to generate",
             )
         ],
         stop_btn="Stop Generation",
+        fill_height=True
     )
+    file_upload.change(
+        fn=process_uploaded_file,
+        inputs=[file_upload],
+        outputs=[upload_status]
+    )
+    clear_btn.click(
+        fn=clear_context,
+        outputs=[upload_status]
+    )
 # Launch the interface
 demo.launch(debug=True)