Spaces:

Daemontatox
/

Chat_with_PDFS_and_Images

Running on Zero

App Files Files Community

Daemontatox commited on 29 days ago

Commit

0644b4c

verified ·

1 Parent(s): 3bc1ee9

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -15

app.py CHANGED Viewed

@@ -11,11 +11,12 @@ import fitz  # PyMuPDF
 import io
 import numpy as np
 ckpt = "Daemontatox/DocumentCogito"
-model = MllamaForConditionalGeneration.from_pretrained(ckpt,
-    torch_dtype=torch.bfloat16).to("cuda")
 processor = AutoProcessor.from_pretrained(ckpt)
 class DocumentState:
     def __init__(self):
         self.current_doc_images = []
@@ -29,27 +30,26 @@ class DocumentState:
 doc_state = DocumentState()
 def process_pdf_file(file_path):
     """Convert PDF to images and extract text using PyMuPDF."""
     doc = fitz.open(file_path)
     images = []
     text = ""
-    # Take first page only for initial processing
-    if doc.page_count > 0:
-        page = doc[0]
-        text = f"First page content:\n{page.get_text()}\n"
         pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
         img_data = pix.tobytes("png")
         img = Image.open(io.BytesIO(img_data))
         images.append(img.convert("RGB"))
-        if doc.page_count > 1:
-            text += f"\nTotal pages in document: {doc.page_count}\n"
     doc.close()
     return images, text
 def process_file(file):
     """Process either PDF or image file and update document state."""
     doc_state.clear()
@@ -62,14 +62,15 @@ def process_file(file):
     if file_path.lower().endswith('.pdf'):
         doc_state.doc_type = 'pdf'
         doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
-        return f"PDF first page processed. You can now ask questions about the content."
     else:
         doc_state.doc_type = 'image'
         doc_state.current_doc_images = [Image.open(file_path).convert("RGB")]
         return "Image loaded successfully. You can now ask questions about the content."
 @spaces.GPU()
-def bot_streaming(message, history, max_new_tokens=2048):
     txt = message["text"]
     messages = []
@@ -79,10 +80,13 @@ def bot_streaming(message, history, max_new_tokens=2048):
     # Process history
     for i, msg in enumerate(history):
-        if isinstance(msg[0], tuple):
-            messages.append({"role": "user", "content": [{"type": "text", "text": msg[0][1]}, {"type": "image"}]})
             messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
-        elif isinstance(msg[0], str):
             messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
             messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
@@ -119,6 +123,7 @@ def bot_streaming(message, history, max_new_tokens=2048):
         time.sleep(0.01)
         yield buffer
 def clear_context():
     """Clear the current document context."""
     doc_state.clear()
@@ -127,7 +132,7 @@ def clear_context():
 # Create the Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Document Analyzer with Chat Support")
-    gr.Markdown("Upload a PDF or image and chat about its contents. For PDFs, the first page will be processed for visual analysis.")
     chatbot = gr.ChatInterface(
         fn=bot_streaming,

 import io
 import numpy as np
+# Load model and processor
 ckpt = "Daemontatox/DocumentCogito"
+model = MllamaForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16).to("cuda")
 processor = AutoProcessor.from_pretrained(ckpt)
+# Document state to track uploaded files
 class DocumentState:
     def __init__(self):
         self.current_doc_images = []
 doc_state = DocumentState()
+# Function to convert PDF to images and extract text
 def process_pdf_file(file_path):
     """Convert PDF to images and extract text using PyMuPDF."""
     doc = fitz.open(file_path)
     images = []
     text = ""
+    # Process each page
+    for page_num in range(doc.page_count):
+        page = doc[page_num]
+        text += f"Page {page_num + 1} content:\n{page.get_text()}\n"
         pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
         img_data = pix.tobytes("png")
         img = Image.open(io.BytesIO(img_data))
         images.append(img.convert("RGB"))
     doc.close()
     return images, text
+# Function to process uploaded files (PDF or image)
 def process_file(file):
     """Process either PDF or image file and update document state."""
     doc_state.clear()
     if file_path.lower().endswith('.pdf'):
         doc_state.doc_type = 'pdf'
         doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
+        return f"PDF processed. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
     else:
         doc_state.doc_type = 'image'
         doc_state.current_doc_images = [Image.open(file_path).convert("RGB")]
         return "Image loaded successfully. You can now ask questions about the content."
+# Function to handle streaming responses from the model
 @spaces.GPU()
+def bot_streaming(message, history, max_new_tokens=8192):
     txt = message["text"]
     messages = []
     # Process history
     for i, msg in enumerate(history):
+        if isinstance(msg[0], dict):  # Multimodal message (text + files)
+            user_content = [{"type": "text", "text": msg[0]["text"]}]
+            if "files" in msg[0] and len(msg[0]["files"]) > 0:
+                user_content.append({"type": "image"})
+            messages.append({"role": "user", "content": user_content})
             messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
+        elif isinstance(msg[0], str):  # Text-only message
             messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
             messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
         time.sleep(0.01)
         yield buffer
+# Function to clear document context
 def clear_context():
     """Clear the current document context."""
     doc_state.clear()
 # Create the Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Document Analyzer with Chat Support")
+    gr.Markdown("Upload a PDF or image and chat about its contents. For PDFs, all pages will be processed for visual analysis.")
     chatbot = gr.ChatInterface(
         fn=bot_streaming,