Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -11,11 +11,12 @@ import fitz # PyMuPDF
|
|
11 |
import io
|
12 |
import numpy as np
|
13 |
|
|
|
14 |
ckpt = "Daemontatox/DocumentCogito"
|
15 |
-
model = MllamaForConditionalGeneration.from_pretrained(ckpt,
|
16 |
-
torch_dtype=torch.bfloat16).to("cuda")
|
17 |
processor = AutoProcessor.from_pretrained(ckpt)
|
18 |
|
|
|
19 |
class DocumentState:
|
20 |
def __init__(self):
|
21 |
self.current_doc_images = []
|
@@ -29,27 +30,26 @@ class DocumentState:
|
|
29 |
|
30 |
doc_state = DocumentState()
|
31 |
|
|
|
32 |
def process_pdf_file(file_path):
|
33 |
"""Convert PDF to images and extract text using PyMuPDF."""
|
34 |
doc = fitz.open(file_path)
|
35 |
images = []
|
36 |
text = ""
|
37 |
|
38 |
-
#
|
39 |
-
|
40 |
-
page = doc[
|
41 |
-
text
|
42 |
pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
|
43 |
img_data = pix.tobytes("png")
|
44 |
img = Image.open(io.BytesIO(img_data))
|
45 |
images.append(img.convert("RGB"))
|
46 |
-
|
47 |
-
if doc.page_count > 1:
|
48 |
-
text += f"\nTotal pages in document: {doc.page_count}\n"
|
49 |
|
50 |
doc.close()
|
51 |
return images, text
|
52 |
|
|
|
53 |
def process_file(file):
|
54 |
"""Process either PDF or image file and update document state."""
|
55 |
doc_state.clear()
|
@@ -62,14 +62,15 @@ def process_file(file):
|
|
62 |
if file_path.lower().endswith('.pdf'):
|
63 |
doc_state.doc_type = 'pdf'
|
64 |
doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
|
65 |
-
return f"PDF
|
66 |
else:
|
67 |
doc_state.doc_type = 'image'
|
68 |
doc_state.current_doc_images = [Image.open(file_path).convert("RGB")]
|
69 |
return "Image loaded successfully. You can now ask questions about the content."
|
70 |
|
|
|
71 |
@spaces.GPU()
|
72 |
-
def bot_streaming(message, history, max_new_tokens=
|
73 |
txt = message["text"]
|
74 |
messages = []
|
75 |
|
@@ -79,10 +80,13 @@ def bot_streaming(message, history, max_new_tokens=2048):
|
|
79 |
|
80 |
# Process history
|
81 |
for i, msg in enumerate(history):
|
82 |
-
if isinstance(msg[0],
|
83 |
-
|
|
|
|
|
|
|
84 |
messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
|
85 |
-
elif isinstance(msg[0], str):
|
86 |
messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
|
87 |
messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
|
88 |
|
@@ -119,6 +123,7 @@ def bot_streaming(message, history, max_new_tokens=2048):
|
|
119 |
time.sleep(0.01)
|
120 |
yield buffer
|
121 |
|
|
|
122 |
def clear_context():
|
123 |
"""Clear the current document context."""
|
124 |
doc_state.clear()
|
@@ -127,7 +132,7 @@ def clear_context():
|
|
127 |
# Create the Gradio interface
|
128 |
with gr.Blocks() as demo:
|
129 |
gr.Markdown("# Document Analyzer with Chat Support")
|
130 |
-
gr.Markdown("Upload a PDF or image and chat about its contents. For PDFs,
|
131 |
|
132 |
chatbot = gr.ChatInterface(
|
133 |
fn=bot_streaming,
|
|
|
11 |
import io
|
12 |
import numpy as np
|
13 |
|
14 |
+
# Load model and processor
|
15 |
ckpt = "Daemontatox/DocumentCogito"
|
16 |
+
model = MllamaForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16).to("cuda")
|
|
|
17 |
processor = AutoProcessor.from_pretrained(ckpt)
|
18 |
|
19 |
+
# Document state to track uploaded files
|
20 |
class DocumentState:
|
21 |
def __init__(self):
|
22 |
self.current_doc_images = []
|
|
|
30 |
|
31 |
doc_state = DocumentState()
|
32 |
|
33 |
+
# Function to convert PDF to images and extract text
|
34 |
def process_pdf_file(file_path):
|
35 |
"""Convert PDF to images and extract text using PyMuPDF."""
|
36 |
doc = fitz.open(file_path)
|
37 |
images = []
|
38 |
text = ""
|
39 |
|
40 |
+
# Process each page
|
41 |
+
for page_num in range(doc.page_count):
|
42 |
+
page = doc[page_num]
|
43 |
+
text += f"Page {page_num + 1} content:\n{page.get_text()}\n"
|
44 |
pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
|
45 |
img_data = pix.tobytes("png")
|
46 |
img = Image.open(io.BytesIO(img_data))
|
47 |
images.append(img.convert("RGB"))
|
|
|
|
|
|
|
48 |
|
49 |
doc.close()
|
50 |
return images, text
|
51 |
|
52 |
+
# Function to process uploaded files (PDF or image)
|
53 |
def process_file(file):
|
54 |
"""Process either PDF or image file and update document state."""
|
55 |
doc_state.clear()
|
|
|
62 |
if file_path.lower().endswith('.pdf'):
|
63 |
doc_state.doc_type = 'pdf'
|
64 |
doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
|
65 |
+
return f"PDF processed. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
|
66 |
else:
|
67 |
doc_state.doc_type = 'image'
|
68 |
doc_state.current_doc_images = [Image.open(file_path).convert("RGB")]
|
69 |
return "Image loaded successfully. You can now ask questions about the content."
|
70 |
|
71 |
+
# Function to handle streaming responses from the model
|
72 |
@spaces.GPU()
|
73 |
+
def bot_streaming(message, history, max_new_tokens=8192):
|
74 |
txt = message["text"]
|
75 |
messages = []
|
76 |
|
|
|
80 |
|
81 |
# Process history
|
82 |
for i, msg in enumerate(history):
|
83 |
+
if isinstance(msg[0], dict): # Multimodal message (text + files)
|
84 |
+
user_content = [{"type": "text", "text": msg[0]["text"]}]
|
85 |
+
if "files" in msg[0] and len(msg[0]["files"]) > 0:
|
86 |
+
user_content.append({"type": "image"})
|
87 |
+
messages.append({"role": "user", "content": user_content})
|
88 |
messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
|
89 |
+
elif isinstance(msg[0], str): # Text-only message
|
90 |
messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
|
91 |
messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
|
92 |
|
|
|
123 |
time.sleep(0.01)
|
124 |
yield buffer
|
125 |
|
126 |
+
# Function to clear document context
|
127 |
def clear_context():
|
128 |
"""Clear the current document context."""
|
129 |
doc_state.clear()
|
|
|
132 |
# Create the Gradio interface
|
133 |
with gr.Blocks() as demo:
|
134 |
gr.Markdown("# Document Analyzer with Chat Support")
|
135 |
+
gr.Markdown("Upload a PDF or image and chat about its contents. For PDFs, all pages will be processed for visual analysis.")
|
136 |
|
137 |
chatbot = gr.ChatInterface(
|
138 |
fn=bot_streaming,
|