Daemontatox commited on
Commit
0f2aa55
·
verified ·
1 Parent(s): cd3a11d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -76
app.py CHANGED
@@ -35,10 +35,7 @@ class DocumentState:
35
  doc_state = DocumentState()
36
 
37
  def process_pdf_file(file_path):
38
- """
39
- Convert PDF to images and extract text using PyMuPDF with improved error handling
40
- and image quality settings.
41
- """
42
  try:
43
  doc = fitz.open(file_path)
44
  images = []
@@ -47,38 +44,24 @@ def process_pdf_file(file_path):
47
  for page_num in range(doc.page_count):
48
  try:
49
  page = doc[page_num]
50
-
51
- # Extract text with better formatting
52
  page_text = page.get_text("text")
53
- if page_text.strip(): # Only add non-empty pages
54
  text += f"Page {page_num + 1}:\n{page_text}\n\n"
55
 
56
- # Improved image extraction with error handling
57
- try:
58
- # Use higher DPI for better quality
59
- zoom = 2 # Increase zoom factor for better resolution
60
- mat = fitz.Matrix(zoom, zoom)
61
- pix = page.get_pixmap(matrix=mat, alpha=False)
62
-
63
- # Convert to PIL Image with proper color handling
64
- img_data = pix.tobytes("png")
65
- img = Image.open(io.BytesIO(img_data))
66
-
67
- # Ensure RGB mode and reasonable size
68
- img = img.convert("RGB")
69
-
70
- # Resize if image is too large (keeping aspect ratio)
71
- max_size = 1600
72
- if max(img.size) > max_size:
73
- ratio = max_size / max(img.size)
74
- new_size = tuple(int(dim * ratio) for dim in img.size)
75
- img = img.resize(new_size, Image.Resampling.LANCZOS)
76
-
77
- images.append(img)
78
-
79
- except Exception as e:
80
- logger.error(f"Error processing page {page_num} image: {str(e)}")
81
- continue
82
 
83
  except Exception as e:
84
  logger.error(f"Error processing page {page_num}: {str(e)}")
@@ -95,28 +78,27 @@ def process_pdf_file(file_path):
95
  logger.error(f"Error processing PDF file: {str(e)}")
96
  raise
97
 
98
- def process_file(file):
99
- """Process either PDF or image file with improved error handling."""
100
  try:
101
  doc_state.clear()
102
 
103
- if isinstance(file, dict):
104
- file_path = file["path"]
105
- else:
106
- file_path = file
107
 
108
- if file_path.lower().endswith('pdf'):
 
 
109
  doc_state.doc_type = 'pdf'
110
  try:
111
  doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
112
  return f"PDF processed successfully. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
113
  except Exception as e:
114
- return f"Error processing PDF: {str(e)}. Please try a different PDF file or check if the file is corrupted."
115
  else:
116
  doc_state.doc_type = 'image'
117
  try:
118
  img = Image.open(file_path).convert("RGB")
119
- # Resize if necessary
120
  max_size = 1600
121
  if max(img.size) > max_size:
122
  ratio = max_size / max(img.size)
@@ -133,28 +115,13 @@ def process_file(file):
133
  @spaces.GPU()
134
  def bot_streaming(message, history, max_new_tokens=8192):
135
  try:
136
- txt = message["text"]
137
  messages = []
138
 
139
- # Process new file if provided
140
- if message.get("files") and len(message["files"]) > 0:
141
- result = process_file(message["files"][0])
142
- if "Error" in result:
143
- yield result
144
- return
145
-
146
- # Process history with better error handling
147
  for i, msg in enumerate(history):
148
  try:
149
- if isinstance(msg[0], dict):
150
- user_content = [{"type": "text", "text": msg[0]["text"]}]
151
- if "files" in msg[0] and len(msg[0]["files"]) > 0:
152
- user_content.append({"type": "image"})
153
- messages.append({"role": "user", "content": user_content})
154
- messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
155
- elif isinstance(msg[0], str):
156
- messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
157
- messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
158
  except Exception as e:
159
  logger.error(f"Error processing history message {i}: {str(e)}")
160
  continue
@@ -162,10 +129,10 @@ def bot_streaming(message, history, max_new_tokens=8192):
162
  # Include document context
163
  if doc_state.current_doc_images:
164
  context = f"\nDocument context:\n{doc_state.current_doc_text}" if doc_state.current_doc_text else ""
165
- current_msg = f"{txt}{context}"
166
  messages.append({"role": "user", "content": [{"type": "text", "text": current_msg}, {"type": "image"}]})
167
  else:
168
- messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
169
 
170
  # Process inputs
171
  texts = processor.apply_chat_template(messages, add_generation_prompt=True)
@@ -210,17 +177,21 @@ with gr.Blocks() as demo:
210
  gr.Markdown("# Document Analyzer with Chat Support")
211
  gr.Markdown("Upload a PDF or image and chat about its contents. For PDFs, all pages will be processed for visual analysis.")
212
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  chatbot = gr.ChatInterface(
214
  fn=bot_streaming,
215
  title="Document Chat",
216
- examples=[
217
- [{"text": "Which era does this piece belong to? Give details about the era.", "files":["./examples/rococo.jpg"]}, 200],
218
- [{"text": "Where do the droughts happen according to this diagram?", "files":["./examples/weather_events.png"]}, 250],
219
- [{"text": "What happens when you take out white cat from this chain?", "files":["./examples/ai2d_test.jpg"]}, 250],
220
- [{"text": "How long does it take from invoice date to due date? Be short and concise.", "files":["./examples/invoice.png"]}, 250],
221
- [{"text": "Where to find this monument? Can you give me other recommendations around the area?", "files":["./examples/wat_arun.jpg"]}, 250],
222
- ],
223
- textbox=gr.MultimodalTextbox(),
224
  additional_inputs=[
225
  gr.Slider(
226
  minimum=10,
@@ -230,16 +201,20 @@ with gr.Blocks() as demo:
230
  label="Maximum number of new tokens to generate",
231
  )
232
  ],
233
- cache_examples=False,
234
  stop_btn="Stop Generation",
235
- fill_height=True,
236
- multimodal=True
237
  )
238
 
239
- clear_btn = gr.Button("Clear Document Context")
240
- clear_btn.click(fn=clear_context)
 
 
 
241
 
242
- chatbot.textbox.file_types = ["image", "pdf", "text"]
 
 
 
243
 
244
  # Launch the interface
245
  demo.launch(debug=True)
 
35
  doc_state = DocumentState()
36
 
37
  def process_pdf_file(file_path):
38
+ """Convert PDF to images and extract text using PyMuPDF."""
 
 
 
39
  try:
40
  doc = fitz.open(file_path)
41
  images = []
 
44
  for page_num in range(doc.page_count):
45
  try:
46
  page = doc[page_num]
 
 
47
  page_text = page.get_text("text")
48
+ if page_text.strip():
49
  text += f"Page {page_num + 1}:\n{page_text}\n\n"
50
 
51
+ zoom = 2
52
+ mat = fitz.Matrix(zoom, zoom)
53
+ pix = page.get_pixmap(matrix=mat, alpha=False)
54
+ img_data = pix.tobytes("png")
55
+ img = Image.open(io.BytesIO(img_data))
56
+ img = img.convert("RGB")
57
+
58
+ max_size = 1600
59
+ if max(img.size) > max_size:
60
+ ratio = max_size / max(img.size)
61
+ new_size = tuple(int(dim * ratio) for dim in img.size)
62
+ img = img.resize(new_size, Image.Resampling.LANCZOS)
63
+
64
+ images.append(img)
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  except Exception as e:
67
  logger.error(f"Error processing page {page_num}: {str(e)}")
 
78
  logger.error(f"Error processing PDF file: {str(e)}")
79
  raise
80
 
81
+ def process_uploaded_file(file):
82
+ """Process uploaded file and update document state."""
83
  try:
84
  doc_state.clear()
85
 
86
+ if file is None:
87
+ return "No file uploaded. Please upload a file."
 
 
88
 
89
+ file_path = file.name if isinstance(file, FileData) else file
90
+
91
+ if file_path.lower().endswith('.pdf'):
92
  doc_state.doc_type = 'pdf'
93
  try:
94
  doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
95
  return f"PDF processed successfully. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
96
  except Exception as e:
97
+ return f"Error processing PDF: {str(e)}. Please try a different PDF file."
98
  else:
99
  doc_state.doc_type = 'image'
100
  try:
101
  img = Image.open(file_path).convert("RGB")
 
102
  max_size = 1600
103
  if max(img.size) > max_size:
104
  ratio = max_size / max(img.size)
 
115
  @spaces.GPU()
116
  def bot_streaming(message, history, max_new_tokens=8192):
117
  try:
 
118
  messages = []
119
 
120
+ # Process history
 
 
 
 
 
 
 
121
  for i, msg in enumerate(history):
122
  try:
123
+ messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
124
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
 
 
 
 
 
 
 
125
  except Exception as e:
126
  logger.error(f"Error processing history message {i}: {str(e)}")
127
  continue
 
129
  # Include document context
130
  if doc_state.current_doc_images:
131
  context = f"\nDocument context:\n{doc_state.current_doc_text}" if doc_state.current_doc_text else ""
132
+ current_msg = f"{message}{context}"
133
  messages.append({"role": "user", "content": [{"type": "text", "text": current_msg}, {"type": "image"}]})
134
  else:
135
+ messages.append({"role": "user", "content": [{"type": "text", "text": message}]})
136
 
137
  # Process inputs
138
  texts = processor.apply_chat_template(messages, add_generation_prompt=True)
 
177
  gr.Markdown("# Document Analyzer with Chat Support")
178
  gr.Markdown("Upload a PDF or image and chat about its contents. For PDFs, all pages will be processed for visual analysis.")
179
 
180
+ with gr.Row():
181
+ file_upload = gr.File(
182
+ label="Upload Document (PDF or Image)",
183
+ file_types=["pdf", "image"]
184
+ )
185
+ upload_status = gr.Textbox(
186
+ label="Upload Status",
187
+ interactive=False
188
+ )
189
+
190
+ clear_btn = gr.Button("Clear Document Context")
191
+
192
  chatbot = gr.ChatInterface(
193
  fn=bot_streaming,
194
  title="Document Chat",
 
 
 
 
 
 
 
 
195
  additional_inputs=[
196
  gr.Slider(
197
  minimum=10,
 
201
  label="Maximum number of new tokens to generate",
202
  )
203
  ],
 
204
  stop_btn="Stop Generation",
205
+ fill_height=True
 
206
  )
207
 
208
+ file_upload.change(
209
+ fn=process_uploaded_file,
210
+ inputs=[file_upload],
211
+ outputs=[upload_status]
212
+ )
213
 
214
+ clear_btn.click(
215
+ fn=clear_context,
216
+ outputs=[upload_status]
217
+ )
218
 
219
  # Launch the interface
220
  demo.launch(debug=True)