Spaces:
Running
Running
Re Restore
Browse files
app.py
CHANGED
@@ -3,7 +3,7 @@ import base64
|
|
3 |
import gradio as gr
|
4 |
from mistralai import Mistral, ImageURLChunk
|
5 |
from mistralai.models import OCRResponse
|
6 |
-
from typing import Union, List, Tuple
|
7 |
import requests
|
8 |
import shutil
|
9 |
import time
|
@@ -110,10 +110,7 @@ class OCRProcessor:
|
|
110 |
range(pdf_document.page_count)
|
111 |
))
|
112 |
pdf_document.close()
|
113 |
-
|
114 |
-
if not valid_data:
|
115 |
-
logger.warning("No valid images generated from PDF")
|
116 |
-
return valid_data
|
117 |
except Exception as e:
|
118 |
logger.error(f"Error converting PDF to images: {str(e)}")
|
119 |
return []
|
@@ -143,16 +140,11 @@ class OCRProcessor:
|
|
143 |
document=ImageURLChunk(image_url=base64_url),
|
144 |
include_image_base64=True
|
145 |
)
|
146 |
-
logger.info(
|
147 |
-
for page in response.pages:
|
148 |
-
logger.debug(f"Page markdown: {page.markdown}")
|
149 |
return response
|
150 |
except (ConnectionError, Timeout, socket.error) as e:
|
151 |
logger.error(f"Network error during OCR API call: {str(e)}")
|
152 |
raise
|
153 |
-
except Exception as e:
|
154 |
-
logger.error(f"OCR API error: {str(e)}")
|
155 |
-
raise
|
156 |
|
157 |
def ocr_uploaded_pdf(self, pdf_file: Union[str, bytes]) -> Tuple[str, List[str]]:
|
158 |
file_name = getattr(pdf_file, 'name', f"pdf_{int(time.time())}.pdf")
|
@@ -166,7 +158,7 @@ class OCRProcessor:
|
|
166 |
|
167 |
image_data = self._pdf_to_images(pdf_path)
|
168 |
if not image_data:
|
169 |
-
|
170 |
|
171 |
ocr_results = []
|
172 |
image_paths = [path for path, _ in image_data]
|
@@ -175,7 +167,7 @@ class OCRProcessor:
|
|
175 |
markdown_with_images = self._get_combined_markdown_with_images(response, image_paths, i)
|
176 |
ocr_results.append(markdown_with_images)
|
177 |
|
178 |
-
return "\n\n".join(ocr_results)
|
179 |
except Exception as e:
|
180 |
return self._handle_error("uploaded PDF processing", e), []
|
181 |
|
@@ -190,7 +182,7 @@ class OCRProcessor:
|
|
190 |
|
191 |
image_data = self._pdf_to_images(pdf_path)
|
192 |
if not image_data:
|
193 |
-
|
194 |
|
195 |
ocr_results = []
|
196 |
image_paths = [path for path, _ in image_data]
|
@@ -199,11 +191,11 @@ class OCRProcessor:
|
|
199 |
markdown_with_images = self._get_combined_markdown_with_images(response, image_paths, i)
|
200 |
ocr_results.append(markdown_with_images)
|
201 |
|
202 |
-
return "\n\n".join(ocr_results)
|
203 |
except Exception as e:
|
204 |
return self._handle_error("PDF URL processing", e), []
|
205 |
|
206 |
-
def ocr_uploaded_image(self, image_file: Union[str, bytes]) -> Tuple[str,
|
207 |
file_name = getattr(image_file, 'name', f"image_{int(time.time())}.jpg")
|
208 |
logger.info(f"Processing uploaded image: {file_name}")
|
209 |
try:
|
@@ -211,20 +203,17 @@ class OCRProcessor:
|
|
211 |
image_path = self._save_uploaded_file(image_file, file_name)
|
212 |
encoded_image = self._encode_image(image_path)
|
213 |
response = self._call_ocr_api(encoded_image)
|
214 |
-
|
215 |
-
preview_update = gr.Image.update(value=image_path) if image_path else gr.Image.update()
|
216 |
-
return markdown_with_images or "No text detected in image", preview_update
|
217 |
except Exception as e:
|
218 |
-
return self._handle_error("image processing", e),
|
219 |
|
220 |
@staticmethod
|
221 |
def _get_combined_markdown_with_images(response: OCRResponse, image_paths: List[str] = None, page_index: int = None) -> str:
|
222 |
markdown_parts = []
|
223 |
-
logger.info(f"Processing response with {len(response.pages)} pages")
|
224 |
for i, page in enumerate(response.pages):
|
225 |
-
if page.markdown
|
226 |
-
markdown = page.markdown
|
227 |
-
logger.info(f"Page {i} markdown: {markdown
|
228 |
if hasattr(page, 'images') and page.images:
|
229 |
logger.info(f"Found {len(page.images)} images in page {i}")
|
230 |
for img in page.images:
|
@@ -244,8 +233,10 @@ class OCRProcessor:
|
|
244 |
)
|
245 |
else:
|
246 |
logger.warning(f"No images found in page {i}")
|
|
|
247 |
if image_paths and page_index is not None and page_index < len(image_paths):
|
248 |
local_encoded = OCRProcessor._encode_image(image_paths[page_index])
|
|
|
249 |
placeholder = f"img-{i}.jpeg"
|
250 |
if placeholder in markdown:
|
251 |
markdown = markdown.replace(
|
@@ -253,15 +244,14 @@ class OCRProcessor:
|
|
253 |
f""
|
254 |
)
|
255 |
else:
|
|
|
256 |
markdown += f"\n\n"
|
257 |
markdown_parts.append(markdown)
|
258 |
-
else:
|
259 |
-
logger.warning(f"No markdown content in page {i}")
|
260 |
return "\n\n".join(markdown_parts) or "No text or images detected"
|
261 |
|
262 |
@staticmethod
|
263 |
def _handle_error(context: str, error: Exception) -> str:
|
264 |
-
logger.error(f"Error in {context}: {str(
|
265 |
return f"**Error in {context}:** {str(error)}"
|
266 |
|
267 |
def create_interface():
|
@@ -305,9 +295,8 @@ def create_interface():
|
|
305 |
|
306 |
def process_image(processor, image):
|
307 |
if not processor or not image:
|
308 |
-
return "Please set API key and upload an image",
|
309 |
-
|
310 |
-
return result, preview_update
|
311 |
|
312 |
process_image_btn.click(
|
313 |
fn=process_image,
|
@@ -332,19 +321,15 @@ def create_interface():
|
|
332 |
|
333 |
def process_pdf(processor, pdf_file, pdf_url):
|
334 |
if not processor:
|
335 |
-
return "Please set API key first",
|
336 |
logger.info(f"Received inputs - PDF file: {pdf_file}, PDF URL: {pdf_url}")
|
337 |
if pdf_file is not None and hasattr(pdf_file, 'name'):
|
338 |
logger.info(f"Processing as uploaded PDF: {pdf_file.name}")
|
339 |
-
|
340 |
-
gallery = gr.Gallery.update(value=[(p, os.path.basename(p)) for p in image_paths]) if image_paths else gr.Gallery.update()
|
341 |
-
return result, gallery
|
342 |
elif pdf_url and pdf_url.strip():
|
343 |
logger.info(f"Processing as PDF URL: {pdf_url}")
|
344 |
-
|
345 |
-
|
346 |
-
return result, gallery
|
347 |
-
return "Please upload a PDF or provide a valid URL", gr.Gallery.update()
|
348 |
|
349 |
process_pdf_btn.click(
|
350 |
fn=process_pdf,
|
@@ -359,4 +344,5 @@ if __name__ == "__main__":
|
|
359 |
print(f"===== Application Startup at {os.environ['START_TIME']} =====")
|
360 |
create_interface().launch(
|
361 |
share=True,
|
|
|
362 |
)
|
|
|
3 |
import gradio as gr
|
4 |
from mistralai import Mistral, ImageURLChunk
|
5 |
from mistralai.models import OCRResponse
|
6 |
+
from typing import Union, List, Tuple
|
7 |
import requests
|
8 |
import shutil
|
9 |
import time
|
|
|
110 |
range(pdf_document.page_count)
|
111 |
))
|
112 |
pdf_document.close()
|
113 |
+
return [data for data in image_data if data]
|
|
|
|
|
|
|
114 |
except Exception as e:
|
115 |
logger.error(f"Error converting PDF to images: {str(e)}")
|
116 |
return []
|
|
|
140 |
document=ImageURLChunk(image_url=base64_url),
|
141 |
include_image_base64=True
|
142 |
)
|
143 |
+
logger.info("OCR API call successful")
|
|
|
|
|
144 |
return response
|
145 |
except (ConnectionError, Timeout, socket.error) as e:
|
146 |
logger.error(f"Network error during OCR API call: {str(e)}")
|
147 |
raise
|
|
|
|
|
|
|
148 |
|
149 |
def ocr_uploaded_pdf(self, pdf_file: Union[str, bytes]) -> Tuple[str, List[str]]:
|
150 |
file_name = getattr(pdf_file, 'name', f"pdf_{int(time.time())}.pdf")
|
|
|
158 |
|
159 |
image_data = self._pdf_to_images(pdf_path)
|
160 |
if not image_data:
|
161 |
+
raise ValueError("No pages converted from PDF")
|
162 |
|
163 |
ocr_results = []
|
164 |
image_paths = [path for path, _ in image_data]
|
|
|
167 |
markdown_with_images = self._get_combined_markdown_with_images(response, image_paths, i)
|
168 |
ocr_results.append(markdown_with_images)
|
169 |
|
170 |
+
return "\n\n".join(ocr_results), image_paths
|
171 |
except Exception as e:
|
172 |
return self._handle_error("uploaded PDF processing", e), []
|
173 |
|
|
|
182 |
|
183 |
image_data = self._pdf_to_images(pdf_path)
|
184 |
if not image_data:
|
185 |
+
raise ValueError("No pages converted from PDF")
|
186 |
|
187 |
ocr_results = []
|
188 |
image_paths = [path for path, _ in image_data]
|
|
|
191 |
markdown_with_images = self._get_combined_markdown_with_images(response, image_paths, i)
|
192 |
ocr_results.append(markdown_with_images)
|
193 |
|
194 |
+
return "\n\n".join(ocr_results), image_paths
|
195 |
except Exception as e:
|
196 |
return self._handle_error("PDF URL processing", e), []
|
197 |
|
198 |
+
def ocr_uploaded_image(self, image_file: Union[str, bytes]) -> Tuple[str, str]:
|
199 |
file_name = getattr(image_file, 'name', f"image_{int(time.time())}.jpg")
|
200 |
logger.info(f"Processing uploaded image: {file_name}")
|
201 |
try:
|
|
|
203 |
image_path = self._save_uploaded_file(image_file, file_name)
|
204 |
encoded_image = self._encode_image(image_path)
|
205 |
response = self._call_ocr_api(encoded_image)
|
206 |
+
return self._get_combined_markdown_with_images(response), image_path
|
|
|
|
|
207 |
except Exception as e:
|
208 |
+
return self._handle_error("image processing", e), None
|
209 |
|
210 |
@staticmethod
|
211 |
def _get_combined_markdown_with_images(response: OCRResponse, image_paths: List[str] = None, page_index: int = None) -> str:
|
212 |
markdown_parts = []
|
|
|
213 |
for i, page in enumerate(response.pages):
|
214 |
+
if page.markdown.strip():
|
215 |
+
markdown = page.markdown
|
216 |
+
logger.info(f"Page {i} markdown: {markdown}")
|
217 |
if hasattr(page, 'images') and page.images:
|
218 |
logger.info(f"Found {len(page.images)} images in page {i}")
|
219 |
for img in page.images:
|
|
|
233 |
)
|
234 |
else:
|
235 |
logger.warning(f"No images found in page {i}")
|
236 |
+
# Replace known placeholders or append the local image
|
237 |
if image_paths and page_index is not None and page_index < len(image_paths):
|
238 |
local_encoded = OCRProcessor._encode_image(image_paths[page_index])
|
239 |
+
# Replace placeholders like img-0.jpeg
|
240 |
placeholder = f"img-{i}.jpeg"
|
241 |
if placeholder in markdown:
|
242 |
markdown = markdown.replace(
|
|
|
244 |
f""
|
245 |
)
|
246 |
else:
|
247 |
+
# Append the image if no placeholder is found
|
248 |
markdown += f"\n\n"
|
249 |
markdown_parts.append(markdown)
|
|
|
|
|
250 |
return "\n\n".join(markdown_parts) or "No text or images detected"
|
251 |
|
252 |
@staticmethod
|
253 |
def _handle_error(context: str, error: Exception) -> str:
|
254 |
+
logger.error(f"Error in {context}: {str(error)}")
|
255 |
return f"**Error in {context}:** {str(error)}"
|
256 |
|
257 |
def create_interface():
|
|
|
295 |
|
296 |
def process_image(processor, image):
|
297 |
if not processor or not image:
|
298 |
+
return "Please set API key and upload an image", None
|
299 |
+
return processor.ocr_uploaded_image(image)
|
|
|
300 |
|
301 |
process_image_btn.click(
|
302 |
fn=process_image,
|
|
|
321 |
|
322 |
def process_pdf(processor, pdf_file, pdf_url):
|
323 |
if not processor:
|
324 |
+
return "Please set API key first", []
|
325 |
logger.info(f"Received inputs - PDF file: {pdf_file}, PDF URL: {pdf_url}")
|
326 |
if pdf_file is not None and hasattr(pdf_file, 'name'):
|
327 |
logger.info(f"Processing as uploaded PDF: {pdf_file.name}")
|
328 |
+
return processor.ocr_uploaded_pdf(pdf_file)
|
|
|
|
|
329 |
elif pdf_url and pdf_url.strip():
|
330 |
logger.info(f"Processing as PDF URL: {pdf_url}")
|
331 |
+
return processor.ocr_pdf_url(pdf_url)
|
332 |
+
return "Please upload a PDF or provide a valid URL", []
|
|
|
|
|
333 |
|
334 |
process_pdf_btn.click(
|
335 |
fn=process_pdf,
|
|
|
344 |
print(f"===== Application Startup at {os.environ['START_TIME']} =====")
|
345 |
create_interface().launch(
|
346 |
share=True,
|
347 |
+
debug=True,
|
348 |
)
|