Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import base64
|
3 |
import gradio as gr
|
4 |
import json
|
5 |
-
import re
|
6 |
from mistralai import Mistral, DocumentURLChunk, ImageURLChunk, TextChunk
|
7 |
from mistralai.models import OCRResponse
|
8 |
from typing import Union, List, Tuple, Dict
|
@@ -13,8 +13,6 @@ import pymupdf as fitz
|
|
13 |
import logging
|
14 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
15 |
from concurrent.futures import ThreadPoolExecutor
|
16 |
-
import socket
|
17 |
-
from requests.exceptions import ConnectionError, Timeout
|
18 |
from pathlib import Path
|
19 |
from pydantic import BaseModel
|
20 |
import pycountry
|
@@ -37,7 +35,7 @@ logging.basicConfig(
|
|
37 |
)
|
38 |
logger = logging.getLogger(__name__)
|
39 |
|
40 |
-
# Language Enum
|
41 |
languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
|
42 |
|
43 |
class LanguageMeta(Enum.__class__):
|
@@ -80,8 +78,8 @@ class OCRProcessor:
|
|
80 |
raise ValueError(f"API key validation failed: {str(e)}")
|
81 |
|
82 |
@staticmethod
|
83 |
-
def _check_file_size(file_input: Union[str, bytes]) -> None:
|
84 |
-
if isinstance(file_input, str) and os.path.exists(file_input):
|
85 |
size = os.path.getsize(file_input)
|
86 |
elif hasattr(file_input, 'read'):
|
87 |
size = len(file_input.read())
|
@@ -92,18 +90,18 @@ class OCRProcessor:
|
|
92 |
raise ValueError(f"File size exceeds {MAX_FILE_SIZE/1024/1024}MB limit")
|
93 |
|
94 |
@staticmethod
|
95 |
-
def _save_uploaded_file(file_input: Union[str, bytes], filename: str) -> str:
|
96 |
clean_filename = os.path.basename(filename).replace(os.sep, "_")
|
97 |
file_path = os.path.join(UPLOAD_FOLDER, f"{int(time.time())}_{clean_filename}")
|
98 |
|
99 |
try:
|
100 |
-
if isinstance(file_input, str) and file_input.startswith("http"):
|
101 |
logger.info(f"Downloading from URL: {file_input}")
|
102 |
response = requests.get(file_input, timeout=30)
|
103 |
response.raise_for_status()
|
104 |
with open(file_path, 'wb') as f:
|
105 |
f.write(response.content)
|
106 |
-
elif isinstance(file_input, str) and os.path.exists(file_input):
|
107 |
logger.info(f"Copying local file: {file_input}")
|
108 |
shutil.copy2(file_input, file_path)
|
109 |
else:
|
@@ -126,7 +124,7 @@ class OCRProcessor:
|
|
126 |
try:
|
127 |
with open(image_path, "rb") as image_file:
|
128 |
encoded = base64.b64encode(image_file.read()).decode('utf-8')
|
129 |
-
logger.info(f"Encoded image {image_path}
|
130 |
return encoded
|
131 |
except Exception as e:
|
132 |
logger.error(f"Error encoding image {image_path}: {str(e)}")
|
@@ -172,7 +170,7 @@ class OCRProcessor:
|
|
172 |
|
173 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
174 |
def _call_ocr_api(self, encoded_image: str) -> OCRResponse:
|
175 |
-
logger.info(
|
176 |
if not isinstance(encoded_image, str):
|
177 |
raise TypeError(f"Expected encoded_image to be a string, got {type(encoded_image)}")
|
178 |
base64_url = f"data:image/png;base64,{encoded_image}"
|
@@ -183,22 +181,16 @@ class OCRProcessor:
|
|
183 |
include_image_base64=True
|
184 |
)
|
185 |
logger.info("OCR API call successful")
|
186 |
-
try:
|
187 |
-
if hasattr(response, 'model_dump_json'):
|
188 |
-
response_dict = json.loads(response.model_dump_json())
|
189 |
-
else:
|
190 |
-
response_dict = {k: v for k, v in response.__dict__.items() if isinstance(v, (str, int, float, list, dict))}
|
191 |
-
logger.info(f"Raw OCR response: {json.dumps(response_dict, default=str, indent=4)}")
|
192 |
-
except Exception as log_err:
|
193 |
-
logger.warning(f"Failed to log raw OCR response: {str(log_err)}")
|
194 |
return response
|
195 |
except Exception as e:
|
196 |
-
|
|
|
|
|
197 |
raise
|
198 |
|
199 |
def _process_pdf_with_ocr(self, pdf_path: str) -> Tuple[str, List[str], List[Dict]]:
|
200 |
try:
|
201 |
-
logger.info(f"Processing PDF
|
202 |
uploaded_file = self.client.files.upload(
|
203 |
file={"file_name": Path(pdf_path).stem, "content": Path(pdf_path).read_bytes()},
|
204 |
purpose="ocr",
|
@@ -214,26 +206,21 @@ class OCRProcessor:
|
|
214 |
json_results = self._convert_to_structured_json(markdown, pdf_path)
|
215 |
image_paths = []
|
216 |
if not any(page.images for page in ocr_response.pages):
|
217 |
-
logger.warning("No images
|
218 |
image_data = self._pdf_to_images(pdf_path)
|
219 |
image_paths = [path for path, _ in image_data]
|
220 |
else:
|
221 |
image_paths = [os.path.join(UPLOAD_FOLDER, f"ocr_page_{i}.png") for i in range(len(ocr_response.pages))]
|
222 |
for i, base64_img in enumerate(base64_images):
|
223 |
-
if base64_img:
|
224 |
try:
|
225 |
img_data = base64.b64decode(base64_img.split(',')[1])
|
226 |
with open(image_paths[i], "wb") as f:
|
227 |
f.write(img_data)
|
228 |
-
if os.path.exists(image_paths[i]):
|
229 |
-
logger.info(f"Image {image_paths[i]} saved and exists")
|
230 |
-
else:
|
231 |
-
logger.error(f"Image {image_paths[i]} saved but does not exist")
|
232 |
except Exception as e:
|
233 |
logger.error(f"Error saving image {i}: {str(e)}")
|
234 |
image_paths[i] = None
|
235 |
image_paths = [path for path in image_paths if path and os.path.exists(path)]
|
236 |
-
logger.info(f"Final image paths: {image_paths}")
|
237 |
return markdown, image_paths, json_results
|
238 |
except Exception as e:
|
239 |
return self._handle_error("PDF OCR processing", e), [], []
|
@@ -245,31 +232,25 @@ class OCRProcessor:
|
|
245 |
image_data = {}
|
246 |
for img in page.images:
|
247 |
if img.image_base64:
|
248 |
-
# Use correct MIME type based on image format (assuming JPEG from logs)
|
249 |
base64_url = f"data:image/jpeg;base64,{img.image_base64}"
|
250 |
image_data[img.id] = base64_url
|
251 |
base64_images.append(base64_url)
|
252 |
-
logger.info(f"Base64 image {img.id} length: {len(img.image_base64)}")
|
253 |
else:
|
254 |
base64_images.append(None)
|
255 |
markdown = page.markdown or "No text detected"
|
256 |
markdown = replace_images_in_markdown(markdown, image_data)
|
257 |
-
logger.info(f"Page {i} markdown (first 200 chars): {markdown[:200]}...")
|
258 |
markdowns.append(markdown)
|
259 |
return "\n\n".join(markdowns), base64_images
|
260 |
|
261 |
def _convert_to_structured_json(self, markdown: str, file_path: str) -> List[Dict]:
|
262 |
try:
|
263 |
text_only_markdown = re.sub(r'!\[.*?\]\(data:image/[^)]+\)', '', markdown)
|
264 |
-
logger.info(f"Text-only markdown length: {len(text_only_markdown)}")
|
265 |
-
logger.info(f"Text-only markdown content: {text_only_markdown[:200]}...")
|
266 |
-
|
267 |
chat_response = self.client.chat.parse(
|
268 |
model="pixtral-12b-latest",
|
269 |
messages=[
|
270 |
{
|
271 |
"role": "user",
|
272 |
-
"content": f"
|
273 |
},
|
274 |
],
|
275 |
response_format=StructuredOCR,
|
@@ -277,13 +258,12 @@ class OCRProcessor:
|
|
277 |
)
|
278 |
structured_result = chat_response.choices[0].message.parsed
|
279 |
json_str = structured_result.model_dump_json()
|
280 |
-
logger.info(f"Structured JSON: {json_str}")
|
281 |
return [json.loads(json_str)]
|
282 |
except Exception as e:
|
283 |
-
logger.error(f"Error converting to
|
284 |
return [{"error": str(e), "file_name": Path(file_path).stem}]
|
285 |
|
286 |
-
def ocr_uploaded_pdf(self, pdf_file: Union[str, bytes]) -> Tuple[str, List[str], List[Dict]]:
|
287 |
file_path = self._save_uploaded_file(pdf_file, getattr(pdf_file, 'name', f"pdf_{int(time.time())}.pdf"))
|
288 |
return self._process_pdf_with_ocr(file_path)
|
289 |
|
@@ -291,10 +271,9 @@ class OCRProcessor:
|
|
291 |
file_path = self._save_uploaded_file(pdf_url, pdf_url.split('/')[-1] or f"pdf_{int(time.time())}.pdf")
|
292 |
return self._process_pdf_with_ocr(file_path)
|
293 |
|
294 |
-
def ocr_uploaded_image(self, image_file: Union[str, bytes]) -> Tuple[str, str, Dict]:
|
295 |
file_path = self._save_uploaded_file(image_file, getattr(image_file, 'name', f"image_{int(time.time())}.jpg"))
|
296 |
encoded_image = self._encode_image(file_path)
|
297 |
-
base64_url = f"data:image/png;base64,{encoded_image}"
|
298 |
response = self._call_ocr_api(encoded_image)
|
299 |
markdown, base64_images = self._get_combined_markdown(response)
|
300 |
json_result = self._convert_to_structured_json(markdown, file_path)[0]
|
@@ -302,7 +281,7 @@ class OCRProcessor:
|
|
302 |
|
303 |
@staticmethod
|
304 |
def _handle_error(context: str, error: Exception) -> str:
|
305 |
-
logger.error(f"Error in {context}: {str(error)}"
|
306 |
return f"**Error in {context}:** {str(error)}"
|
307 |
|
308 |
def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
|
@@ -351,8 +330,10 @@ def create_interface():
|
|
351 |
process_image_btn = gr.Button("Process Image", variant="primary")
|
352 |
|
353 |
def process_image(processor, image):
|
354 |
-
if not processor
|
355 |
-
return "Please set API key
|
|
|
|
|
356 |
markdown, image_path, json_data = processor.ocr_uploaded_image(image)
|
357 |
return markdown, image_path, json_data
|
358 |
|
@@ -380,22 +361,19 @@ def create_interface():
|
|
380 |
|
381 |
def process_pdf(processor, pdf_file, pdf_url):
|
382 |
if not processor:
|
383 |
-
return "Please set API key
|
384 |
-
|
385 |
-
if pdf_file is not None and hasattr(pdf_file, 'name'):
|
386 |
-
logger.info(f"Processing as uploaded PDF: {pdf_file.name}")
|
387 |
markdown, image_paths, json_data = processor.ocr_uploaded_pdf(pdf_file)
|
388 |
elif pdf_url and pdf_url.strip():
|
389 |
-
logger.info(f"Processing as PDF URL: {pdf_url}")
|
390 |
markdown, image_paths, json_data = processor.ocr_pdf_url(pdf_url)
|
391 |
else:
|
392 |
-
return "Please upload a PDF or provide a
|
393 |
-
return markdown, image_paths, json_data
|
394 |
|
395 |
process_pdf_btn.click(
|
396 |
fn=process_pdf,
|
397 |
inputs=[processor_state, pdf_input, pdf_url_input],
|
398 |
-
outputs=[pdf_output, pdf_gallery, pdf_json_output]
|
399 |
)
|
400 |
|
401 |
return demo
|
|
|
2 |
import base64
|
3 |
import gradio as gr
|
4 |
import json
|
5 |
+
import re
|
6 |
from mistralai import Mistral, DocumentURLChunk, ImageURLChunk, TextChunk
|
7 |
from mistralai.models import OCRResponse
|
8 |
from typing import Union, List, Tuple, Dict
|
|
|
13 |
import logging
|
14 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
15 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
16 |
from pathlib import Path
|
17 |
from pydantic import BaseModel
|
18 |
import pycountry
|
|
|
35 |
)
|
36 |
logger = logging.getLogger(__name__)
|
37 |
|
38 |
+
# Language Enum
|
39 |
languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
|
40 |
|
41 |
class LanguageMeta(Enum.__class__):
|
|
|
78 |
raise ValueError(f"API key validation failed: {str(e)}")
|
79 |
|
80 |
@staticmethod
|
81 |
+
def _check_file_size(file_input: Union[str, bytes, Path]) -> None:
|
82 |
+
if isinstance(file_input, (str, Path)) and os.path.exists(file_input):
|
83 |
size = os.path.getsize(file_input)
|
84 |
elif hasattr(file_input, 'read'):
|
85 |
size = len(file_input.read())
|
|
|
90 |
raise ValueError(f"File size exceeds {MAX_FILE_SIZE/1024/1024}MB limit")
|
91 |
|
92 |
@staticmethod
|
93 |
+
def _save_uploaded_file(file_input: Union[str, bytes, Path], filename: str) -> str:
|
94 |
clean_filename = os.path.basename(filename).replace(os.sep, "_")
|
95 |
file_path = os.path.join(UPLOAD_FOLDER, f"{int(time.time())}_{clean_filename}")
|
96 |
|
97 |
try:
|
98 |
+
if isinstance(file_input, (str, Path)) and str(file_input).startswith("http"):
|
99 |
logger.info(f"Downloading from URL: {file_input}")
|
100 |
response = requests.get(file_input, timeout=30)
|
101 |
response.raise_for_status()
|
102 |
with open(file_path, 'wb') as f:
|
103 |
f.write(response.content)
|
104 |
+
elif isinstance(file_input, (str, Path)) and os.path.exists(file_input):
|
105 |
logger.info(f"Copying local file: {file_input}")
|
106 |
shutil.copy2(file_input, file_path)
|
107 |
else:
|
|
|
124 |
try:
|
125 |
with open(image_path, "rb") as image_file:
|
126 |
encoded = base64.b64encode(image_file.read()).decode('utf-8')
|
127 |
+
logger.info(f"Encoded image {image_path} (length: {len(encoded)})")
|
128 |
return encoded
|
129 |
except Exception as e:
|
130 |
logger.error(f"Error encoding image {image_path}: {str(e)}")
|
|
|
170 |
|
171 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
172 |
def _call_ocr_api(self, encoded_image: str) -> OCRResponse:
|
173 |
+
logger.info("Calling OCR API")
|
174 |
if not isinstance(encoded_image, str):
|
175 |
raise TypeError(f"Expected encoded_image to be a string, got {type(encoded_image)}")
|
176 |
base64_url = f"data:image/png;base64,{encoded_image}"
|
|
|
181 |
include_image_base64=True
|
182 |
)
|
183 |
logger.info("OCR API call successful")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
return response
|
185 |
except Exception as e:
|
186 |
+
if "401" in str(e) or "authentication" in str(e).lower():
|
187 |
+
raise ValueError("Authentication failed: Invalid API key")
|
188 |
+
logger.error(f"OCR API error: {str(e)}")
|
189 |
raise
|
190 |
|
191 |
def _process_pdf_with_ocr(self, pdf_path: str) -> Tuple[str, List[str], List[Dict]]:
|
192 |
try:
|
193 |
+
logger.info(f"Processing PDF: {pdf_path}")
|
194 |
uploaded_file = self.client.files.upload(
|
195 |
file={"file_name": Path(pdf_path).stem, "content": Path(pdf_path).read_bytes()},
|
196 |
purpose="ocr",
|
|
|
206 |
json_results = self._convert_to_structured_json(markdown, pdf_path)
|
207 |
image_paths = []
|
208 |
if not any(page.images for page in ocr_response.pages):
|
209 |
+
logger.warning("No images in OCR response; using local conversion")
|
210 |
image_data = self._pdf_to_images(pdf_path)
|
211 |
image_paths = [path for path, _ in image_data]
|
212 |
else:
|
213 |
image_paths = [os.path.join(UPLOAD_FOLDER, f"ocr_page_{i}.png") for i in range(len(ocr_response.pages))]
|
214 |
for i, base64_img in enumerate(base64_images):
|
215 |
+
if base64_img and ',' in base64_img:
|
216 |
try:
|
217 |
img_data = base64.b64decode(base64_img.split(',')[1])
|
218 |
with open(image_paths[i], "wb") as f:
|
219 |
f.write(img_data)
|
|
|
|
|
|
|
|
|
220 |
except Exception as e:
|
221 |
logger.error(f"Error saving image {i}: {str(e)}")
|
222 |
image_paths[i] = None
|
223 |
image_paths = [path for path in image_paths if path and os.path.exists(path)]
|
|
|
224 |
return markdown, image_paths, json_results
|
225 |
except Exception as e:
|
226 |
return self._handle_error("PDF OCR processing", e), [], []
|
|
|
232 |
image_data = {}
|
233 |
for img in page.images:
|
234 |
if img.image_base64:
|
|
|
235 |
base64_url = f"data:image/jpeg;base64,{img.image_base64}"
|
236 |
image_data[img.id] = base64_url
|
237 |
base64_images.append(base64_url)
|
|
|
238 |
else:
|
239 |
base64_images.append(None)
|
240 |
markdown = page.markdown or "No text detected"
|
241 |
markdown = replace_images_in_markdown(markdown, image_data)
|
|
|
242 |
markdowns.append(markdown)
|
243 |
return "\n\n".join(markdowns), base64_images
|
244 |
|
245 |
def _convert_to_structured_json(self, markdown: str, file_path: str) -> List[Dict]:
|
246 |
try:
|
247 |
text_only_markdown = re.sub(r'!\[.*?\]\(data:image/[^)]+\)', '', markdown)
|
|
|
|
|
|
|
248 |
chat_response = self.client.chat.parse(
|
249 |
model="pixtral-12b-latest",
|
250 |
messages=[
|
251 |
{
|
252 |
"role": "user",
|
253 |
+
"content": f"Convert OCR output to JSON with file_name, topics, languages, and ocr_contents.\n\nOCR Output:\n{text_only_markdown}"
|
254 |
},
|
255 |
],
|
256 |
response_format=StructuredOCR,
|
|
|
258 |
)
|
259 |
structured_result = chat_response.choices[0].message.parsed
|
260 |
json_str = structured_result.model_dump_json()
|
|
|
261 |
return [json.loads(json_str)]
|
262 |
except Exception as e:
|
263 |
+
logger.error(f"Error converting to JSON: {str(e)}")
|
264 |
return [{"error": str(e), "file_name": Path(file_path).stem}]
|
265 |
|
266 |
+
def ocr_uploaded_pdf(self, pdf_file: Union[str, bytes, Path]) -> Tuple[str, List[str], List[Dict]]:
|
267 |
file_path = self._save_uploaded_file(pdf_file, getattr(pdf_file, 'name', f"pdf_{int(time.time())}.pdf"))
|
268 |
return self._process_pdf_with_ocr(file_path)
|
269 |
|
|
|
271 |
file_path = self._save_uploaded_file(pdf_url, pdf_url.split('/')[-1] or f"pdf_{int(time.time())}.pdf")
|
272 |
return self._process_pdf_with_ocr(file_path)
|
273 |
|
274 |
+
def ocr_uploaded_image(self, image_file: Union[str, bytes, Path]) -> Tuple[str, str, Dict]:
|
275 |
file_path = self._save_uploaded_file(image_file, getattr(image_file, 'name', f"image_{int(time.time())}.jpg"))
|
276 |
encoded_image = self._encode_image(file_path)
|
|
|
277 |
response = self._call_ocr_api(encoded_image)
|
278 |
markdown, base64_images = self._get_combined_markdown(response)
|
279 |
json_result = self._convert_to_structured_json(markdown, file_path)[0]
|
|
|
281 |
|
282 |
@staticmethod
|
283 |
def _handle_error(context: str, error: Exception) -> str:
|
284 |
+
logger.error(f"Error in {context}: {str(error)}")
|
285 |
return f"**Error in {context}:** {str(error)}"
|
286 |
|
287 |
def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
|
|
|
330 |
process_image_btn = gr.Button("Process Image", variant="primary")
|
331 |
|
332 |
def process_image(processor, image):
|
333 |
+
if not processor:
|
334 |
+
return "Please set API key", None, {}
|
335 |
+
if not image:
|
336 |
+
return "Please upload an image", None, {}
|
337 |
markdown, image_path, json_data = processor.ocr_uploaded_image(image)
|
338 |
return markdown, image_path, json_data
|
339 |
|
|
|
361 |
|
362 |
def process_pdf(processor, pdf_file, pdf_url):
|
363 |
if not processor:
|
364 |
+
return "Please set API key", [], {}, "Please set API key"
|
365 |
+
if pdf_file:
|
|
|
|
|
366 |
markdown, image_paths, json_data = processor.ocr_uploaded_pdf(pdf_file)
|
367 |
elif pdf_url and pdf_url.strip():
|
|
|
368 |
markdown, image_paths, json_data = processor.ocr_pdf_url(pdf_url)
|
369 |
else:
|
370 |
+
return "Please upload a PDF or provide a URL", [], {}, "No input provided"
|
371 |
+
return markdown, image_paths, json_data, "✅ Processing complete"
|
372 |
|
373 |
process_pdf_btn.click(
|
374 |
fn=process_pdf,
|
375 |
inputs=[processor_state, pdf_input, pdf_url_input],
|
376 |
+
outputs=[pdf_output, pdf_gallery, pdf_json_output, status]
|
377 |
)
|
378 |
|
379 |
return demo
|