Svngoku commited on
Commit
130cd2c
·
verified ·
1 Parent(s): dd232fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -51
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import base64
3
  import gradio as gr
4
  import json
5
- import re # Added to fix NameError
6
  from mistralai import Mistral, DocumentURLChunk, ImageURLChunk, TextChunk
7
  from mistralai.models import OCRResponse
8
  from typing import Union, List, Tuple, Dict
@@ -13,8 +13,6 @@ import pymupdf as fitz
13
  import logging
14
  from tenacity import retry, stop_after_attempt, wait_exponential
15
  from concurrent.futures import ThreadPoolExecutor
16
- import socket
17
- from requests.exceptions import ConnectionError, Timeout
18
  from pathlib import Path
19
  from pydantic import BaseModel
20
  import pycountry
@@ -37,7 +35,7 @@ logging.basicConfig(
37
  )
38
  logger = logging.getLogger(__name__)
39
 
40
- # Language Enum for StructuredOCR
41
  languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
42
 
43
  class LanguageMeta(Enum.__class__):
@@ -80,8 +78,8 @@ class OCRProcessor:
80
  raise ValueError(f"API key validation failed: {str(e)}")
81
 
82
  @staticmethod
83
- def _check_file_size(file_input: Union[str, bytes]) -> None:
84
- if isinstance(file_input, str) and os.path.exists(file_input):
85
  size = os.path.getsize(file_input)
86
  elif hasattr(file_input, 'read'):
87
  size = len(file_input.read())
@@ -92,18 +90,18 @@ class OCRProcessor:
92
  raise ValueError(f"File size exceeds {MAX_FILE_SIZE/1024/1024}MB limit")
93
 
94
  @staticmethod
95
- def _save_uploaded_file(file_input: Union[str, bytes], filename: str) -> str:
96
  clean_filename = os.path.basename(filename).replace(os.sep, "_")
97
  file_path = os.path.join(UPLOAD_FOLDER, f"{int(time.time())}_{clean_filename}")
98
 
99
  try:
100
- if isinstance(file_input, str) and file_input.startswith("http"):
101
  logger.info(f"Downloading from URL: {file_input}")
102
  response = requests.get(file_input, timeout=30)
103
  response.raise_for_status()
104
  with open(file_path, 'wb') as f:
105
  f.write(response.content)
106
- elif isinstance(file_input, str) and os.path.exists(file_input):
107
  logger.info(f"Copying local file: {file_input}")
108
  shutil.copy2(file_input, file_path)
109
  else:
@@ -126,7 +124,7 @@ class OCRProcessor:
126
  try:
127
  with open(image_path, "rb") as image_file:
128
  encoded = base64.b64encode(image_file.read()).decode('utf-8')
129
- logger.info(f"Encoded image {image_path} to base64 (length: {len(encoded)})")
130
  return encoded
131
  except Exception as e:
132
  logger.error(f"Error encoding image {image_path}: {str(e)}")
@@ -172,7 +170,7 @@ class OCRProcessor:
172
 
173
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
174
  def _call_ocr_api(self, encoded_image: str) -> OCRResponse:
175
- logger.info(f"Calling OCR API with API key: {self.api_key[:4]}...") # Log partial key for debugging
176
  if not isinstance(encoded_image, str):
177
  raise TypeError(f"Expected encoded_image to be a string, got {type(encoded_image)}")
178
  base64_url = f"data:image/png;base64,{encoded_image}"
@@ -183,22 +181,16 @@ class OCRProcessor:
183
  include_image_base64=True
184
  )
185
  logger.info("OCR API call successful")
186
- try:
187
- if hasattr(response, 'model_dump_json'):
188
- response_dict = json.loads(response.model_dump_json())
189
- else:
190
- response_dict = {k: v for k, v in response.__dict__.items() if isinstance(v, (str, int, float, list, dict))}
191
- logger.info(f"Raw OCR response: {json.dumps(response_dict, default=str, indent=4)}")
192
- except Exception as log_err:
193
- logger.warning(f"Failed to log raw OCR response: {str(log_err)}")
194
  return response
195
  except Exception as e:
196
- logger.error(f"OCR API error: {str(e)}", exc_info=True)
 
 
197
  raise
198
 
199
  def _process_pdf_with_ocr(self, pdf_path: str) -> Tuple[str, List[str], List[Dict]]:
200
  try:
201
- logger.info(f"Processing PDF with API key: {self.api_key[:4]}...")
202
  uploaded_file = self.client.files.upload(
203
  file={"file_name": Path(pdf_path).stem, "content": Path(pdf_path).read_bytes()},
204
  purpose="ocr",
@@ -214,26 +206,21 @@ class OCRProcessor:
214
  json_results = self._convert_to_structured_json(markdown, pdf_path)
215
  image_paths = []
216
  if not any(page.images for page in ocr_response.pages):
217
- logger.warning("No images found in OCR response; using local images")
218
  image_data = self._pdf_to_images(pdf_path)
219
  image_paths = [path for path, _ in image_data]
220
  else:
221
  image_paths = [os.path.join(UPLOAD_FOLDER, f"ocr_page_{i}.png") for i in range(len(ocr_response.pages))]
222
  for i, base64_img in enumerate(base64_images):
223
- if base64_img:
224
  try:
225
  img_data = base64.b64decode(base64_img.split(',')[1])
226
  with open(image_paths[i], "wb") as f:
227
  f.write(img_data)
228
- if os.path.exists(image_paths[i]):
229
- logger.info(f"Image {image_paths[i]} saved and exists")
230
- else:
231
- logger.error(f"Image {image_paths[i]} saved but does not exist")
232
  except Exception as e:
233
  logger.error(f"Error saving image {i}: {str(e)}")
234
  image_paths[i] = None
235
  image_paths = [path for path in image_paths if path and os.path.exists(path)]
236
- logger.info(f"Final image paths: {image_paths}")
237
  return markdown, image_paths, json_results
238
  except Exception as e:
239
  return self._handle_error("PDF OCR processing", e), [], []
@@ -245,31 +232,25 @@ class OCRProcessor:
245
  image_data = {}
246
  for img in page.images:
247
  if img.image_base64:
248
- # Use correct MIME type based on image format (assuming JPEG from logs)
249
  base64_url = f"data:image/jpeg;base64,{img.image_base64}"
250
  image_data[img.id] = base64_url
251
  base64_images.append(base64_url)
252
- logger.info(f"Base64 image {img.id} length: {len(img.image_base64)}")
253
  else:
254
  base64_images.append(None)
255
  markdown = page.markdown or "No text detected"
256
  markdown = replace_images_in_markdown(markdown, image_data)
257
- logger.info(f"Page {i} markdown (first 200 chars): {markdown[:200]}...")
258
  markdowns.append(markdown)
259
  return "\n\n".join(markdowns), base64_images
260
 
261
  def _convert_to_structured_json(self, markdown: str, file_path: str) -> List[Dict]:
262
  try:
263
  text_only_markdown = re.sub(r'!\[.*?\]\(data:image/[^)]+\)', '', markdown)
264
- logger.info(f"Text-only markdown length: {len(text_only_markdown)}")
265
- logger.info(f"Text-only markdown content: {text_only_markdown[:200]}...")
266
-
267
  chat_response = self.client.chat.parse(
268
  model="pixtral-12b-latest",
269
  messages=[
270
  {
271
  "role": "user",
272
- "content": f"Given OCR output from a PDF about African history and artifacts, convert to JSON with file_name, topics (e.g., African Artifacts, Tribal History), languages (e.g., English), and ocr_contents (title and list of items with descriptions and image refs).\n\nOCR Output:\n{text_only_markdown}"
273
  },
274
  ],
275
  response_format=StructuredOCR,
@@ -277,13 +258,12 @@ class OCRProcessor:
277
  )
278
  structured_result = chat_response.choices[0].message.parsed
279
  json_str = structured_result.model_dump_json()
280
- logger.info(f"Structured JSON: {json_str}")
281
  return [json.loads(json_str)]
282
  except Exception as e:
283
- logger.error(f"Error converting to structured JSON: {str(e)}", exc_info=True)
284
  return [{"error": str(e), "file_name": Path(file_path).stem}]
285
 
286
- def ocr_uploaded_pdf(self, pdf_file: Union[str, bytes]) -> Tuple[str, List[str], List[Dict]]:
287
  file_path = self._save_uploaded_file(pdf_file, getattr(pdf_file, 'name', f"pdf_{int(time.time())}.pdf"))
288
  return self._process_pdf_with_ocr(file_path)
289
 
@@ -291,10 +271,9 @@ class OCRProcessor:
291
  file_path = self._save_uploaded_file(pdf_url, pdf_url.split('/')[-1] or f"pdf_{int(time.time())}.pdf")
292
  return self._process_pdf_with_ocr(file_path)
293
 
294
- def ocr_uploaded_image(self, image_file: Union[str, bytes]) -> Tuple[str, str, Dict]:
295
  file_path = self._save_uploaded_file(image_file, getattr(image_file, 'name', f"image_{int(time.time())}.jpg"))
296
  encoded_image = self._encode_image(file_path)
297
- base64_url = f"data:image/png;base64,{encoded_image}"
298
  response = self._call_ocr_api(encoded_image)
299
  markdown, base64_images = self._get_combined_markdown(response)
300
  json_result = self._convert_to_structured_json(markdown, file_path)[0]
@@ -302,7 +281,7 @@ class OCRProcessor:
302
 
303
  @staticmethod
304
  def _handle_error(context: str, error: Exception) -> str:
305
- logger.error(f"Error in {context}: {str(error)}", exc_info=True)
306
  return f"**Error in {context}:** {str(error)}"
307
 
308
  def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
@@ -351,8 +330,10 @@ def create_interface():
351
  process_image_btn = gr.Button("Process Image", variant="primary")
352
 
353
  def process_image(processor, image):
354
- if not processor or not image:
355
- return "Please set API key and upload an image", None, {}
 
 
356
  markdown, image_path, json_data = processor.ocr_uploaded_image(image)
357
  return markdown, image_path, json_data
358
 
@@ -380,22 +361,19 @@ def create_interface():
380
 
381
  def process_pdf(processor, pdf_file, pdf_url):
382
  if not processor:
383
- return "Please set API key first", [], {}
384
- logger.info(f"Received inputs - PDF file: {pdf_file}, PDF URL: {pdf_url}")
385
- if pdf_file is not None and hasattr(pdf_file, 'name'):
386
- logger.info(f"Processing as uploaded PDF: {pdf_file.name}")
387
  markdown, image_paths, json_data = processor.ocr_uploaded_pdf(pdf_file)
388
  elif pdf_url and pdf_url.strip():
389
- logger.info(f"Processing as PDF URL: {pdf_url}")
390
  markdown, image_paths, json_data = processor.ocr_pdf_url(pdf_url)
391
  else:
392
- return "Please upload a PDF or provide a valid URL", [], {}
393
- return markdown, image_paths, json_data
394
 
395
  process_pdf_btn.click(
396
  fn=process_pdf,
397
  inputs=[processor_state, pdf_input, pdf_url_input],
398
- outputs=[pdf_output, pdf_gallery, pdf_json_output]
399
  )
400
 
401
  return demo
 
2
  import base64
3
  import gradio as gr
4
  import json
5
+ import re
6
  from mistralai import Mistral, DocumentURLChunk, ImageURLChunk, TextChunk
7
  from mistralai.models import OCRResponse
8
  from typing import Union, List, Tuple, Dict
 
13
  import logging
14
  from tenacity import retry, stop_after_attempt, wait_exponential
15
  from concurrent.futures import ThreadPoolExecutor
 
 
16
  from pathlib import Path
17
  from pydantic import BaseModel
18
  import pycountry
 
35
  )
36
  logger = logging.getLogger(__name__)
37
 
38
+ # Language Enum
39
  languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
40
 
41
  class LanguageMeta(Enum.__class__):
 
78
  raise ValueError(f"API key validation failed: {str(e)}")
79
 
80
  @staticmethod
81
+ def _check_file_size(file_input: Union[str, bytes, Path]) -> None:
82
+ if isinstance(file_input, (str, Path)) and os.path.exists(file_input):
83
  size = os.path.getsize(file_input)
84
  elif hasattr(file_input, 'read'):
85
  size = len(file_input.read())
 
90
  raise ValueError(f"File size exceeds {MAX_FILE_SIZE/1024/1024}MB limit")
91
 
92
  @staticmethod
93
+ def _save_uploaded_file(file_input: Union[str, bytes, Path], filename: str) -> str:
94
  clean_filename = os.path.basename(filename).replace(os.sep, "_")
95
  file_path = os.path.join(UPLOAD_FOLDER, f"{int(time.time())}_{clean_filename}")
96
 
97
  try:
98
+ if isinstance(file_input, (str, Path)) and str(file_input).startswith("http"):
99
  logger.info(f"Downloading from URL: {file_input}")
100
  response = requests.get(file_input, timeout=30)
101
  response.raise_for_status()
102
  with open(file_path, 'wb') as f:
103
  f.write(response.content)
104
+ elif isinstance(file_input, (str, Path)) and os.path.exists(file_input):
105
  logger.info(f"Copying local file: {file_input}")
106
  shutil.copy2(file_input, file_path)
107
  else:
 
124
  try:
125
  with open(image_path, "rb") as image_file:
126
  encoded = base64.b64encode(image_file.read()).decode('utf-8')
127
+ logger.info(f"Encoded image {image_path} (length: {len(encoded)})")
128
  return encoded
129
  except Exception as e:
130
  logger.error(f"Error encoding image {image_path}: {str(e)}")
 
170
 
171
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
172
  def _call_ocr_api(self, encoded_image: str) -> OCRResponse:
173
+ logger.info("Calling OCR API")
174
  if not isinstance(encoded_image, str):
175
  raise TypeError(f"Expected encoded_image to be a string, got {type(encoded_image)}")
176
  base64_url = f"data:image/png;base64,{encoded_image}"
 
181
  include_image_base64=True
182
  )
183
  logger.info("OCR API call successful")
 
 
 
 
 
 
 
 
184
  return response
185
  except Exception as e:
186
+ if "401" in str(e) or "authentication" in str(e).lower():
187
+ raise ValueError("Authentication failed: Invalid API key")
188
+ logger.error(f"OCR API error: {str(e)}")
189
  raise
190
 
191
  def _process_pdf_with_ocr(self, pdf_path: str) -> Tuple[str, List[str], List[Dict]]:
192
  try:
193
+ logger.info(f"Processing PDF: {pdf_path}")
194
  uploaded_file = self.client.files.upload(
195
  file={"file_name": Path(pdf_path).stem, "content": Path(pdf_path).read_bytes()},
196
  purpose="ocr",
 
206
  json_results = self._convert_to_structured_json(markdown, pdf_path)
207
  image_paths = []
208
  if not any(page.images for page in ocr_response.pages):
209
+ logger.warning("No images in OCR response; using local conversion")
210
  image_data = self._pdf_to_images(pdf_path)
211
  image_paths = [path for path, _ in image_data]
212
  else:
213
  image_paths = [os.path.join(UPLOAD_FOLDER, f"ocr_page_{i}.png") for i in range(len(ocr_response.pages))]
214
  for i, base64_img in enumerate(base64_images):
215
+ if base64_img and ',' in base64_img:
216
  try:
217
  img_data = base64.b64decode(base64_img.split(',')[1])
218
  with open(image_paths[i], "wb") as f:
219
  f.write(img_data)
 
 
 
 
220
  except Exception as e:
221
  logger.error(f"Error saving image {i}: {str(e)}")
222
  image_paths[i] = None
223
  image_paths = [path for path in image_paths if path and os.path.exists(path)]
 
224
  return markdown, image_paths, json_results
225
  except Exception as e:
226
  return self._handle_error("PDF OCR processing", e), [], []
 
232
  image_data = {}
233
  for img in page.images:
234
  if img.image_base64:
 
235
  base64_url = f"data:image/jpeg;base64,{img.image_base64}"
236
  image_data[img.id] = base64_url
237
  base64_images.append(base64_url)
 
238
  else:
239
  base64_images.append(None)
240
  markdown = page.markdown or "No text detected"
241
  markdown = replace_images_in_markdown(markdown, image_data)
 
242
  markdowns.append(markdown)
243
  return "\n\n".join(markdowns), base64_images
244
 
245
  def _convert_to_structured_json(self, markdown: str, file_path: str) -> List[Dict]:
246
  try:
247
  text_only_markdown = re.sub(r'!\[.*?\]\(data:image/[^)]+\)', '', markdown)
 
 
 
248
  chat_response = self.client.chat.parse(
249
  model="pixtral-12b-latest",
250
  messages=[
251
  {
252
  "role": "user",
253
+ "content": f"Convert OCR output to JSON with file_name, topics, languages, and ocr_contents.\n\nOCR Output:\n{text_only_markdown}"
254
  },
255
  ],
256
  response_format=StructuredOCR,
 
258
  )
259
  structured_result = chat_response.choices[0].message.parsed
260
  json_str = structured_result.model_dump_json()
 
261
  return [json.loads(json_str)]
262
  except Exception as e:
263
+ logger.error(f"Error converting to JSON: {str(e)}")
264
  return [{"error": str(e), "file_name": Path(file_path).stem}]
265
 
266
+ def ocr_uploaded_pdf(self, pdf_file: Union[str, bytes, Path]) -> Tuple[str, List[str], List[Dict]]:
267
  file_path = self._save_uploaded_file(pdf_file, getattr(pdf_file, 'name', f"pdf_{int(time.time())}.pdf"))
268
  return self._process_pdf_with_ocr(file_path)
269
 
 
271
  file_path = self._save_uploaded_file(pdf_url, pdf_url.split('/')[-1] or f"pdf_{int(time.time())}.pdf")
272
  return self._process_pdf_with_ocr(file_path)
273
 
274
+ def ocr_uploaded_image(self, image_file: Union[str, bytes, Path]) -> Tuple[str, str, Dict]:
275
  file_path = self._save_uploaded_file(image_file, getattr(image_file, 'name', f"image_{int(time.time())}.jpg"))
276
  encoded_image = self._encode_image(file_path)
 
277
  response = self._call_ocr_api(encoded_image)
278
  markdown, base64_images = self._get_combined_markdown(response)
279
  json_result = self._convert_to_structured_json(markdown, file_path)[0]
 
281
 
282
  @staticmethod
283
  def _handle_error(context: str, error: Exception) -> str:
284
+ logger.error(f"Error in {context}: {str(error)}")
285
  return f"**Error in {context}:** {str(error)}"
286
 
287
  def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
 
330
  process_image_btn = gr.Button("Process Image", variant="primary")
331
 
332
  def process_image(processor, image):
333
+ if not processor:
334
+ return "Please set API key", None, {}
335
+ if not image:
336
+ return "Please upload an image", None, {}
337
  markdown, image_path, json_data = processor.ocr_uploaded_image(image)
338
  return markdown, image_path, json_data
339
 
 
361
 
362
  def process_pdf(processor, pdf_file, pdf_url):
363
  if not processor:
364
+ return "Please set API key", [], {}, "Please set API key"
365
+ if pdf_file:
 
 
366
  markdown, image_paths, json_data = processor.ocr_uploaded_pdf(pdf_file)
367
  elif pdf_url and pdf_url.strip():
 
368
  markdown, image_paths, json_data = processor.ocr_pdf_url(pdf_url)
369
  else:
370
+ return "Please upload a PDF or provide a URL", [], {}, "No input provided"
371
+ return markdown, image_paths, json_data, "✅ Processing complete"
372
 
373
  process_pdf_btn.click(
374
  fn=process_pdf,
375
  inputs=[processor_state, pdf_input, pdf_url_input],
376
+ outputs=[pdf_output, pdf_gallery, pdf_json_output, status]
377
  )
378
 
379
  return demo