Svngoku commited on
Commit
e057fa4
·
verified ·
1 Parent(s): bba970c
Files changed (1) hide show
  1. app.py +127 -165
app.py CHANGED
@@ -1,11 +1,9 @@
1
  import os
2
  import base64
3
  import gradio as gr
4
- import json
5
- import re
6
- from mistralai import Mistral, DocumentURLChunk, ImageURLChunk, TextChunk
7
  from mistralai.models import OCRResponse
8
- from typing import Union, List, Tuple, Dict
9
  import requests
10
  import shutil
11
  import time
@@ -13,11 +11,8 @@ import pymupdf as fitz
13
  import logging
14
  from tenacity import retry, stop_after_attempt, wait_exponential
15
  from concurrent.futures import ThreadPoolExecutor
16
- from pathlib import Path
17
- from pydantic import BaseModel
18
- import pycountry
19
- from enum import Enum
20
- from PIL import Image
21
 
22
  # Constants
23
  SUPPORTED_IMAGE_TYPES = [".jpg", ".png", ".jpeg"]
@@ -35,36 +30,10 @@ logging.basicConfig(
35
  )
36
  logger = logging.getLogger(__name__)
37
 
38
- # Language Enum
39
- languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
40
-
41
- class LanguageMeta(Enum.__class__):
42
- def __new__(metacls, cls, bases, classdict):
43
- for code, name in languages.items():
44
- classdict[name.upper().replace(' ', '_')] = name
45
- return super().__new__(metacls, cls, bases, classdict)
46
-
47
- class Language(Enum, metaclass=LanguageMeta):
48
- pass
49
-
50
- class StructuredOCR(BaseModel):
51
- file_name: str
52
- topics: list[str]
53
- languages: list[Language]
54
- ocr_contents: dict
55
-
56
- def model_dump_json(self, **kwargs):
57
- data = self.model_dump(exclude_unset=True, by_alias=True, mode='json')
58
- for key, value in data.items():
59
- if isinstance(value, list) and all(isinstance(item, Language) for item in value):
60
- data[key] = [item.value for item in value]
61
- return json.dumps(data, indent=4)
62
-
63
  class OCRProcessor:
64
  def __init__(self, api_key: str):
65
  if not api_key or not isinstance(api_key, str):
66
  raise ValueError("Valid API key must be provided")
67
- self.api_key = api_key
68
  self.client = Mistral(api_key=api_key)
69
  self._validate_client()
70
 
@@ -73,13 +42,12 @@ class OCRProcessor:
73
  models = self.client.models.list()
74
  if not models:
75
  raise ValueError("No models available")
76
- logger.info("API key validated successfully")
77
  except Exception as e:
78
  raise ValueError(f"API key validation failed: {str(e)}")
79
 
80
  @staticmethod
81
- def _check_file_size(file_input: Union[str, bytes, Path]) -> None:
82
- if isinstance(file_input, (str, Path)) and os.path.exists(file_input):
83
  size = os.path.getsize(file_input)
84
  elif hasattr(file_input, 'read'):
85
  size = len(file_input.read())
@@ -90,18 +58,18 @@ class OCRProcessor:
90
  raise ValueError(f"File size exceeds {MAX_FILE_SIZE/1024/1024}MB limit")
91
 
92
  @staticmethod
93
- def _save_uploaded_file(file_input: Union[str, bytes, Path], filename: str) -> str:
94
  clean_filename = os.path.basename(filename).replace(os.sep, "_")
95
  file_path = os.path.join(UPLOAD_FOLDER, f"{int(time.time())}_{clean_filename}")
96
 
97
  try:
98
- if isinstance(file_input, (str, Path)) and str(file_input).startswith("http"):
99
  logger.info(f"Downloading from URL: {file_input}")
100
  response = requests.get(file_input, timeout=30)
101
  response.raise_for_status()
102
  with open(file_path, 'wb') as f:
103
  f.write(response.content)
104
- elif isinstance(file_input, (str, Path)) and os.path.exists(file_input):
105
  logger.info(f"Copying local file: {file_input}")
106
  shutil.copy2(file_input, file_path)
107
  else:
@@ -123,12 +91,10 @@ class OCRProcessor:
123
  def _encode_image(image_path: str) -> str:
124
  try:
125
  with open(image_path, "rb") as image_file:
126
- encoded = base64.b64encode(image_file.read()).decode('utf-8')
127
- logger.info(f"Encoded image {image_path} (length: {len(encoded)})")
128
- return encoded
129
  except Exception as e:
130
  logger.error(f"Error encoding image {image_path}: {str(e)}")
131
- raise ValueError(f"Failed to encode image: {str(e)}")
132
 
133
  @staticmethod
134
  def _pdf_to_images(pdf_path: str) -> List[Tuple[str, str]]:
@@ -144,14 +110,10 @@ class OCRProcessor:
144
  range(pdf_document.page_count)
145
  ))
146
  pdf_document.close()
147
- valid_image_data = [(path, encoded) for path, encoded in image_data if path and encoded]
148
- if not valid_image_data:
149
- raise ValueError("No valid pages converted from PDF")
150
- logger.info(f"Converted {len(valid_image_data)} pages to images")
151
- return valid_image_data
152
  except Exception as e:
153
  logger.error(f"Error converting PDF to images: {str(e)}")
154
- raise
155
 
156
  @staticmethod
157
  def _convert_page(pdf_path: str, page_num: int) -> Tuple[str, str]:
@@ -170,132 +132,135 @@ class OCRProcessor:
170
 
171
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
172
  def _call_ocr_api(self, encoded_image: str) -> OCRResponse:
173
- logger.info("Calling OCR API")
174
- if not isinstance(encoded_image, str):
175
- raise TypeError(f"Expected encoded_image to be a string, got {type(encoded_image)}")
176
  base64_url = f"data:image/png;base64,{encoded_image}"
177
  try:
 
178
  response = self.client.ocr.process(
179
- document=ImageURLChunk(image_url=base64_url),
180
  model="mistral-ocr-latest",
 
181
  include_image_base64=True
182
  )
183
  logger.info("OCR API call successful")
184
  return response
185
- except Exception as e:
186
- if "401" in str(e) or "authentication" in str(e).lower():
187
- raise ValueError("Authentication failed: Invalid API key")
188
- logger.error(f"OCR API error: {str(e)}")
189
  raise
190
 
191
- def _process_pdf_with_ocr(self, pdf_path: str) -> Tuple[str, List[str], List[Dict]]:
 
 
192
  try:
193
- logger.info(f"Processing PDF: {pdf_path}")
194
- uploaded_file = self.client.files.upload(
195
- file={"file_name": Path(pdf_path).stem, "content": Path(pdf_path).read_bytes()},
196
- purpose="ocr",
197
- )
198
- signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=1).url
199
-
200
- ocr_response = self.client.ocr.process(
201
- document=DocumentURLChunk(document_url=signed_url),
202
- model="mistral-ocr-latest",
203
- include_image_base64=True
204
- )
205
- markdown, base64_images = self._get_combined_markdown(ocr_response)
206
- json_results = self._convert_to_structured_json(markdown, pdf_path)
207
- image_paths = []
208
- if not any(page.images for page in ocr_response.pages):
209
- logger.warning("No images in OCR response; using local conversion")
210
- image_data = self._pdf_to_images(pdf_path)
211
- image_paths = [path for path, _ in image_data]
212
- else:
213
- image_paths = [os.path.join(UPLOAD_FOLDER, f"ocr_page_{i}.png") for i in range(len(ocr_response.pages))]
214
- for i, base64_img in enumerate(base64_images):
215
- if base64_img and ',' in base64_img:
216
- try:
217
- img_data = base64.b64decode(base64_img.split(',')[1])
218
- with open(image_paths[i], "wb") as f:
219
- f.write(img_data)
220
- except Exception as e:
221
- logger.error(f"Error saving image {i}: {str(e)}")
222
- image_paths[i] = None
223
- image_paths = [path for path in image_paths if path and os.path.exists(path)]
224
- return markdown, image_paths, json_results
225
  except Exception as e:
226
- return self._handle_error("PDF OCR processing", e), [], []
227
 
228
- def _get_combined_markdown(self, ocr_response: OCRResponse) -> Tuple[str, List[str]]:
229
- markdowns = []
230
- base64_images = []
231
- for i, page in enumerate(ocr_response.pages):
232
- image_data = {}
233
- for img in page.images:
234
- if img.image_base64:
235
- base64_url = f"data:image/jpeg;base64,{img.image_base64}"
236
- image_data[img.id] = base64_url
237
- base64_images.append(base64_url)
238
- else:
239
- base64_images.append(None)
240
- markdown = page.markdown or "No text detected"
241
- markdown = replace_images_in_markdown(markdown, image_data)
242
- markdowns.append(markdown)
243
- return "\n\n".join(markdowns), base64_images
244
-
245
- def _convert_to_structured_json(self, markdown: str, file_path: str) -> List[Dict]:
246
  try:
247
- text_only_markdown = re.sub(r'!\[.*?\]\(data:image/[^)]+\)', '', markdown)
248
- chat_response = self.client.chat.parse(
249
- model="pixtral-12b-latest",
250
- messages=[
251
- {
252
- "role": "user",
253
- "content": f"Convert OCR output to JSON with file_name, topics, languages, and ocr_contents.\n\nOCR Output:\n{text_only_markdown}"
254
- },
255
- ],
256
- response_format=StructuredOCR,
257
- temperature=0
258
- )
259
- structured_result = chat_response.choices[0].message.parsed
260
- json_str = structured_result.model_dump_json()
261
- return [json.loads(json_str)]
 
 
 
262
  except Exception as e:
263
- logger.error(f"Error converting to JSON: {str(e)}")
264
- return [{"error": str(e), "file_name": Path(file_path).stem}]
265
 
266
- def ocr_uploaded_pdf(self, pdf_file: Union[str, bytes, Path]) -> Tuple[str, List[str], List[Dict]]:
267
- file_path = self._save_uploaded_file(pdf_file, getattr(pdf_file, 'name', f"pdf_{int(time.time())}.pdf"))
268
- return self._process_pdf_with_ocr(file_path)
269
-
270
- def ocr_pdf_url(self, pdf_url: str) -> Tuple[str, List[str], List[Dict]]:
271
- file_path = self._save_uploaded_file(pdf_url, pdf_url.split('/')[-1] or f"pdf_{int(time.time())}.pdf")
272
- return self._process_pdf_with_ocr(file_path)
 
 
 
 
273
 
274
- def ocr_uploaded_image(self, image_file: Union[str, bytes, Path]) -> Tuple[str, str, Dict]:
275
- file_path = self._save_uploaded_file(image_file, getattr(image_file, 'name', f"image_{int(time.time())}.jpg"))
276
- encoded_image = self._encode_image(file_path)
277
- response = self._call_ocr_api(encoded_image)
278
- markdown, base64_images = self._get_combined_markdown(response)
279
- json_result = self._convert_to_structured_json(markdown, file_path)[0]
280
- return markdown, file_path, json_result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
  @staticmethod
283
  def _handle_error(context: str, error: Exception) -> str:
284
  logger.error(f"Error in {context}: {str(error)}")
285
  return f"**Error in {context}:** {str(error)}"
286
 
287
- def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
288
- for img_name, base64_str in images_dict.items():
289
- markdown_str = markdown_str.replace(f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})")
290
- return markdown_str
291
-
292
  def create_interface():
293
  css = """
294
  .output-markdown {font-size: 14px; max-height: 500px; overflow-y: auto;}
295
  .status {color: #666; font-style: italic;}
296
  """
297
 
298
- with gr.Blocks(title="Mistral OCR API Demo", css=css) as demo:
299
  gr.Markdown("# Mistral OCR App\nUpload images or PDFs, or provide a PDF URL for OCR processing")
300
 
301
  with gr.Row():
@@ -326,21 +291,17 @@ def create_interface():
326
  )
327
  image_preview = gr.Image(label="Preview", height=300)
328
  image_output = gr.Markdown(label="OCR Result", elem_classes="output-markdown")
329
- image_json_output = gr.JSON(label="Structured JSON Output")
330
  process_image_btn = gr.Button("Process Image", variant="primary")
331
 
332
  def process_image(processor, image):
333
- if not processor:
334
- return "Please set API key", None, {}
335
- if not image:
336
- return "Please upload an image", None, {}
337
- markdown, image_path, json_data = processor.ocr_uploaded_image(image)
338
- return markdown, image_path, json_data
339
 
340
  process_image_btn.click(
341
  fn=process_image,
342
  inputs=[processor_state, image_input],
343
- outputs=[image_output, image_preview, image_json_output]
344
  )
345
 
346
  with gr.Tab("PDF OCR"):
@@ -356,24 +317,24 @@ def create_interface():
356
  )
357
  pdf_gallery = gr.Gallery(label="PDF Pages", height=300)
358
  pdf_output = gr.Markdown(label="OCR Result", elem_classes="output-markdown")
359
- pdf_json_output = gr.JSON(label="Structured JSON Output")
360
  process_pdf_btn = gr.Button("Process PDF", variant="primary")
361
 
362
  def process_pdf(processor, pdf_file, pdf_url):
363
  if not processor:
364
- return "Please set API key", [], {}, "Please set API key"
365
- if pdf_file:
366
- markdown, image_paths, json_data = processor.ocr_uploaded_pdf(pdf_file)
 
 
367
  elif pdf_url and pdf_url.strip():
368
- markdown, image_paths, json_data = processor.ocr_pdf_url(pdf_url)
369
- else:
370
- return "Please upload a PDF or provide a URL", [], {}, "No input provided"
371
- return markdown, image_paths, json_data, "✅ Processing complete"
372
 
373
  process_pdf_btn.click(
374
  fn=process_pdf,
375
  inputs=[processor_state, pdf_input, pdf_url_input],
376
- outputs=[pdf_output, pdf_gallery, pdf_json_output, status]
377
  )
378
 
379
  return demo
@@ -383,4 +344,5 @@ if __name__ == "__main__":
383
  print(f"===== Application Startup at {os.environ['START_TIME']} =====")
384
  create_interface().launch(
385
  share=True,
 
386
  )
 
1
  import os
2
  import base64
3
  import gradio as gr
4
+ from mistralai import Mistral, ImageURLChunk
 
 
5
  from mistralai.models import OCRResponse
6
+ from typing import Union, List, Tuple
7
  import requests
8
  import shutil
9
  import time
 
11
  import logging
12
  from tenacity import retry, stop_after_attempt, wait_exponential
13
  from concurrent.futures import ThreadPoolExecutor
14
+ import socket
15
+ from requests.exceptions import ConnectionError, Timeout
 
 
 
16
 
17
  # Constants
18
  SUPPORTED_IMAGE_TYPES = [".jpg", ".png", ".jpeg"]
 
30
  )
31
  logger = logging.getLogger(__name__)
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  class OCRProcessor:
34
  def __init__(self, api_key: str):
35
  if not api_key or not isinstance(api_key, str):
36
  raise ValueError("Valid API key must be provided")
 
37
  self.client = Mistral(api_key=api_key)
38
  self._validate_client()
39
 
 
42
  models = self.client.models.list()
43
  if not models:
44
  raise ValueError("No models available")
 
45
  except Exception as e:
46
  raise ValueError(f"API key validation failed: {str(e)}")
47
 
48
  @staticmethod
49
+ def _check_file_size(file_input: Union[str, bytes]) -> None:
50
+ if isinstance(file_input, str) and os.path.exists(file_input):
51
  size = os.path.getsize(file_input)
52
  elif hasattr(file_input, 'read'):
53
  size = len(file_input.read())
 
58
  raise ValueError(f"File size exceeds {MAX_FILE_SIZE/1024/1024}MB limit")
59
 
60
  @staticmethod
61
+ def _save_uploaded_file(file_input: Union[str, bytes], filename: str) -> str:
62
  clean_filename = os.path.basename(filename).replace(os.sep, "_")
63
  file_path = os.path.join(UPLOAD_FOLDER, f"{int(time.time())}_{clean_filename}")
64
 
65
  try:
66
+ if isinstance(file_input, str) and file_input.startswith("http"):
67
  logger.info(f"Downloading from URL: {file_input}")
68
  response = requests.get(file_input, timeout=30)
69
  response.raise_for_status()
70
  with open(file_path, 'wb') as f:
71
  f.write(response.content)
72
+ elif isinstance(file_input, str) and os.path.exists(file_input):
73
  logger.info(f"Copying local file: {file_input}")
74
  shutil.copy2(file_input, file_path)
75
  else:
 
91
  def _encode_image(image_path: str) -> str:
92
  try:
93
  with open(image_path, "rb") as image_file:
94
+ return base64.b64encode(image_file.read()).decode('utf-8')
 
 
95
  except Exception as e:
96
  logger.error(f"Error encoding image {image_path}: {str(e)}")
97
+ raise ValueError("Failed to encode image")
98
 
99
  @staticmethod
100
  def _pdf_to_images(pdf_path: str) -> List[Tuple[str, str]]:
 
110
  range(pdf_document.page_count)
111
  ))
112
  pdf_document.close()
113
+ return [data for data in image_data if data]
 
 
 
 
114
  except Exception as e:
115
  logger.error(f"Error converting PDF to images: {str(e)}")
116
+ return []
117
 
118
  @staticmethod
119
  def _convert_page(pdf_path: str, page_num: int) -> Tuple[str, str]:
 
132
 
133
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
134
  def _call_ocr_api(self, encoded_image: str) -> OCRResponse:
 
 
 
135
  base64_url = f"data:image/png;base64,{encoded_image}"
136
  try:
137
+ logger.info("Calling OCR API")
138
  response = self.client.ocr.process(
 
139
  model="mistral-ocr-latest",
140
+ document=ImageURLChunk(image_url=base64_url),
141
  include_image_base64=True
142
  )
143
  logger.info("OCR API call successful")
144
  return response
145
+ except (ConnectionError, Timeout, socket.error) as e:
146
+ logger.error(f"Network error during OCR API call: {str(e)}")
 
 
147
  raise
148
 
149
+ def ocr_uploaded_pdf(self, pdf_file: Union[str, bytes]) -> Tuple[str, List[str]]:
150
+ file_name = getattr(pdf_file, 'name', f"pdf_{int(time.time())}.pdf")
151
+ logger.info(f"Processing uploaded PDF: {file_name}")
152
  try:
153
+ self._check_file_size(pdf_file)
154
+ pdf_path = self._save_uploaded_file(pdf_file, file_name)
155
+
156
+ if not os.path.exists(pdf_path):
157
+ raise FileNotFoundError(f"Saved PDF not found at: {pdf_path}")
158
+
159
+ image_data = self._pdf_to_images(pdf_path)
160
+ if not image_data:
161
+ raise ValueError("No pages converted from PDF")
162
+
163
+ ocr_results = []
164
+ image_paths = [path for path, _ in image_data]
165
+ for i, (_, encoded) in enumerate(image_data):
166
+ response = self._call_ocr_api(encoded)
167
+ markdown_with_images = self._get_combined_markdown_with_images(response, image_paths, i)
168
+ ocr_results.append(markdown_with_images)
169
+
170
+ return "\n\n".join(ocr_results), image_paths
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  except Exception as e:
172
+ return self._handle_error("uploaded PDF processing", e), []
173
 
174
+ def ocr_pdf_url(self, pdf_url: str) -> Tuple[str, List[str]]:
175
+ logger.info(f"Processing PDF URL: {pdf_url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  try:
177
+ file_name = pdf_url.split('/')[-1] or f"pdf_{int(time.time())}.pdf"
178
+ pdf_path = self._save_uploaded_file(pdf_url, file_name)
179
+
180
+ if not os.path.exists(pdf_path):
181
+ raise FileNotFoundError(f"Saved PDF not found at: {pdf_path}")
182
+
183
+ image_data = self._pdf_to_images(pdf_path)
184
+ if not image_data:
185
+ raise ValueError("No pages converted from PDF")
186
+
187
+ ocr_results = []
188
+ image_paths = [path for path, _ in image_data]
189
+ for i, (_, encoded) in enumerate(image_data):
190
+ response = self._call_ocr_api(encoded)
191
+ markdown_with_images = self._get_combined_markdown_with_images(response, image_paths, i)
192
+ ocr_results.append(markdown_with_images)
193
+
194
+ return "\n\n".join(ocr_results), image_paths
195
  except Exception as e:
196
+ return self._handle_error("PDF URL processing", e), []
 
197
 
198
+ def ocr_uploaded_image(self, image_file: Union[str, bytes]) -> Tuple[str, str]:
199
+ file_name = getattr(image_file, 'name', f"image_{int(time.time())}.jpg")
200
+ logger.info(f"Processing uploaded image: {file_name}")
201
+ try:
202
+ self._check_file_size(image_file)
203
+ image_path = self._save_uploaded_file(image_file, file_name)
204
+ encoded_image = self._encode_image(image_path)
205
+ response = self._call_ocr_api(encoded_image)
206
+ return self._get_combined_markdown_with_images(response), image_path
207
+ except Exception as e:
208
+ return self._handle_error("image processing", e), None
209
 
210
+ @staticmethod
211
+ def _get_combined_markdown_with_images(response: OCRResponse, image_paths: List[str] = None, page_index: int = None) -> str:
212
+ markdown_parts = []
213
+ for i, page in enumerate(response.pages):
214
+ if page.markdown.strip():
215
+ markdown = page.markdown
216
+ logger.info(f"Page {i} markdown: {markdown}")
217
+ if hasattr(page, 'images') and page.images:
218
+ logger.info(f"Found {len(page.images)} images in page {i}")
219
+ for img in page.images:
220
+ if img.image_base64:
221
+ logger.info(f"Replacing image {img.id} with base64")
222
+ markdown = markdown.replace(
223
+ f"![{img.id}]({img.id})",
224
+ f"![{img.id}](data:image/png;base64,{img.image_base64})"
225
+ )
226
+ else:
227
+ logger.warning(f"No base64 data for image {img.id}")
228
+ if image_paths and page_index is not None and page_index < len(image_paths):
229
+ local_encoded = OCRProcessor._encode_image(image_paths[page_index])
230
+ markdown = markdown.replace(
231
+ f"![{img.id}]({img.id})",
232
+ f"![{img.id}](data:image/png;base64,{local_encoded})"
233
+ )
234
+ else:
235
+ logger.warning(f"No images found in page {i}")
236
+ # Replace known placeholders or append the local image
237
+ if image_paths and page_index is not None and page_index < len(image_paths):
238
+ local_encoded = OCRProcessor._encode_image(image_paths[page_index])
239
+ # Replace placeholders like img-0.jpeg
240
+ placeholder = f"img-{i}.jpeg"
241
+ if placeholder in markdown:
242
+ markdown = markdown.replace(
243
+ placeholder,
244
+ f"![Page {i} Image](data:image/png;base64,{local_encoded})"
245
+ )
246
+ else:
247
+ # Append the image if no placeholder is found
248
+ markdown += f"\n\n![Page {i} Image](data:image/png;base64,{local_encoded})"
249
+ markdown_parts.append(markdown)
250
+ return "\n\n".join(markdown_parts) or "No text or images detected"
251
 
252
  @staticmethod
253
  def _handle_error(context: str, error: Exception) -> str:
254
  logger.error(f"Error in {context}: {str(error)}")
255
  return f"**Error in {context}:** {str(error)}"
256
 
 
 
 
 
 
257
  def create_interface():
258
  css = """
259
  .output-markdown {font-size: 14px; max-height: 500px; overflow-y: auto;}
260
  .status {color: #666; font-style: italic;}
261
  """
262
 
263
+ with gr.Blocks(title="Mistral OCR App", css=css) as demo:
264
  gr.Markdown("# Mistral OCR App\nUpload images or PDFs, or provide a PDF URL for OCR processing")
265
 
266
  with gr.Row():
 
291
  )
292
  image_preview = gr.Image(label="Preview", height=300)
293
  image_output = gr.Markdown(label="OCR Result", elem_classes="output-markdown")
 
294
  process_image_btn = gr.Button("Process Image", variant="primary")
295
 
296
  def process_image(processor, image):
297
+ if not processor or not image:
298
+ return "Please set API key and upload an image", None
299
+ return processor.ocr_uploaded_image(image)
 
 
 
300
 
301
  process_image_btn.click(
302
  fn=process_image,
303
  inputs=[processor_state, image_input],
304
+ outputs=[image_output, image_preview]
305
  )
306
 
307
  with gr.Tab("PDF OCR"):
 
317
  )
318
  pdf_gallery = gr.Gallery(label="PDF Pages", height=300)
319
  pdf_output = gr.Markdown(label="OCR Result", elem_classes="output-markdown")
 
320
  process_pdf_btn = gr.Button("Process PDF", variant="primary")
321
 
322
  def process_pdf(processor, pdf_file, pdf_url):
323
  if not processor:
324
+ return "Please set API key first", []
325
+ logger.info(f"Received inputs - PDF file: {pdf_file}, PDF URL: {pdf_url}")
326
+ if pdf_file is not None and hasattr(pdf_file, 'name'):
327
+ logger.info(f"Processing as uploaded PDF: {pdf_file.name}")
328
+ return processor.ocr_uploaded_pdf(pdf_file)
329
  elif pdf_url and pdf_url.strip():
330
+ logger.info(f"Processing as PDF URL: {pdf_url}")
331
+ return processor.ocr_pdf_url(pdf_url)
332
+ return "Please upload a PDF or provide a valid URL", []
 
333
 
334
  process_pdf_btn.click(
335
  fn=process_pdf,
336
  inputs=[processor_state, pdf_input, pdf_url_input],
337
+ outputs=[pdf_output, pdf_gallery]
338
  )
339
 
340
  return demo
 
344
  print(f"===== Application Startup at {os.environ['START_TIME']} =====")
345
  create_interface().launch(
346
  share=True,
347
+ debug=True,
348
  )