Svngoku commited on
Commit
e3bc0c6
·
verified ·
1 Parent(s): fd84b98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -62
app.py CHANGED
@@ -8,11 +8,18 @@ from enum import Enum
8
  from pydantic import BaseModel
9
  import pycountry
10
  import json
 
 
 
 
 
 
 
11
 
12
  # Initialize Mistral client with API key
13
  api_key = os.environ.get("MISTRAL_API_KEY")
14
  if not api_key:
15
- raise ValueError("Please set the MISTRAL_API_KEY environment variable.")
16
  client = Mistral(api_key=api_key)
17
 
18
  # Helper function to encode image to base64
@@ -21,68 +28,105 @@ def encode_image(image_path):
21
  with open(image_path, "rb") as image_file:
22
  return base64.b64encode(image_file.read()).decode('utf-8')
23
  except Exception as e:
 
24
  return f"Error encoding image: {str(e)}"
25
 
 
 
 
 
 
 
 
 
 
26
  # OCR with PDF URL
27
  def ocr_pdf_url(pdf_url):
 
28
  try:
29
- ocr_response = client.ocr.process(
30
- model="mistral-ocr-latest",
31
- document={"type": "document_url", "document_url": pdf_url},
32
- include_image_base64=True
33
- )
34
- markdown = ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
35
- return markdown # Return raw markdown for gr.Markdown to render
36
  except Exception as e:
 
37
  return f"**Error:** {str(e)}"
38
 
39
  # OCR with Uploaded PDF
40
  def ocr_uploaded_pdf(pdf_file):
 
 
41
  try:
 
 
 
 
42
  uploaded_pdf = client.files.upload(
43
- file={"file_name": pdf_file.name, "content": open(pdf_file.name, "rb")},
44
  purpose="ocr"
45
  )
46
- signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id, expiry=3600)
47
- ocr_response = client.ocr.process(
48
- model="mistral-ocr-latest",
49
- document={"type": "document_url", "document_url": signed_url.url},
50
- include_image_base64=True
51
- )
52
- markdown = ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
53
  return markdown
54
  except Exception as e:
 
55
  return f"**Error:** {str(e)}"
 
 
 
56
 
57
  # OCR with Image URL
58
  def ocr_image_url(image_url):
 
59
  try:
60
- ocr_response = client.ocr.process(
61
- model="mistral-ocr-latest",
62
- document={"type": "image_url", "image_url": image_url}
63
- )
64
- markdown = ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
 
65
  return markdown
66
  except Exception as e:
 
67
  return f"**Error:** {str(e)}"
68
 
69
  # OCR with Uploaded Image
70
  def ocr_uploaded_image(image_file):
 
 
71
  try:
72
- base64_image = encode_image(image_file.name)
73
- if "Error" in base64_image:
74
- return f"**Error:** {base64_image}"
75
- ocr_response = client.ocr.process(
76
- model="mistral-ocr-latest",
77
- document={"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"}
78
- )
79
- markdown = ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
 
 
 
 
 
80
  return markdown
81
  except Exception as e:
 
82
  return f"**Error:** {str(e)}"
 
 
 
83
 
84
  # Document Understanding
85
  def document_understanding(doc_url, question):
 
86
  try:
87
  messages = [
88
  {"role": "user", "content": [
@@ -90,12 +134,15 @@ def document_understanding(doc_url, question):
90
  {"type": "document_url", "document_url": doc_url}
91
  ]}
92
  ]
93
- chat_response = client.chat.complete(
94
- model="mistral-small-latest",
95
- messages=messages
96
- )
97
- return chat_response.choices[0].message.content # Plain text output
 
 
98
  except Exception as e:
 
99
  return f"**Error:** {str(e)}"
100
 
101
  # Structured OCR Setup
@@ -117,22 +164,25 @@ class StructuredOCR(BaseModel):
117
  ocr_contents: dict
118
 
119
  def structured_ocr(image_file):
 
 
120
  try:
121
- image_path = Path(image_file.name)
122
- encoded_image = encode_image(image_path)
 
 
 
123
  if "Error" in encoded_image:
124
- return f"**Error:** {encoded_image}"
125
  base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
126
 
127
- # OCR processing
128
- image_response = client.ocr.process(
129
- document={"type": "image_url", "image_url": base64_data_url},
130
- model="mistral-ocr-latest"
131
- )
132
- image_ocr_markdown = image_response.pages[0].markdown
133
 
134
- # Structured output with pixtral-12b-latest
135
- chat_response = client.chat.complete(
136
  model="pixtral-12b-latest",
137
  messages=[{
138
  "role": "user",
@@ -147,60 +197,73 @@ def structured_ocr(image_file):
147
  response_format={"type": "json_object"},
148
  temperature=0
149
  )
150
-
151
- response_dict = json.loads(chat_response.choices[0].message.content)
152
- structured_response = StructuredOCR.parse_obj({
153
- "file_name": image_path.name,
154
- "topics": response_dict.get("topics", []),
155
- "languages": [Language[l] for l in response_dict.get("languages", ["English"]) if l in languages.values()],
156
- "ocr_contents": response_dict.get("ocr_contents", {})
157
- })
158
- # Return as Markdown code block
 
 
 
 
 
 
 
 
 
 
159
  return f"```json\n{json.dumps(structured_response.dict(), indent=4)}\n```"
160
  except Exception as e:
 
161
  return f"**Error:** {str(e)}"
 
 
 
162
 
163
  # Gradio Interface
164
  with gr.Blocks(title="Mistral OCR & Structured Output App") as demo:
165
  gr.Markdown("# Mistral OCR & Structured Output App")
166
- gr.Markdown("Extract text from PDFs and images, ask questions about documents, or get structured JSON output in Markdown format!")
167
 
168
  with gr.Tab("OCR with PDF URL"):
169
  pdf_url_input = gr.Textbox(label="PDF URL", placeholder="e.g., https://arxiv.org/pdf/2201.04234")
170
- pdf_url_output = gr.Markdown(label="OCR Result (Markdown)")
171
  pdf_url_button = gr.Button("Process PDF")
172
  pdf_url_button.click(ocr_pdf_url, inputs=pdf_url_input, outputs=pdf_url_output)
173
 
174
  with gr.Tab("OCR with Uploaded PDF"):
175
  pdf_file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
176
- pdf_file_output = gr.Markdown(label="OCR Result (Markdown)")
177
  pdf_file_button = gr.Button("Process Uploaded PDF")
178
  pdf_file_button.click(ocr_uploaded_pdf, inputs=pdf_file_input, outputs=pdf_file_output)
179
 
180
  with gr.Tab("OCR with Image URL"):
181
  image_url_input = gr.Textbox(label="Image URL", placeholder="e.g., https://example.com/image.jpg")
182
- image_url_output = gr.Markdown(label="OCR Result (Markdown)")
183
  image_url_button = gr.Button("Process Image")
184
  image_url_button.click(ocr_image_url, inputs=image_url_input, outputs=image_url_output)
185
 
186
  with gr.Tab("OCR with Uploaded Image"):
187
  image_file_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
188
- image_file_output = gr.Markdown(label="OCR Result (Markdown)")
189
  image_file_button = gr.Button("Process Uploaded Image")
190
  image_file_button.click(ocr_uploaded_image, inputs=image_file_input, outputs=image_file_output)
191
 
192
  with gr.Tab("Document Understanding"):
193
  doc_url_input = gr.Textbox(label="Document URL", placeholder="e.g., https://arxiv.org/pdf/1805.04770")
194
  question_input = gr.Textbox(label="Question", placeholder="e.g., What is the last sentence?")
195
- doc_output = gr.Textbox(label="Answer") # Keep as Textbox for plain text
196
  doc_button = gr.Button("Ask Question")
197
  doc_button.click(document_understanding, inputs=[doc_url_input, question_input], outputs=doc_output)
198
 
199
  with gr.Tab("Structured OCR"):
200
  struct_image_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
201
- struct_output = gr.Markdown(label="Structured JSON Output (Markdown)")
202
  struct_button = gr.Button("Get Structured Output")
203
  struct_button.click(structured_ocr, inputs=struct_image_input, outputs=struct_output)
204
 
205
- # Launch the app
206
  demo.launch(share=True, debug=True)
 
8
  from pydantic import BaseModel
9
  import pycountry
10
  import json
11
+ import logging
12
+ from tenacity import retry, stop_after_attempt, wait_fixed
13
+ import tempfile
14
+
15
+ # Set up logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
 
19
  # Initialize Mistral client with API key
20
  api_key = os.environ.get("MISTRAL_API_KEY")
21
  if not api_key:
22
+ raise ValueError("MISTRAL_API_KEY environment variable is not set. Please configure it.")
23
  client = Mistral(api_key=api_key)
24
 
25
  # Helper function to encode image to base64
 
28
  with open(image_path, "rb") as image_file:
29
  return base64.b64encode(image_file.read()).decode('utf-8')
30
  except Exception as e:
31
+ logger.error(f"Error encoding image {image_path}: {str(e)}")
32
  return f"Error encoding image: {str(e)}"
33
 
34
+ # Retry-enabled API call helpers
35
+ @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
36
+ def call_ocr_api(document):
37
+ return client.ocr.process(model="mistral-ocr-latest", document=document)
38
+
39
+ @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
40
+ def call_chat_complete(model, messages, **kwargs):
41
+ return client.chat.complete(model=model, messages=messages, **kwargs)
42
+
43
  # OCR with PDF URL
44
  def ocr_pdf_url(pdf_url):
45
+ logger.info(f"Processing PDF URL: {pdf_url}")
46
  try:
47
+ ocr_response = call_ocr_api({"type": "document_url", "document_url": pdf_url})
48
+ try:
49
+ markdown = ocr_response.pages[0].markdown
50
+ except (IndexError, AttributeError):
51
+ markdown = "No text extracted or response invalid."
52
+ logger.info("Successfully processed PDF URL")
53
+ return markdown
54
  except Exception as e:
55
+ logger.error(f"Error processing PDF URL: {str(e)}")
56
  return f"**Error:** {str(e)}"
57
 
58
  # OCR with Uploaded PDF
59
  def ocr_uploaded_pdf(pdf_file):
60
+ logger.info(f"Processing uploaded PDF: {pdf_file.name}")
61
+ temp_path = None
62
  try:
63
+ # Use tempfile to handle uploaded file securely
64
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
65
+ temp_file.write(pdf_file.read())
66
+ temp_path = temp_file.name
67
  uploaded_pdf = client.files.upload(
68
+ file={"file_name": temp_path, "content": open(temp_path, "rb")},
69
  purpose="ocr"
70
  )
71
+ signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id, expiry=7200) # Increased to 2 hours
72
+ ocr_response = call_ocr_api({"type": "document_url", "document_url": signed_url.url})
73
+ try:
74
+ markdown = ocr_response.pages[0].markdown
75
+ except (IndexError, AttributeError):
76
+ markdown = "No text extracted or response invalid."
77
+ logger.info("Successfully processed uploaded PDF")
78
  return markdown
79
  except Exception as e:
80
+ logger.error(f"Error processing uploaded PDF: {str(e)}")
81
  return f"**Error:** {str(e)}"
82
+ finally:
83
+ if temp_path and os.path.exists(temp_path):
84
+ os.remove(temp_path)
85
 
86
  # OCR with Image URL
87
  def ocr_image_url(image_url):
88
+ logger.info(f"Processing image URL: {image_url}")
89
  try:
90
+ ocr_response = call_ocr_api({"type": "image_url", "image_url": image_url})
91
+ try:
92
+ markdown = ocr_response.pages[0].markdown
93
+ except (IndexError, AttributeError):
94
+ markdown = "No text extracted or response invalid."
95
+ logger.info("Successfully processed image URL")
96
  return markdown
97
  except Exception as e:
98
+ logger.error(f"Error processing image URL: {str(e)}")
99
  return f"**Error:** {str(e)}"
100
 
101
  # OCR with Uploaded Image
102
  def ocr_uploaded_image(image_file):
103
+ logger.info(f"Processing uploaded image: {image_file.name}")
104
+ temp_path = None
105
  try:
106
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
107
+ temp_file.write(image_file.read())
108
+ temp_path = temp_file.name
109
+ encoded_image = encode_image(temp_path)
110
+ if "Error" in encoded_image:
111
+ raise ValueError(encoded_image)
112
+ base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
113
+ ocr_response = call_ocr_api({"type": "image_url", "image_url": base64_data_url})
114
+ try:
115
+ markdown = ocr_response.pages[0].markdown
116
+ except (IndexError, AttributeError):
117
+ markdown = "No text extracted or response invalid."
118
+ logger.info("Successfully processed uploaded image")
119
  return markdown
120
  except Exception as e:
121
+ logger.error(f"Error processing uploaded image: {str(e)}")
122
  return f"**Error:** {str(e)}"
123
+ finally:
124
+ if temp_path and os.path.exists(temp_path):
125
+ os.remove(temp_path)
126
 
127
  # Document Understanding
128
  def document_understanding(doc_url, question):
129
+ logger.info(f"Processing document understanding - URL: {doc_url}, Question: {question}")
130
  try:
131
  messages = [
132
  {"role": "user", "content": [
 
134
  {"type": "document_url", "document_url": doc_url}
135
  ]}
136
  ]
137
+ chat_response = call_chat_complete(model="mistral-small-latest", messages=messages)
138
+ try:
139
+ content = chat_response.choices[0].message.content
140
+ except (IndexError, AttributeError):
141
+ content = "No response received from the API."
142
+ logger.info("Successfully processed document understanding")
143
+ return content
144
  except Exception as e:
145
+ logger.error(f"Error in document understanding: {str(e)}")
146
  return f"**Error:** {str(e)}"
147
 
148
  # Structured OCR Setup
 
164
  ocr_contents: dict
165
 
166
  def structured_ocr(image_file):
167
+ logger.info(f"Processing structured OCR for image: {image_file.name}")
168
+ temp_path = None
169
  try:
170
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
171
+ temp_file.write(image_file.read())
172
+ temp_path = temp_file.name
173
+ image_path = Path(temp_path)
174
+ encoded_image = encode_image(temp_path)
175
  if "Error" in encoded_image:
176
+ raise ValueError(encoded_image)
177
  base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
178
 
179
+ image_response = call_ocr_api({"type": "image_url", "image_url": base64_data_url})
180
+ try:
181
+ image_ocr_markdown = image_response.pages[0].markdown
182
+ except (IndexError, AttributeError):
183
+ image_ocr_markdown = "No text extracted."
 
184
 
185
+ chat_response = call_chat_complete(
 
186
  model="pixtral-12b-latest",
187
  messages=[{
188
  "role": "user",
 
197
  response_format={"type": "json_object"},
198
  temperature=0
199
  )
200
+
201
+ try:
202
+ content = chat_response.choices[0].message.content
203
+ response_dict = json.loads(content)
204
+ except (json.JSONDecodeError, IndexError, AttributeError):
205
+ logger.error("Failed to parse structured response")
206
+ return "Failed to parse structured response. Please try again."
207
+
208
+ language_members = {member.value: member for member in Language}
209
+ valid_languages = [l for l in response_dict.get("languages", ["English"]) if l in language_members]
210
+ languages = [language_members[l] for l in valid_languages] if valid_languages else [Language.ENGLISH]
211
+
212
+ structured_response = StructuredOCR(
213
+ file_name=image_path.name,
214
+ topics=response_dict.get("topics", []),
215
+ languages=languages,
216
+ ocr_contents=response_dict.get("ocr_contents", {})
217
+ )
218
+ logger.info("Successfully processed structured OCR")
219
  return f"```json\n{json.dumps(structured_response.dict(), indent=4)}\n```"
220
  except Exception as e:
221
+ logger.error(f"Error processing structured OCR: {str(e)}")
222
  return f"**Error:** {str(e)}"
223
+ finally:
224
+ if temp_path and os.path.exists(temp_path):
225
+ os.remove(temp_path)
226
 
227
  # Gradio Interface
228
  with gr.Blocks(title="Mistral OCR & Structured Output App") as demo:
229
  gr.Markdown("# Mistral OCR & Structured Output App")
230
+ gr.Markdown("Extract text from PDFs and images, ask questions about documents, or get structured JSON output!")
231
 
232
  with gr.Tab("OCR with PDF URL"):
233
  pdf_url_input = gr.Textbox(label="PDF URL", placeholder="e.g., https://arxiv.org/pdf/2201.04234")
234
+ pdf_url_output = gr.Textbox(label="OCR Result (Markdown)")
235
  pdf_url_button = gr.Button("Process PDF")
236
  pdf_url_button.click(ocr_pdf_url, inputs=pdf_url_input, outputs=pdf_url_output)
237
 
238
  with gr.Tab("OCR with Uploaded PDF"):
239
  pdf_file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
240
+ pdf_file_output = gr.Textbox(label="OCR Result (Markdown)")
241
  pdf_file_button = gr.Button("Process Uploaded PDF")
242
  pdf_file_button.click(ocr_uploaded_pdf, inputs=pdf_file_input, outputs=pdf_file_output)
243
 
244
  with gr.Tab("OCR with Image URL"):
245
  image_url_input = gr.Textbox(label="Image URL", placeholder="e.g., https://example.com/image.jpg")
246
+ image_url_output = gr.Textbox(label="OCR Result (Markdown)")
247
  image_url_button = gr.Button("Process Image")
248
  image_url_button.click(ocr_image_url, inputs=image_url_input, outputs=image_url_output)
249
 
250
  with gr.Tab("OCR with Uploaded Image"):
251
  image_file_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
252
+ image_file_output = gr.Textbox(label="OCR Result (Markdown)")
253
  image_file_button = gr.Button("Process Uploaded Image")
254
  image_file_button.click(ocr_uploaded_image, inputs=image_file_input, outputs=image_file_output)
255
 
256
  with gr.Tab("Document Understanding"):
257
  doc_url_input = gr.Textbox(label="Document URL", placeholder="e.g., https://arxiv.org/pdf/1805.04770")
258
  question_input = gr.Textbox(label="Question", placeholder="e.g., What is the last sentence?")
259
+ doc_output = gr.Textbox(label="Answer")
260
  doc_button = gr.Button("Ask Question")
261
  doc_button.click(document_understanding, inputs=[doc_url_input, question_input], outputs=doc_output)
262
 
263
  with gr.Tab("Structured OCR"):
264
  struct_image_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
265
+ struct_output = gr.Textbox(label="Structured JSON Output")
266
  struct_button = gr.Button("Get Structured Output")
267
  struct_button.click(structured_ocr, inputs=struct_image_input, outputs=struct_output)
268
 
 
269
  demo.launch(share=True, debug=True)