Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,11 +8,18 @@ from enum import Enum
|
|
8 |
from pydantic import BaseModel
|
9 |
import pycountry
|
10 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
# Initialize Mistral client with API key
|
13 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
14 |
if not api_key:
|
15 |
-
raise ValueError("
|
16 |
client = Mistral(api_key=api_key)
|
17 |
|
18 |
# Helper function to encode image to base64
|
@@ -21,68 +28,105 @@ def encode_image(image_path):
|
|
21 |
with open(image_path, "rb") as image_file:
|
22 |
return base64.b64encode(image_file.read()).decode('utf-8')
|
23 |
except Exception as e:
|
|
|
24 |
return f"Error encoding image: {str(e)}"
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
# OCR with PDF URL
|
27 |
def ocr_pdf_url(pdf_url):
|
|
|
28 |
try:
|
29 |
-
ocr_response =
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
return markdown
|
36 |
except Exception as e:
|
|
|
37 |
return f"**Error:** {str(e)}"
|
38 |
|
39 |
# OCR with Uploaded PDF
|
40 |
def ocr_uploaded_pdf(pdf_file):
|
|
|
|
|
41 |
try:
|
|
|
|
|
|
|
|
|
42 |
uploaded_pdf = client.files.upload(
|
43 |
-
file={"file_name":
|
44 |
purpose="ocr"
|
45 |
)
|
46 |
-
signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id, expiry=
|
47 |
-
ocr_response =
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
return markdown
|
54 |
except Exception as e:
|
|
|
55 |
return f"**Error:** {str(e)}"
|
|
|
|
|
|
|
56 |
|
57 |
# OCR with Image URL
|
58 |
def ocr_image_url(image_url):
|
|
|
59 |
try:
|
60 |
-
ocr_response =
|
61 |
-
|
62 |
-
|
63 |
-
)
|
64 |
-
|
|
|
65 |
return markdown
|
66 |
except Exception as e:
|
|
|
67 |
return f"**Error:** {str(e)}"
|
68 |
|
69 |
# OCR with Uploaded Image
|
70 |
def ocr_uploaded_image(image_file):
|
|
|
|
|
71 |
try:
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
80 |
return markdown
|
81 |
except Exception as e:
|
|
|
82 |
return f"**Error:** {str(e)}"
|
|
|
|
|
|
|
83 |
|
84 |
# Document Understanding
|
85 |
def document_understanding(doc_url, question):
|
|
|
86 |
try:
|
87 |
messages = [
|
88 |
{"role": "user", "content": [
|
@@ -90,12 +134,15 @@ def document_understanding(doc_url, question):
|
|
90 |
{"type": "document_url", "document_url": doc_url}
|
91 |
]}
|
92 |
]
|
93 |
-
chat_response =
|
94 |
-
|
95 |
-
|
96 |
-
)
|
97 |
-
|
|
|
|
|
98 |
except Exception as e:
|
|
|
99 |
return f"**Error:** {str(e)}"
|
100 |
|
101 |
# Structured OCR Setup
|
@@ -117,22 +164,25 @@ class StructuredOCR(BaseModel):
|
|
117 |
ocr_contents: dict
|
118 |
|
119 |
def structured_ocr(image_file):
|
|
|
|
|
120 |
try:
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
123 |
if "Error" in encoded_image:
|
124 |
-
|
125 |
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
126 |
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
image_ocr_markdown = image_response.pages[0].markdown
|
133 |
|
134 |
-
|
135 |
-
chat_response = client.chat.complete(
|
136 |
model="pixtral-12b-latest",
|
137 |
messages=[{
|
138 |
"role": "user",
|
@@ -147,60 +197,73 @@ def structured_ocr(image_file):
|
|
147 |
response_format={"type": "json_object"},
|
148 |
temperature=0
|
149 |
)
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
"
|
157 |
-
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
return f"```json\n{json.dumps(structured_response.dict(), indent=4)}\n```"
|
160 |
except Exception as e:
|
|
|
161 |
return f"**Error:** {str(e)}"
|
|
|
|
|
|
|
162 |
|
163 |
# Gradio Interface
|
164 |
with gr.Blocks(title="Mistral OCR & Structured Output App") as demo:
|
165 |
gr.Markdown("# Mistral OCR & Structured Output App")
|
166 |
-
gr.Markdown("Extract text from PDFs and images, ask questions about documents, or get structured JSON output
|
167 |
|
168 |
with gr.Tab("OCR with PDF URL"):
|
169 |
pdf_url_input = gr.Textbox(label="PDF URL", placeholder="e.g., https://arxiv.org/pdf/2201.04234")
|
170 |
-
pdf_url_output = gr.
|
171 |
pdf_url_button = gr.Button("Process PDF")
|
172 |
pdf_url_button.click(ocr_pdf_url, inputs=pdf_url_input, outputs=pdf_url_output)
|
173 |
|
174 |
with gr.Tab("OCR with Uploaded PDF"):
|
175 |
pdf_file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
176 |
-
pdf_file_output = gr.
|
177 |
pdf_file_button = gr.Button("Process Uploaded PDF")
|
178 |
pdf_file_button.click(ocr_uploaded_pdf, inputs=pdf_file_input, outputs=pdf_file_output)
|
179 |
|
180 |
with gr.Tab("OCR with Image URL"):
|
181 |
image_url_input = gr.Textbox(label="Image URL", placeholder="e.g., https://example.com/image.jpg")
|
182 |
-
image_url_output = gr.
|
183 |
image_url_button = gr.Button("Process Image")
|
184 |
image_url_button.click(ocr_image_url, inputs=image_url_input, outputs=image_url_output)
|
185 |
|
186 |
with gr.Tab("OCR with Uploaded Image"):
|
187 |
image_file_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
|
188 |
-
image_file_output = gr.
|
189 |
image_file_button = gr.Button("Process Uploaded Image")
|
190 |
image_file_button.click(ocr_uploaded_image, inputs=image_file_input, outputs=image_file_output)
|
191 |
|
192 |
with gr.Tab("Document Understanding"):
|
193 |
doc_url_input = gr.Textbox(label="Document URL", placeholder="e.g., https://arxiv.org/pdf/1805.04770")
|
194 |
question_input = gr.Textbox(label="Question", placeholder="e.g., What is the last sentence?")
|
195 |
-
doc_output = gr.Textbox(label="Answer")
|
196 |
doc_button = gr.Button("Ask Question")
|
197 |
doc_button.click(document_understanding, inputs=[doc_url_input, question_input], outputs=doc_output)
|
198 |
|
199 |
with gr.Tab("Structured OCR"):
|
200 |
struct_image_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
|
201 |
-
struct_output = gr.
|
202 |
struct_button = gr.Button("Get Structured Output")
|
203 |
struct_button.click(structured_ocr, inputs=struct_image_input, outputs=struct_output)
|
204 |
|
205 |
-
# Launch the app
|
206 |
demo.launch(share=True, debug=True)
|
|
|
8 |
from pydantic import BaseModel
|
9 |
import pycountry
|
10 |
import json
|
11 |
+
import logging
|
12 |
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
13 |
+
import tempfile
|
14 |
+
|
15 |
+
# Set up logging
|
16 |
+
logging.basicConfig(level=logging.INFO)
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
|
19 |
# Initialize Mistral client with API key
|
20 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
21 |
if not api_key:
|
22 |
+
raise ValueError("MISTRAL_API_KEY environment variable is not set. Please configure it.")
|
23 |
client = Mistral(api_key=api_key)
|
24 |
|
25 |
# Helper function to encode image to base64
|
|
|
28 |
with open(image_path, "rb") as image_file:
|
29 |
return base64.b64encode(image_file.read()).decode('utf-8')
|
30 |
except Exception as e:
|
31 |
+
logger.error(f"Error encoding image {image_path}: {str(e)}")
|
32 |
return f"Error encoding image: {str(e)}"
|
33 |
|
34 |
+
# Retry-enabled API call helpers
|
35 |
+
@retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
|
36 |
+
def call_ocr_api(document):
|
37 |
+
return client.ocr.process(model="mistral-ocr-latest", document=document)
|
38 |
+
|
39 |
+
@retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
|
40 |
+
def call_chat_complete(model, messages, **kwargs):
|
41 |
+
return client.chat.complete(model=model, messages=messages, **kwargs)
|
42 |
+
|
43 |
# OCR with PDF URL
|
44 |
def ocr_pdf_url(pdf_url):
|
45 |
+
logger.info(f"Processing PDF URL: {pdf_url}")
|
46 |
try:
|
47 |
+
ocr_response = call_ocr_api({"type": "document_url", "document_url": pdf_url})
|
48 |
+
try:
|
49 |
+
markdown = ocr_response.pages[0].markdown
|
50 |
+
except (IndexError, AttributeError):
|
51 |
+
markdown = "No text extracted or response invalid."
|
52 |
+
logger.info("Successfully processed PDF URL")
|
53 |
+
return markdown
|
54 |
except Exception as e:
|
55 |
+
logger.error(f"Error processing PDF URL: {str(e)}")
|
56 |
return f"**Error:** {str(e)}"
|
57 |
|
58 |
# OCR with Uploaded PDF
|
59 |
def ocr_uploaded_pdf(pdf_file):
|
60 |
+
logger.info(f"Processing uploaded PDF: {pdf_file.name}")
|
61 |
+
temp_path = None
|
62 |
try:
|
63 |
+
# Use tempfile to handle uploaded file securely
|
64 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
65 |
+
temp_file.write(pdf_file.read())
|
66 |
+
temp_path = temp_file.name
|
67 |
uploaded_pdf = client.files.upload(
|
68 |
+
file={"file_name": temp_path, "content": open(temp_path, "rb")},
|
69 |
purpose="ocr"
|
70 |
)
|
71 |
+
signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id, expiry=7200) # Increased to 2 hours
|
72 |
+
ocr_response = call_ocr_api({"type": "document_url", "document_url": signed_url.url})
|
73 |
+
try:
|
74 |
+
markdown = ocr_response.pages[0].markdown
|
75 |
+
except (IndexError, AttributeError):
|
76 |
+
markdown = "No text extracted or response invalid."
|
77 |
+
logger.info("Successfully processed uploaded PDF")
|
78 |
return markdown
|
79 |
except Exception as e:
|
80 |
+
logger.error(f"Error processing uploaded PDF: {str(e)}")
|
81 |
return f"**Error:** {str(e)}"
|
82 |
+
finally:
|
83 |
+
if temp_path and os.path.exists(temp_path):
|
84 |
+
os.remove(temp_path)
|
85 |
|
86 |
# OCR with Image URL
|
87 |
def ocr_image_url(image_url):
|
88 |
+
logger.info(f"Processing image URL: {image_url}")
|
89 |
try:
|
90 |
+
ocr_response = call_ocr_api({"type": "image_url", "image_url": image_url})
|
91 |
+
try:
|
92 |
+
markdown = ocr_response.pages[0].markdown
|
93 |
+
except (IndexError, AttributeError):
|
94 |
+
markdown = "No text extracted or response invalid."
|
95 |
+
logger.info("Successfully processed image URL")
|
96 |
return markdown
|
97 |
except Exception as e:
|
98 |
+
logger.error(f"Error processing image URL: {str(e)}")
|
99 |
return f"**Error:** {str(e)}"
|
100 |
|
101 |
# OCR with Uploaded Image
|
102 |
def ocr_uploaded_image(image_file):
|
103 |
+
logger.info(f"Processing uploaded image: {image_file.name}")
|
104 |
+
temp_path = None
|
105 |
try:
|
106 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
|
107 |
+
temp_file.write(image_file.read())
|
108 |
+
temp_path = temp_file.name
|
109 |
+
encoded_image = encode_image(temp_path)
|
110 |
+
if "Error" in encoded_image:
|
111 |
+
raise ValueError(encoded_image)
|
112 |
+
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
113 |
+
ocr_response = call_ocr_api({"type": "image_url", "image_url": base64_data_url})
|
114 |
+
try:
|
115 |
+
markdown = ocr_response.pages[0].markdown
|
116 |
+
except (IndexError, AttributeError):
|
117 |
+
markdown = "No text extracted or response invalid."
|
118 |
+
logger.info("Successfully processed uploaded image")
|
119 |
return markdown
|
120 |
except Exception as e:
|
121 |
+
logger.error(f"Error processing uploaded image: {str(e)}")
|
122 |
return f"**Error:** {str(e)}"
|
123 |
+
finally:
|
124 |
+
if temp_path and os.path.exists(temp_path):
|
125 |
+
os.remove(temp_path)
|
126 |
|
127 |
# Document Understanding
|
128 |
def document_understanding(doc_url, question):
|
129 |
+
logger.info(f"Processing document understanding - URL: {doc_url}, Question: {question}")
|
130 |
try:
|
131 |
messages = [
|
132 |
{"role": "user", "content": [
|
|
|
134 |
{"type": "document_url", "document_url": doc_url}
|
135 |
]}
|
136 |
]
|
137 |
+
chat_response = call_chat_complete(model="mistral-small-latest", messages=messages)
|
138 |
+
try:
|
139 |
+
content = chat_response.choices[0].message.content
|
140 |
+
except (IndexError, AttributeError):
|
141 |
+
content = "No response received from the API."
|
142 |
+
logger.info("Successfully processed document understanding")
|
143 |
+
return content
|
144 |
except Exception as e:
|
145 |
+
logger.error(f"Error in document understanding: {str(e)}")
|
146 |
return f"**Error:** {str(e)}"
|
147 |
|
148 |
# Structured OCR Setup
|
|
|
164 |
ocr_contents: dict
|
165 |
|
166 |
def structured_ocr(image_file):
|
167 |
+
logger.info(f"Processing structured OCR for image: {image_file.name}")
|
168 |
+
temp_path = None
|
169 |
try:
|
170 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
|
171 |
+
temp_file.write(image_file.read())
|
172 |
+
temp_path = temp_file.name
|
173 |
+
image_path = Path(temp_path)
|
174 |
+
encoded_image = encode_image(temp_path)
|
175 |
if "Error" in encoded_image:
|
176 |
+
raise ValueError(encoded_image)
|
177 |
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
178 |
|
179 |
+
image_response = call_ocr_api({"type": "image_url", "image_url": base64_data_url})
|
180 |
+
try:
|
181 |
+
image_ocr_markdown = image_response.pages[0].markdown
|
182 |
+
except (IndexError, AttributeError):
|
183 |
+
image_ocr_markdown = "No text extracted."
|
|
|
184 |
|
185 |
+
chat_response = call_chat_complete(
|
|
|
186 |
model="pixtral-12b-latest",
|
187 |
messages=[{
|
188 |
"role": "user",
|
|
|
197 |
response_format={"type": "json_object"},
|
198 |
temperature=0
|
199 |
)
|
200 |
+
|
201 |
+
try:
|
202 |
+
content = chat_response.choices[0].message.content
|
203 |
+
response_dict = json.loads(content)
|
204 |
+
except (json.JSONDecodeError, IndexError, AttributeError):
|
205 |
+
logger.error("Failed to parse structured response")
|
206 |
+
return "Failed to parse structured response. Please try again."
|
207 |
+
|
208 |
+
language_members = {member.value: member for member in Language}
|
209 |
+
valid_languages = [l for l in response_dict.get("languages", ["English"]) if l in language_members]
|
210 |
+
languages = [language_members[l] for l in valid_languages] if valid_languages else [Language.ENGLISH]
|
211 |
+
|
212 |
+
structured_response = StructuredOCR(
|
213 |
+
file_name=image_path.name,
|
214 |
+
topics=response_dict.get("topics", []),
|
215 |
+
languages=languages,
|
216 |
+
ocr_contents=response_dict.get("ocr_contents", {})
|
217 |
+
)
|
218 |
+
logger.info("Successfully processed structured OCR")
|
219 |
return f"```json\n{json.dumps(structured_response.dict(), indent=4)}\n```"
|
220 |
except Exception as e:
|
221 |
+
logger.error(f"Error processing structured OCR: {str(e)}")
|
222 |
return f"**Error:** {str(e)}"
|
223 |
+
finally:
|
224 |
+
if temp_path and os.path.exists(temp_path):
|
225 |
+
os.remove(temp_path)
|
226 |
|
227 |
# Gradio Interface
|
228 |
with gr.Blocks(title="Mistral OCR & Structured Output App") as demo:
|
229 |
gr.Markdown("# Mistral OCR & Structured Output App")
|
230 |
+
gr.Markdown("Extract text from PDFs and images, ask questions about documents, or get structured JSON output!")
|
231 |
|
232 |
with gr.Tab("OCR with PDF URL"):
|
233 |
pdf_url_input = gr.Textbox(label="PDF URL", placeholder="e.g., https://arxiv.org/pdf/2201.04234")
|
234 |
+
pdf_url_output = gr.Textbox(label="OCR Result (Markdown)")
|
235 |
pdf_url_button = gr.Button("Process PDF")
|
236 |
pdf_url_button.click(ocr_pdf_url, inputs=pdf_url_input, outputs=pdf_url_output)
|
237 |
|
238 |
with gr.Tab("OCR with Uploaded PDF"):
|
239 |
pdf_file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
240 |
+
pdf_file_output = gr.Textbox(label="OCR Result (Markdown)")
|
241 |
pdf_file_button = gr.Button("Process Uploaded PDF")
|
242 |
pdf_file_button.click(ocr_uploaded_pdf, inputs=pdf_file_input, outputs=pdf_file_output)
|
243 |
|
244 |
with gr.Tab("OCR with Image URL"):
|
245 |
image_url_input = gr.Textbox(label="Image URL", placeholder="e.g., https://example.com/image.jpg")
|
246 |
+
image_url_output = gr.Textbox(label="OCR Result (Markdown)")
|
247 |
image_url_button = gr.Button("Process Image")
|
248 |
image_url_button.click(ocr_image_url, inputs=image_url_input, outputs=image_url_output)
|
249 |
|
250 |
with gr.Tab("OCR with Uploaded Image"):
|
251 |
image_file_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
|
252 |
+
image_file_output = gr.Textbox(label="OCR Result (Markdown)")
|
253 |
image_file_button = gr.Button("Process Uploaded Image")
|
254 |
image_file_button.click(ocr_uploaded_image, inputs=image_file_input, outputs=image_file_output)
|
255 |
|
256 |
with gr.Tab("Document Understanding"):
|
257 |
doc_url_input = gr.Textbox(label="Document URL", placeholder="e.g., https://arxiv.org/pdf/1805.04770")
|
258 |
question_input = gr.Textbox(label="Question", placeholder="e.g., What is the last sentence?")
|
259 |
+
doc_output = gr.Textbox(label="Answer")
|
260 |
doc_button = gr.Button("Ask Question")
|
261 |
doc_button.click(document_understanding, inputs=[doc_url_input, question_input], outputs=doc_output)
|
262 |
|
263 |
with gr.Tab("Structured OCR"):
|
264 |
struct_image_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
|
265 |
+
struct_output = gr.Textbox(label="Structured JSON Output")
|
266 |
struct_button = gr.Button("Get Structured Output")
|
267 |
struct_button.click(structured_ocr, inputs=struct_image_input, outputs=struct_output)
|
268 |
|
|
|
269 |
demo.launch(share=True, debug=True)
|