Bobholamovic
commited on
Commit
·
b33797d
1
Parent(s):
455679a
[Feat] Another big update
Browse files
app.py
CHANGED
@@ -3,6 +3,7 @@ import base64
|
|
3 |
import io
|
4 |
import json
|
5 |
import os
|
|
|
6 |
import tempfile
|
7 |
import uuid
|
8 |
import zipfile
|
@@ -14,7 +15,7 @@ from PIL import Image
|
|
14 |
|
15 |
# API Configuration
|
16 |
API_URL = "https://cf38vaydqdl2l4p2.aistudio-hub.baidu.com/layout-parsing"
|
17 |
-
TOKEN = os.getenv("API_TOKEN")
|
18 |
|
19 |
LOGO_PATH = Path(__file__).parent / "pp-structurev3.png"
|
20 |
with open(LOGO_PATH, "rb") as image_file:
|
@@ -180,7 +181,56 @@ def embed_images_into_markdown_text(markdown_text, markdown_images):
|
|
180 |
return markdown_text
|
181 |
|
182 |
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
"""Process uploaded file with API"""
|
185 |
try:
|
186 |
if not file_path:
|
@@ -204,7 +254,15 @@ def process_file(file_path):
|
|
204 |
|
205 |
response = requests.post(
|
206 |
API_URL,
|
207 |
-
json={
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
headers=headers,
|
209 |
timeout=1000,
|
210 |
)
|
@@ -237,6 +295,10 @@ def process_file(file_path):
|
|
237 |
)
|
238 |
markdown_content_list.append(markdown_content)
|
239 |
|
|
|
|
|
|
|
|
|
240 |
return {
|
241 |
"original_file": file_path,
|
242 |
"file_type": file_type,
|
@@ -244,6 +306,7 @@ def process_file(file_path):
|
|
244 |
"markdown_texts": markdown_texts,
|
245 |
"markdown_images": markdown_images,
|
246 |
"markdown_content_list": markdown_content_list,
|
|
|
247 |
"input_images": input_images,
|
248 |
"api_response": result,
|
249 |
}
|
@@ -318,9 +381,31 @@ with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
|
|
318 |
file_types=[".pdf", ".jpg", ".jpeg", ".png"],
|
319 |
type="filepath",
|
320 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
process_btn = gr.Button("Analyze Document", variant="primary")
|
322 |
gr.Markdown(
|
323 |
-
f"
|
|
|
|
|
|
|
324 |
)
|
325 |
|
326 |
loading_spinner = gr.Column(visible=False, elem_classes=["loader-container"])
|
@@ -335,28 +420,31 @@ with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
|
|
335 |
# Results display section
|
336 |
with gr.Column():
|
337 |
gr.Markdown("### Results")
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
|
|
347 |
)
|
348 |
-
|
349 |
-
markdown_display_list
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
|
|
|
|
|
|
358 |
)
|
359 |
-
)
|
360 |
|
361 |
# Download section
|
362 |
with gr.Column(elem_classes=["download-section"]):
|
@@ -366,43 +454,65 @@ with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
|
|
366 |
|
367 |
# Interaction logic
|
368 |
def toggle_spinner():
|
369 |
-
return gr.
|
370 |
|
371 |
def hide_spinner():
|
372 |
-
return gr.
|
373 |
|
374 |
-
def update_display(results):
|
|
|
|
|
|
|
|
|
|
|
|
|
375 |
ret_img = []
|
376 |
-
|
377 |
-
|
378 |
-
for
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
return ret_img + ret_cont
|
388 |
|
389 |
process_btn.click(toggle_spinner, outputs=[loading_spinner]).then(
|
390 |
-
process_file,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
391 |
).then(hide_spinner, outputs=[loading_spinner]).then(
|
392 |
update_display,
|
393 |
-
inputs=[results_state],
|
394 |
outputs=layout_ordering_images + markdown_display_list,
|
395 |
)
|
396 |
|
397 |
download_all_btn.click(
|
398 |
export_full_results, inputs=[results_state], outputs=[download_file]
|
399 |
-
).success(lambda: gr.
|
400 |
|
401 |
|
402 |
if __name__ == "__main__":
|
403 |
demo.launch(
|
404 |
server_name="0.0.0.0",
|
405 |
-
server_port=
|
406 |
-
share=True,
|
407 |
favicon_path=LOGO_PATH,
|
408 |
)
|
|
|
3 |
import io
|
4 |
import json
|
5 |
import os
|
6 |
+
import re
|
7 |
import tempfile
|
8 |
import uuid
|
9 |
import zipfile
|
|
|
15 |
|
16 |
# API Configuration
|
17 |
API_URL = "https://cf38vaydqdl2l4p2.aistudio-hub.baidu.com/layout-parsing"
|
18 |
+
TOKEN = os.getenv("API_TOKEN", "")
|
19 |
|
20 |
LOGO_PATH = Path(__file__).parent / "pp-structurev3.png"
|
21 |
with open(LOGO_PATH, "rb") as image_file:
|
|
|
181 |
return markdown_text
|
182 |
|
183 |
|
184 |
+
# HACK: Adapted from PaddleX 3.0.0 code
|
185 |
+
def concatenate_markdown_pages(markdown_list):
|
186 |
+
markdown_texts = ""
|
187 |
+
previous_page_last_element_paragraph_end_flag = True
|
188 |
+
|
189 |
+
for res in markdown_list:
|
190 |
+
# Get the paragraph flags for the current page
|
191 |
+
page_first_element_paragraph_start_flag: bool = res["isStart"]
|
192 |
+
page_last_element_paragraph_end_flag: bool = res["isEnd"]
|
193 |
+
|
194 |
+
# Determine whether to add a space or a newline
|
195 |
+
if (
|
196 |
+
not page_first_element_paragraph_start_flag
|
197 |
+
and not previous_page_last_element_paragraph_end_flag
|
198 |
+
):
|
199 |
+
last_char_of_markdown = markdown_texts[-1] if markdown_texts else ""
|
200 |
+
first_char_of_handler = res["text"]
|
201 |
+
|
202 |
+
# Check if the last character and the first character are Chinese characters
|
203 |
+
last_is_chinese_char = (
|
204 |
+
re.match(r"[\u4e00-\u9fff]", last_char_of_markdown)
|
205 |
+
if last_char_of_markdown
|
206 |
+
else False
|
207 |
+
)
|
208 |
+
first_is_chinese_char = (
|
209 |
+
re.match(r"[\u4e00-\u9fff]", first_char_of_handler)
|
210 |
+
if first_char_of_handler
|
211 |
+
else False
|
212 |
+
)
|
213 |
+
if not (last_is_chinese_char or first_is_chinese_char):
|
214 |
+
markdown_texts += " " + res["text"]
|
215 |
+
else:
|
216 |
+
markdown_texts += res["text"]
|
217 |
+
else:
|
218 |
+
markdown_texts += "\n\n" + res["text"]
|
219 |
+
previous_page_last_element_paragraph_end_flag = (
|
220 |
+
page_last_element_paragraph_end_flag
|
221 |
+
)
|
222 |
+
|
223 |
+
return markdown_texts
|
224 |
+
|
225 |
+
|
226 |
+
def process_file(
|
227 |
+
file_path,
|
228 |
+
use_formula_recognition,
|
229 |
+
use_chart_recognition,
|
230 |
+
use_doc_orientation_classify,
|
231 |
+
use_doc_unwarping,
|
232 |
+
use_textline_orientation,
|
233 |
+
):
|
234 |
"""Process uploaded file with API"""
|
235 |
try:
|
236 |
if not file_path:
|
|
|
254 |
|
255 |
response = requests.post(
|
256 |
API_URL,
|
257 |
+
json={
|
258 |
+
"file": file_data,
|
259 |
+
"fileType": 0 if file_type == "pdf" else 1,
|
260 |
+
"useFormulaRecognition": use_formula_recognition,
|
261 |
+
"useChartRecognition": use_chart_recognition,
|
262 |
+
"useDocOrientationClassify": use_doc_orientation_classify,
|
263 |
+
"useDocUnwarping": use_doc_unwarping,
|
264 |
+
"useTextlineOrientation": use_textline_orientation,
|
265 |
+
},
|
266 |
headers=headers,
|
267 |
timeout=1000,
|
268 |
)
|
|
|
295 |
)
|
296 |
markdown_content_list.append(markdown_content)
|
297 |
|
298 |
+
concatenated_markdown_content = concatenate_markdown_pages(
|
299 |
+
[res["markdown"] for res in layout_results]
|
300 |
+
)
|
301 |
+
|
302 |
return {
|
303 |
"original_file": file_path,
|
304 |
"file_type": file_type,
|
|
|
306 |
"markdown_texts": markdown_texts,
|
307 |
"markdown_images": markdown_images,
|
308 |
"markdown_content_list": markdown_content_list,
|
309 |
+
"concatenated_markdown_content": concatenated_markdown_content,
|
310 |
"input_images": input_images,
|
311 |
"api_response": result,
|
312 |
}
|
|
|
381 |
file_types=[".pdf", ".jpg", ".jpeg", ".png"],
|
382 |
type="filepath",
|
383 |
)
|
384 |
+
with gr.Row():
|
385 |
+
use_formula_recognition_cb = gr.Checkbox(
|
386 |
+
value=True, label="Use formula recognition"
|
387 |
+
)
|
388 |
+
use_chart_recognition_cb = gr.Checkbox(
|
389 |
+
value=False, label="Use chart recognition"
|
390 |
+
)
|
391 |
+
with gr.Row():
|
392 |
+
use_doc_orientation_classify_cb = gr.Checkbox(
|
393 |
+
value=False, label="Use document image orientation classification"
|
394 |
+
)
|
395 |
+
use_doc_unwarping_cb = gr.Checkbox(
|
396 |
+
value=False, label="Use text image unwarping"
|
397 |
+
)
|
398 |
+
with gr.Row():
|
399 |
+
use_textline_orientation_cb = gr.Checkbox(
|
400 |
+
value=False, label="Use text line orientation classification"
|
401 |
+
)
|
402 |
+
concatenate_pages_cb = gr.Checkbox(value=True, label="Concatenate pages")
|
403 |
process_btn = gr.Button("Analyze Document", variant="primary")
|
404 |
gr.Markdown(
|
405 |
+
f"""
|
406 |
+
1. Only the first {MAX_NUM_PAGES} pages will be processed.
|
407 |
+
2. Some formulas might not display correctly because of renderer limitations.
|
408 |
+
"""
|
409 |
)
|
410 |
|
411 |
loading_spinner = gr.Column(visible=False, elem_classes=["loader-container"])
|
|
|
420 |
# Results display section
|
421 |
with gr.Column():
|
422 |
gr.Markdown("### Results")
|
423 |
+
with gr.Row():
|
424 |
+
with gr.Column():
|
425 |
+
layout_ordering_images = []
|
426 |
+
for i in range(MAX_NUM_PAGES):
|
427 |
+
layout_ordering_images.append(
|
428 |
+
gr.Image(
|
429 |
+
label=f"Layout Ordering Image {i}",
|
430 |
+
show_label=True,
|
431 |
+
visible=False,
|
432 |
+
)
|
433 |
)
|
434 |
+
with gr.Column():
|
435 |
+
markdown_display_list = []
|
436 |
+
for i in range(MAX_NUM_PAGES):
|
437 |
+
markdown_display_list.append(
|
438 |
+
gr.Markdown(
|
439 |
+
visible=False,
|
440 |
+
container=True,
|
441 |
+
show_copy_button=True,
|
442 |
+
latex_delimiters=[
|
443 |
+
{"left": "$$", "right": "$$", "display": True},
|
444 |
+
{"left": "$", "right": "$", "display": False},
|
445 |
+
],
|
446 |
+
)
|
447 |
)
|
|
|
448 |
|
449 |
# Download section
|
450 |
with gr.Column(elem_classes=["download-section"]):
|
|
|
454 |
|
455 |
# Interaction logic
|
456 |
def toggle_spinner():
|
457 |
+
return gr.Column(visible=True)
|
458 |
|
459 |
def hide_spinner():
|
460 |
+
return gr.Column(visible=False)
|
461 |
|
462 |
+
def update_display(results, concatenate_pages):
|
463 |
+
if not results:
|
464 |
+
return gr.skip()
|
465 |
+
|
466 |
+
assert len(results["layout_ordering_images"]) <= MAX_NUM_PAGES, len(
|
467 |
+
results["layout_ordering_images"]
|
468 |
+
)
|
469 |
ret_img = []
|
470 |
+
for img in results["layout_ordering_images"]:
|
471 |
+
ret_img.append(gr.Image(value=bytes_to_image(img), visible=True))
|
472 |
+
for _ in range(len(results["layout_ordering_images"]), MAX_NUM_PAGES):
|
473 |
+
ret_img.append(gr.Image(visible=False))
|
474 |
+
|
475 |
+
if concatenate_pages:
|
476 |
+
markdown_content = results["concatenated_markdown_content"]
|
477 |
+
ret_cont = [gr.Markdown(value=markdown_content, visible=True)]
|
478 |
+
for _ in range(1, MAX_NUM_PAGES):
|
479 |
+
ret_cont.append(gr.Markdown(visible=False))
|
480 |
+
else:
|
481 |
+
assert len(results["markdown_content_list"]) <= MAX_NUM_PAGES, len(
|
482 |
+
results["markdown_content_list"]
|
483 |
+
)
|
484 |
+
ret_cont = []
|
485 |
+
for cont in results["markdown_content_list"]:
|
486 |
+
ret_cont.append(gr.Markdown(value=cont, visible=True))
|
487 |
+
for _ in range(len(results["markdown_content_list"]), MAX_NUM_PAGES):
|
488 |
+
ret_cont.append(gr.Markdown(visible=False))
|
489 |
return ret_img + ret_cont
|
490 |
|
491 |
process_btn.click(toggle_spinner, outputs=[loading_spinner]).then(
|
492 |
+
process_file,
|
493 |
+
inputs=[
|
494 |
+
file_input,
|
495 |
+
use_formula_recognition_cb,
|
496 |
+
use_chart_recognition_cb,
|
497 |
+
use_doc_orientation_classify_cb,
|
498 |
+
use_doc_unwarping_cb,
|
499 |
+
use_textline_orientation_cb,
|
500 |
+
],
|
501 |
+
outputs=[results_state],
|
502 |
).then(hide_spinner, outputs=[loading_spinner]).then(
|
503 |
update_display,
|
504 |
+
inputs=[results_state, concatenate_pages_cb],
|
505 |
outputs=layout_ordering_images + markdown_display_list,
|
506 |
)
|
507 |
|
508 |
download_all_btn.click(
|
509 |
export_full_results, inputs=[results_state], outputs=[download_file]
|
510 |
+
).success(lambda: gr.File(visible=True), outputs=[download_file])
|
511 |
|
512 |
|
513 |
if __name__ == "__main__":
|
514 |
demo.launch(
|
515 |
server_name="0.0.0.0",
|
516 |
+
server_port=8860,
|
|
|
517 |
favicon_path=LOGO_PATH,
|
518 |
)
|