Bobholamovic commited on
Commit
b33797d
·
1 Parent(s): 455679a

[Feat] Another big update

Browse files
Files changed (1) hide show
  1. app.py +153 -43
app.py CHANGED
@@ -3,6 +3,7 @@ import base64
3
  import io
4
  import json
5
  import os
 
6
  import tempfile
7
  import uuid
8
  import zipfile
@@ -14,7 +15,7 @@ from PIL import Image
14
 
15
  # API Configuration
16
  API_URL = "https://cf38vaydqdl2l4p2.aistudio-hub.baidu.com/layout-parsing"
17
- TOKEN = os.getenv("API_TOKEN")
18
 
19
  LOGO_PATH = Path(__file__).parent / "pp-structurev3.png"
20
  with open(LOGO_PATH, "rb") as image_file:
@@ -180,7 +181,56 @@ def embed_images_into_markdown_text(markdown_text, markdown_images):
180
  return markdown_text
181
 
182
 
183
- def process_file(file_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  """Process uploaded file with API"""
185
  try:
186
  if not file_path:
@@ -204,7 +254,15 @@ def process_file(file_path):
204
 
205
  response = requests.post(
206
  API_URL,
207
- json={"file": file_data, "fileType": 0 if file_type == "pdf" else 1},
 
 
 
 
 
 
 
 
208
  headers=headers,
209
  timeout=1000,
210
  )
@@ -237,6 +295,10 @@ def process_file(file_path):
237
  )
238
  markdown_content_list.append(markdown_content)
239
 
 
 
 
 
240
  return {
241
  "original_file": file_path,
242
  "file_type": file_type,
@@ -244,6 +306,7 @@ def process_file(file_path):
244
  "markdown_texts": markdown_texts,
245
  "markdown_images": markdown_images,
246
  "markdown_content_list": markdown_content_list,
 
247
  "input_images": input_images,
248
  "api_response": result,
249
  }
@@ -318,9 +381,31 @@ with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
318
  file_types=[".pdf", ".jpg", ".jpeg", ".png"],
319
  type="filepath",
320
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  process_btn = gr.Button("Analyze Document", variant="primary")
322
  gr.Markdown(
323
- f"*Please note that only the first {MAX_NUM_PAGES} pages will be processed.*"
 
 
 
324
  )
325
 
326
  loading_spinner = gr.Column(visible=False, elem_classes=["loader-container"])
@@ -335,28 +420,31 @@ with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
335
  # Results display section
336
  with gr.Column():
337
  gr.Markdown("### Results")
338
- layout_ordering_images = []
339
- markdown_display_list = []
340
- for i in range(MAX_NUM_PAGES):
341
- with gr.Row():
342
- layout_ordering_images.append(
343
- gr.Image(
344
- label=f"Layout Ordering Image {i}",
345
- show_label=True,
346
- visible=False,
 
347
  )
348
- )
349
- markdown_display_list.append(
350
- gr.Markdown(
351
- visible=False,
352
- container=True,
353
- show_copy_button=True,
354
- latex_delimiters=[
355
- {"left": "$$", "right": "$$", "display": True},
356
- {"left": "$", "right": "$", "display": False},
357
- ],
 
 
 
358
  )
359
- )
360
 
361
  # Download section
362
  with gr.Column(elem_classes=["download-section"]):
@@ -366,43 +454,65 @@ with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
366
 
367
  # Interaction logic
368
  def toggle_spinner():
369
- return gr.update(visible=True)
370
 
371
  def hide_spinner():
372
- return gr.update(visible=False)
373
 
374
- def update_display(results):
 
 
 
 
 
 
375
  ret_img = []
376
- ret_cont = []
377
- cnt = 0
378
- for img, cont in zip(
379
- results["layout_ordering_images"], results["markdown_content_list"]
380
- ):
381
- ret_img.append(gr.update(value=bytes_to_image(img), visible=True))
382
- ret_cont.append(gr.update(value=cont, visible=True))
383
- cnt += 1
384
- for _ in range(cnt, MAX_NUM_PAGES):
385
- ret_img.append(gr.update(visible=False))
386
- ret_cont.append(gr.update(visible=False))
 
 
 
 
 
 
 
 
387
  return ret_img + ret_cont
388
 
389
  process_btn.click(toggle_spinner, outputs=[loading_spinner]).then(
390
- process_file, inputs=[file_input], outputs=[results_state]
 
 
 
 
 
 
 
 
 
391
  ).then(hide_spinner, outputs=[loading_spinner]).then(
392
  update_display,
393
- inputs=[results_state],
394
  outputs=layout_ordering_images + markdown_display_list,
395
  )
396
 
397
  download_all_btn.click(
398
  export_full_results, inputs=[results_state], outputs=[download_file]
399
- ).success(lambda: gr.update(visible=True), outputs=[download_file])
400
 
401
 
402
  if __name__ == "__main__":
403
  demo.launch(
404
  server_name="0.0.0.0",
405
- server_port=7860,
406
- share=True,
407
  favicon_path=LOGO_PATH,
408
  )
 
3
  import io
4
  import json
5
  import os
6
+ import re
7
  import tempfile
8
  import uuid
9
  import zipfile
 
15
 
16
  # API Configuration
17
  API_URL = "https://cf38vaydqdl2l4p2.aistudio-hub.baidu.com/layout-parsing"
18
+ TOKEN = os.getenv("API_TOKEN", "")
19
 
20
  LOGO_PATH = Path(__file__).parent / "pp-structurev3.png"
21
  with open(LOGO_PATH, "rb") as image_file:
 
181
  return markdown_text
182
 
183
 
184
+ # HACK: Adapted from PaddleX 3.0.0 code
185
+ def concatenate_markdown_pages(markdown_list):
186
+ markdown_texts = ""
187
+ previous_page_last_element_paragraph_end_flag = True
188
+
189
+ for res in markdown_list:
190
+ # Get the paragraph flags for the current page
191
+ page_first_element_paragraph_start_flag: bool = res["isStart"]
192
+ page_last_element_paragraph_end_flag: bool = res["isEnd"]
193
+
194
+ # Determine whether to add a space or a newline
195
+ if (
196
+ not page_first_element_paragraph_start_flag
197
+ and not previous_page_last_element_paragraph_end_flag
198
+ ):
199
+ last_char_of_markdown = markdown_texts[-1] if markdown_texts else ""
200
+ first_char_of_handler = res["text"]
201
+
202
+ # Check if the last character and the first character are Chinese characters
203
+ last_is_chinese_char = (
204
+ re.match(r"[\u4e00-\u9fff]", last_char_of_markdown)
205
+ if last_char_of_markdown
206
+ else False
207
+ )
208
+ first_is_chinese_char = (
209
+ re.match(r"[\u4e00-\u9fff]", first_char_of_handler)
210
+ if first_char_of_handler
211
+ else False
212
+ )
213
+ if not (last_is_chinese_char or first_is_chinese_char):
214
+ markdown_texts += " " + res["text"]
215
+ else:
216
+ markdown_texts += res["text"]
217
+ else:
218
+ markdown_texts += "\n\n" + res["text"]
219
+ previous_page_last_element_paragraph_end_flag = (
220
+ page_last_element_paragraph_end_flag
221
+ )
222
+
223
+ return markdown_texts
224
+
225
+
226
+ def process_file(
227
+ file_path,
228
+ use_formula_recognition,
229
+ use_chart_recognition,
230
+ use_doc_orientation_classify,
231
+ use_doc_unwarping,
232
+ use_textline_orientation,
233
+ ):
234
  """Process uploaded file with API"""
235
  try:
236
  if not file_path:
 
254
 
255
  response = requests.post(
256
  API_URL,
257
+ json={
258
+ "file": file_data,
259
+ "fileType": 0 if file_type == "pdf" else 1,
260
+ "useFormulaRecognition": use_formula_recognition,
261
+ "useChartRecognition": use_chart_recognition,
262
+ "useDocOrientationClassify": use_doc_orientation_classify,
263
+ "useDocUnwarping": use_doc_unwarping,
264
+ "useTextlineOrientation": use_textline_orientation,
265
+ },
266
  headers=headers,
267
  timeout=1000,
268
  )
 
295
  )
296
  markdown_content_list.append(markdown_content)
297
 
298
+ concatenated_markdown_content = concatenate_markdown_pages(
299
+ [res["markdown"] for res in layout_results]
300
+ )
301
+
302
  return {
303
  "original_file": file_path,
304
  "file_type": file_type,
 
306
  "markdown_texts": markdown_texts,
307
  "markdown_images": markdown_images,
308
  "markdown_content_list": markdown_content_list,
309
+ "concatenated_markdown_content": concatenated_markdown_content,
310
  "input_images": input_images,
311
  "api_response": result,
312
  }
 
381
  file_types=[".pdf", ".jpg", ".jpeg", ".png"],
382
  type="filepath",
383
  )
384
+ with gr.Row():
385
+ use_formula_recognition_cb = gr.Checkbox(
386
+ value=True, label="Use formula recognition"
387
+ )
388
+ use_chart_recognition_cb = gr.Checkbox(
389
+ value=False, label="Use chart recognition"
390
+ )
391
+ with gr.Row():
392
+ use_doc_orientation_classify_cb = gr.Checkbox(
393
+ value=False, label="Use document image orientation classification"
394
+ )
395
+ use_doc_unwarping_cb = gr.Checkbox(
396
+ value=False, label="Use text image unwarping"
397
+ )
398
+ with gr.Row():
399
+ use_textline_orientation_cb = gr.Checkbox(
400
+ value=False, label="Use text line orientation classification"
401
+ )
402
+ concatenate_pages_cb = gr.Checkbox(value=True, label="Concatenate pages")
403
  process_btn = gr.Button("Analyze Document", variant="primary")
404
  gr.Markdown(
405
+ f"""
406
+ 1. Only the first {MAX_NUM_PAGES} pages will be processed.
407
+ 2. Some formulas might not display correctly because of renderer limitations.
408
+ """
409
  )
410
 
411
  loading_spinner = gr.Column(visible=False, elem_classes=["loader-container"])
 
420
  # Results display section
421
  with gr.Column():
422
  gr.Markdown("### Results")
423
+ with gr.Row():
424
+ with gr.Column():
425
+ layout_ordering_images = []
426
+ for i in range(MAX_NUM_PAGES):
427
+ layout_ordering_images.append(
428
+ gr.Image(
429
+ label=f"Layout Ordering Image {i}",
430
+ show_label=True,
431
+ visible=False,
432
+ )
433
  )
434
+ with gr.Column():
435
+ markdown_display_list = []
436
+ for i in range(MAX_NUM_PAGES):
437
+ markdown_display_list.append(
438
+ gr.Markdown(
439
+ visible=False,
440
+ container=True,
441
+ show_copy_button=True,
442
+ latex_delimiters=[
443
+ {"left": "$$", "right": "$$", "display": True},
444
+ {"left": "$", "right": "$", "display": False},
445
+ ],
446
+ )
447
  )
 
448
 
449
  # Download section
450
  with gr.Column(elem_classes=["download-section"]):
 
454
 
455
  # Interaction logic
456
  def toggle_spinner():
457
+ return gr.Column(visible=True)
458
 
459
  def hide_spinner():
460
+ return gr.Column(visible=False)
461
 
462
+ def update_display(results, concatenate_pages):
463
+ if not results:
464
+ return gr.skip()
465
+
466
+ assert len(results["layout_ordering_images"]) <= MAX_NUM_PAGES, len(
467
+ results["layout_ordering_images"]
468
+ )
469
  ret_img = []
470
+ for img in results["layout_ordering_images"]:
471
+ ret_img.append(gr.Image(value=bytes_to_image(img), visible=True))
472
+ for _ in range(len(results["layout_ordering_images"]), MAX_NUM_PAGES):
473
+ ret_img.append(gr.Image(visible=False))
474
+
475
+ if concatenate_pages:
476
+ markdown_content = results["concatenated_markdown_content"]
477
+ ret_cont = [gr.Markdown(value=markdown_content, visible=True)]
478
+ for _ in range(1, MAX_NUM_PAGES):
479
+ ret_cont.append(gr.Markdown(visible=False))
480
+ else:
481
+ assert len(results["markdown_content_list"]) <= MAX_NUM_PAGES, len(
482
+ results["markdown_content_list"]
483
+ )
484
+ ret_cont = []
485
+ for cont in results["markdown_content_list"]:
486
+ ret_cont.append(gr.Markdown(value=cont, visible=True))
487
+ for _ in range(len(results["markdown_content_list"]), MAX_NUM_PAGES):
488
+ ret_cont.append(gr.Markdown(visible=False))
489
  return ret_img + ret_cont
490
 
491
  process_btn.click(toggle_spinner, outputs=[loading_spinner]).then(
492
+ process_file,
493
+ inputs=[
494
+ file_input,
495
+ use_formula_recognition_cb,
496
+ use_chart_recognition_cb,
497
+ use_doc_orientation_classify_cb,
498
+ use_doc_unwarping_cb,
499
+ use_textline_orientation_cb,
500
+ ],
501
+ outputs=[results_state],
502
  ).then(hide_spinner, outputs=[loading_spinner]).then(
503
  update_display,
504
+ inputs=[results_state, concatenate_pages_cb],
505
  outputs=layout_ordering_images + markdown_display_list,
506
  )
507
 
508
  download_all_btn.click(
509
  export_full_results, inputs=[results_state], outputs=[download_file]
510
+ ).success(lambda: gr.File(visible=True), outputs=[download_file])
511
 
512
 
513
  if __name__ == "__main__":
514
  demo.launch(
515
  server_name="0.0.0.0",
516
+ server_port=8860,
 
517
  favicon_path=LOGO_PATH,
518
  )