taprosoft
commited on
Commit
·
3bce890
1
Parent(s):
36add35
feat: add enable visualization flag
Browse files- app.py +33 -9
- backends/docling.py +4 -2
- backends/marker.py +3 -1
- backends/mineru.py +4 -0
- backends/settings.py +3 -0
- backends/unstructured.py +3 -1
app.py
CHANGED
@@ -11,6 +11,7 @@ from backends import (
|
|
11 |
convert_mineru,
|
12 |
convert_unstructured,
|
13 |
)
|
|
|
14 |
from utils import remove_images_from_markdown, trim_pages
|
15 |
|
16 |
TRIMMED_PDF_PATH = Path("/tmp/gradio/trim")
|
@@ -18,9 +19,9 @@ TRIMMED_PDF_PATH.mkdir(exist_ok=True)
|
|
18 |
|
19 |
|
20 |
def convert_document(path, method, enabled=True):
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
return "", "", []
|
25 |
|
26 |
# benchmarking
|
@@ -84,7 +85,6 @@ with gr.Blocks(
|
|
84 |
output_tabs = []
|
85 |
visualization_sub_tabs = []
|
86 |
first_method = supported_methods[0]
|
87 |
-
num_methods = len(supported_methods)
|
88 |
|
89 |
with gr.Row():
|
90 |
with gr.Column(variant="panel", scale=5):
|
@@ -106,7 +106,9 @@ with gr.Blocks(
|
|
106 |
)
|
107 |
with gr.Row():
|
108 |
visual_checkbox = gr.Checkbox(
|
109 |
-
label="Enable debug visualizations",
|
|
|
|
|
110 |
)
|
111 |
with gr.Row():
|
112 |
convert_btn = gr.Button("Convert", variant="primary", scale=2)
|
@@ -134,7 +136,10 @@ with gr.Blocks(
|
|
134 |
line_breaks=True,
|
135 |
latex_delimiters=latex_delimiters,
|
136 |
)
|
137 |
-
with gr.Tab(
|
|
|
|
|
|
|
138 |
debug_images = gr.Gallery(
|
139 |
show_label=False,
|
140 |
container=False,
|
@@ -159,16 +164,35 @@ with gr.Blocks(
|
|
159 |
)
|
160 |
for idx, method in enumerate(supported_methods):
|
161 |
|
162 |
-
def progress_message(
|
163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
def process_method(input_file, selected_methods, method=method):
|
|
|
|
|
166 |
return convert_document(
|
167 |
input_file, method=method, enabled=method in selected_methods
|
168 |
)
|
169 |
|
170 |
click_event = click_event.then(
|
171 |
-
fn=lambda
|
|
|
172 |
outputs=[progress_status],
|
173 |
).then(
|
174 |
fn=lambda input_file, methods, method=method: process_method(
|
|
|
11 |
convert_mineru,
|
12 |
convert_unstructured,
|
13 |
)
|
14 |
+
from backends.settings import ENABLE_DEBUG_MODE
|
15 |
from utils import remove_images_from_markdown, trim_pages
|
16 |
|
17 |
TRIMMED_PDF_PATH = Path("/tmp/gradio/trim")
|
|
|
19 |
|
20 |
|
21 |
def convert_document(path, method, enabled=True):
|
22 |
+
if enabled:
|
23 |
+
print("Processing file", path, "with method", method)
|
24 |
+
else:
|
25 |
return "", "", []
|
26 |
|
27 |
# benchmarking
|
|
|
85 |
output_tabs = []
|
86 |
visualization_sub_tabs = []
|
87 |
first_method = supported_methods[0]
|
|
|
88 |
|
89 |
with gr.Row():
|
90 |
with gr.Column(variant="panel", scale=5):
|
|
|
106 |
)
|
107 |
with gr.Row():
|
108 |
visual_checkbox = gr.Checkbox(
|
109 |
+
label="Enable debug visualizations",
|
110 |
+
visible=ENABLE_DEBUG_MODE,
|
111 |
+
value=True,
|
112 |
)
|
113 |
with gr.Row():
|
114 |
convert_btn = gr.Button("Convert", variant="primary", scale=2)
|
|
|
136 |
line_breaks=True,
|
137 |
latex_delimiters=latex_delimiters,
|
138 |
)
|
139 |
+
with gr.Tab(
|
140 |
+
"Debug visualizations",
|
141 |
+
visible=ENABLE_DEBUG_MODE,
|
142 |
+
) as visual_sub_tab:
|
143 |
debug_images = gr.Gallery(
|
144 |
show_label=False,
|
145 |
container=False,
|
|
|
164 |
)
|
165 |
for idx, method in enumerate(supported_methods):
|
166 |
|
167 |
+
def progress_message(selected_methods, method=method):
|
168 |
+
selected_methods_indices = [
|
169 |
+
idx
|
170 |
+
for idx, current_method in enumerate(supported_methods)
|
171 |
+
if current_method in selected_methods
|
172 |
+
]
|
173 |
+
try:
|
174 |
+
current_method_idx = selected_methods_indices.index(
|
175 |
+
supported_methods.index(method)
|
176 |
+
)
|
177 |
+
msg = (
|
178 |
+
f"Processing ({current_method_idx + 1} / "
|
179 |
+
f"{len(selected_methods)}) **{method}**...\n\n"
|
180 |
+
)
|
181 |
+
except ValueError:
|
182 |
+
msg = gr.update()
|
183 |
+
|
184 |
+
return msg
|
185 |
|
186 |
def process_method(input_file, selected_methods, method=method):
|
187 |
+
if input_file is None:
|
188 |
+
raise ValueError("Please upload a PDF file first!")
|
189 |
return convert_document(
|
190 |
input_file, method=method, enabled=method in selected_methods
|
191 |
)
|
192 |
|
193 |
click_event = click_event.then(
|
194 |
+
fn=lambda methods, method=method: progress_message(methods, method),
|
195 |
+
inputs=[methods],
|
196 |
outputs=[progress_status],
|
197 |
).then(
|
198 |
fn=lambda input_file, methods, method=method: process_method(
|
backends/docling.py
CHANGED
@@ -10,6 +10,8 @@ from docling.datamodel.settings import settings
|
|
10 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
11 |
from docling_core.types.doc import ImageRefMode
|
12 |
|
|
|
|
|
13 |
DOCLING_DEBUG_PATH = Path("/tmp/docling")
|
14 |
|
15 |
# Docling settings
|
@@ -23,8 +25,8 @@ pipeline_options.images_scale = 2.0
|
|
23 |
|
24 |
# debug visualization settings
|
25 |
settings.debug.debug_output_path = str(DOCLING_DEBUG_PATH)
|
26 |
-
settings.debug.visualize_layout =
|
27 |
-
settings.debug.visualize_tables =
|
28 |
|
29 |
# Docling init
|
30 |
docling_converter = DocumentConverter(
|
|
|
10 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
11 |
from docling_core.types.doc import ImageRefMode
|
12 |
|
13 |
+
from .settings import ENABLE_DEBUG_MODE
|
14 |
+
|
15 |
DOCLING_DEBUG_PATH = Path("/tmp/docling")
|
16 |
|
17 |
# Docling settings
|
|
|
25 |
|
26 |
# debug visualization settings
|
27 |
settings.debug.debug_output_path = str(DOCLING_DEBUG_PATH)
|
28 |
+
settings.debug.visualize_layout = ENABLE_DEBUG_MODE
|
29 |
+
settings.debug.visualize_tables = ENABLE_DEBUG_MODE
|
30 |
|
31 |
# Docling init
|
32 |
docling_converter = DocumentConverter(
|
backends/marker.py
CHANGED
@@ -8,11 +8,13 @@ from marker.models import create_model_dict
|
|
8 |
from marker.output import text_from_rendered
|
9 |
from marker.settings import settings
|
10 |
|
|
|
|
|
11 |
# Marker init
|
12 |
marker_converter = PdfConverter(
|
13 |
artifact_dict=create_model_dict(),
|
14 |
config={
|
15 |
-
"debug_pdf_images":
|
16 |
},
|
17 |
)
|
18 |
|
|
|
8 |
from marker.output import text_from_rendered
|
9 |
from marker.settings import settings
|
10 |
|
11 |
+
from .settings import ENABLE_DEBUG_MODE
|
12 |
+
|
13 |
# Marker init
|
14 |
marker_converter = PdfConverter(
|
15 |
artifact_dict=create_model_dict(),
|
16 |
config={
|
17 |
+
"debug_pdf_images": ENABLE_DEBUG_MODE,
|
18 |
},
|
19 |
)
|
20 |
|
backends/mineru.py
CHANGED
@@ -7,6 +7,8 @@ import pymupdf
|
|
7 |
from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
8 |
from magic_pdf.tools.common import do_parse, prepare_env
|
9 |
|
|
|
|
|
10 |
MINERU_DEBUG_PATH = Path("/tmp/mineru")
|
11 |
MINERU_DEBUG_PATH.mkdir(exist_ok=True)
|
12 |
|
@@ -48,6 +50,8 @@ def do_process_mineru(input_path, output_dir):
|
|
48 |
parse_method,
|
49 |
debug_able=False,
|
50 |
f_dump_orig_pdf=False,
|
|
|
|
|
51 |
formula_enable=False,
|
52 |
table_enable=True,
|
53 |
)
|
|
|
7 |
from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
8 |
from magic_pdf.tools.common import do_parse, prepare_env
|
9 |
|
10 |
+
from .settings import ENABLE_DEBUG_MODE
|
11 |
+
|
12 |
MINERU_DEBUG_PATH = Path("/tmp/mineru")
|
13 |
MINERU_DEBUG_PATH.mkdir(exist_ok=True)
|
14 |
|
|
|
50 |
parse_method,
|
51 |
debug_able=False,
|
52 |
f_dump_orig_pdf=False,
|
53 |
+
f_draw_layout_bbox=ENABLE_DEBUG_MODE,
|
54 |
+
f_draw_char_bbox=ENABLE_DEBUG_MODE,
|
55 |
formula_enable=False,
|
56 |
table_enable=True,
|
57 |
)
|
backends/settings.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
ENABLE_DEBUG_MODE = os.environ.get("ENABLE_DEBUG_MODE", "True").lower() == "true"
|
backends/unstructured.py
CHANGED
@@ -5,6 +5,8 @@ from matplotlib import font_manager
|
|
5 |
from unstructured.partition.pdf import partition_pdf
|
6 |
from unstructured.partition.pdf_image.analysis import bbox_visualisation
|
7 |
|
|
|
|
|
8 |
UNSTRUCTURED_DEBUG_PATH = Path("/tmp/unstructured")
|
9 |
|
10 |
|
@@ -59,7 +61,7 @@ def convert_unstructured(path: str, file_name: str):
|
|
59 |
# extract_images_in_pdf=True,
|
60 |
extract_image_block_types=["Image", "Table"],
|
61 |
extract_image_block_to_payload=True,
|
62 |
-
analysis=
|
63 |
analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH,
|
64 |
)
|
65 |
text = convert_elements_to_markdown(elements)
|
|
|
5 |
from unstructured.partition.pdf import partition_pdf
|
6 |
from unstructured.partition.pdf_image.analysis import bbox_visualisation
|
7 |
|
8 |
+
from .settings import ENABLE_DEBUG_MODE
|
9 |
+
|
10 |
UNSTRUCTURED_DEBUG_PATH = Path("/tmp/unstructured")
|
11 |
|
12 |
|
|
|
61 |
# extract_images_in_pdf=True,
|
62 |
extract_image_block_types=["Image", "Table"],
|
63 |
extract_image_block_to_payload=True,
|
64 |
+
analysis=ENABLE_DEBUG_MODE,
|
65 |
analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH,
|
66 |
)
|
67 |
text = convert_elements_to_markdown(elements)
|