taprosoft
commited on
Commit
·
b7d4a95
1
Parent(s):
465d368
fix: add examples
Browse files- app.py +35 -21
- examples/academic_paper_figure.pdf +0 -0
- examples/academic_paper_formula.pdf +0 -0
- examples/complex_layout.pdf +0 -0
- examples/handwriting_form.pdf +0 -0
- examples/invoice.pdf +0 -0
- examples/magazine_complex_layout.pdf +0 -0
- table.pdf → examples/table.pdf +0 -0
app.py
CHANGED
@@ -70,9 +70,9 @@ def convert_document(path, method, start_page=0, enabled=True):
|
|
70 |
text, debug_image_paths = convert_sycamore(path, file_name)
|
71 |
# elif method == "Zerox":
|
72 |
# text, debug_image_paths = convert_zerox(path, file_name)
|
73 |
-
elif method == "Img2Table":
|
74 |
text, debug_image_paths = convert_img2table(path, file_name)
|
75 |
-
elif method == "GMFT":
|
76 |
text, debug_image_paths = convert_gmft(path, file_name)
|
77 |
else:
|
78 |
raise ValueError(f"Unsupported method: {method}")
|
@@ -148,7 +148,7 @@ latex_delimiters = [
|
|
148 |
|
149 |
# startup test (also for loading models the first time)
|
150 |
start_startup = time.time()
|
151 |
-
WARMUP_PDF_PATH = "table.pdf"
|
152 |
SUPPORTED_METHODS = [
|
153 |
"PyMuPDF",
|
154 |
"Docling",
|
@@ -156,8 +156,8 @@ SUPPORTED_METHODS = [
|
|
156 |
"MinerU",
|
157 |
"Unstructured",
|
158 |
"Gemini (API)",
|
159 |
-
"Img2Table",
|
160 |
-
"GMFT",
|
161 |
"Sycamore",
|
162 |
# "Zerox"
|
163 |
]
|
@@ -188,21 +188,15 @@ with gr.Blocks(
|
|
188 |
".pdf",
|
189 |
],
|
190 |
)
|
191 |
-
with gr.Accordion(
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
value=1,
|
201 |
-
)
|
202 |
-
visual_checkbox = gr.Checkbox(
|
203 |
-
label="Enable debug visualization",
|
204 |
-
visible=ENABLE_DEBUG_MODE,
|
205 |
-
value=True,
|
206 |
)
|
207 |
progress_status = gr.Markdown("", show_label=False, container=False)
|
208 |
output_file = gr.File(
|
@@ -219,6 +213,26 @@ with gr.Blocks(
|
|
219 |
value=SUPPORTED_METHODS[:2],
|
220 |
multiselect=True,
|
221 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
with gr.Row():
|
223 |
convert_btn = gr.Button("Convert", variant="primary", scale=2)
|
224 |
clear_btn = gr.ClearButton(value="Clear", scale=1)
|
@@ -358,7 +372,7 @@ with gr.Blocks(
|
|
358 |
outputs=visualization_sub_tabs,
|
359 |
)
|
360 |
|
361 |
-
demo.launch(
|
362 |
show_error=True,
|
363 |
max_file_size="50mb",
|
364 |
)
|
|
|
70 |
text, debug_image_paths = convert_sycamore(path, file_name)
|
71 |
# elif method == "Zerox":
|
72 |
# text, debug_image_paths = convert_zerox(path, file_name)
|
73 |
+
elif method == "Img2Table (table-only)":
|
74 |
text, debug_image_paths = convert_img2table(path, file_name)
|
75 |
+
elif method == "GMFT (table-only)":
|
76 |
text, debug_image_paths = convert_gmft(path, file_name)
|
77 |
else:
|
78 |
raise ValueError(f"Unsupported method: {method}")
|
|
|
148 |
|
149 |
# startup test (also for loading models the first time)
|
150 |
start_startup = time.time()
|
151 |
+
WARMUP_PDF_PATH = "examples/table.pdf"
|
152 |
SUPPORTED_METHODS = [
|
153 |
"PyMuPDF",
|
154 |
"Docling",
|
|
|
156 |
"MinerU",
|
157 |
"Unstructured",
|
158 |
"Gemini (API)",
|
159 |
+
"Img2Table (table-only)",
|
160 |
+
"GMFT (table-only)",
|
161 |
"Sycamore",
|
162 |
# "Zerox"
|
163 |
]
|
|
|
188 |
".pdf",
|
189 |
],
|
190 |
)
|
191 |
+
with gr.Accordion("Examples:"):
|
192 |
+
example_root = os.path.join(os.path.dirname(__file__), "examples")
|
193 |
+
gr.Examples(
|
194 |
+
examples=[
|
195 |
+
os.path.join(example_root, _)
|
196 |
+
for _ in os.listdir(example_root)
|
197 |
+
if _.endswith("pdf")
|
198 |
+
],
|
199 |
+
inputs=input_file,
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
)
|
201 |
progress_status = gr.Markdown("", show_label=False, container=False)
|
202 |
output_file = gr.File(
|
|
|
213 |
value=SUPPORTED_METHODS[:2],
|
214 |
multiselect=True,
|
215 |
)
|
216 |
+
with gr.Row():
|
217 |
+
with gr.Accordion(
|
218 |
+
"Advanced settings",
|
219 |
+
open=False,
|
220 |
+
):
|
221 |
+
start_page = gr.Number(
|
222 |
+
label=(
|
223 |
+
"Starting page (only max 5 "
|
224 |
+
"consecutive pages are processed)"
|
225 |
+
),
|
226 |
+
minimum=1,
|
227 |
+
maximum=100,
|
228 |
+
step=1,
|
229 |
+
value=1,
|
230 |
+
)
|
231 |
+
visual_checkbox = gr.Checkbox(
|
232 |
+
label="Enable debug visualization",
|
233 |
+
visible=ENABLE_DEBUG_MODE,
|
234 |
+
value=True,
|
235 |
+
)
|
236 |
with gr.Row():
|
237 |
convert_btn = gr.Button("Convert", variant="primary", scale=2)
|
238 |
clear_btn = gr.ClearButton(value="Clear", scale=1)
|
|
|
372 |
outputs=visualization_sub_tabs,
|
373 |
)
|
374 |
|
375 |
+
demo.queue(default_concurrency_limit=2,).launch(
|
376 |
show_error=True,
|
377 |
max_file_size="50mb",
|
378 |
)
|
examples/academic_paper_figure.pdf
ADDED
Binary file (63.2 kB). View file
|
|
examples/academic_paper_formula.pdf
ADDED
Binary file (42.1 kB). View file
|
|
examples/complex_layout.pdf
ADDED
Binary file (43.1 kB). View file
|
|
examples/handwriting_form.pdf
ADDED
The diff for this file is too large to render.
See raw diff
|
|
examples/invoice.pdf
ADDED
Binary file (561 kB). View file
|
|
examples/magazine_complex_layout.pdf
ADDED
Binary file (391 kB). View file
|
|
table.pdf → examples/table.pdf
RENAMED
File without changes
|