taprosoft
commited on
Commit
·
7e20950
1
Parent(s):
7e604f0
fix: minor update backend
Browse files- .gitignore +1 -0
- backends/mineru.py +2 -2
- backends/unstructured.py +0 -1
.gitignore
CHANGED
@@ -465,3 +465,4 @@ S.gpg-agent*
|
|
465 |
.vscode/settings.json
|
466 |
examples/example1/assets
|
467 |
storage/*
|
|
|
|
465 |
.vscode/settings.json
|
466 |
examples/example1/assets
|
467 |
storage/*
|
468 |
+
debug_data/*
|
backends/mineru.py
CHANGED
@@ -41,7 +41,7 @@ def do_process_mineru(input_path, output_dir):
|
|
41 |
|
42 |
pdf_data = read_fn(input_path)
|
43 |
parse_method = "auto"
|
44 |
-
|
45 |
do_parse(
|
46 |
output_dir,
|
47 |
file_name,
|
@@ -51,7 +51,7 @@ def do_process_mineru(input_path, output_dir):
|
|
51 |
debug_able=False,
|
52 |
f_dump_orig_pdf=False,
|
53 |
f_draw_layout_bbox=ENABLE_DEBUG_MODE,
|
54 |
-
f_draw_char_bbox=
|
55 |
formula_enable=False,
|
56 |
table_enable=True,
|
57 |
)
|
|
|
41 |
|
42 |
pdf_data = read_fn(input_path)
|
43 |
parse_method = "auto"
|
44 |
+
_, local_md_dir = prepare_env(output_dir, file_name, parse_method)
|
45 |
do_parse(
|
46 |
output_dir,
|
47 |
file_name,
|
|
|
51 |
debug_able=False,
|
52 |
f_dump_orig_pdf=False,
|
53 |
f_draw_layout_bbox=ENABLE_DEBUG_MODE,
|
54 |
+
f_draw_char_bbox=False,
|
55 |
formula_enable=False,
|
56 |
table_enable=True,
|
57 |
)
|
backends/unstructured.py
CHANGED
@@ -58,7 +58,6 @@ def convert_unstructured(path: str, file_name: str):
|
|
58 |
# mandatory to use ``hi_res`` strategy
|
59 |
strategy="hi_res",
|
60 |
infer_table_structure=True,
|
61 |
-
# extract_images_in_pdf=True,
|
62 |
extract_image_block_types=["Image", "Table"],
|
63 |
extract_image_block_to_payload=True,
|
64 |
analysis=ENABLE_DEBUG_MODE,
|
|
|
58 |
# mandatory to use ``hi_res`` strategy
|
59 |
strategy="hi_res",
|
60 |
infer_table_structure=True,
|
|
|
61 |
extract_image_block_types=["Image", "Table"],
|
62 |
extract_image_block_to_payload=True,
|
63 |
analysis=ENABLE_DEBUG_MODE,
|