taprosoft commited on
Commit
77fbded
·
0 Parent(s):

feat: initial commit

Browse files
Files changed (9) hide show
  1. README.md +13 -0
  2. app.py +200 -0
  3. backends/__init__.py +11 -0
  4. backends/docling.py +47 -0
  5. backends/marker.py +24 -0
  6. backends/mineru.py +56 -0
  7. backends/unstructured.py +68 -0
  8. header.html +47 -0
  9. utils.py +29 -0
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: DoclingConverter
3
+ emoji: 🐢
4
+ colorFrom: blue
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 5.7.1
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: Convert documents to Markdown or JSON with metadata
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from pathlib import Path
3
+
4
+ import gradio as gr
5
+ import pymupdf4llm
6
+ from gradio_pdf import PDF
7
+
8
+ from backends import (
9
+ convert_docling,
10
+ convert_marker,
11
+ convert_mineru,
12
+ convert_unstructured,
13
+ )
14
+ from utils import remove_images_from_markdown, trim_pages
15
+
16
+ TRIMMED_PDF_PATH = Path("/tmp/gradio/trim")
17
+ TRIMMED_PDF_PATH.mkdir(exist_ok=True)
18
+
19
+
20
+ def convert_document(path, method, enabled=True):
21
+ print("Processing file", path, "with method", method, "enabled", enabled)
22
+
23
+ if not enabled:
24
+ return "", "", []
25
+
26
+ # benchmarking
27
+ start = time.time()
28
+
29
+ path = trim_pages(path, output_path=TRIMMED_PDF_PATH)
30
+ file_name = Path(path).stem
31
+ debug_image_paths = []
32
+ text = "unknown method"
33
+
34
+ if method == "Docling":
35
+ text, debug_image_paths = convert_docling(path, file_name)
36
+ elif method == "Marker":
37
+ text, debug_image_paths = convert_marker(path, file_name)
38
+ elif method == "Unstructured":
39
+ text, debug_image_paths = convert_unstructured(path, file_name)
40
+ elif method == "PyMuPDF":
41
+ text = pymupdf4llm.to_markdown(
42
+ path,
43
+ embed_images=True,
44
+ )
45
+ elif method == "MinerU":
46
+ text, debug_image_paths = convert_mineru(path, file_name)
47
+
48
+ end = time.time()
49
+ print(f"Conversion with {method} took {end - start} seconds")
50
+ return text, remove_images_from_markdown(text), debug_image_paths
51
+
52
+
53
+ def show_tabs(selected_methods):
54
+ visible_tabs = []
55
+ for method in supported_methods:
56
+ visible_tabs.append(gr.update(visible=method in selected_methods))
57
+
58
+ return visible_tabs
59
+
60
+
61
+ latex_delimiters = [
62
+ {"left": "$$", "right": "$$", "display": True},
63
+ {"left": "$", "right": "$", "display": False},
64
+ ]
65
+
66
+ # startup test (also for loading models the first time)
67
+ start_startup = time.time()
68
+ test_pdf_path = "/home/tadashi/MinerU/examples/complex_layout.pdf"
69
+ supported_methods = ["Docling", "Marker", "Unstructured", "MinerU", "PyMuPDF"]
70
+
71
+ # print("Warm-up sequence")
72
+ # for method in supported_methods:
73
+ # for _ in range(1):
74
+ # convert_document(test_pdf_path, method)
75
+ # print("Start up time", time.time() - start_startup, "seconds")
76
+
77
+ with gr.Blocks(
78
+ theme=gr.themes.Ocean(),
79
+ ) as demo:
80
+ with open("header.html", "r") as file:
81
+ header = file.read()
82
+ gr.HTML(header)
83
+ output_components = []
84
+ output_tabs = []
85
+ visualization_sub_tabs = []
86
+ first_method = supported_methods[0]
87
+ num_methods = len(supported_methods)
88
+
89
+ with gr.Row():
90
+ with gr.Column(variant="panel", scale=5):
91
+ input_file = gr.File(
92
+ label="Upload PDF document",
93
+ file_types=[
94
+ ".pdf",
95
+ ],
96
+ )
97
+ progress_status = gr.Markdown("", show_label=False, container=False)
98
+
99
+ with gr.Column(variant="panel", scale=5):
100
+ with gr.Row():
101
+ methods = gr.Dropdown(
102
+ supported_methods,
103
+ label="Conversion methods",
104
+ value=first_method,
105
+ multiselect=True,
106
+ )
107
+ with gr.Row():
108
+ visual_checkbox = gr.Checkbox(
109
+ label="Enable debug visualizations", value=True
110
+ )
111
+ with gr.Row():
112
+ convert_btn = gr.Button("Convert", variant="primary", scale=2)
113
+ clear_btn = gr.ClearButton(value="Clear", scale=1)
114
+
115
+ with gr.Row():
116
+ with gr.Column(variant="panel", scale=5):
117
+ pdf_preview = PDF(
118
+ label="PDF preview",
119
+ interactive=False,
120
+ visible=True,
121
+ height=800,
122
+ )
123
+
124
+ with gr.Column(variant="panel", scale=5):
125
+ with gr.Tabs():
126
+ for method in supported_methods:
127
+ with gr.Tab(method, visible=False) as output_tab:
128
+ with gr.Tabs():
129
+ with gr.Tab("Markdown rendering"):
130
+ markdown_render = gr.Markdown(
131
+ label="Markdown rendering",
132
+ height=900,
133
+ show_copy_button=True,
134
+ line_breaks=True,
135
+ latex_delimiters=latex_delimiters,
136
+ )
137
+ with gr.Tab("Debug visualizations") as visual_sub_tab:
138
+ debug_images = gr.Gallery(
139
+ show_label=False,
140
+ container=False,
141
+ interactive=False,
142
+ )
143
+ with gr.Tab("Raw text"):
144
+ markdown_text = gr.TextArea(
145
+ lines=45, show_label=False, container=False
146
+ )
147
+
148
+ output_components.extend(
149
+ [markdown_render, markdown_text, debug_images]
150
+ )
151
+ output_tabs.append(output_tab)
152
+ visualization_sub_tabs.append(visual_sub_tab)
153
+
154
+ input_file.change(fn=lambda x: x, inputs=input_file, outputs=pdf_preview)
155
+ click_event = convert_btn.click(
156
+ fn=show_tabs,
157
+ inputs=[methods],
158
+ outputs=output_tabs,
159
+ )
160
+ for idx, method in enumerate(supported_methods):
161
+
162
+ def progress_message(idx=idx, method=method):
163
+ return f"Processing ({idx + 1} / {num_methods}) **{method}**...\n\n"
164
+
165
+ def process_method(input_file, selected_methods, method=method):
166
+ return convert_document(
167
+ input_file, method=method, enabled=method in selected_methods
168
+ )
169
+
170
+ click_event = click_event.then(
171
+ fn=lambda idx=idx, method=method: progress_message(idx, method),
172
+ outputs=[progress_status],
173
+ ).then(
174
+ fn=lambda input_file, methods, method=method: process_method(
175
+ input_file, methods, method
176
+ ),
177
+ inputs=[input_file, methods],
178
+ outputs=output_components[idx * 3 : (idx + 1) * 3],
179
+ )
180
+
181
+ click_event.then(
182
+ lambda: "All tasks completed.",
183
+ outputs=[progress_status],
184
+ )
185
+
186
+ clear_btn.add(
187
+ [
188
+ input_file,
189
+ pdf_preview,
190
+ ]
191
+ + output_components
192
+ )
193
+
194
+ visual_checkbox.change(
195
+ fn=lambda state: [gr.update(visible=state)] * len(visualization_sub_tabs),
196
+ inputs=visual_checkbox,
197
+ outputs=visualization_sub_tabs,
198
+ )
199
+
200
+ demo.launch(show_error=True)
backends/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .docling import convert_docling
2
+ from .marker import convert_marker
3
+ from .mineru import convert_mineru
4
+ from .unstructured import convert_unstructured
5
+
6
+ __all__ = [
7
+ "convert_docling",
8
+ "convert_marker",
9
+ "convert_mineru",
10
+ "convert_unstructured",
11
+ ]
backends/docling.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from docling.datamodel.base_models import InputFormat
4
+ from docling.datamodel.pipeline_options import (
5
+ AcceleratorDevice,
6
+ AcceleratorOptions,
7
+ PdfPipelineOptions,
8
+ )
9
+ from docling.datamodel.settings import settings
10
+ from docling.document_converter import DocumentConverter, PdfFormatOption
11
+ from docling_core.types.doc import ImageRefMode
12
+
13
+ DOCLING_DEBUG_PATH = Path("/tmp/docling")
14
+
15
+ # Docling settings
16
+ accelerator_options = AcceleratorOptions(num_threads=8, device=AcceleratorDevice.AUTO)
17
+ pipeline_options = PdfPipelineOptions()
18
+ pipeline_options.accelerator_options = accelerator_options
19
+ pipeline_options.do_ocr = True
20
+ pipeline_options.do_table_structure = True
21
+ pipeline_options.generate_picture_images = True
22
+ pipeline_options.images_scale = 2.0
23
+
24
+ # debug visualization settings
25
+ settings.debug.debug_output_path = str(DOCLING_DEBUG_PATH)
26
+ settings.debug.visualize_layout = True
27
+ settings.debug.visualize_tables = True
28
+
29
+ # Docling init
30
+ docling_converter = DocumentConverter(
31
+ format_options={
32
+ InputFormat.PDF: PdfFormatOption(
33
+ pipeline_options=pipeline_options,
34
+ )
35
+ }
36
+ )
37
+
38
+
39
+ def convert_docling(path: str, file_name: str):
40
+ result = docling_converter.convert(path)
41
+ text = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
42
+ debug_image_dir = DOCLING_DEBUG_PATH / f"debug_{file_name}"
43
+ debug_image_paths = [
44
+ path for path in debug_image_dir.iterdir() if path.suffix == ".png"
45
+ ]
46
+
47
+ return text, debug_image_paths
backends/marker.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from marker.converters.pdf import PdfConverter
4
+ from marker.models import create_model_dict
5
+ from marker.output import text_from_rendered
6
+
7
+ # Marker init
8
+ marker_converter = PdfConverter(
9
+ artifact_dict=create_model_dict(),
10
+ config={
11
+ "debug_pdf_images": True,
12
+ },
13
+ )
14
+
15
+
16
+ def convert_marker(path: str, file_name: str):
17
+ rendered = marker_converter(path)
18
+ text, _, images = text_from_rendered(rendered)
19
+ debug_image_dir = Path(rendered.metadata.get("debug_data_path"))
20
+ debug_image_paths = [
21
+ path for path in debug_image_dir.iterdir() if "pdf_page" in path.stem
22
+ ]
23
+
24
+ return text, debug_image_paths
backends/mineru.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import pymupdf
4
+ from magic_pdf.data.data_reader_writer import FileBasedDataReader
5
+ from magic_pdf.tools.common import do_parse, prepare_env
6
+
7
+ MINERU_DEBUG_PATH = Path("/tmp/mineru")
8
+ MINERU_DEBUG_PATH.mkdir(exist_ok=True)
9
+
10
+
11
+ def read_fn(path):
12
+ disk_rw = FileBasedDataReader(MINERU_DEBUG_PATH)
13
+ return disk_rw.read(path)
14
+
15
+
16
+ def do_process_mineru(input_path, output_dir):
17
+ file_name = Path(input_path).stem
18
+ output_dir = Path(output_dir)
19
+
20
+ pdf_data = read_fn(input_path)
21
+ parse_method = "auto"
22
+ local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
23
+ do_parse(
24
+ output_dir,
25
+ file_name,
26
+ pdf_data,
27
+ [],
28
+ parse_method,
29
+ debug_able=False,
30
+ f_dump_orig_pdf=False,
31
+ formula_enable=False,
32
+ table_enable=True,
33
+ )
34
+ return local_md_dir, file_name
35
+
36
+
37
+ def convert_mineru(path: str, file_name: str):
38
+ debug_image_paths = []
39
+ output_path = MINERU_DEBUG_PATH / file_name
40
+ output_path.mkdir(exist_ok=True)
41
+
42
+ local_md_dir, _ = do_process_mineru(path, output_path)
43
+ local_md_dir = Path(local_md_dir)
44
+
45
+ with open(local_md_dir / f"{file_name}.md", "r") as file:
46
+ text = file.read()
47
+
48
+ debug_pdf = str(local_md_dir / (file_name + "_layout.pdf"))
49
+ doc = pymupdf.open(debug_pdf) # open document
50
+ for page in doc: # iterate through the pages
51
+ pix = page.get_pixmap() # render page to an image
52
+ page_debug_path = str(output_path / ("page-%i.png" % page.number))
53
+ debug_image_paths.append(page_debug_path)
54
+ pix.save(page_debug_path) # store image as a PNG
55
+
56
+ return text, debug_image_paths
backends/unstructured.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import functools
2
+ from pathlib import Path
3
+
4
+ from matplotlib import font_manager
5
+ from unstructured.partition.pdf import partition_pdf
6
+ from unstructured.partition.pdf_image.analysis import bbox_visualisation
7
+
8
+ UNSTRUCTURED_DEBUG_PATH = Path("/tmp/unstructured")
9
+
10
+
11
+ def convert_elements_to_markdown(elements):
12
+ lines = []
13
+
14
+ for e in elements:
15
+ if e.category == "Title":
16
+ line = f"\n# {e.text}\n"
17
+ elif e.category == "ListItem":
18
+ line = f"- {e.text}"
19
+ elif e.category == "Table":
20
+ line = f"\n{e.metadata.text_as_html}\n"
21
+ elif e.category == "UncategorizedText":
22
+ line = ""
23
+ else:
24
+ line = e.text
25
+
26
+ lines.append(line)
27
+
28
+ md = "\n".join(lines)
29
+ return md
30
+
31
+
32
+ @functools.lru_cache(maxsize=None)
33
+ def get_font():
34
+ preferred_fonts = ["Arial.ttf", "DejaVuSans.ttf"]
35
+ available_fonts = font_manager.findSystemFonts()
36
+ if not available_fonts:
37
+ raise ValueError("No fonts available")
38
+ for font in preferred_fonts:
39
+ for available_font in available_fonts:
40
+ if font in available_font:
41
+ return available_font
42
+
43
+ return available_fonts[0]
44
+
45
+
46
+ # monkey patch
47
+ bbox_visualisation.get_font = get_font
48
+
49
+
50
+ def convert_unstructured(path: str, file_name: str):
51
+ elements = partition_pdf(
52
+ filename=path,
53
+ # mandatory to use ``hi_res`` strategy
54
+ strategy="hi_res",
55
+ infer_table_structure=True,
56
+ # extract_images_in_pdf=True,
57
+ # extract_image_block_types=["Image", "Table"],
58
+ # extract_image_block_to_payload=False,
59
+ analysis=True,
60
+ analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH,
61
+ )
62
+ text = convert_elements_to_markdown(elements)
63
+ debug_image_dir = UNSTRUCTURED_DEBUG_PATH / "analysis" / file_name / "bboxes"
64
+ debug_image_paths = [
65
+ path for path in debug_image_dir.iterdir() if "od_model" in path.stem
66
+ ]
67
+
68
+ return text, debug_image_paths
header.html ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <html>
2
+ <head>
3
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
4
+ </head>
5
+
6
+ <body>
7
+ <div style="
8
+ display: flex;
9
+ flex-direction: column;
10
+ justify-content: center;
11
+ align-items: center;
12
+ text-align: center;
13
+ background: #059669;
14
+ padding: 18px;
15
+ gap: 18px;
16
+ border-radius: 8px;
17
+ ">
18
+ <div style="
19
+ display: flex;
20
+ flex-direction: column;
21
+ align-items: center;
22
+ gap: 12px;
23
+ ">
24
+ <div style="display: flex; flex-direction: column; gap: 8px">
25
+ <h1 style="
26
+ font-size: 48px;
27
+ color: #fafafa;
28
+ margin: 0;
29
+ font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
30
+ 'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
31
+ ">
32
+ PDF Parsers Playground
33
+ </h1>
34
+ </div>
35
+ </div>
36
+
37
+ <p style="
38
+ margin: 0;
39
+ line-height: 1.6rem;
40
+ font-size: 16px;
41
+ color: #fafafa;
42
+ opacity: 0.8;
43
+ ">
44
+ Playground for quick and easy experiment with many popular open-source PDF parsers.<br>
45
+ </p>
46
+ </div>
47
+ </body></html>
utils.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from pathlib import Path
3
+ from shutil import copy2
4
+
5
+ import pymupdf
6
+
7
+
8
+ def remove_images_from_markdown(markdown_text):
9
+ # remove <image> and ![image](path) from markdown
10
+ markdown_text = re.sub(r"<img[^>]*>", "", markdown_text)
11
+ markdown_text = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", markdown_text)
12
+ return markdown_text
13
+
14
+
15
+ def trim_pages(pdf_path, output_path, trim_pages=5):
16
+ doc = pymupdf.open(pdf_path)
17
+ parent_dir_name = Path(pdf_path).parent.name
18
+ output_file_path = Path(output_path) / f"{parent_dir_name}.pdf"
19
+
20
+ num_pages = len(doc)
21
+ if num_pages > trim_pages:
22
+ to_select = list(range(trim_pages))
23
+ doc.select(to_select)
24
+ doc.ez_save(output_file_path)
25
+ print("Trimmed pdf to with pages", to_select, "path", output_file_path)
26
+ else:
27
+ copy2(pdf_path, str(output_file_path))
28
+
29
+ return str(output_file_path)