taprosoft
		
	commited on
		
		
					Commit 
							
							·
						
						acbe414
	
1
								Parent(s):
							
							df456bd
								
feat: add img2table gmft
Browse files- app.py +134 -24
- backends/__init__.py +8 -0
- backends/gemini.py +43 -0
- backends/gmft.py +54 -0
- backends/img2table.py +55 -0
- backends/zerox.py +43 -0
- requirements.txt +3 -1
- utils.py +2 -2
    	
        app.py
    CHANGED
    
    | @@ -1,3 +1,7 @@ | |
|  | |
|  | |
|  | |
|  | |
| 1 | 
             
            from utils import fix_problematic_imports, prepare_env_mineru
         | 
| 2 |  | 
| 3 | 
             
            fix_problematic_imports()  # noqa
         | 
| @@ -13,18 +17,23 @@ from gradio_pdf import PDF | |
| 13 |  | 
| 14 | 
             
            from backends import (
         | 
| 15 | 
             
                convert_docling,
         | 
|  | |
|  | |
|  | |
| 16 | 
             
                convert_marker,
         | 
| 17 | 
             
                convert_mineru,
         | 
| 18 | 
             
                convert_unstructured,
         | 
|  | |
| 19 | 
             
            )
         | 
| 20 | 
             
            from backends.settings import ENABLE_DEBUG_MODE
         | 
| 21 | 
             
            from utils import remove_images_from_markdown, trim_pages
         | 
| 22 |  | 
| 23 | 
             
            TRIMMED_PDF_PATH = Path("/tmp/trimmed_input")
         | 
| 24 | 
             
            TRIMMED_PDF_PATH.mkdir(exist_ok=True)
         | 
|  | |
| 25 |  | 
| 26 |  | 
| 27 | 
            -
            def convert_document(path, method, enabled=True):
         | 
| 28 | 
             
                if enabled:
         | 
| 29 | 
             
                    print("Processing file", path, "with method", method)
         | 
| 30 | 
             
                else:
         | 
| @@ -33,7 +42,11 @@ def convert_document(path, method, enabled=True): | |
| 33 | 
             
                # benchmarking
         | 
| 34 | 
             
                start = time.time()
         | 
| 35 |  | 
| 36 | 
            -
                path = trim_pages( | 
|  | |
|  | |
|  | |
|  | |
| 37 | 
             
                file_name = Path(path).stem
         | 
| 38 | 
             
                debug_image_paths = []
         | 
| 39 | 
             
                text = "unknown method"
         | 
| @@ -51,6 +64,16 @@ def convert_document(path, method, enabled=True): | |
| 51 | 
             
                    )
         | 
| 52 | 
             
                elif method == "MinerU":
         | 
| 53 | 
             
                    text, debug_image_paths = convert_mineru(path, file_name)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 54 |  | 
| 55 | 
             
                duration = time.time() - start
         | 
| 56 | 
             
                duration_message = f"Conversion with {method} took *{duration:.2f} seconds*"
         | 
| @@ -63,6 +86,51 @@ def convert_document(path, method, enabled=True): | |
| 63 | 
             
                )
         | 
| 64 |  | 
| 65 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 66 | 
             
            def show_tabs(selected_methods):
         | 
| 67 | 
             
                visible_tabs = []
         | 
| 68 | 
             
                for method in SUPPORTED_METHODS:
         | 
| @@ -79,14 +147,25 @@ latex_delimiters = [ | |
| 79 | 
             
            # startup test (also for loading models the first time)
         | 
| 80 | 
             
            start_startup = time.time()
         | 
| 81 | 
             
            WARMUP_PDF_PATH = "table.pdf"
         | 
| 82 | 
            -
            SUPPORTED_METHODS = [ | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 83 |  | 
| 84 | 
            -
             | 
| 85 | 
            -
             | 
| 86 | 
            -
                for  | 
| 87 | 
            -
                     | 
| 88 | 
            -
             | 
| 89 | 
            -
             | 
|  | |
| 90 |  | 
| 91 | 
             
            with gr.Blocks(
         | 
| 92 | 
             
                theme=gr.themes.Ocean(),
         | 
| @@ -106,7 +185,28 @@ with gr.Blocks( | |
| 106 | 
             
                                ".pdf",
         | 
| 107 | 
             
                            ],
         | 
| 108 | 
             
                        )
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 109 | 
             
                        progress_status = gr.Markdown("", show_label=False, container=False)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 110 |  | 
| 111 | 
             
                    with gr.Column(variant="panel", scale=5):
         | 
| 112 | 
             
                        with gr.Row():
         | 
| @@ -116,12 +216,6 @@ with gr.Blocks( | |
| 116 | 
             
                                value=SUPPORTED_METHODS[:2],
         | 
| 117 | 
             
                                multiselect=True,
         | 
| 118 | 
             
                            )
         | 
| 119 | 
            -
                        with gr.Row():
         | 
| 120 | 
            -
                            visual_checkbox = gr.Checkbox(
         | 
| 121 | 
            -
                                label="Enable debug visualization",
         | 
| 122 | 
            -
                                visible=ENABLE_DEBUG_MODE,
         | 
| 123 | 
            -
                                value=True,
         | 
| 124 | 
            -
                            )
         | 
| 125 | 
             
                        with gr.Row():
         | 
| 126 | 
             
                            convert_btn = gr.Button("Convert", variant="primary", scale=2)
         | 
| 127 | 
             
                            clear_btn = gr.ClearButton(value="Clear", scale=1)
         | 
| @@ -210,11 +304,14 @@ with gr.Blocks( | |
| 210 |  | 
| 211 | 
             
                        return msg
         | 
| 212 |  | 
| 213 | 
            -
                    def process_method(input_file, selected_methods, method=method):
         | 
| 214 | 
             
                        if input_file is None:
         | 
| 215 | 
             
                            raise ValueError("Please upload a PDF file first!")
         | 
| 216 | 
             
                        return convert_document(
         | 
| 217 | 
            -
                            input_file, | 
|  | |
|  | |
|  | |
| 218 | 
             
                        )
         | 
| 219 |  | 
| 220 | 
             
                    click_event = click_event.then(
         | 
| @@ -222,25 +319,35 @@ with gr.Blocks( | |
| 222 | 
             
                        inputs=[methods],
         | 
| 223 | 
             
                        outputs=[progress_status],
         | 
| 224 | 
             
                    ).then(
         | 
| 225 | 
            -
                        fn=lambda input_file, methods, method=method: process_method(
         | 
| 226 | 
            -
                            input_file, methods, method
         | 
| 227 | 
             
                        ),
         | 
| 228 | 
            -
                        inputs=[input_file, methods],
         | 
| 229 | 
             
                        outputs=output_components[idx * 4 : (idx + 1) * 4],
         | 
| 230 | 
             
                    )
         | 
| 231 |  | 
| 232 | 
            -
                click_event.then(
         | 
| 233 | 
            -
                     | 
| 234 | 
            -
                     | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 235 | 
             
                )
         | 
| 236 |  | 
| 237 | 
             
                clear_btn.add(
         | 
| 238 | 
             
                    [
         | 
| 239 | 
             
                        input_file,
         | 
| 240 | 
             
                        pdf_preview,
         | 
|  | |
| 241 | 
             
                    ]
         | 
| 242 | 
             
                    + output_components
         | 
| 243 | 
             
                )
         | 
|  | |
|  | |
|  | |
|  | |
| 244 |  | 
| 245 | 
             
                visual_checkbox.change(
         | 
| 246 | 
             
                    fn=lambda state: [gr.update(visible=state)] * len(visualization_sub_tabs),
         | 
| @@ -248,4 +355,7 @@ with gr.Blocks( | |
| 248 | 
             
                    outputs=visualization_sub_tabs,
         | 
| 249 | 
             
                )
         | 
| 250 |  | 
| 251 | 
            -
                demo.launch( | 
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
            import zipfile
         | 
| 3 | 
            +
            from collections import defaultdict
         | 
| 4 | 
            +
             | 
| 5 | 
             
            from utils import fix_problematic_imports, prepare_env_mineru
         | 
| 6 |  | 
| 7 | 
             
            fix_problematic_imports()  # noqa
         | 
|  | |
| 17 |  | 
| 18 | 
             
            from backends import (
         | 
| 19 | 
             
                convert_docling,
         | 
| 20 | 
            +
                convert_gemini,
         | 
| 21 | 
            +
                convert_gmft,
         | 
| 22 | 
            +
                convert_img2table,
         | 
| 23 | 
             
                convert_marker,
         | 
| 24 | 
             
                convert_mineru,
         | 
| 25 | 
             
                convert_unstructured,
         | 
| 26 | 
            +
                convert_zerox,
         | 
| 27 | 
             
            )
         | 
| 28 | 
             
            from backends.settings import ENABLE_DEBUG_MODE
         | 
| 29 | 
             
            from utils import remove_images_from_markdown, trim_pages
         | 
| 30 |  | 
| 31 | 
             
            TRIMMED_PDF_PATH = Path("/tmp/trimmed_input")
         | 
| 32 | 
             
            TRIMMED_PDF_PATH.mkdir(exist_ok=True)
         | 
| 33 | 
            +
            DO_WARMUP = os.getenv("DO_WARMUP", "True").lower() == "true"
         | 
| 34 |  | 
| 35 |  | 
| 36 | 
            +
            def convert_document(path, method, start_page=0, enabled=True):
         | 
| 37 | 
             
                if enabled:
         | 
| 38 | 
             
                    print("Processing file", path, "with method", method)
         | 
| 39 | 
             
                else:
         | 
|  | |
| 42 | 
             
                # benchmarking
         | 
| 43 | 
             
                start = time.time()
         | 
| 44 |  | 
| 45 | 
            +
                path = trim_pages(
         | 
| 46 | 
            +
                    path,
         | 
| 47 | 
            +
                    output_path=TRIMMED_PDF_PATH,
         | 
| 48 | 
            +
                    start_page=start_page,
         | 
| 49 | 
            +
                )
         | 
| 50 | 
             
                file_name = Path(path).stem
         | 
| 51 | 
             
                debug_image_paths = []
         | 
| 52 | 
             
                text = "unknown method"
         | 
|  | |
| 64 | 
             
                    )
         | 
| 65 | 
             
                elif method == "MinerU":
         | 
| 66 | 
             
                    text, debug_image_paths = convert_mineru(path, file_name)
         | 
| 67 | 
            +
                elif method == "Gemini (API)":
         | 
| 68 | 
            +
                    text, debug_image_paths = convert_gemini(path, file_name)
         | 
| 69 | 
            +
                elif method == "Zerox":
         | 
| 70 | 
            +
                    text, debug_image_paths = convert_zerox(path, file_name)
         | 
| 71 | 
            +
                elif method == "Img2Table":
         | 
| 72 | 
            +
                    text, debug_image_paths = convert_img2table(path, file_name)
         | 
| 73 | 
            +
                elif method == "GMFT":
         | 
| 74 | 
            +
                    text, debug_image_paths = convert_gmft(path, file_name)
         | 
| 75 | 
            +
                else:
         | 
| 76 | 
            +
                    raise ValueError(f"Unsupported method: {method}")
         | 
| 77 |  | 
| 78 | 
             
                duration = time.time() - start
         | 
| 79 | 
             
                duration_message = f"Conversion with {method} took *{duration:.2f} seconds*"
         | 
|  | |
| 86 | 
             
                )
         | 
| 87 |  | 
| 88 |  | 
| 89 | 
            +
            def to_zip_file(file_path, methods, *output_components):
         | 
| 90 | 
            +
                markdown_text_dict = dict()
         | 
| 91 | 
            +
                debug_images_dict = defaultdict(list)
         | 
| 92 | 
            +
                for idx, method_name in enumerate(SUPPORTED_METHODS):
         | 
| 93 | 
            +
                    if method_name not in methods:
         | 
| 94 | 
            +
                        continue
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                    markdown_text = output_components[idx * 4 + 2]
         | 
| 97 | 
            +
                    debug_images = output_components[idx * 4 + 3]
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                    markdown_text_dict[method_name] = markdown_text
         | 
| 100 | 
            +
                    debug_images_dict[method_name] = debug_images
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                # create new temp directory using Python's tempfile module
         | 
| 103 | 
            +
                temp_dir = Path(file_path).parent
         | 
| 104 | 
            +
                zip_file_path = temp_dir / "output.zip"
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                markdown_path = temp_dir / f"{method_name}.md"
         | 
| 107 | 
            +
                with open(markdown_path, "w") as f:
         | 
| 108 | 
            +
                    f.write(markdown_text)
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                # create a zip file in write mode
         | 
| 111 | 
            +
                with zipfile.ZipFile(zip_file_path, "w", zipfile.ZIP_DEFLATED) as zipf:
         | 
| 112 | 
            +
                    for method_name, markdown_text in markdown_text_dict.items():
         | 
| 113 | 
            +
                        debug_image_paths = debug_images_dict[method_name]
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                        # write the markdown text to the zip file
         | 
| 116 | 
            +
                        zipf.write(
         | 
| 117 | 
            +
                            markdown_path,
         | 
| 118 | 
            +
                            f"{method_name}/{method_name}.md",
         | 
| 119 | 
            +
                        )
         | 
| 120 | 
            +
                        if debug_image_paths:
         | 
| 121 | 
            +
                            for idx, (debug_image_path, _) in enumerate(debug_image_paths):
         | 
| 122 | 
            +
                                debug_image_name = Path(debug_image_path).name
         | 
| 123 | 
            +
                                zipf.write(
         | 
| 124 | 
            +
                                    debug_image_path,
         | 
| 125 | 
            +
                                    f"{method_name}/{debug_image_name}",
         | 
| 126 | 
            +
                                )
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                return gr.update(
         | 
| 129 | 
            +
                    value=str(zip_file_path),
         | 
| 130 | 
            +
                    visible=True,
         | 
| 131 | 
            +
                )
         | 
| 132 | 
            +
             | 
| 133 | 
            +
             | 
| 134 | 
             
            def show_tabs(selected_methods):
         | 
| 135 | 
             
                visible_tabs = []
         | 
| 136 | 
             
                for method in SUPPORTED_METHODS:
         | 
|  | |
| 147 | 
             
            # startup test (also for loading models the first time)
         | 
| 148 | 
             
            start_startup = time.time()
         | 
| 149 | 
             
            WARMUP_PDF_PATH = "table.pdf"
         | 
| 150 | 
            +
            SUPPORTED_METHODS = [
         | 
| 151 | 
            +
                "PyMuPDF",
         | 
| 152 | 
            +
                "Docling",
         | 
| 153 | 
            +
                "Marker",
         | 
| 154 | 
            +
                "MinerU",
         | 
| 155 | 
            +
                "Unstructured",
         | 
| 156 | 
            +
                "Gemini (API)",
         | 
| 157 | 
            +
                "Img2Table",
         | 
| 158 | 
            +
                "GMFT",
         | 
| 159 | 
            +
                # "Zerox"
         | 
| 160 | 
            +
            ]
         | 
| 161 |  | 
| 162 | 
            +
            if DO_WARMUP:
         | 
| 163 | 
            +
                print("Warm-up sequence")
         | 
| 164 | 
            +
                for method in SUPPORTED_METHODS:
         | 
| 165 | 
            +
                    for _ in range(1):
         | 
| 166 | 
            +
                        convert_document(WARMUP_PDF_PATH, method)
         | 
| 167 | 
            +
                startup_duration = time.time() - start_startup
         | 
| 168 | 
            +
                print(f"Total start-up time: {startup_duration:.2f} seconds")
         | 
| 169 |  | 
| 170 | 
             
            with gr.Blocks(
         | 
| 171 | 
             
                theme=gr.themes.Ocean(),
         | 
|  | |
| 185 | 
             
                                ".pdf",
         | 
| 186 | 
             
                            ],
         | 
| 187 | 
             
                        )
         | 
| 188 | 
            +
                        with gr.Accordion(
         | 
| 189 | 
            +
                            "Advanced settings",
         | 
| 190 | 
            +
                            open=False,
         | 
| 191 | 
            +
                        ):
         | 
| 192 | 
            +
                            start_page = gr.Number(
         | 
| 193 | 
            +
                                label="Starting page (only max 5 consecutive pages are processed)",
         | 
| 194 | 
            +
                                minimum=1,
         | 
| 195 | 
            +
                                maximum=100,
         | 
| 196 | 
            +
                                step=1,
         | 
| 197 | 
            +
                                value=1,
         | 
| 198 | 
            +
                            )
         | 
| 199 | 
            +
                            visual_checkbox = gr.Checkbox(
         | 
| 200 | 
            +
                                label="Enable debug visualization",
         | 
| 201 | 
            +
                                visible=ENABLE_DEBUG_MODE,
         | 
| 202 | 
            +
                                value=True,
         | 
| 203 | 
            +
                            )
         | 
| 204 | 
             
                        progress_status = gr.Markdown("", show_label=False, container=False)
         | 
| 205 | 
            +
                        output_file = gr.File(
         | 
| 206 | 
            +
                            label="Download output",
         | 
| 207 | 
            +
                            interactive=False,
         | 
| 208 | 
            +
                            visible=False,
         | 
| 209 | 
            +
                        )
         | 
| 210 |  | 
| 211 | 
             
                    with gr.Column(variant="panel", scale=5):
         | 
| 212 | 
             
                        with gr.Row():
         | 
|  | |
| 216 | 
             
                                value=SUPPORTED_METHODS[:2],
         | 
| 217 | 
             
                                multiselect=True,
         | 
| 218 | 
             
                            )
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 219 | 
             
                        with gr.Row():
         | 
| 220 | 
             
                            convert_btn = gr.Button("Convert", variant="primary", scale=2)
         | 
| 221 | 
             
                            clear_btn = gr.ClearButton(value="Clear", scale=1)
         | 
|  | |
| 304 |  | 
| 305 | 
             
                        return msg
         | 
| 306 |  | 
| 307 | 
            +
                    def process_method(input_file, start_page, selected_methods, method=method):
         | 
| 308 | 
             
                        if input_file is None:
         | 
| 309 | 
             
                            raise ValueError("Please upload a PDF file first!")
         | 
| 310 | 
             
                        return convert_document(
         | 
| 311 | 
            +
                            input_file,
         | 
| 312 | 
            +
                            method=method,
         | 
| 313 | 
            +
                            start_page=start_page - 1,
         | 
| 314 | 
            +
                            enabled=method in selected_methods,
         | 
| 315 | 
             
                        )
         | 
| 316 |  | 
| 317 | 
             
                    click_event = click_event.then(
         | 
|  | |
| 319 | 
             
                        inputs=[methods],
         | 
| 320 | 
             
                        outputs=[progress_status],
         | 
| 321 | 
             
                    ).then(
         | 
| 322 | 
            +
                        fn=lambda input_file, start_page, methods, method=method: process_method(
         | 
| 323 | 
            +
                            input_file, start_page, methods, method
         | 
| 324 | 
             
                        ),
         | 
| 325 | 
            +
                        inputs=[input_file, start_page, methods],
         | 
| 326 | 
             
                        outputs=output_components[idx * 4 : (idx + 1) * 4],
         | 
| 327 | 
             
                    )
         | 
| 328 |  | 
| 329 | 
            +
                click_event.then(lambda: "All tasks completed.", outputs=[progress_status],).then(
         | 
| 330 | 
            +
                    fn=to_zip_file,
         | 
| 331 | 
            +
                    inputs=[
         | 
| 332 | 
            +
                        input_file,
         | 
| 333 | 
            +
                        methods,
         | 
| 334 | 
            +
                    ]
         | 
| 335 | 
            +
                    + output_components,
         | 
| 336 | 
            +
                    outputs=[output_file],
         | 
| 337 | 
             
                )
         | 
| 338 |  | 
| 339 | 
             
                clear_btn.add(
         | 
| 340 | 
             
                    [
         | 
| 341 | 
             
                        input_file,
         | 
| 342 | 
             
                        pdf_preview,
         | 
| 343 | 
            +
                        output_file,
         | 
| 344 | 
             
                    ]
         | 
| 345 | 
             
                    + output_components
         | 
| 346 | 
             
                )
         | 
| 347 | 
            +
                clear_btn.click(
         | 
| 348 | 
            +
                    fn=lambda: gr.update(visible=False),
         | 
| 349 | 
            +
                    outputs=[output_file],
         | 
| 350 | 
            +
                )
         | 
| 351 |  | 
| 352 | 
             
                visual_checkbox.change(
         | 
| 353 | 
             
                    fn=lambda state: [gr.update(visible=state)] * len(visualization_sub_tabs),
         | 
|  | |
| 355 | 
             
                    outputs=visualization_sub_tabs,
         | 
| 356 | 
             
                )
         | 
| 357 |  | 
| 358 | 
            +
                demo.launch(
         | 
| 359 | 
            +
                    show_error=True,
         | 
| 360 | 
            +
                    max_file_size="50mb",
         | 
| 361 | 
            +
                )
         | 
    	
        backends/__init__.py
    CHANGED
    
    | @@ -1,11 +1,19 @@ | |
| 1 | 
             
            from .docling import convert_docling
         | 
|  | |
|  | |
|  | |
| 2 | 
             
            from .marker import convert_marker
         | 
| 3 | 
             
            from .mineru import convert_mineru
         | 
| 4 | 
             
            from .unstructured import convert_unstructured
         | 
|  | |
| 5 |  | 
| 6 | 
             
            __all__ = [
         | 
| 7 | 
             
                "convert_docling",
         | 
| 8 | 
             
                "convert_marker",
         | 
| 9 | 
             
                "convert_mineru",
         | 
| 10 | 
             
                "convert_unstructured",
         | 
|  | |
|  | |
|  | |
|  | |
| 11 | 
             
            ]
         | 
|  | |
| 1 | 
             
            from .docling import convert_docling
         | 
| 2 | 
            +
            from .gemini import convert_gemini
         | 
| 3 | 
            +
            from .gmft import convert_gmft
         | 
| 4 | 
            +
            from .img2table import convert_img2table
         | 
| 5 | 
             
            from .marker import convert_marker
         | 
| 6 | 
             
            from .mineru import convert_mineru
         | 
| 7 | 
             
            from .unstructured import convert_unstructured
         | 
| 8 | 
            +
            from .zerox import convert_zerox
         | 
| 9 |  | 
| 10 | 
             
            __all__ = [
         | 
| 11 | 
             
                "convert_docling",
         | 
| 12 | 
             
                "convert_marker",
         | 
| 13 | 
             
                "convert_mineru",
         | 
| 14 | 
             
                "convert_unstructured",
         | 
| 15 | 
            +
                "convert_gemini",
         | 
| 16 | 
            +
                "convert_zerox",
         | 
| 17 | 
            +
                "convert_img2table",
         | 
| 18 | 
            +
                "convert_gmft",
         | 
| 19 | 
             
            ]
         | 
    	
        backends/gemini.py
    ADDED
    
    | @@ -0,0 +1,43 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
            from pathlib import Path
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            from google import genai
         | 
| 5 | 
            +
            from google.genai import types
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            # Create a client
         | 
| 8 | 
            +
            client = genai.Client(api_key=os.getenv("GEMINI_API_KEY", ""))
         | 
| 9 | 
            +
            MODEL_NAME = "gemini-2.0-flash"
         | 
| 10 | 
            +
            PROMPT = """
         | 
| 11 | 
            +
            Convert the following document to markdown, preserving header, table and figure structure as much as possible.
         | 
| 12 | 
            +
            Return only the markdown with no explanation text. Do not include delimiters like ```markdown or ```html.
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            RULES:
         | 
| 15 | 
            +
                - You must include all information on the page. Do not exclude headers, footers, or subtext.
         | 
| 16 | 
            +
                - Return tables in Markdown format.
         | 
| 17 | 
            +
                - Must format headers / sub-headers in Markdown format (#, ##, etc).
         | 
| 18 | 
            +
                - Attempt to merge line-breaks in to coherent paragraphs.
         | 
| 19 | 
            +
                - Charts & infographics must be interpreted to a text-based markdown format. Prefer table format when applicable.
         | 
| 20 | 
            +
                - Do not include any images URL / tag in the markdown.
         | 
| 21 | 
            +
                - Page numbers should be wrapped in brackets. Ex: <page_number>14<page_number> or <page_number>9/22<page_number>
         | 
| 22 | 
            +
                - Prefer using ☐ and ☑ for check boxes.
         | 
| 23 | 
            +
            """  # noqa: E501
         | 
| 24 | 
            +
             | 
| 25 | 
            +
             | 
| 26 | 
            +
            def convert_gemini(path: str, file_name: str):
         | 
| 27 | 
            +
                # Generate a structured response using the Gemini API
         | 
| 28 | 
            +
                generation_config = types.GenerationConfig(
         | 
| 29 | 
            +
                    max_output_tokens=8192,
         | 
| 30 | 
            +
                ).to_json_dict()
         | 
| 31 | 
            +
                response = client.models.generate_content(
         | 
| 32 | 
            +
                    model=MODEL_NAME,
         | 
| 33 | 
            +
                    contents=[
         | 
| 34 | 
            +
                        PROMPT,
         | 
| 35 | 
            +
                        types.Part.from_bytes(
         | 
| 36 | 
            +
                            data=Path(path).read_bytes(),
         | 
| 37 | 
            +
                            mime_type="application/pdf",
         | 
| 38 | 
            +
                        ),
         | 
| 39 | 
            +
                    ],
         | 
| 40 | 
            +
                    config=generation_config,
         | 
| 41 | 
            +
                )
         | 
| 42 | 
            +
                # Convert the response to the pydantic model and return it
         | 
| 43 | 
            +
                return response.text, []
         | 
    	
        backends/gmft.py
    ADDED
    
    | @@ -0,0 +1,54 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from pathlib import Path
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            from gmft.auto import AutoFormatConfig, AutoTableFormatter, CroppedTable, TableDetector
         | 
| 4 | 
            +
            from gmft.pdf_bindings import PyPDFium2Document
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from .settings import ENABLE_DEBUG_MODE
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            detector = TableDetector()
         | 
| 9 | 
            +
            config = AutoFormatConfig()
         | 
| 10 | 
            +
            config.semantic_spanning_cells = True  # [Experimental] better spanning cells
         | 
| 11 | 
            +
            config.enable_multi_header = True  # multi-headers
         | 
| 12 | 
            +
            formatter = AutoTableFormatter(config)
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
            GMFT_DEBUG_PATH = Path("/tmp/gmft")
         | 
| 16 | 
            +
            GMFT_DEBUG_PATH.mkdir(exist_ok=True)
         | 
| 17 | 
            +
             | 
| 18 | 
            +
             | 
| 19 | 
            +
            def ingest_pdf(pdf_path) -> list[CroppedTable]:
         | 
| 20 | 
            +
                doc = PyPDFium2Document(pdf_path)
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                tables = []
         | 
| 23 | 
            +
                for page in doc:
         | 
| 24 | 
            +
                    tables += detector.extract(page)
         | 
| 25 | 
            +
                return tables
         | 
| 26 | 
            +
             | 
| 27 | 
            +
             | 
| 28 | 
            +
            def convert_gmft(path: str, file_name: str):
         | 
| 29 | 
            +
                tables = ingest_pdf(path)
         | 
| 30 | 
            +
                formatted_tables = []
         | 
| 31 | 
            +
                debug_image_paths = []
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                debug_path = GMFT_DEBUG_PATH / file_name
         | 
| 34 | 
            +
                debug_path.mkdir(exist_ok=True)
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                for idx, table in enumerate(tables):
         | 
| 37 | 
            +
                    ft = formatter.extract(
         | 
| 38 | 
            +
                        table,
         | 
| 39 | 
            +
                        dpi=72 * 2,
         | 
| 40 | 
            +
                    )
         | 
| 41 | 
            +
                    df = ft.df()
         | 
| 42 | 
            +
                    if df is not None:
         | 
| 43 | 
            +
                        html = df.fillna("").to_html(
         | 
| 44 | 
            +
                            index=False,
         | 
| 45 | 
            +
                        )
         | 
| 46 | 
            +
                        formatted_tables.append(html)
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                    if ENABLE_DEBUG_MODE:
         | 
| 49 | 
            +
                        image_path = debug_path / f"table_{idx}.png"
         | 
| 50 | 
            +
                        ft.image().save(image_path)
         | 
| 51 | 
            +
                        debug_image_paths.append(image_path)
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                content = "\n\n".join(formatted_tables)
         | 
| 54 | 
            +
                return content, debug_image_paths
         | 
    	
        backends/img2table.py
    ADDED
    
    | @@ -0,0 +1,55 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from pathlib import Path
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            import cv2
         | 
| 4 | 
            +
            from img2table.document import PDF
         | 
| 5 | 
            +
            from img2table.ocr import SuryaOCR
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            from .settings import ENABLE_DEBUG_MODE
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            ocr = SuryaOCR(
         | 
| 10 | 
            +
                langs=["en"],
         | 
| 11 | 
            +
            )
         | 
| 12 | 
            +
            IMG2TABLE_DEBUG_PATH = Path("/tmp/img2table")
         | 
| 13 | 
            +
            IMG2TABLE_DEBUG_PATH.mkdir(exist_ok=True)
         | 
| 14 | 
            +
             | 
| 15 | 
            +
             | 
| 16 | 
            +
            def convert_img2table(path: str, file_name: str):
         | 
| 17 | 
            +
                doc = PDF(path)
         | 
| 18 | 
            +
                pages = doc.extract_tables(
         | 
| 19 | 
            +
                    ocr=ocr,
         | 
| 20 | 
            +
                    implicit_rows=False,
         | 
| 21 | 
            +
                    implicit_columns=False,
         | 
| 22 | 
            +
                    borderless_tables=True,
         | 
| 23 | 
            +
                    min_confidence=50,
         | 
| 24 | 
            +
                )
         | 
| 25 | 
            +
                debug_image_paths = []
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                if ENABLE_DEBUG_MODE:
         | 
| 28 | 
            +
                    debug_path = IMG2TABLE_DEBUG_PATH / file_name
         | 
| 29 | 
            +
                    debug_path.mkdir(exist_ok=True)
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                    images = doc.images
         | 
| 32 | 
            +
                    for idx, page_number in enumerate(doc.pages or range(len(images))):
         | 
| 33 | 
            +
                        page_image = images[idx]
         | 
| 34 | 
            +
                        for table in pages[page_number]:
         | 
| 35 | 
            +
                            for row in table.content.values():
         | 
| 36 | 
            +
                                for cell in row:
         | 
| 37 | 
            +
                                    cv2.rectangle(
         | 
| 38 | 
            +
                                        page_image,
         | 
| 39 | 
            +
                                        (cell.bbox.x1, cell.bbox.y1),
         | 
| 40 | 
            +
                                        (cell.bbox.x2, cell.bbox.y2),
         | 
| 41 | 
            +
                                        (0, 0, 255),
         | 
| 42 | 
            +
                                        2,
         | 
| 43 | 
            +
                                    )
         | 
| 44 | 
            +
                        image_path = debug_path / f"page_{idx}.png"
         | 
| 45 | 
            +
                        debug_image_paths.append(image_path)
         | 
| 46 | 
            +
                        cv2.imwrite(str(image_path), page_image)
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                content = "\n\n".join(
         | 
| 49 | 
            +
                    [
         | 
| 50 | 
            +
                        (table.title if table.title else "") + "\n\n" + table.html
         | 
| 51 | 
            +
                        for tables in pages.values()
         | 
| 52 | 
            +
                        for table in tables
         | 
| 53 | 
            +
                    ]
         | 
| 54 | 
            +
                )
         | 
| 55 | 
            +
                return content, debug_image_paths
         | 
    	
        backends/zerox.py
    ADDED
    
    | @@ -0,0 +1,43 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import asyncio
         | 
| 2 | 
            +
            import re
         | 
| 3 | 
            +
            from pathlib import Path
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            from pyzerox import zerox
         | 
| 6 | 
            +
             | 
| 7 | 
            +
             | 
| 8 | 
            +
            def remove_images_from_markdown(markdown_text):
         | 
| 9 | 
            +
                # remove <image> and  from markdown
         | 
| 10 | 
            +
                markdown_text = re.sub(r"<img[^>]*>", "", markdown_text)
         | 
| 11 | 
            +
                markdown_text = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", markdown_text)
         | 
| 12 | 
            +
                return markdown_text
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
            ZEROX_DEBUG_PATH = Path("/tmp/zerox_debug")
         | 
| 16 | 
            +
            ZEROX_DEBUG_PATH.mkdir(exist_ok=True)
         | 
| 17 | 
            +
            MODEL_NAME = "gemini/gemini-2.0-flash"
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            def clean_up_html_code_block(text: str):
         | 
| 21 | 
            +
                # remove ```html and ``` from text
         | 
| 22 | 
            +
                text = text.replace("```html", "")
         | 
| 23 | 
            +
                text = text.replace("```", "")
         | 
| 24 | 
            +
                return text
         | 
| 25 | 
            +
             | 
| 26 | 
            +
             | 
| 27 | 
            +
            def convert_zerox(path: str, file_name: str):
         | 
| 28 | 
            +
                output_dir = ZEROX_DEBUG_PATH / file_name
         | 
| 29 | 
            +
                output_dir.mkdir(exist_ok=True)
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                async def async_convert():
         | 
| 32 | 
            +
                    return await zerox(
         | 
| 33 | 
            +
                        concurrency=4,
         | 
| 34 | 
            +
                        file_path=path,
         | 
| 35 | 
            +
                        model=MODEL_NAME,
         | 
| 36 | 
            +
                        output_dir=output_dir,
         | 
| 37 | 
            +
                    )
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                output = asyncio.run(async_convert())
         | 
| 40 | 
            +
                output_text = "\n\n".join(page.content for page in output.pages)
         | 
| 41 | 
            +
                output_text = clean_up_html_code_block(output_text)
         | 
| 42 | 
            +
                output_text = remove_images_from_markdown(output_text)
         | 
| 43 | 
            +
                return output_text, []
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -14,5 +14,7 @@ unstructured[pdf] | |
| 14 | 
             
            ultralytics>=8.3.48
         | 
| 15 | 
             
            transformers<5.0.0,>=4.45.2
         | 
| 16 | 
             
            struct-eqtable==0.3.2
         | 
| 17 | 
            -
            openai
         | 
| 18 | 
             
            doclayout_yolo==0.0.2b1
         | 
|  | |
|  | |
|  | 
|  | |
| 14 | 
             
            ultralytics>=8.3.48
         | 
| 15 | 
             
            transformers<5.0.0,>=4.45.2
         | 
| 16 | 
             
            struct-eqtable==0.3.2
         | 
|  | |
| 17 | 
             
            doclayout_yolo==0.0.2b1
         | 
| 18 | 
            +
            openai
         | 
| 19 | 
            +
            opencv-contrib-python
         | 
| 20 | 
            +
            gmft
         | 
    	
        utils.py
    CHANGED
    
    | @@ -14,14 +14,14 @@ def remove_images_from_markdown(markdown_text): | |
| 14 |  | 
| 15 |  | 
| 16 | 
             
            @functools.lru_cache(maxsize=None)
         | 
| 17 | 
            -
            def trim_pages(pdf_path, output_path, trim_pages=5):
         | 
| 18 | 
             
                doc = pymupdf.open(pdf_path)
         | 
| 19 | 
             
                parent_dir_name = Path(pdf_path).parent.name
         | 
| 20 | 
             
                output_file_path = Path(output_path) / f"{parent_dir_name}.pdf"
         | 
| 21 |  | 
| 22 | 
             
                num_pages = len(doc)
         | 
| 23 | 
             
                if num_pages > trim_pages:
         | 
| 24 | 
            -
                    to_select = list(range(trim_pages))
         | 
| 25 | 
             
                    doc.select(to_select)
         | 
| 26 | 
             
                    doc.ez_save(output_file_path)
         | 
| 27 | 
             
                    print("Trimmed pdf to with pages", to_select, "path", output_file_path)
         | 
|  | |
| 14 |  | 
| 15 |  | 
| 16 | 
             
            @functools.lru_cache(maxsize=None)
         | 
| 17 | 
            +
            def trim_pages(pdf_path, output_path, start_page=0, trim_pages=5):
         | 
| 18 | 
             
                doc = pymupdf.open(pdf_path)
         | 
| 19 | 
             
                parent_dir_name = Path(pdf_path).parent.name
         | 
| 20 | 
             
                output_file_path = Path(output_path) / f"{parent_dir_name}.pdf"
         | 
| 21 |  | 
| 22 | 
             
                num_pages = len(doc)
         | 
| 23 | 
             
                if num_pages > trim_pages:
         | 
| 24 | 
            +
                    to_select = list(range(start_page, min(start_page + trim_pages, num_pages)))
         | 
| 25 | 
             
                    doc.select(to_select)
         | 
| 26 | 
             
                    doc.ez_save(output_file_path)
         | 
| 27 | 
             
                    print("Trimmed pdf to with pages", to_select, "path", output_file_path)
         |