taprosoft
		
	commited on
		
		
					Commit 
							
							·
						
						188f052
	
1
								Parent(s):
							
							0933b39
								
fix: skip problematic import
Browse files- .pre-commit-config.yaml +1 -1
- README.md +3 -3
- app.py +28 -6
- requirements.txt +1 -3
- utils.py +18 -0
    	
        .pre-commit-config.yaml
    CHANGED
    
    | @@ -29,7 +29,7 @@ repos: | |
| 29 | 
             
                rev: 4.0.1
         | 
| 30 | 
             
                hooks:
         | 
| 31 | 
             
                  - id: flake8
         | 
| 32 | 
            -
                    args: ["--max-line-length", "88", "--extend-ignore", "E203"]
         | 
| 33 | 
             
              - repo: https://github.com/myint/autoflake
         | 
| 34 | 
             
                rev: v1.4
         | 
| 35 | 
             
                hooks:
         | 
|  | |
| 29 | 
             
                rev: 4.0.1
         | 
| 30 | 
             
                hooks:
         | 
| 31 | 
             
                  - id: flake8
         | 
| 32 | 
            +
                    args: ["--max-line-length", "88", "--extend-ignore", "E203,E402"]
         | 
| 33 | 
             
              - repo: https://github.com/myint/autoflake
         | 
| 34 | 
             
                rev: v1.4
         | 
| 35 | 
             
                hooks:
         | 
    	
        README.md
    CHANGED
    
    | @@ -1,13 +1,13 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
            -
            title:  | 
| 3 | 
             
            emoji: 🐢
         | 
| 4 | 
             
            colorFrom: blue
         | 
| 5 | 
            -
            colorTo:  | 
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
             
            sdk_version: 5.7.1
         | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
            -
            short_description: Convert documents to Markdown  | 
| 11 | 
             
            ---
         | 
| 12 |  | 
| 13 | 
             
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
            +
            title: PDFParsersPlayground
         | 
| 3 | 
             
            emoji: 🐢
         | 
| 4 | 
             
            colorFrom: blue
         | 
| 5 | 
            +
            colorTo: green
         | 
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
             
            sdk_version: 5.7.1
         | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
            +
            short_description: Convert PDF documents to Markdown with multiple open-source parsers
         | 
| 11 | 
             
            ---
         | 
| 12 |  | 
| 13 | 
             
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         | 
    	
        app.py
    CHANGED
    
    | @@ -1,3 +1,8 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1 | 
             
            import time
         | 
| 2 | 
             
            from pathlib import Path
         | 
| 3 |  | 
| @@ -46,9 +51,15 @@ def convert_document(path, method, enabled=True): | |
| 46 | 
             
                elif method == "MinerU":
         | 
| 47 | 
             
                    text, debug_image_paths = convert_mineru(path, file_name)
         | 
| 48 |  | 
| 49 | 
            -
                 | 
| 50 | 
            -
                 | 
| 51 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 52 |  | 
| 53 |  | 
| 54 | 
             
            def show_tabs(selected_methods):
         | 
| @@ -73,7 +84,8 @@ print("Warm-up sequence") | |
| 73 | 
             
            for method in SUPPORTED_METHODS:
         | 
| 74 | 
             
                for _ in range(1):
         | 
| 75 | 
             
                    convert_document(WARMUP_PDF_PATH, method)
         | 
| 76 | 
            -
             | 
|  | |
| 77 |  | 
| 78 | 
             
            with gr.Blocks(
         | 
| 79 | 
             
                theme=gr.themes.Ocean(),
         | 
| @@ -149,9 +161,19 @@ with gr.Blocks( | |
| 149 | 
             
                                            markdown_text = gr.TextArea(
         | 
| 150 | 
             
                                                lines=45, show_label=False, container=False
         | 
| 151 | 
             
                                            )
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 152 |  | 
| 153 | 
             
                                output_components.extend(
         | 
| 154 | 
            -
                                    [ | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 155 | 
             
                                )
         | 
| 156 | 
             
                                output_tabs.append(output_tab)
         | 
| 157 | 
             
                                visualization_sub_tabs.append(visual_sub_tab)
         | 
| @@ -199,7 +221,7 @@ with gr.Blocks( | |
| 199 | 
             
                            input_file, methods, method
         | 
| 200 | 
             
                        ),
         | 
| 201 | 
             
                        inputs=[input_file, methods],
         | 
| 202 | 
            -
                        outputs=output_components[idx *  | 
| 203 | 
             
                    )
         | 
| 204 |  | 
| 205 | 
             
                click_event.then(
         | 
|  | |
| 1 | 
            +
            from utils import fix_problematic_imports  # noqa
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            fix_problematic_imports()  # noqa
         | 
| 4 | 
            +
             | 
| 5 | 
            +
             | 
| 6 | 
             
            import time
         | 
| 7 | 
             
            from pathlib import Path
         | 
| 8 |  | 
|  | |
| 51 | 
             
                elif method == "MinerU":
         | 
| 52 | 
             
                    text, debug_image_paths = convert_mineru(path, file_name)
         | 
| 53 |  | 
| 54 | 
            +
                duration = time.time() - start
         | 
| 55 | 
            +
                duration_message = f"Conversion with {method} took *{duration:.2f} seconds*"
         | 
| 56 | 
            +
                print(duration_message)
         | 
| 57 | 
            +
                return (
         | 
| 58 | 
            +
                    duration_message,
         | 
| 59 | 
            +
                    text,
         | 
| 60 | 
            +
                    remove_images_from_markdown(text),
         | 
| 61 | 
            +
                    debug_image_paths,
         | 
| 62 | 
            +
                )
         | 
| 63 |  | 
| 64 |  | 
| 65 | 
             
            def show_tabs(selected_methods):
         | 
|  | |
| 84 | 
             
            for method in SUPPORTED_METHODS:
         | 
| 85 | 
             
                for _ in range(1):
         | 
| 86 | 
             
                    convert_document(WARMUP_PDF_PATH, method)
         | 
| 87 | 
            +
            startup_duration = time.time() - start_startup
         | 
| 88 | 
            +
            print(f"Total start-up time: {startup_duration:.2f} seconds")
         | 
| 89 |  | 
| 90 | 
             
            with gr.Blocks(
         | 
| 91 | 
             
                theme=gr.themes.Ocean(),
         | 
|  | |
| 161 | 
             
                                            markdown_text = gr.TextArea(
         | 
| 162 | 
             
                                                lines=45, show_label=False, container=False
         | 
| 163 | 
             
                                            )
         | 
| 164 | 
            +
                                        with gr.Tab("Reference"):
         | 
| 165 | 
            +
                                            output_description = gr.Markdown(
         | 
| 166 | 
            +
                                                container=False,
         | 
| 167 | 
            +
                                                show_label=False,
         | 
| 168 | 
            +
                                            )
         | 
| 169 |  | 
| 170 | 
             
                                output_components.extend(
         | 
| 171 | 
            +
                                    [
         | 
| 172 | 
            +
                                        output_description,
         | 
| 173 | 
            +
                                        markdown_render,
         | 
| 174 | 
            +
                                        markdown_text,
         | 
| 175 | 
            +
                                        debug_images,
         | 
| 176 | 
            +
                                    ]
         | 
| 177 | 
             
                                )
         | 
| 178 | 
             
                                output_tabs.append(output_tab)
         | 
| 179 | 
             
                                visualization_sub_tabs.append(visual_sub_tab)
         | 
|  | |
| 221 | 
             
                            input_file, methods, method
         | 
| 222 | 
             
                        ),
         | 
| 223 | 
             
                        inputs=[input_file, methods],
         | 
| 224 | 
            +
                        outputs=output_components[idx * 4 : (idx + 1) * 4],
         | 
| 225 | 
             
                    )
         | 
| 226 |  | 
| 227 | 
             
                click_event.then(
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -1,5 +1,3 @@ | |
| 1 | 
            -
            torch>=2.2.2,<=2.3.1
         | 
| 2 | 
            -
            torchvision>=0.17.2,<=0.18.1
         | 
| 3 | 
             
            paddlepaddle-gpu @ https://paddle-whl.bj.bcebos.com/stable/cu118/paddlepaddle-gpu/paddlepaddle_gpu-3.0.0b1-cp310-cp310-linux_x86_64.whl
         | 
| 4 | 
             
            detectron2 @ https://wheels-1251341229.cos.ap-shanghai.myqcloud.com/assets/whl/detectron2/detectron2-0.6-cp310-cp310-linux_x86_64.whl
         | 
| 5 | 
             
            paddleocr==2.7.3
         | 
| @@ -14,7 +12,7 @@ PyMuPDF>=1.24.9,<1.24.14 | |
| 14 | 
             
            pymupdf4llm
         | 
| 15 | 
             
            unstructured[pdf]
         | 
| 16 | 
             
            ultralytics>=8.3.48
         | 
| 17 | 
            -
            unimernet==0.2.3
         | 
| 18 | 
             
            transformers<5.0.0,>=4.45.2
         | 
| 19 | 
             
            struct-eqtable==0.3.2
         | 
| 20 | 
             
            openai
         | 
|  | 
|  | |
|  | |
|  | |
| 1 | 
             
            paddlepaddle-gpu @ https://paddle-whl.bj.bcebos.com/stable/cu118/paddlepaddle-gpu/paddlepaddle_gpu-3.0.0b1-cp310-cp310-linux_x86_64.whl
         | 
| 2 | 
             
            detectron2 @ https://wheels-1251341229.cos.ap-shanghai.myqcloud.com/assets/whl/detectron2/detectron2-0.6-cp310-cp310-linux_x86_64.whl
         | 
| 3 | 
             
            paddleocr==2.7.3
         | 
|  | |
| 12 | 
             
            pymupdf4llm
         | 
| 13 | 
             
            unstructured[pdf]
         | 
| 14 | 
             
            ultralytics>=8.3.48
         | 
|  | |
| 15 | 
             
            transformers<5.0.0,>=4.45.2
         | 
| 16 | 
             
            struct-eqtable==0.3.2
         | 
| 17 | 
             
            openai
         | 
| 18 | 
            +
            doclayout_yolo==0.0.2b1
         | 
    	
        utils.py
    CHANGED
    
    | @@ -29,3 +29,21 @@ def trim_pages(pdf_path, output_path, trim_pages=5): | |
| 29 | 
             
                    copy2(pdf_path, str(output_file_path))
         | 
| 30 |  | 
| 31 | 
             
                return str(output_file_path)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 29 | 
             
                    copy2(pdf_path, str(output_file_path))
         | 
| 30 |  | 
| 31 | 
             
                return str(output_file_path)
         | 
| 32 | 
            +
             | 
| 33 | 
            +
             | 
| 34 | 
            +
            def fix_problematic_imports():
         | 
| 35 | 
            +
                import sys
         | 
| 36 | 
            +
                import types
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                # Create a fake 'UnimernetModel' class inside a fake 'Unimernet' module
         | 
| 39 | 
            +
                fake_unimernet_module = types.ModuleType(
         | 
| 40 | 
            +
                    "magic_pdf.model.sub_modules.mfr.unimernet.Unimernet"
         | 
| 41 | 
            +
                )
         | 
| 42 | 
            +
                fake_unimernet_module.UnimernetModel = type(  # type: ignore
         | 
| 43 | 
            +
                    "UnimernetModel", (), {}
         | 
| 44 | 
            +
                )
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                # Register fake module in sys.modules
         | 
| 47 | 
            +
                sys.modules[
         | 
| 48 | 
            +
                    "magic_pdf.model.sub_modules.mfr.unimernet.Unimernet"
         | 
| 49 | 
            +
                ] = fake_unimernet_module
         |