Spaces:

marcosremar2
/

docker_mineru

Sleeping

App Files Files Community

marcosremar2 commited on May 3

Commit

53a34c2

1 Parent(s): ab599b4

Fix: Use PymuDocDataset in API endpoint

Browse files

Files changed (1) hide show

app.py +69 -27

app.py CHANGED Viewed

@@ -1,26 +1,30 @@
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
-import magic_pdf
 import tempfile
 import os
 import json
 import traceback
-import uvicorn
 from datetime import datetime
 from typing import Dict, List, Any, Optional
 # Application metadata
 app_description = """
 # MinerU PDF Processor API
 This API provides PDF processing capabilities using MinerU's magic-pdf library.
-It extracts text content and tables from PDF documents.
 ## Features:
 - PDF text extraction
-- Table detection and extraction
-- JSON response for easy integration
 """
 app = FastAPI(
@@ -41,6 +45,11 @@ app.add_middleware(
     allow_headers=["*"],  # Allow all headers
 )
 # Health check endpoint
 @app.get("/health", tags=["Health"])
 async def health_check() -> Dict[str, Any]:
@@ -57,13 +66,13 @@ async def health_check() -> Dict[str, Any]:
 @app.post("/extract", tags=["PDF Processing"])
 async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
     """
-    Extract text and tables from a PDF file.
     Parameters:
         file: The PDF file to process
     Returns:
-        A JSON object containing the extracted content with pages, text blocks, and tables
     """
     if not file.filename or not file.filename.lower().endswith('.pdf'):
         raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
@@ -76,35 +85,66 @@ async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
         with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
             temp_pdf.write(content)
             temp_pdf_path = temp_pdf.name
-        # Process the PDF using magic_pdf.PDF class
-        result = magic_pdf.PDF(temp_pdf_path).parse()
-        # Convert result to dictionary
-        output = {
             "filename": file.filename,
-            "pages": []
         }
-        for page in result.pages:
-            page_data = {
-                "page_num": page.page_num,
-                "text": "\n".join([block.text for block in page.text_blocks]),
-                "tables": []
-            }
-            for table in page.tables:
-                page_data["tables"].append(table.to_markdown())
-            output["pages"].append(page_data)
-        return {"result": output}
     except Exception as e:
         error_detail = str(e)
         error_trace = traceback.format_exc()
-        # Log the error (would be better with a proper logger)
         print(f"Error processing PDF: {error_detail}")
         print(error_trace)
@@ -126,4 +166,6 @@ async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
                 pass
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)

 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 import tempfile
 import os
 import json
 import traceback
 from datetime import datetime
 from typing import Dict, List, Any, Optional
+# Import necessary components from magic_pdf based on convert_pdf.py
+from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.config.enums import SupportedPdfParseMethod
 # Application metadata
 app_description = """
 # MinerU PDF Processor API
 This API provides PDF processing capabilities using MinerU's magic-pdf library.
+It extracts text content and generates markdown from PDF documents.
 ## Features:
 - PDF text extraction
+- Markdown conversion
+- Layout analysis (via output files)
 """
 app = FastAPI(
     allow_headers=["*"],  # Allow all headers
 )
+# Define output directories (relative to the app's working directory in the container)
+local_image_dir, local_md_dir = "output/images", "output"
+os.makedirs(local_image_dir, exist_ok=True)
+os.makedirs(local_md_dir, exist_ok=True)
 # Health check endpoint
 @app.get("/health", tags=["Health"])
 async def health_check() -> Dict[str, Any]:
 @app.post("/extract", tags=["PDF Processing"])
 async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
     """
+    Process a PDF file using PymuDocDataset and return the extracted markdown content.
     Parameters:
         file: The PDF file to process
     Returns:
+        A JSON object containing the extracted markdown and status.
     """
     if not file.filename or not file.filename.lower().endswith('.pdf'):
         raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
         with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
             temp_pdf.write(content)
             temp_pdf_path = temp_pdf.name
+        # Clear previous output files (optional, depending on desired behavior)
+        # You might want to handle output naming differently in a multi-user API context
+        # For simplicity, we'll clear the output dir here like in convert_pdf.py
+        for item in os.listdir(local_image_dir):
+            os.remove(os.path.join(local_image_dir, item))
+        for item in os.listdir(local_md_dir):
+             if os.path.isfile(os.path.join(local_md_dir, item)):
+                 os.remove(os.path.join(local_md_dir, item))
+        # Get filename and prepare output paths for magic-pdf
+        pdf_file_name = os.path.basename(temp_pdf_path)
+        name_without_suff = os.path.splitext(pdf_file_name)[0]
+        image_dir_rel_path = str(os.path.basename(local_image_dir)) # Relative path for markdown image links
+        # Setup writers
+        image_writer = FileBasedDataWriter(local_image_dir)
+        md_writer = FileBasedDataWriter(local_md_dir)
+        # Use PymuDocDataset for processing
+        ds = PymuDocDataset(content) # Pass pdf bytes directly
+        # Inference and pipeline based on PDF type
+        if ds.classify() == SupportedPdfParseMethod.OCR:
+            infer_result = ds.apply(doc_analyze, ocr=True)
+            pipe_result = infer_result.pipe_ocr_mode(image_writer)
+        else:
+            infer_result = ds.apply(doc_analyze, ocr=False)
+            pipe_result = infer_result.pipe_txt_mode(image_writer)
+        # Optional: Generate intermediate output files (comment out if not needed for API)
+        infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
+        pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
+        pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
+        pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir_rel_path)
+        pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
+        # Get markdown content
+        md_content = pipe_result.get_markdown(image_dir_rel_path)
+        # Dump markdown to file (optional for API, but useful for debugging/access)
+        md_file_path = f"{name_without_suff}.md"
+        pipe_result.dump_md(md_writer, md_file_path, image_dir_rel_path)
+        print(f"Markdown saved to: {os.path.join(local_md_dir, md_file_path)}")
+        # Return the markdown content in the response
+        return {
             "filename": file.filename,
+            "status": "success",
+            "markdown_content": md_content
+            # You could potentially add links to the generated files here if needed
+            # "output_files": { ... }
         }
     except Exception as e:
         error_detail = str(e)
         error_trace = traceback.format_exc()
+        # Log the error
         print(f"Error processing PDF: {error_detail}")
         print(error_trace)
                 pass
 if __name__ == "__main__":
+    # Keep uvicorn import here for local running
+    import uvicorn
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)