Spaces:
Runtime error
Runtime error
# Copyright (c) Opendatalab. All rights reserved. | |
import base64 | |
import json | |
import os | |
import time | |
import zipfile | |
from pathlib import Path | |
import re | |
import uuid | |
import pymupdf | |
from io import BytesIO | |
from fastapi import FastAPI, File, UploadFile | |
from fastapi.responses import JSONResponse | |
import uvicorn | |
# Initialize FastAPI app | |
app = FastAPI() | |
# Setup and installation commands | |
os.system('pip uninstall -y magic-pdf') | |
os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev') | |
os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py') | |
os.system('python download_models_hf.py') | |
# Configure magic-pdf settings | |
with open('/home/user/magic-pdf.json', 'r') as file: | |
data = json.load(file) | |
data['device-mode'] = "cuda" | |
if os.getenv('apikey'): | |
data['llm-aided-config']['title_aided']['api_key'] = os.getenv('apikey') | |
data['llm-aided-config']['title_aided']['enable'] = True | |
with open('/home/user/magic-pdf.json', 'w') as file: | |
json.dump(data, file, indent=4) | |
os.system('cp -r paddleocr /home/user/.paddleocr') | |
# Import required modules | |
from magic_pdf.data.data_reader_writer import FileBasedDataReader | |
from magic_pdf.libs.hash_utils import compute_sha256 | |
from magic_pdf.tools.common import do_parse, prepare_env | |
from loguru import logger | |
def read_fn(path): | |
disk_rw = FileBasedDataReader(os.path.dirname(path)) | |
return disk_rw.read(os.path.basename(path)) | |
# Your existing functions here (parse_pdf, compress_directory_to_zip, image_to_base64, etc.) | |
# ... (keep all the utility functions from your original code) | |
async def process_document( | |
file: UploadFile = File(...), | |
end_pages: int = 10, | |
is_ocr: bool = False, | |
layout_mode: str = "doclayout_yolo", | |
formula_enable: bool = True, | |
table_enable: bool = True, | |
language: str = "auto" | |
): | |
try: | |
# Save uploaded file temporarily | |
temp_path = f"/tmp/{file.filename}" | |
with open(temp_path, "wb") as buffer: | |
content = await file.read() | |
buffer.write(content) | |
# Process file | |
md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown( | |
temp_path, | |
end_pages=end_pages, | |
is_ocr=is_ocr, | |
layout_mode=layout_mode, | |
formula_enable=formula_enable, | |
table_enable=table_enable, | |
language=language | |
) | |
# Read the zip file as base64 | |
with open(archive_zip_path, "rb") as zip_file: | |
zip_content = base64.b64encode(zip_file.read()).decode() | |
# Clean up | |
os.remove(temp_path) | |
return JSONResponse({ | |
"markdown_content": md_content, | |
"text_content": txt_content, | |
"zip_file_base64": zip_content | |
}) | |
except Exception as e: | |
return JSONResponse( | |
status_code=500, | |
content={"error": str(e)} | |
) | |
# Initialize models | |
model_init = init_model() | |
logger.info(f"model_init: {model_init}") | |
if __name__ == "__main__": | |
uvicorn.run(app, host="0.0.0.0", port=7860) |