dmitrynovikov2121's picture
Update app.py
76d97a2 verified
raw
history blame
3.17 kB
# Copyright (c) Opendatalab. All rights reserved.
import base64
import json
import os
import time
import zipfile
from pathlib import Path
import re
import uuid
import pymupdf
from io import BytesIO
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import uvicorn
# Initialize FastAPI app
app = FastAPI()
# Setup and installation commands
os.system('pip uninstall -y magic-pdf')
os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py')
os.system('python download_models_hf.py')
# Configure magic-pdf settings
with open('/home/user/magic-pdf.json', 'r') as file:
data = json.load(file)
data['device-mode'] = "cuda"
if os.getenv('apikey'):
data['llm-aided-config']['title_aided']['api_key'] = os.getenv('apikey')
data['llm-aided-config']['title_aided']['enable'] = True
with open('/home/user/magic-pdf.json', 'w') as file:
json.dump(data, file, indent=4)
os.system('cp -r paddleocr /home/user/.paddleocr')
# Import required modules
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.libs.hash_utils import compute_sha256
from magic_pdf.tools.common import do_parse, prepare_env
from loguru import logger
def read_fn(path):
disk_rw = FileBasedDataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path))
# Your existing functions here (parse_pdf, compress_directory_to_zip, image_to_base64, etc.)
# ... (keep all the utility functions from your original code)
@app.post("/process_document")
async def process_document(
file: UploadFile = File(...),
end_pages: int = 10,
is_ocr: bool = False,
layout_mode: str = "doclayout_yolo",
formula_enable: bool = True,
table_enable: bool = True,
language: str = "auto"
):
try:
# Save uploaded file temporarily
temp_path = f"/tmp/{file.filename}"
with open(temp_path, "wb") as buffer:
content = await file.read()
buffer.write(content)
# Process file
md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
temp_path,
end_pages=end_pages,
is_ocr=is_ocr,
layout_mode=layout_mode,
formula_enable=formula_enable,
table_enable=table_enable,
language=language
)
# Read the zip file as base64
with open(archive_zip_path, "rb") as zip_file:
zip_content = base64.b64encode(zip_file.read()).decode()
# Clean up
os.remove(temp_path)
return JSONResponse({
"markdown_content": md_content,
"text_content": txt_content,
"zip_file_base64": zip_content
})
except Exception as e:
return JSONResponse(
status_code=500,
content={"error": str(e)}
)
# Initialize models
model_init = init_model()
logger.info(f"model_init: {model_init}")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)