dmitrynovikov2121 commited on
Commit
59be4c5
·
verified ·
1 Parent(s): f280e03

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +228 -1
app.py CHANGED
@@ -7,6 +7,225 @@ import torch
7
  from PIL import Image
8
  import numpy as np
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  @app.post("/process_document")
11
  async def process_document(
12
  file: UploadFile = File(...),
@@ -167,4 +386,12 @@ async def process_document(
167
  return JSONResponse(
168
  status_code=500,
169
  content={"error": str(e)}
170
- )
 
 
 
 
 
 
 
 
 
7
  from PIL import Image
8
  import numpy as np
9
 
10
+ # Copyright (c) Opendatalab. All rights reserved.
11
+
12
+ import base64
13
+ import json
14
+ import os
15
+ import time
16
+ import zipfile
17
+ from pathlib import Path
18
+ import re
19
+ import uuid
20
+ import pymupdf
21
+ from io import BytesIO
22
+ from fastapi import FastAPI, File, UploadFile
23
+ from fastapi.responses import JSONResponse
24
+ import uvicorn
25
+
26
+ # Initialize FastAPI app
27
+ app = FastAPI()
28
+
29
+ # Setup and installation commands
30
+ os.system('pip uninstall -y magic-pdf')
31
+ os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
32
+ os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py')
33
+ os.system('python download_models_hf.py')
34
+
35
+ # Configure magic-pdf settings
36
+ with open('/home/user/magic-pdf.json', 'r') as file:
37
+ data = json.load(file)
38
+
39
+ data['device-mode'] = "cuda"
40
+ if os.getenv('apikey'):
41
+ data['llm-aided-config']['title_aided']['api_key'] = os.getenv('apikey')
42
+ data['llm-aided-config']['title_aided']['enable'] = True
43
+
44
+ with open('/home/user/magic-pdf.json', 'w') as file:
45
+ json.dump(data, file, indent=4)
46
+
47
+ os.system('cp -r paddleocr /home/user/.paddleocr')
48
+
49
+ # Import required modules
50
+ from magic_pdf.data.data_reader_writer import FileBasedDataReader
51
+ from magic_pdf.libs.hash_utils import compute_sha256
52
+ from magic_pdf.tools.common import do_parse, prepare_env
53
+ from loguru import logger
54
+
55
+ def read_fn(path):
56
+ disk_rw = FileBasedDataReader(os.path.dirname(path))
57
+ return disk_rw.read(os.path.basename(path))
58
+
59
+ def read_fn(path):
60
+ disk_rw = FileBasedDataReader(os.path.dirname(path))
61
+ return disk_rw.read(os.path.basename(path))
62
+
63
+
64
+ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
65
+ os.makedirs(output_dir, exist_ok=True)
66
+
67
+ try:
68
+ file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
69
+ pdf_data = read_fn(doc_path)
70
+ if is_ocr:
71
+ parse_method = "ocr"
72
+ else:
73
+ parse_method = "auto"
74
+ local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
75
+ do_parse(
76
+ output_dir,
77
+ file_name,
78
+ pdf_data,
79
+ [],
80
+ parse_method,
81
+ False,
82
+ end_page_id=end_page_id,
83
+ layout_model=layout_mode,
84
+ formula_enable=formula_enable,
85
+ table_enable=table_enable,
86
+ lang=language,
87
+ f_dump_orig_pdf=False,
88
+ )
89
+ return local_md_dir, file_name
90
+ except Exception as e:
91
+ logger.exception(e)
92
+
93
+
94
+ def compress_directory_to_zip(directory_path, output_zip_path):
95
+ """
96
+ 压缩指定目录到一个 ZIP 文件。
97
+
98
+ :param directory_path: 要压缩的目录路径
99
+ :param output_zip_path: 输出的 ZIP 文件路径
100
+ """
101
+ try:
102
+ with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
103
+
104
+ # 遍历目录中的所有文件和子目录
105
+ for root, dirs, files in os.walk(directory_path):
106
+ for file in files:
107
+ # 构建完整的文件路径
108
+ file_path = os.path.join(root, file)
109
+ # 计算相对路径
110
+ arcname = os.path.relpath(file_path, directory_path)
111
+ # 添加文件到 ZIP 文件
112
+ zipf.write(file_path, arcname)
113
+ return 0
114
+ except Exception as e:
115
+ logger.exception(e)
116
+ return -1
117
+
118
+
119
+ def image_to_base64(image_path):
120
+ with open(image_path, "rb") as image_file:
121
+ return base64.b64encode(image_file.read()).decode('utf-8')
122
+
123
+
124
+ def replace_image_with_base64(markdown_text, image_dir_path):
125
+ # 匹配Markdown中的图片标签
126
+ pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
127
+
128
+ # 替换图片链接
129
+ def replace(match):
130
+ relative_path = match.group(1)
131
+ full_path = os.path.join(image_dir_path, relative_path)
132
+ base64_image = image_to_base64(full_path)
133
+ return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"
134
+
135
+ # 应用替换
136
+ return re.sub(pattern, replace, markdown_text)
137
+
138
+
139
+ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
140
+ file_path = to_pdf(file_path)
141
+ if end_pages > 20:
142
+ end_pages = 20
143
+ # 获取识别的md文件以及压缩包文件路径
144
+ local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
145
+ layout_mode, formula_enable, table_enable, language)
146
+ archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
147
+ zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
148
+ if zip_archive_success == 0:
149
+ logger.info("压缩成功")
150
+ else:
151
+ logger.error("压缩失败")
152
+ md_path = os.path.join(local_md_dir, file_name + ".md")
153
+ with open(md_path, 'r', encoding='utf-8') as f:
154
+ txt_content = f.read()
155
+ md_content = replace_image_with_base64(txt_content, local_md_dir)
156
+ # 返回转换后的PDF路径
157
+ new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
158
+
159
+ return md_content, txt_content, archive_zip_path, new_pdf_path
160
+
161
+
162
+ latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
163
+ {"left": '$', "right": '$', "display": False}]
164
+
165
+
166
+ def init_model():
167
+ from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
168
+ try:
169
+ model_manager = ModelSingleton()
170
+ txt_model = model_manager.get_model(False, False)
171
+ logger.info(f"txt_model init final")
172
+ ocr_model = model_manager.get_model(True, False)
173
+ logger.info(f"ocr_model init final")
174
+ return 0
175
+ except Exception as e:
176
+ logger.exception(e)
177
+ return -1
178
+
179
+
180
+ model_init = init_model()
181
+ logger.info(f"model_init: {model_init}")
182
+
183
+
184
+ with open("header.html", "r") as file:
185
+ header = file.read()
186
+
187
+
188
+ latin_lang = [
189
+ 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
190
+ 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
191
+ 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
192
+ 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
193
+ ]
194
+ arabic_lang = ['ar', 'fa', 'ug', 'ur']
195
+ cyrillic_lang = [
196
+ 'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
197
+ 'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
198
+ ]
199
+ devanagari_lang = [
200
+ 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
201
+ 'sa', 'bgc'
202
+ ]
203
+ other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
204
+
205
+ all_lang = ['', 'auto']
206
+ all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
207
+
208
+
209
+ def to_pdf(file_path):
210
+ with pymupdf.open(file_path) as f:
211
+ if f.is_pdf:
212
+ return file_path
213
+ else:
214
+ pdf_bytes = f.convert_to_pdf()
215
+ # 将pdfbytes 写入到uuid.pdf中
216
+ # 生成唯一的文件名
217
+ unique_filename = f"{uuid.uuid4()}.pdf"
218
+
219
+ # 构建完整的文件路径
220
+ tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
221
+
222
+ # 将字节数据写入文件
223
+ with open(tmp_file_path, 'wb') as tmp_pdf_file:
224
+ tmp_pdf_file.write(pdf_bytes)
225
+
226
+ return tmp_file_path
227
+
228
+
229
  @app.post("/process_document")
230
  async def process_document(
231
  file: UploadFile = File(...),
 
386
  return JSONResponse(
387
  status_code=500,
388
  content={"error": str(e)}
389
+ )
390
+
391
+
392
+ # Initialize models
393
+ model_init = init_model()
394
+ logger.info(f"model_init: {model_init}")
395
+
396
+ if __name__ == "__main__":
397
+ uvicorn.run(app, host="0.0.0.0", port=7860)