taprosoft commited on
Commit
188f052
·
1 Parent(s): 0933b39

fix: skip problematic import

Browse files
Files changed (5) hide show
  1. .pre-commit-config.yaml +1 -1
  2. README.md +3 -3
  3. app.py +28 -6
  4. requirements.txt +1 -3
  5. utils.py +18 -0
.pre-commit-config.yaml CHANGED
@@ -29,7 +29,7 @@ repos:
29
  rev: 4.0.1
30
  hooks:
31
  - id: flake8
32
- args: ["--max-line-length", "88", "--extend-ignore", "E203"]
33
  - repo: https://github.com/myint/autoflake
34
  rev: v1.4
35
  hooks:
 
29
  rev: 4.0.1
30
  hooks:
31
  - id: flake8
32
+ args: ["--max-line-length", "88", "--extend-ignore", "E203,E402"]
33
  - repo: https://github.com/myint/autoflake
34
  rev: v1.4
35
  hooks:
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: DoclingConverter
3
  emoji: 🐢
4
  colorFrom: blue
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.7.1
8
  app_file: app.py
9
  pinned: false
10
- short_description: Convert documents to Markdown or JSON with metadata
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: PDFParsersPlayground
3
  emoji: 🐢
4
  colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.7.1
8
  app_file: app.py
9
  pinned: false
10
+ short_description: Convert PDF documents to Markdown with multiple open-source parsers
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  import time
2
  from pathlib import Path
3
 
@@ -46,9 +51,15 @@ def convert_document(path, method, enabled=True):
46
  elif method == "MinerU":
47
  text, debug_image_paths = convert_mineru(path, file_name)
48
 
49
- end = time.time()
50
- print(f"Conversion with {method} took {end - start} seconds")
51
- return text, remove_images_from_markdown(text), debug_image_paths
 
 
 
 
 
 
52
 
53
 
54
  def show_tabs(selected_methods):
@@ -73,7 +84,8 @@ print("Warm-up sequence")
73
  for method in SUPPORTED_METHODS:
74
  for _ in range(1):
75
  convert_document(WARMUP_PDF_PATH, method)
76
- print("Start up time", time.time() - start_startup, "seconds")
 
77
 
78
  with gr.Blocks(
79
  theme=gr.themes.Ocean(),
@@ -149,9 +161,19 @@ with gr.Blocks(
149
  markdown_text = gr.TextArea(
150
  lines=45, show_label=False, container=False
151
  )
 
 
 
 
 
152
 
153
  output_components.extend(
154
- [markdown_render, markdown_text, debug_images]
 
 
 
 
 
155
  )
156
  output_tabs.append(output_tab)
157
  visualization_sub_tabs.append(visual_sub_tab)
@@ -199,7 +221,7 @@ with gr.Blocks(
199
  input_file, methods, method
200
  ),
201
  inputs=[input_file, methods],
202
- outputs=output_components[idx * 3 : (idx + 1) * 3],
203
  )
204
 
205
  click_event.then(
 
1
+ from utils import fix_problematic_imports # noqa
2
+
3
+ fix_problematic_imports() # noqa
4
+
5
+
6
  import time
7
  from pathlib import Path
8
 
 
51
  elif method == "MinerU":
52
  text, debug_image_paths = convert_mineru(path, file_name)
53
 
54
+ duration = time.time() - start
55
+ duration_message = f"Conversion with {method} took *{duration:.2f} seconds*"
56
+ print(duration_message)
57
+ return (
58
+ duration_message,
59
+ text,
60
+ remove_images_from_markdown(text),
61
+ debug_image_paths,
62
+ )
63
 
64
 
65
  def show_tabs(selected_methods):
 
84
  for method in SUPPORTED_METHODS:
85
  for _ in range(1):
86
  convert_document(WARMUP_PDF_PATH, method)
87
+ startup_duration = time.time() - start_startup
88
+ print(f"Total start-up time: {startup_duration:.2f} seconds")
89
 
90
  with gr.Blocks(
91
  theme=gr.themes.Ocean(),
 
161
  markdown_text = gr.TextArea(
162
  lines=45, show_label=False, container=False
163
  )
164
+ with gr.Tab("Reference"):
165
+ output_description = gr.Markdown(
166
+ container=False,
167
+ show_label=False,
168
+ )
169
 
170
  output_components.extend(
171
+ [
172
+ output_description,
173
+ markdown_render,
174
+ markdown_text,
175
+ debug_images,
176
+ ]
177
  )
178
  output_tabs.append(output_tab)
179
  visualization_sub_tabs.append(visual_sub_tab)
 
221
  input_file, methods, method
222
  ),
223
  inputs=[input_file, methods],
224
+ outputs=output_components[idx * 4 : (idx + 1) * 4],
225
  )
226
 
227
  click_event.then(
requirements.txt CHANGED
@@ -1,5 +1,3 @@
1
- torch>=2.2.2,<=2.3.1
2
- torchvision>=0.17.2,<=0.18.1
3
  paddlepaddle-gpu @ https://paddle-whl.bj.bcebos.com/stable/cu118/paddlepaddle-gpu/paddlepaddle_gpu-3.0.0b1-cp310-cp310-linux_x86_64.whl
4
  detectron2 @ https://wheels-1251341229.cos.ap-shanghai.myqcloud.com/assets/whl/detectron2/detectron2-0.6-cp310-cp310-linux_x86_64.whl
5
  paddleocr==2.7.3
@@ -14,7 +12,7 @@ PyMuPDF>=1.24.9,<1.24.14
14
  pymupdf4llm
15
  unstructured[pdf]
16
  ultralytics>=8.3.48
17
- unimernet==0.2.3
18
  transformers<5.0.0,>=4.45.2
19
  struct-eqtable==0.3.2
20
  openai
 
 
 
 
1
  paddlepaddle-gpu @ https://paddle-whl.bj.bcebos.com/stable/cu118/paddlepaddle-gpu/paddlepaddle_gpu-3.0.0b1-cp310-cp310-linux_x86_64.whl
2
  detectron2 @ https://wheels-1251341229.cos.ap-shanghai.myqcloud.com/assets/whl/detectron2/detectron2-0.6-cp310-cp310-linux_x86_64.whl
3
  paddleocr==2.7.3
 
12
  pymupdf4llm
13
  unstructured[pdf]
14
  ultralytics>=8.3.48
 
15
  transformers<5.0.0,>=4.45.2
16
  struct-eqtable==0.3.2
17
  openai
18
+ doclayout_yolo==0.0.2b1
utils.py CHANGED
@@ -29,3 +29,21 @@ def trim_pages(pdf_path, output_path, trim_pages=5):
29
  copy2(pdf_path, str(output_file_path))
30
 
31
  return str(output_file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  copy2(pdf_path, str(output_file_path))
30
 
31
  return str(output_file_path)
32
+
33
+
34
+ def fix_problematic_imports():
35
+ import sys
36
+ import types
37
+
38
+ # Create a fake 'UnimernetModel' class inside a fake 'Unimernet' module
39
+ fake_unimernet_module = types.ModuleType(
40
+ "magic_pdf.model.sub_modules.mfr.unimernet.Unimernet"
41
+ )
42
+ fake_unimernet_module.UnimernetModel = type( # type: ignore
43
+ "UnimernetModel", (), {}
44
+ )
45
+
46
+ # Register fake module in sys.modules
47
+ sys.modules[
48
+ "magic_pdf.model.sub_modules.mfr.unimernet.Unimernet"
49
+ ] = fake_unimernet_module