taprosoft commited on
Commit
0933b39
·
1 Parent(s): 3bce890

fix: add requirements.txt, fix empty debug dir

Browse files
.gitignore ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
3
+
4
+ activate*
5
+ activate/*
6
+ .env
7
+
8
+ ### Emacs ###
9
+ # -*- mode: gitignore; -*-
10
+ *~
11
+ \#*\#
12
+ /.emacs.desktop
13
+ /.emacs.desktop.lock
14
+ *.elc
15
+ auto-save-list
16
+ tramp
17
+ .\#*
18
+
19
+ # Org-mode
20
+ .org-id-locations
21
+ *_archive
22
+
23
+ # flymake-mode
24
+ *_flymake.*
25
+
26
+ # eshell files
27
+ /eshell/history
28
+ /eshell/lastdir
29
+
30
+ # elpa packages
31
+ /elpa/
32
+
33
+ # reftex files
34
+ *.rel
35
+
36
+ # AUCTeX auto folder
37
+ /auto/
38
+
39
+ # cask packages
40
+ .cask/
41
+ dist/
42
+
43
+ # Flycheck
44
+ flycheck_*.el
45
+
46
+ # server auth directory
47
+ /server/
48
+
49
+ # projectiles files
50
+ .projectile
51
+
52
+ # directory configuration
53
+ .dir-locals.el
54
+
55
+ # network security
56
+ /network-security.data
57
+
58
+ ### Linux ###
59
+
60
+ # temporary files which can be created if a process still has a handle open of a deleted file
61
+ .fuse_hidden*
62
+
63
+ # KDE directory preferences
64
+ .directory
65
+
66
+ # Linux trash folder which might appear on any partition or disk
67
+ .Trash-*
68
+
69
+ # .nfs files are created when an open file is removed but is still being accessed
70
+ .nfs*
71
+
72
+ ### macOS ###
73
+ # General
74
+ .DS_Store
75
+ .AppleDouble
76
+ .LSOverride
77
+
78
+ # Icon must end with two \r
79
+ Icon
80
+
81
+ # Thumbnails
82
+ ._*
83
+
84
+ # Files that might appear in the root of a volume
85
+ .DocumentRevisions-V100
86
+ .fseventsd
87
+ .Spotlight-V100
88
+ .TemporaryItems
89
+ .Trashes
90
+ .VolumeIcon.icns
91
+ .com.apple.timemachine.donotpresent
92
+
93
+ # Directories potentially created on remote AFP share
94
+ .AppleDB
95
+ .AppleDesktop
96
+ Network Trash Folder
97
+ Temporary Items
98
+ .apdisk
99
+
100
+ ### macOS Patch ###
101
+ # iCloud generated files
102
+ *.icloud
103
+
104
+ ### PyCharm ###
105
+ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
106
+ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
107
+
108
+ # User-specific stuff
109
+ .idea/**/workspace.xml
110
+ .idea/**/tasks.xml
111
+ .idea/**/usage.statistics.xml
112
+ .idea/**/dictionaries
113
+ .idea/**/shelf
114
+
115
+ # AWS User-specific
116
+ .idea/**/aws.xml
117
+
118
+ # Generated files
119
+ .idea/**/contentModel.xml
120
+
121
+ # Sensitive or high-churn files
122
+ .idea/**/dataSources/
123
+ .idea/**/dataSources.ids
124
+ .idea/**/dataSources.local.xml
125
+ .idea/**/sqlDataSources.xml
126
+ .idea/**/dynamic.xml
127
+ .idea/**/uiDesigner.xml
128
+ .idea/**/dbnavigator.xml
129
+
130
+ # Gradle
131
+ .idea/**/gradle.xml
132
+ .idea/**/libraries
133
+
134
+ # Gradle and Maven with auto-import
135
+ # When using Gradle or Maven with auto-import, you should exclude module files,
136
+ # since they will be recreated, and may cause churn. Uncomment if using
137
+ # auto-import.
138
+ # .idea/artifacts
139
+ # .idea/compiler.xml
140
+ # .idea/jarRepositories.xml
141
+ # .idea/modules.xml
142
+ # .idea/*.iml
143
+ # .idea/modules
144
+ # *.iml
145
+ # *.ipr
146
+
147
+ # CMake
148
+ cmake-build-*/
149
+
150
+ # Mongo Explorer plugin
151
+ .idea/**/mongoSettings.xml
152
+
153
+ # File-based project format
154
+ *.iws
155
+
156
+ # IntelliJ
157
+ out/
158
+
159
+ # mpeltonen/sbt-idea plugin
160
+ .idea_modules/
161
+
162
+ # JIRA plugin
163
+ atlassian-ide-plugin.xml
164
+
165
+ # Cursive Clojure plugin
166
+ .idea/replstate.xml
167
+
168
+ # SonarLint plugin
169
+ .idea/sonarlint/
170
+
171
+ # Crashlytics plugin (for Android Studio and IntelliJ)
172
+ com_crashlytics_export_strings.xml
173
+ crashlytics.properties
174
+ crashlytics-build.properties
175
+ fabric.properties
176
+
177
+ # Editor-based Rest Client
178
+ .idea/httpRequests
179
+
180
+ # Android studio 3.1+ serialized cache file
181
+ .idea/caches/build_file_checksums.ser
182
+
183
+ ### PyCharm Patch ###
184
+ # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
185
+
186
+ # *.iml
187
+ # modules.xml
188
+ # .idea/misc.xml
189
+ # *.ipr
190
+
191
+ # Sonarlint plugin
192
+ # https://plugins.jetbrains.com/plugin/7973-sonarlint
193
+ .idea/**/sonarlint/
194
+
195
+ # SonarQube Plugin
196
+ # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
197
+ .idea/**/sonarIssues.xml
198
+
199
+ # Markdown Navigator plugin
200
+ # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
201
+ .idea/**/markdown-navigator.xml
202
+ .idea/**/markdown-navigator-enh.xml
203
+ .idea/**/markdown-navigator/
204
+
205
+ # Cache file creation bug
206
+ # See https://youtrack.jetbrains.com/issue/JBR-2257
207
+ .idea/$CACHE_FILE$
208
+
209
+ # CodeStream plugin
210
+ # https://plugins.jetbrains.com/plugin/12206-codestream
211
+ .idea/codestream.xml
212
+
213
+ # Azure Toolkit for IntelliJ plugin
214
+ # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
215
+ .idea/**/azureSettings.xml
216
+
217
+ ### Python ###
218
+ # Byte-compiled / optimized / DLL files
219
+ __pycache__/
220
+ *.py[cod]
221
+ *$py.class
222
+
223
+ # C extensions
224
+ *.so
225
+
226
+ # Distribution / packaging
227
+ .Python
228
+ build/
229
+ develop-eggs/
230
+ downloads/
231
+ eggs/
232
+ .eggs/
233
+ lib/
234
+ lib64/
235
+ parts/
236
+ sdist/
237
+ var/
238
+ wheels/
239
+ share/python-wheels/
240
+ *.egg-info/
241
+ .installed.cfg
242
+ *.egg
243
+ MANIFEST
244
+
245
+ # PyInstaller
246
+ # Usually these files are written by a python script from a template
247
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
248
+ *.manifest
249
+ *.spec
250
+
251
+ # Installer logs
252
+ pip-log.txt
253
+ pip-delete-this-directory.txt
254
+
255
+ # Unit test / coverage reports
256
+ htmlcov/
257
+ .tox/
258
+ .nox/
259
+ .coverage
260
+ .coverage.*
261
+ .cache
262
+ nosetests.xml
263
+ coverage.xml
264
+ *.cover
265
+ *.py,cover
266
+ .hypothesis/
267
+ .pytest_cache/
268
+ cover/
269
+
270
+ # Translations
271
+ *.mo
272
+ *.pot
273
+
274
+ # Django stuff:
275
+ *.log
276
+ local_settings.py
277
+ db.sqlite3
278
+ db.sqlite3-journal
279
+
280
+ # Flask stuff:
281
+ instance/
282
+ .webassets-cache
283
+
284
+ # Scrapy stuff:
285
+ .scrapy
286
+
287
+ # Sphinx documentation
288
+ docs/_build/
289
+
290
+ # PyBuilder
291
+ .pybuilder/
292
+ target/
293
+
294
+ # Jupyter Notebook
295
+ .ipynb_checkpoints
296
+
297
+ # IPython
298
+ profile_default/
299
+ ipython_config.py
300
+
301
+ # pyenv
302
+ # For a library or package, you might want to ignore these files since the code is
303
+ # intended to run in multiple environments; otherwise, check them in:
304
+ # .python-version
305
+
306
+ # pipenv
307
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
308
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
309
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
310
+ # install all needed dependencies.
311
+ #Pipfile.lock
312
+
313
+ # poetry
314
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
315
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
316
+ # commonly ignored for libraries.
317
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
318
+ #poetry.lock
319
+
320
+ # pdm
321
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
322
+ #pdm.lock
323
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
324
+ # in version control.
325
+ # https://pdm.fming.dev/#use-with-ide
326
+ .pdm.toml
327
+
328
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
329
+ __pypackages__/
330
+
331
+ # Celery stuff
332
+ celerybeat-schedule
333
+ celerybeat.pid
334
+
335
+ # SageMath parsed files
336
+ *.sage.py
337
+
338
+ # Environments
339
+ .venv
340
+ env/
341
+ venv/
342
+ ENV/
343
+ env.bak/
344
+ venv.bak/
345
+
346
+ # Spyder project settings
347
+ .spyderproject
348
+ .spyproject
349
+
350
+ # Rope project settings
351
+ .ropeproject
352
+
353
+ # mkdocs documentation
354
+ /site
355
+
356
+ # mypy
357
+ .mypy_cache/
358
+ .dmypy.json
359
+ dmypy.json
360
+
361
+ # Pyre type checker
362
+ .pyre/
363
+
364
+ # pytype static type analyzer
365
+ .pytype/
366
+
367
+ # Cython debug symbols
368
+ cython_debug/
369
+
370
+ # PyCharm
371
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
372
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
373
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
374
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
375
+ #.idea/
376
+
377
+ ### Python Patch ###
378
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
379
+ poetry.toml
380
+
381
+ # ruff
382
+ .ruff_cache/
383
+
384
+ # LSP config files
385
+ pyrightconfig.json
386
+
387
+ ### Vim ###
388
+ # Swap
389
+ [._]*.s[a-v][a-z]
390
+ !*.svg # comment out if you don't need vector files
391
+ [._]*.sw[a-p]
392
+ [._]s[a-rt-v][a-z]
393
+ [._]ss[a-gi-z]
394
+ [._]sw[a-p]
395
+
396
+ # Session
397
+ Session.vim
398
+ Sessionx.vim
399
+
400
+ # Temporary
401
+ .netrwhist
402
+ # Auto-generated tag files
403
+ tags
404
+ # Persistent undo
405
+ [._]*.un~
406
+
407
+ ### VisualStudioCode ###
408
+ .vscode/*
409
+ !.vscode/settings.json
410
+ !.vscode/tasks.json
411
+ !.vscode/launch.json
412
+ !.vscode/extensions.json
413
+ !.vscode/*.code-snippets
414
+
415
+ # Local History for Visual Studio Code
416
+ .history/
417
+
418
+ # Built Visual Studio Code Extensions
419
+ *.vsix
420
+
421
+ ### VisualStudioCode Patch ###
422
+ # Ignore all local history of files
423
+ .history
424
+ .ionide
425
+
426
+ ### Windows ###
427
+ # Windows thumbnail cache files
428
+ Thumbs.db
429
+ Thumbs.db:encryptable
430
+ ehthumbs.db
431
+ ehthumbs_vista.db
432
+
433
+ # Dump file
434
+ *.stackdump
435
+
436
+ # Folder config file
437
+ [Dd]esktop.ini
438
+
439
+ # Recycle Bin used on file shares
440
+ $RECYCLE.BIN/
441
+
442
+ # Windows Installer files
443
+ *.cab
444
+ *.msi
445
+ *.msix
446
+ *.msm
447
+ *.msp
448
+
449
+ # Windows shortcuts
450
+ *.lnk
451
+
452
+ # PDF files
453
+ *.pdf
454
+
455
+ # End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
456
+ *.py[coid]
457
+
458
+ logs/
459
+ .gitsecret/keys/random_seed
460
+ !*.secret
461
+ .envrc
462
+ .env
463
+
464
+ S.gpg-agent*
465
+ .vscode/settings.json
466
+ examples/example1/assets
467
+ storage/*
app.py CHANGED
@@ -53,7 +53,7 @@ def convert_document(path, method, enabled=True):
53
 
54
  def show_tabs(selected_methods):
55
  visible_tabs = []
56
- for method in supported_methods:
57
  visible_tabs.append(gr.update(visible=method in selected_methods))
58
 
59
  return visible_tabs
@@ -66,14 +66,14 @@ latex_delimiters = [
66
 
67
  # startup test (also for loading models the first time)
68
  start_startup = time.time()
69
- test_pdf_path = "/home/tadashi/MinerU/examples/complex_layout.pdf"
70
- supported_methods = ["Docling", "Marker", "Unstructured", "MinerU", "PyMuPDF"]
71
 
72
- # print("Warm-up sequence")
73
- # for method in supported_methods:
74
- # for _ in range(1):
75
- # convert_document(test_pdf_path, method)
76
- # print("Start up time", time.time() - start_startup, "seconds")
77
 
78
  with gr.Blocks(
79
  theme=gr.themes.Ocean(),
@@ -84,7 +84,7 @@ with gr.Blocks(
84
  output_components = []
85
  output_tabs = []
86
  visualization_sub_tabs = []
87
- first_method = supported_methods[0]
88
 
89
  with gr.Row():
90
  with gr.Column(variant="panel", scale=5):
@@ -99,7 +99,7 @@ with gr.Blocks(
99
  with gr.Column(variant="panel", scale=5):
100
  with gr.Row():
101
  methods = gr.Dropdown(
102
- supported_methods,
103
  label="Conversion methods",
104
  value=first_method,
105
  multiselect=True,
@@ -125,7 +125,7 @@ with gr.Blocks(
125
 
126
  with gr.Column(variant="panel", scale=5):
127
  with gr.Tabs():
128
- for method in supported_methods:
129
  with gr.Tab(method, visible=False) as output_tab:
130
  with gr.Tabs():
131
  with gr.Tab("Markdown rendering"):
@@ -162,17 +162,17 @@ with gr.Blocks(
162
  inputs=[methods],
163
  outputs=output_tabs,
164
  )
165
- for idx, method in enumerate(supported_methods):
166
 
167
  def progress_message(selected_methods, method=method):
168
  selected_methods_indices = [
169
  idx
170
- for idx, current_method in enumerate(supported_methods)
171
  if current_method in selected_methods
172
  ]
173
  try:
174
  current_method_idx = selected_methods_indices.index(
175
- supported_methods.index(method)
176
  )
177
  msg = (
178
  f"Processing ({current_method_idx + 1} / "
 
53
 
54
  def show_tabs(selected_methods):
55
  visible_tabs = []
56
+ for method in SUPPORTED_METHODS:
57
  visible_tabs.append(gr.update(visible=method in selected_methods))
58
 
59
  return visible_tabs
 
66
 
67
  # startup test (also for loading models the first time)
68
  start_startup = time.time()
69
+ WARMUP_PDF_PATH = "/home/tadashi/MinerU/examples/complex_layout.pdf"
70
+ SUPPORTED_METHODS = ["Docling", "Marker", "Unstructured", "MinerU", "PyMuPDF"]
71
 
72
+ print("Warm-up sequence")
73
+ for method in SUPPORTED_METHODS:
74
+ for _ in range(1):
75
+ convert_document(WARMUP_PDF_PATH, method)
76
+ print("Start up time", time.time() - start_startup, "seconds")
77
 
78
  with gr.Blocks(
79
  theme=gr.themes.Ocean(),
 
84
  output_components = []
85
  output_tabs = []
86
  visualization_sub_tabs = []
87
+ first_method = SUPPORTED_METHODS[0]
88
 
89
  with gr.Row():
90
  with gr.Column(variant="panel", scale=5):
 
99
  with gr.Column(variant="panel", scale=5):
100
  with gr.Row():
101
  methods = gr.Dropdown(
102
+ SUPPORTED_METHODS,
103
  label="Conversion methods",
104
  value=first_method,
105
  multiselect=True,
 
125
 
126
  with gr.Column(variant="panel", scale=5):
127
  with gr.Tabs():
128
+ for method in SUPPORTED_METHODS:
129
  with gr.Tab(method, visible=False) as output_tab:
130
  with gr.Tabs():
131
  with gr.Tab("Markdown rendering"):
 
162
  inputs=[methods],
163
  outputs=output_tabs,
164
  )
165
+ for idx, method in enumerate(SUPPORTED_METHODS):
166
 
167
  def progress_message(selected_methods, method=method):
168
  selected_methods_indices = [
169
  idx
170
+ for idx, current_method in enumerate(SUPPORTED_METHODS)
171
  if current_method in selected_methods
172
  ]
173
  try:
174
  current_method_idx = selected_methods_indices.index(
175
+ SUPPORTED_METHODS.index(method)
176
  )
177
  msg = (
178
  f"Processing ({current_method_idx + 1} / "
backends/docling.py CHANGED
@@ -42,8 +42,11 @@ def convert_docling(path: str, file_name: str):
42
  result = docling_converter.convert(path)
43
  text = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
44
  debug_image_dir = DOCLING_DEBUG_PATH / f"debug_{file_name}"
45
- debug_image_paths = [
46
- path for path in debug_image_dir.iterdir() if path.suffix == ".png"
47
- ]
 
 
 
48
 
49
  return text, debug_image_paths
 
42
  result = docling_converter.convert(path)
43
  text = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
44
  debug_image_dir = DOCLING_DEBUG_PATH / f"debug_{file_name}"
45
+ if debug_image_dir.exists():
46
+ debug_image_paths = [
47
+ path for path in debug_image_dir.iterdir() if path.suffix == ".png"
48
+ ]
49
+ else:
50
+ debug_image_paths = []
51
 
52
  return text, debug_image_paths
backends/marker.py CHANGED
@@ -53,8 +53,11 @@ def convert_marker(path: str, file_name: str):
53
  text, _, images = text_from_rendered(rendered)
54
  text = markdown_insert_images(text, images)
55
  debug_image_dir = Path(rendered.metadata.get("debug_data_path"))
56
- debug_image_paths = [
57
- path for path in debug_image_dir.iterdir() if "pdf_page" in path.stem
58
- ]
 
 
 
59
 
60
  return text, debug_image_paths
 
53
  text, _, images = text_from_rendered(rendered)
54
  text = markdown_insert_images(text, images)
55
  debug_image_dir = Path(rendered.metadata.get("debug_data_path"))
56
+ if debug_image_dir.exists():
57
+ debug_image_paths = [
58
+ path for path in debug_image_dir.iterdir() if "pdf_page" in path.stem
59
+ ]
60
+ else:
61
+ debug_image_paths = []
62
 
63
  return text, debug_image_paths
backends/mineru.py CHANGED
@@ -72,11 +72,13 @@ def convert_mineru(path: str, file_name: str):
72
  text = replace_image_with_base64(text, local_md_dir)
73
 
74
  debug_pdf = str(local_md_dir / (file_name + "_layout.pdf"))
75
- doc = pymupdf.open(debug_pdf) # open document
76
- for page in doc: # iterate through the pages
77
- pix = page.get_pixmap() # render page to an image
78
- page_debug_path = str(output_path / ("page-%i.png" % page.number))
79
- debug_image_paths.append(page_debug_path)
80
- pix.save(page_debug_path) # store image as a PNG
 
 
81
 
82
  return text, debug_image_paths
 
72
  text = replace_image_with_base64(text, local_md_dir)
73
 
74
  debug_pdf = str(local_md_dir / (file_name + "_layout.pdf"))
75
+
76
+ if Path(debug_pdf).exists():
77
+ doc = pymupdf.open(debug_pdf) # open document
78
+ for page in doc: # iterate through the pages
79
+ pix = page.get_pixmap() # render page to an image
80
+ page_debug_path = str(output_path / ("page-%i.png" % page.number))
81
+ debug_image_paths.append(page_debug_path)
82
+ pix.save(page_debug_path) # store image as a PNG
83
 
84
  return text, debug_image_paths
backends/unstructured.py CHANGED
@@ -66,8 +66,11 @@ def convert_unstructured(path: str, file_name: str):
66
  )
67
  text = convert_elements_to_markdown(elements)
68
  debug_image_dir = UNSTRUCTURED_DEBUG_PATH / "analysis" / file_name / "bboxes"
69
- debug_image_paths = [
70
- path for path in debug_image_dir.iterdir() if "od_model" in path.stem
71
- ]
 
 
 
72
 
73
  return text, debug_image_paths
 
66
  )
67
  text = convert_elements_to_markdown(elements)
68
  debug_image_dir = UNSTRUCTURED_DEBUG_PATH / "analysis" / file_name / "bboxes"
69
+ if debug_image_dir.exists():
70
+ debug_image_paths = [
71
+ path for path in debug_image_dir.iterdir() if "od_model" in path.stem
72
+ ]
73
+ else:
74
+ debug_image_paths = []
75
 
76
  return text, debug_image_paths
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.2.2,<=2.3.1
2
+ torchvision>=0.17.2,<=0.18.1
3
+ paddlepaddle-gpu @ https://paddle-whl.bj.bcebos.com/stable/cu118/paddlepaddle-gpu/paddlepaddle_gpu-3.0.0b1-cp310-cp310-linux_x86_64.whl
4
+ detectron2 @ https://wheels-1251341229.cos.ap-shanghai.myqcloud.com/assets/whl/detectron2/detectron2-0.6-cp310-cp310-linux_x86_64.whl
5
+ paddleocr==2.7.3
6
+ rapid-table>=1.0.3,<2.0.0
7
+ rapidocr-paddle
8
+ rapidocr-onnxruntime
9
+ gradio-pdf>=0.0.21
10
+ git+https://github.com/opendatalab/MinerU.git@dev
11
+ git+https://github.com/VikParuchuri/marker
12
+ docling
13
+ PyMuPDF>=1.24.9,<1.24.14
14
+ pymupdf4llm
15
+ unstructured[pdf]
16
+ ultralytics>=8.3.48
17
+ unimernet==0.2.3
18
+ transformers<5.0.0,>=4.45.2
19
+ struct-eqtable==0.3.2
20
+ openai
utils.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import re
2
  from pathlib import Path
3
  from shutil import copy2
@@ -12,6 +13,7 @@ def remove_images_from_markdown(markdown_text):
12
  return markdown_text
13
 
14
 
 
15
  def trim_pages(pdf_path, output_path, trim_pages=5):
16
  doc = pymupdf.open(pdf_path)
17
  parent_dir_name = Path(pdf_path).parent.name
 
1
+ import functools
2
  import re
3
  from pathlib import Path
4
  from shutil import copy2
 
13
  return markdown_text
14
 
15
 
16
+ @functools.lru_cache(maxsize=None)
17
  def trim_pages(pdf_path, output_path, trim_pages=5):
18
  doc = pymupdf.open(pdf_path)
19
  parent_dir_name = Path(pdf_path).parent.name