Spaces:

chunking-ai
/

smoldocling-preview

Paused

smoldocling-preview / utils.py

taprosoft

fix: skip problematic import

188f052 8 months ago

1.45 kB

	import functools
	import re
	from pathlib import Path
	from shutil import copy2

	import pymupdf


	def remove_images_from_markdown(markdown_text):
	# remove <image> and ![image](path) from markdown
	markdown_text = re.sub(r"<img[^>]*>", "", markdown_text)
	markdown_text = re.sub(r"!\[[^\]]\]\([^)]\)", "", markdown_text)
	return markdown_text


	@functools.lru_cache(maxsize=None)
	def trim_pages(pdf_path, output_path, trim_pages=5):
	doc = pymupdf.open(pdf_path)
	parent_dir_name = Path(pdf_path).parent.name
	output_file_path = Path(output_path) / f"{parent_dir_name}.pdf"

	num_pages = len(doc)
	if num_pages > trim_pages:
	to_select = list(range(trim_pages))
	doc.select(to_select)
	doc.ez_save(output_file_path)
	print("Trimmed pdf to with pages", to_select, "path", output_file_path)
	else:
	copy2(pdf_path, str(output_file_path))

	return str(output_file_path)


	def fix_problematic_imports():
	import sys
	import types

	# Create a fake 'UnimernetModel' class inside a fake 'Unimernet' module
	fake_unimernet_module = types.ModuleType(
	"magic_pdf.model.sub_modules.mfr.unimernet.Unimernet"
	)
	fake_unimernet_module.UnimernetModel = type( # type: ignore
	"UnimernetModel", (), {}
	)

	# Register fake module in sys.modules
	sys.modules[
	"magic_pdf.model.sub_modules.mfr.unimernet.Unimernet"
	] = fake_unimernet_module