smoldocling-preview / backends /unstructured.py
taprosoft
fix: minor update backend
7e20950
raw
history blame
2.21 kB
import functools
from pathlib import Path
from matplotlib import font_manager
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.pdf_image.analysis import bbox_visualisation
from .settings import ENABLE_DEBUG_MODE
UNSTRUCTURED_DEBUG_PATH = Path("/tmp/unstructured")
def convert_elements_to_markdown(elements):
lines = []
for e in elements:
if e.category == "Title":
line = f"\n# {e.text}\n"
elif e.category == "ListItem":
line = f"- {e.text}"
elif e.category == "Table":
line = f"\n{e.metadata.text_as_html}\n"
elif e.category == "UncategorizedText":
line = ""
elif e.category == "Image":
# base64 image
line = f"![{e.text}](data:image/jpeg;base64," f"{e.metadata.image_base64})"
else:
line = e.text
lines.append(line)
md = "\n".join(lines)
return md
@functools.lru_cache(maxsize=None)
def get_font():
preferred_fonts = ["Arial.ttf", "DejaVuSans.ttf"]
available_fonts = font_manager.findSystemFonts()
if not available_fonts:
raise ValueError("No fonts available")
for font in preferred_fonts:
for available_font in available_fonts:
if font in available_font:
return available_font
return available_fonts[0]
# monkey patch
bbox_visualisation.get_font = get_font
def convert_unstructured(path: str, file_name: str):
elements = partition_pdf(
filename=path,
# mandatory to use ``hi_res`` strategy
strategy="hi_res",
infer_table_structure=True,
extract_image_block_types=["Image", "Table"],
extract_image_block_to_payload=True,
analysis=ENABLE_DEBUG_MODE,
analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH,
)
text = convert_elements_to_markdown(elements)
debug_image_dir = UNSTRUCTURED_DEBUG_PATH / "analysis" / file_name / "bboxes"
if debug_image_dir.exists():
debug_image_paths = [
path for path in debug_image_dir.iterdir() if "od_model" in path.stem
]
else:
debug_image_paths = []
return text, debug_image_paths