Spaces:

chunking-ai
/

smoldocling-preview

Paused

File size: 2,211 Bytes

import functools
from pathlib import Path

from matplotlib import font_manager
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.pdf_image.analysis import bbox_visualisation

from .settings import ENABLE_DEBUG_MODE

UNSTRUCTURED_DEBUG_PATH = Path("/tmp/unstructured")


def convert_elements_to_markdown(elements):
    lines = []

    for e in elements:
        if e.category == "Title":
            line = f"\n# {e.text}\n"
        elif e.category == "ListItem":
            line = f"- {e.text}"
        elif e.category == "Table":
            line = f"\n{e.metadata.text_as_html}\n"
        elif e.category == "UncategorizedText":
            line = ""
        elif e.category == "Image":
            # base64 image
            line = f"![{e.text}](data:image/jpeg;base64," f"{e.metadata.image_base64})"
        else:
            line = e.text

        lines.append(line)

    md = "\n".join(lines)
    return md


@functools.lru_cache(maxsize=None)
def get_font():
    preferred_fonts = ["Arial.ttf", "DejaVuSans.ttf"]
    available_fonts = font_manager.findSystemFonts()
    if not available_fonts:
        raise ValueError("No fonts available")
    for font in preferred_fonts:
        for available_font in available_fonts:
            if font in available_font:
                return available_font

    return available_fonts[0]


# monkey patch
bbox_visualisation.get_font = get_font


def convert_unstructured(path: str, file_name: str):
    elements = partition_pdf(
        filename=path,
        # mandatory to use ``hi_res`` strategy
        strategy="hi_res",
        infer_table_structure=True,
        extract_image_block_types=["Image", "Table"],
        extract_image_block_to_payload=True,
        analysis=ENABLE_DEBUG_MODE,
        analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH,
    )
    text = convert_elements_to_markdown(elements)
    debug_image_dir = UNSTRUCTURED_DEBUG_PATH / "analysis" / file_name / "bboxes"
    if debug_image_dir.exists():
        debug_image_paths = [
            path for path in debug_image_dir.iterdir() if "od_model" in path.stem
        ]
    else:
        debug_image_paths = []

    return text, debug_image_paths