|
import functools |
|
from pathlib import Path |
|
|
|
from matplotlib import font_manager |
|
from unstructured.partition.pdf import partition_pdf |
|
from unstructured.partition.pdf_image.analysis import bbox_visualisation |
|
|
|
from .settings import ENABLE_DEBUG_MODE |
|
|
|
UNSTRUCTURED_DEBUG_PATH = Path("/tmp/unstructured") |
|
|
|
|
|
def convert_elements_to_markdown(elements): |
|
lines = [] |
|
|
|
for e in elements: |
|
if e.category == "Title": |
|
line = f"\n# {e.text}\n" |
|
elif e.category == "ListItem": |
|
line = f"- {e.text}" |
|
elif e.category == "Table": |
|
line = f"\n{e.metadata.text_as_html}\n" |
|
elif e.category == "UncategorizedText": |
|
line = "" |
|
elif e.category == "Image": |
|
|
|
line = f"" |
|
else: |
|
line = e.text |
|
|
|
lines.append(line) |
|
|
|
md = "\n".join(lines) |
|
return md |
|
|
|
|
|
@functools.lru_cache(maxsize=None) |
|
def get_font(): |
|
preferred_fonts = ["Arial.ttf", "DejaVuSans.ttf"] |
|
available_fonts = font_manager.findSystemFonts() |
|
if not available_fonts: |
|
raise ValueError("No fonts available") |
|
for font in preferred_fonts: |
|
for available_font in available_fonts: |
|
if font in available_font: |
|
return available_font |
|
|
|
return available_fonts[0] |
|
|
|
|
|
|
|
bbox_visualisation.get_font = get_font |
|
|
|
|
|
def convert_unstructured(path: str, file_name: str): |
|
elements = partition_pdf( |
|
filename=path, |
|
|
|
strategy="hi_res", |
|
infer_table_structure=True, |
|
extract_image_block_types=["Image", "Table"], |
|
extract_image_block_to_payload=True, |
|
analysis=ENABLE_DEBUG_MODE, |
|
analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH, |
|
) |
|
text = convert_elements_to_markdown(elements) |
|
debug_image_dir = UNSTRUCTURED_DEBUG_PATH / "analysis" / file_name / "bboxes" |
|
if debug_image_dir.exists(): |
|
debug_image_paths = [ |
|
path for path in debug_image_dir.iterdir() if "od_model" in path.stem |
|
] |
|
else: |
|
debug_image_paths = [] |
|
|
|
return text, debug_image_paths |
|
|