File size: 2,211 Bytes
77fbded 3bce890 77fbded 36add35 77fbded 36add35 3bce890 77fbded 0933b39 77fbded |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import functools
from pathlib import Path
from matplotlib import font_manager
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.pdf_image.analysis import bbox_visualisation
from .settings import ENABLE_DEBUG_MODE
UNSTRUCTURED_DEBUG_PATH = Path("/tmp/unstructured")
def convert_elements_to_markdown(elements):
lines = []
for e in elements:
if e.category == "Title":
line = f"\n# {e.text}\n"
elif e.category == "ListItem":
line = f"- {e.text}"
elif e.category == "Table":
line = f"\n{e.metadata.text_as_html}\n"
elif e.category == "UncategorizedText":
line = ""
elif e.category == "Image":
# base64 image
line = f""
else:
line = e.text
lines.append(line)
md = "\n".join(lines)
return md
@functools.lru_cache(maxsize=None)
def get_font():
preferred_fonts = ["Arial.ttf", "DejaVuSans.ttf"]
available_fonts = font_manager.findSystemFonts()
if not available_fonts:
raise ValueError("No fonts available")
for font in preferred_fonts:
for available_font in available_fonts:
if font in available_font:
return available_font
return available_fonts[0]
# monkey patch
bbox_visualisation.get_font = get_font
def convert_unstructured(path: str, file_name: str):
elements = partition_pdf(
filename=path,
# mandatory to use ``hi_res`` strategy
strategy="hi_res",
infer_table_structure=True,
extract_image_block_types=["Image", "Table"],
extract_image_block_to_payload=True,
analysis=ENABLE_DEBUG_MODE,
analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH,
)
text = convert_elements_to_markdown(elements)
debug_image_dir = UNSTRUCTURED_DEBUG_PATH / "analysis" / file_name / "bboxes"
if debug_image_dir.exists():
debug_image_paths = [
path for path in debug_image_dir.iterdir() if "od_model" in path.stem
]
else:
debug_image_paths = []
return text, debug_image_paths
|