Spaces:

chunking-ai
/

smoldocling-preview

Paused

App Files Files Community

taprosoft commited on Feb 25

Commit

36add35

1 Parent(s): 2418a0c

feat: add image support

Browse files

Files changed (3) hide show

backends/marker.py +34 -0
backends/mineru.py +22 -0
backends/unstructured.py +5 -2

backends/marker.py CHANGED Viewed

@@ -1,8 +1,12 @@
 from pathlib import Path
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
 # Marker init
 marker_converter = PdfConverter(
@@ -13,9 +17,39 @@ marker_converter = PdfConverter(
 )
 def convert_marker(path: str, file_name: str):
     rendered = marker_converter(path)
     text, _, images = text_from_rendered(rendered)
     debug_image_dir = Path(rendered.metadata.get("debug_data_path"))
     debug_image_paths = [
         path for path in debug_image_dir.iterdir() if "pdf_page" in path.stem

+import base64
+import io
+import re
 from pathlib import Path
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
+from marker.settings import settings
 # Marker init
 marker_converter = PdfConverter(
 )
+def img_to_html(img, img_alt):
+    img_bytes = io.BytesIO()
+    img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT)
+    img_bytes_value = img_bytes.getvalue()
+    encoded = base64.b64encode(img_bytes_value).decode()
+    img_html = (
+        f'<img src="data:image/{settings.OUTPUT_IMAGE_FORMAT.lower()}'
+        f';base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
+    )
+    return img_html
+def markdown_insert_images(markdown, images):
+    image_tags = re.findall(
+        r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))',
+        markdown,
+    )
+    for image in image_tags:
+        image_markdown = image[0]
+        image_alt = image[1]
+        image_path = image[2]
+        if image_path in images:
+            markdown = markdown.replace(
+                image_markdown, img_to_html(images[image_path], image_alt)
+            )
+    return markdown
 def convert_marker(path: str, file_name: str):
     rendered = marker_converter(path)
     text, _, images = text_from_rendered(rendered)
+    text = markdown_insert_images(text, images)
     debug_image_dir = Path(rendered.metadata.get("debug_data_path"))
     debug_image_paths = [
         path for path in debug_image_dir.iterdir() if "pdf_page" in path.stem

backends/mineru.py CHANGED Viewed

@@ -1,3 +1,6 @@
 from pathlib import Path
 import pymupdf
@@ -13,6 +16,23 @@ def read_fn(path):
     return disk_rw.read(path)
 def do_process_mineru(input_path, output_dir):
     file_name = Path(input_path).stem
     output_dir = Path(output_dir)
@@ -45,6 +65,8 @@ def convert_mineru(path: str, file_name: str):
     with open(local_md_dir / f"{file_name}.md", "r") as file:
         text = file.read()
     debug_pdf = str(local_md_dir / (file_name + "_layout.pdf"))
     doc = pymupdf.open(debug_pdf)  # open document
     for page in doc:  # iterate through the pages

+import base64
+import os
+import re
 from pathlib import Path
 import pymupdf
     return disk_rw.read(path)
+def image_to_base64(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def replace_image_with_base64(markdown_text, image_dir_path):
+    pattern = r"\!\[(?:[^\]]*)\]\(([^)]+)\)"
+    def replace(match):
+        relative_path = match.group(1)
+        full_path = os.path.join(image_dir_path, relative_path)
+        base64_image = image_to_base64(full_path)
+        return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"
+    return re.sub(pattern, replace, markdown_text)
 def do_process_mineru(input_path, output_dir):
     file_name = Path(input_path).stem
     output_dir = Path(output_dir)
     with open(local_md_dir / f"{file_name}.md", "r") as file:
         text = file.read()
+    text = replace_image_with_base64(text, local_md_dir)
     debug_pdf = str(local_md_dir / (file_name + "_layout.pdf"))
     doc = pymupdf.open(debug_pdf)  # open document
     for page in doc:  # iterate through the pages

backends/unstructured.py CHANGED Viewed

@@ -20,6 +20,9 @@ def convert_elements_to_markdown(elements):
             line = f"\n{e.metadata.text_as_html}\n"
         elif e.category == "UncategorizedText":
             line = ""
         else:
             line = e.text
@@ -54,8 +57,8 @@ def convert_unstructured(path: str, file_name: str):
         strategy="hi_res",
         infer_table_structure=True,
         # extract_images_in_pdf=True,
-        # extract_image_block_types=["Image", "Table"],
-        # extract_image_block_to_payload=False,
         analysis=True,
         analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH,
     )

             line = f"\n{e.metadata.text_as_html}\n"
         elif e.category == "UncategorizedText":
             line = ""
+        elif e.category == "Image":
+            # base64 image
+            line = f"![{e.text}](data:image/jpeg;base64," f"{e.metadata.image_base64})"
         else:
             line = e.text
         strategy="hi_res",
         infer_table_structure=True,
         # extract_images_in_pdf=True,
+        extract_image_block_types=["Image", "Table"],
+        extract_image_block_to_payload=True,
         analysis=True,
         analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH,
     )