taprosoft commited on
Commit
36add35
·
1 Parent(s): 2418a0c

feat: add image support

Browse files
backends/marker.py CHANGED
@@ -1,8 +1,12 @@
 
 
 
1
  from pathlib import Path
2
 
3
  from marker.converters.pdf import PdfConverter
4
  from marker.models import create_model_dict
5
  from marker.output import text_from_rendered
 
6
 
7
  # Marker init
8
  marker_converter = PdfConverter(
@@ -13,9 +17,39 @@ marker_converter = PdfConverter(
13
  )
14
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def convert_marker(path: str, file_name: str):
17
  rendered = marker_converter(path)
18
  text, _, images = text_from_rendered(rendered)
 
19
  debug_image_dir = Path(rendered.metadata.get("debug_data_path"))
20
  debug_image_paths = [
21
  path for path in debug_image_dir.iterdir() if "pdf_page" in path.stem
 
1
+ import base64
2
+ import io
3
+ import re
4
  from pathlib import Path
5
 
6
  from marker.converters.pdf import PdfConverter
7
  from marker.models import create_model_dict
8
  from marker.output import text_from_rendered
9
+ from marker.settings import settings
10
 
11
  # Marker init
12
  marker_converter = PdfConverter(
 
17
  )
18
 
19
 
20
+ def img_to_html(img, img_alt):
21
+ img_bytes = io.BytesIO()
22
+ img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT)
23
+ img_bytes_value = img_bytes.getvalue()
24
+ encoded = base64.b64encode(img_bytes_value).decode()
25
+ img_html = (
26
+ f'<img src="data:image/{settings.OUTPUT_IMAGE_FORMAT.lower()}'
27
+ f';base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
28
+ )
29
+ return img_html
30
+
31
+
32
+ def markdown_insert_images(markdown, images):
33
+ image_tags = re.findall(
34
+ r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))',
35
+ markdown,
36
+ )
37
+
38
+ for image in image_tags:
39
+ image_markdown = image[0]
40
+ image_alt = image[1]
41
+ image_path = image[2]
42
+ if image_path in images:
43
+ markdown = markdown.replace(
44
+ image_markdown, img_to_html(images[image_path], image_alt)
45
+ )
46
+ return markdown
47
+
48
+
49
  def convert_marker(path: str, file_name: str):
50
  rendered = marker_converter(path)
51
  text, _, images = text_from_rendered(rendered)
52
+ text = markdown_insert_images(text, images)
53
  debug_image_dir = Path(rendered.metadata.get("debug_data_path"))
54
  debug_image_paths = [
55
  path for path in debug_image_dir.iterdir() if "pdf_page" in path.stem
backends/mineru.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  from pathlib import Path
2
 
3
  import pymupdf
@@ -13,6 +16,23 @@ def read_fn(path):
13
  return disk_rw.read(path)
14
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def do_process_mineru(input_path, output_dir):
17
  file_name = Path(input_path).stem
18
  output_dir = Path(output_dir)
@@ -45,6 +65,8 @@ def convert_mineru(path: str, file_name: str):
45
  with open(local_md_dir / f"{file_name}.md", "r") as file:
46
  text = file.read()
47
 
 
 
48
  debug_pdf = str(local_md_dir / (file_name + "_layout.pdf"))
49
  doc = pymupdf.open(debug_pdf) # open document
50
  for page in doc: # iterate through the pages
 
1
+ import base64
2
+ import os
3
+ import re
4
  from pathlib import Path
5
 
6
  import pymupdf
 
16
  return disk_rw.read(path)
17
 
18
 
19
+ def image_to_base64(image_path):
20
+ with open(image_path, "rb") as image_file:
21
+ return base64.b64encode(image_file.read()).decode("utf-8")
22
+
23
+
24
+ def replace_image_with_base64(markdown_text, image_dir_path):
25
+ pattern = r"\!\[(?:[^\]]*)\]\(([^)]+)\)"
26
+
27
+ def replace(match):
28
+ relative_path = match.group(1)
29
+ full_path = os.path.join(image_dir_path, relative_path)
30
+ base64_image = image_to_base64(full_path)
31
+ return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"
32
+
33
+ return re.sub(pattern, replace, markdown_text)
34
+
35
+
36
  def do_process_mineru(input_path, output_dir):
37
  file_name = Path(input_path).stem
38
  output_dir = Path(output_dir)
 
65
  with open(local_md_dir / f"{file_name}.md", "r") as file:
66
  text = file.read()
67
 
68
+ text = replace_image_with_base64(text, local_md_dir)
69
+
70
  debug_pdf = str(local_md_dir / (file_name + "_layout.pdf"))
71
  doc = pymupdf.open(debug_pdf) # open document
72
  for page in doc: # iterate through the pages
backends/unstructured.py CHANGED
@@ -20,6 +20,9 @@ def convert_elements_to_markdown(elements):
20
  line = f"\n{e.metadata.text_as_html}\n"
21
  elif e.category == "UncategorizedText":
22
  line = ""
 
 
 
23
  else:
24
  line = e.text
25
 
@@ -54,8 +57,8 @@ def convert_unstructured(path: str, file_name: str):
54
  strategy="hi_res",
55
  infer_table_structure=True,
56
  # extract_images_in_pdf=True,
57
- # extract_image_block_types=["Image", "Table"],
58
- # extract_image_block_to_payload=False,
59
  analysis=True,
60
  analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH,
61
  )
 
20
  line = f"\n{e.metadata.text_as_html}\n"
21
  elif e.category == "UncategorizedText":
22
  line = ""
23
+ elif e.category == "Image":
24
+ # base64 image
25
+ line = f"![{e.text}](data:image/jpeg;base64," f"{e.metadata.image_base64})"
26
  else:
27
  line = e.text
28
 
 
57
  strategy="hi_res",
58
  infer_table_structure=True,
59
  # extract_images_in_pdf=True,
60
+ extract_image_block_types=["Image", "Table"],
61
+ extract_image_block_to_payload=True,
62
  analysis=True,
63
  analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH,
64
  )