File size: 1,830 Bytes
f6539d9 90ca638 f6539d9 90ca638 1e92b0d 90ca638 f6539d9 90ca638 f6539d9 90ca638 f6539d9 22e6cb5 f6539d9 90ca638 f6539d9 90ca638 f6539d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# Prerequisites:
# pip install torch
# pip install docling_core
# pip install transformers
from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.document import DocTagsDocument
from img2table.document import PDF
from PIL import Image
from vllm import LLM, SamplingParams
# Configuration
MAX_PAGES = 1
MODEL_PATH = "ds4sd/SmolDocling-256M-preview"
PROMPT_TEXT = "Convert page to Docling."
# Initialize processor and model
# Initialize LLM
llm = LLM(
model=MODEL_PATH, limit_mm_per_prompt={"image": 1}, gpu_memory_utilization=0.4
)
chat_template = f"<|im_start|>User:<image>{PROMPT_TEXT}<end_of_utterance>\nAssistant:"
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=4096,
)
def convert_smoldocling(path: str, file_name: str):
doc = PDF(path)
output_md = ""
for image in doc.images[:MAX_PAGES]:
# convert ndarray to Image
image = Image.fromarray(image)
# resize image to maximum width of 1200
max_width = 1200
if image.width > max_width:
image = image.resize(
(max_width, int(max_width * image.height / image.width))
)
# Prepare inputs
llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}}
output = llm.generate([llm_input], sampling_params=sampling_params)[0]
doctags = output.outputs[0].text.lstrip()
# Populate document
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
# create a docling document
doc = DoclingDocument(name="Document")
doc.load_from_doctags(doctags_doc)
# export as any format
# HTML
# doc.save_as_html(output_file)
# MD
output_md += doc.export_to_markdown() + "\n\n"
return output_md, []
|