|
|
|
|
|
|
|
|
|
|
|
from docling_core.types.doc import DoclingDocument |
|
from docling_core.types.doc.document import DocTagsDocument |
|
from img2table.document import PDF |
|
from PIL import Image |
|
from vllm import LLM, SamplingParams |
|
|
|
|
|
MAX_PAGES = 1 |
|
MODEL_PATH = "ds4sd/SmolDocling-256M-preview" |
|
PROMPT_TEXT = "Convert page to Docling." |
|
|
|
|
|
|
|
llm = LLM( |
|
model=MODEL_PATH, limit_mm_per_prompt={"image": 1}, gpu_memory_utilization=0.4 |
|
) |
|
chat_template = f"<|im_start|>User:<image>{PROMPT_TEXT}<end_of_utterance>\nAssistant:" |
|
|
|
sampling_params = SamplingParams( |
|
temperature=0.0, |
|
max_tokens=4096, |
|
) |
|
|
|
|
|
def convert_smoldocling(path: str, file_name: str): |
|
doc = PDF(path) |
|
output_md = "" |
|
|
|
for image in doc.images[:MAX_PAGES]: |
|
|
|
image = Image.fromarray(image) |
|
|
|
max_width = 1200 |
|
if image.width > max_width: |
|
image = image.resize( |
|
(max_width, int(max_width * image.height / image.width)) |
|
) |
|
|
|
|
|
llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}} |
|
output = llm.generate([llm_input], sampling_params=sampling_params)[0] |
|
|
|
doctags = output.outputs[0].text.lstrip() |
|
|
|
|
|
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image]) |
|
|
|
doc = DoclingDocument(name="Document") |
|
doc.load_from_doctags(doctags_doc) |
|
|
|
|
|
|
|
|
|
|
|
output_md += doc.export_to_markdown() + "\n\n" |
|
|
|
return output_md, [] |
|
|