smoldocling-preview / backends /smoldocling.py
taprosoft
fix: move to VLLM
90ca638
raw
history blame
1.83 kB
# Prerequisites:
# pip install torch
# pip install docling_core
# pip install transformers
from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.document import DocTagsDocument
from img2table.document import PDF
from PIL import Image
from vllm import LLM, SamplingParams
# Configuration
MAX_PAGES = 1
MODEL_PATH = "ds4sd/SmolDocling-256M-preview"
PROMPT_TEXT = "Convert page to Docling."
# Initialize processor and model
# Initialize LLM
llm = LLM(
model=MODEL_PATH, limit_mm_per_prompt={"image": 1}, gpu_memory_utilization=0.4
)
chat_template = f"<|im_start|>User:<image>{PROMPT_TEXT}<end_of_utterance>\nAssistant:"
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=4096,
)
def convert_smoldocling(path: str, file_name: str):
doc = PDF(path)
output_md = ""
for image in doc.images[:MAX_PAGES]:
# convert ndarray to Image
image = Image.fromarray(image)
# resize image to maximum width of 1200
max_width = 1200
if image.width > max_width:
image = image.resize(
(max_width, int(max_width * image.height / image.width))
)
# Prepare inputs
llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}}
output = llm.generate([llm_input], sampling_params=sampling_params)[0]
doctags = output.outputs[0].text.lstrip()
# Populate document
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
# create a docling document
doc = DoclingDocument(name="Document")
doc.load_from_doctags(doctags_doc)
# export as any format
# HTML
# doc.save_as_html(output_file)
# MD
output_md += doc.export_to_markdown() + "\n\n"
return output_md, []