# Prerequisites: # pip install torch # pip install docling_core # pip install transformers from docling_core.types.doc import DoclingDocument from docling_core.types.doc.document import DocTagsDocument from img2table.document import PDF from PIL import Image from vllm import LLM, SamplingParams # Configuration MAX_PAGES = 1 MODEL_PATH = "ds4sd/SmolDocling-256M-preview" PROMPT_TEXT = "Convert page to Docling." # Initialize processor and model # Initialize LLM llm = LLM( model=MODEL_PATH, limit_mm_per_prompt={"image": 1}, gpu_memory_utilization=0.4 ) chat_template = f"<|im_start|>User:{PROMPT_TEXT}\nAssistant:" sampling_params = SamplingParams( temperature=0.0, max_tokens=4096, ) def convert_smoldocling(path: str, file_name: str): doc = PDF(path) output_md = "" for image in doc.images[:MAX_PAGES]: # convert ndarray to Image image = Image.fromarray(image) # resize image to maximum width of 1200 max_width = 1200 if image.width > max_width: image = image.resize( (max_width, int(max_width * image.height / image.width)) ) # Prepare inputs llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}} output = llm.generate([llm_input], sampling_params=sampling_params)[0] doctags = output.outputs[0].text.lstrip() # Populate document doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image]) # create a docling document doc = DoclingDocument(name="Document") doc.load_from_doctags(doctags_doc) # export as any format # HTML # doc.save_as_html(output_file) # MD output_md += doc.export_to_markdown() + "\n\n" return output_md, []