Spaces:

chunking-ai
/

smoldocling-preview

Paused

smoldocling-preview / backends /smoldocling.py

taprosoft

fix: move to VLLM

90ca638 4 days ago

1.83 kB

	# Prerequisites:
	# pip install torch
	# pip install docling_core
	# pip install transformers

	from docling_core.types.doc import DoclingDocument
	from docling_core.types.doc.document import DocTagsDocument
	from img2table.document import PDF
	from PIL import Image
	from vllm import LLM, SamplingParams

	# Configuration
	MAX_PAGES = 1
	MODEL_PATH = "ds4sd/SmolDocling-256M-preview"
	PROMPT_TEXT = "Convert page to Docling."

	# Initialize processor and model
	# Initialize LLM
	llm = LLM(
	model=MODEL_PATH, limit_mm_per_prompt={"image": 1}, gpu_memory_utilization=0.4
	)
	chat_template = f"<\|im_start\|>User:<image>{PROMPT_TEXT}<end_of_utterance>\nAssistant:"

	sampling_params = SamplingParams(
	temperature=0.0,
	max_tokens=4096,
	)


	def convert_smoldocling(path: str, file_name: str):
	doc = PDF(path)
	output_md = ""

	for image in doc.images[:MAX_PAGES]:
	# convert ndarray to Image
	image = Image.fromarray(image)
	# resize image to maximum width of 1200
	max_width = 1200
	if image.width > max_width:
	image = image.resize(
	(max_width, int(max_width * image.height / image.width))
	)

	# Prepare inputs
	llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}}
	output = llm.generate([llm_input], sampling_params=sampling_params)[0]

	doctags = output.outputs[0].text.lstrip()

	# Populate document
	doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
	# create a docling document
	doc = DoclingDocument(name="Document")
	doc.load_from_doctags(doctags_doc)

	# export as any format
	# HTML
	# doc.save_as_html(output_file)
	# MD
	output_md += doc.export_to_markdown() + "\n\n"

	return output_md, []