taprosoft commited on
Commit
90ca638
·
1 Parent(s): 9c2f030

fix: move to VLLM

Browse files
Files changed (2) hide show
  1. backends/smoldocling.py +16 -30
  2. requirements.txt +1 -0
backends/smoldocling.py CHANGED
@@ -3,34 +3,28 @@
3
  # pip install docling_core
4
  # pip install transformers
5
 
6
- import torch
7
  from docling_core.types.doc import DoclingDocument
8
  from docling_core.types.doc.document import DocTagsDocument
9
  from img2table.document import PDF
10
  from PIL import Image
11
- from transformers import AutoModelForVision2Seq, AutoProcessor
12
 
13
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
  MAX_PAGES = 1
 
 
15
 
16
  # Initialize processor and model
17
- processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
18
- model = AutoModelForVision2Seq.from_pretrained(
19
- "ds4sd/SmolDocling-256M-preview",
20
- torch_dtype=torch.bfloat16,
21
- _attn_implementation="eager",
22
- ).to(DEVICE)
23
 
24
- # Create input messages
25
- messages = [
26
- {
27
- "role": "user",
28
- "content": [
29
- {"type": "image"},
30
- {"type": "text", "text": "Convert this page to docling."},
31
- ],
32
- },
33
- ]
34
 
35
 
36
  def convert_smoldocling(path: str, file_name: str):
@@ -48,18 +42,10 @@ def convert_smoldocling(path: str, file_name: str):
48
  )
49
 
50
  # Prepare inputs
51
- prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
52
- inputs = processor(text=prompt, images=[image], return_tensors="pt")
53
- inputs = inputs.to(DEVICE)
54
 
55
- # Generate outputs
56
- generated_ids = model.generate(**inputs, max_new_tokens=4096)
57
- prompt_length = inputs.input_ids.shape[1]
58
- trimmed_generated_ids = generated_ids[:, prompt_length:]
59
- doctags = processor.batch_decode(
60
- trimmed_generated_ids,
61
- skip_special_tokens=False,
62
- )[0].lstrip()
63
 
64
  # Populate document
65
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
 
3
  # pip install docling_core
4
  # pip install transformers
5
 
 
6
  from docling_core.types.doc import DoclingDocument
7
  from docling_core.types.doc.document import DocTagsDocument
8
  from img2table.document import PDF
9
  from PIL import Image
10
+ from vllm import LLM, SamplingParams
11
 
12
+ # Configuration
13
  MAX_PAGES = 1
14
+ MODEL_PATH = "ds4sd/SmolDocling-256M-preview"
15
+ PROMPT_TEXT = "Convert page to Docling."
16
 
17
  # Initialize processor and model
18
+ # Initialize LLM
19
+ llm = LLM(
20
+ model=MODEL_PATH, limit_mm_per_prompt={"image": 1}, gpu_memory_utilization=0.4
21
+ )
22
+ chat_template = f"<|im_start|>User:<image>{PROMPT_TEXT}<end_of_utterance>\nAssistant:"
 
23
 
24
+ sampling_params = SamplingParams(
25
+ temperature=0.0,
26
+ max_tokens=4096,
27
+ )
 
 
 
 
 
 
28
 
29
 
30
  def convert_smoldocling(path: str, file_name: str):
 
42
  )
43
 
44
  # Prepare inputs
45
+ llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}}
46
+ output = llm.generate([llm_input], sampling_params=sampling_params)[0]
 
47
 
48
+ doctags = output.outputs[0].text.lstrip()
 
 
 
 
 
 
 
49
 
50
  # Populate document
51
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
requirements.txt CHANGED
@@ -11,3 +11,4 @@ docling_core
11
  opencv-contrib-python
12
  huggingface_hub
13
  spaces
 
 
11
  opencv-contrib-python
12
  huggingface_hub
13
  spaces
14
+ vllm