prithivMLmods commited on
Commit
154496b
·
verified ·
1 Parent(s): eb05eab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -3
app.py CHANGED
@@ -27,6 +27,7 @@ from transformers import (
27
  AutoModel,
28
  AutoProcessor,
29
  TextIteratorStreamer,
 
30
  )
31
 
32
  from transformers.image_utils import load_image
@@ -103,6 +104,19 @@ model_z = Qwen2_5_VLForConditionalGeneration.from_pretrained(
103
  MODEL_ID_Z, trust_remote_code=True, torch_dtype=torch.float16
104
  ).to(device).eval()
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  # --- PDF Generation and Preview Utility Function ---
107
  def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
108
  """
@@ -192,7 +206,17 @@ def process_document_stream(
192
  if not prompt_input or not prompt_input.strip():
193
  yield "Please enter a prompt.", ""
194
  return
195
-
 
 
 
 
 
 
 
 
 
 
196
  if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
197
  elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
198
  elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
@@ -257,9 +281,9 @@ def create_gradio_interface():
257
  # Left Column (Inputs)
258
  with gr.Column(scale=1):
259
  model_choice = gr.Dropdown(
260
- choices=["LFM2-VL-1.6B(fast)", "LFM2-VL-450M(fast)", "SmolVLM-Instruct-250M(smol)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
261
  "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Nanonets-OCR-s(ocr)"],
262
- label="Select Model", value="LFM2-VL-450M(fast)"
263
  )
264
  prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")
265
  image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
 
27
  AutoModel,
28
  AutoProcessor,
29
  TextIteratorStreamer,
30
+ AutoTokenizer,
31
  )
32
 
33
  from transformers.image_utils import load_image
 
104
  MODEL_ID_Z, trust_remote_code=True, torch_dtype=torch.float16
105
  ).to(device).eval()
106
 
107
+ # --- Moondream2 Model Loading ---
108
+ MODEL_ID_MD = "vikhyatk/moondream2"
109
+ REVISION_MD = "2025-06-21"
110
+ moondream = AutoModelForCausalLM.from_pretrained(
111
+ MODEL_ID_MD,
112
+ revision=REVISION_MD,
113
+ trust_remote_code=True,
114
+ torch_dtype=torch.float16,
115
+ device_map={"": "cuda"},
116
+ )
117
+ tokenizer_md = AutoTokenizer.from_pretrained(MODEL_ID_MD, revision=REVISION_MD)
118
+
119
+
120
  # --- PDF Generation and Preview Utility Function ---
121
  def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
122
  """
 
206
  if not prompt_input or not prompt_input.strip():
207
  yield "Please enter a prompt.", ""
208
  return
209
+
210
+ if model_name == "Moondream2(vision)":
211
+ image_embeds = moondream.encode_image(image)
212
+ answer = moondream.answer_question(
213
+ image_embeds=image_embeds,
214
+ question=prompt_input,
215
+ tokenizer=tokenizer_md
216
+ )
217
+ yield answer, answer
218
+ return
219
+
220
  if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
221
  elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
222
  elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
 
281
  # Left Column (Inputs)
282
  with gr.Column(scale=1):
283
  model_choice = gr.Dropdown(
284
+ choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
285
  "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Nanonets-OCR-s(ocr)"],
286
+ label="Select Model", value= "LFM2-VL-450M(fast)"
287
  )
288
  prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")
289
  image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])