prithivMLmods commited on
Commit
69a99ff
·
verified ·
1 Parent(s): 9e03ccb
Files changed (1) hide show
  1. app.py +13 -1
app.py CHANGED
@@ -150,6 +150,16 @@ model_o = AutoModelForVision2Seq.from_pretrained(
150
  MODEL_ID_O, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
151
  ).to(device).eval()
152
 
 
 
 
 
 
 
 
 
 
 
153
  # --- PDF Generation and Preview Utility Function ---
154
  def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
155
  """
@@ -250,6 +260,7 @@ def process_document_stream(
250
  yield answer, answer
251
  return
252
 
 
253
  if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
254
  elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
255
  elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
@@ -263,6 +274,7 @@ def process_document_stream(
263
  elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
264
  elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
265
  elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
 
266
  else:
267
  yield "Invalid model selected.", ""
268
  return
@@ -321,7 +333,7 @@ def create_gradio_interface():
321
  model_choice = gr.Dropdown(
322
  choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
323
  "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
324
- "LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)", "SmolVLM-500M-Instruct(smol)"],
325
  label="Select Model", value= "LFM2-VL-450M(fast)"
326
  )
327
  prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")
 
150
  MODEL_ID_O, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
151
  ).to(device).eval()
152
 
153
+ # --- NEW MODEL: SmolVLM2-500M-Video-Instruct ---
154
+ MODEL_ID_SV = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
155
+ processor_sv = AutoProcessor.from_pretrained(MODEL_ID_SV, trust_remote_code=True)
156
+ model_sv = AutoModelForImageTextToText.from_pretrained(
157
+ MODEL_ID_SV,
158
+ trust_remote_code=True,
159
+ torch_dtype=torch.float16
160
+ ).to(device).eval()
161
+
162
+
163
  # --- PDF Generation and Preview Utility Function ---
164
  def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
165
  """
 
260
  yield answer, answer
261
  return
262
 
263
+ # Model and processor selection
264
  if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
265
  elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
266
  elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
 
274
  elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
275
  elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
276
  elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
277
+ elif model_name == "SmolVLM2-500M-Video-Instruct(video)": processor, model = processor_sv, model_sv
278
  else:
279
  yield "Invalid model selected.", ""
280
  return
 
333
  model_choice = gr.Dropdown(
334
  choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
335
  "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
336
+ "LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)", "SmolVLM-500M-Instruct(smol)", "SmolVLM2-500M-Video-Instruct(video)"],
337
  label="Select Model", value= "LFM2-VL-450M(fast)"
338
  )
339
  prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")