prithivMLmods commited on
Commit
cee682d
·
verified ·
1 Parent(s): 69a99ff

update app models(++) (#13)

Browse files

- update app models(++) (cb89a3bc952a29dba920d66ee6d7edfbbb42d7e8)

Files changed (1) hide show
  1. app.py +56 -10
app.py CHANGED
@@ -150,14 +150,15 @@ model_o = AutoModelForVision2Seq.from_pretrained(
150
  MODEL_ID_O, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
151
  ).to(device).eval()
152
 
153
- # --- NEW MODEL: SmolVLM2-500M-Video-Instruct ---
154
- MODEL_ID_SV = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
155
- processor_sv = AutoProcessor.from_pretrained(MODEL_ID_SV, trust_remote_code=True)
156
- model_sv = AutoModelForImageTextToText.from_pretrained(
157
- MODEL_ID_SV,
158
- trust_remote_code=True,
159
- torch_dtype=torch.float16
160
  ).to(device).eval()
 
161
 
162
 
163
  # --- PDF Generation and Preview Utility Function ---
@@ -250,6 +251,9 @@ def process_document_stream(
250
  yield "Please enter a prompt.", ""
251
  return
252
 
 
 
 
253
  if model_name == "Moondream2(vision)":
254
  image_embeds = moondream.encode_image(image)
255
  answer = moondream.answer_question(
@@ -259,8 +263,51 @@ def process_document_stream(
259
  )
260
  yield answer, answer
261
  return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
- # Model and processor selection
 
 
264
  if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
265
  elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
266
  elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
@@ -274,7 +321,6 @@ def process_document_stream(
274
  elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
275
  elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
276
  elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
277
- elif model_name == "SmolVLM2-500M-Video-Instruct(video)": processor, model = processor_sv, model_sv
278
  else:
279
  yield "Invalid model selected.", ""
280
  return
@@ -333,7 +379,7 @@ def create_gradio_interface():
333
  model_choice = gr.Dropdown(
334
  choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
335
  "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
336
- "LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)", "SmolVLM-500M-Instruct(smol)", "SmolVLM2-500M-Video-Instruct(video)"],
337
  label="Select Model", value= "LFM2-VL-450M(fast)"
338
  )
339
  prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")
 
150
  MODEL_ID_O, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
151
  ).to(device).eval()
152
 
153
+ # --- AIDC-AI/Ovis2-1B Model Loading ---
154
+ MODEL_ID_O2 = 'AIDC-AI/Ovis2-1B'
155
+ model_o2 = AutoModelForCausalLM.from_pretrained(
156
+ MODEL_ID_O2,
157
+ torch_dtype=torch.bfloat16,
158
+ multimodal_max_length=8192,
159
+ trust_remote_code=True
160
  ).to(device).eval()
161
+ text_tokenizer_o2 = model_o2.get_text_tokenizer()
162
 
163
 
164
  # --- PDF Generation and Preview Utility Function ---
 
251
  yield "Please enter a prompt.", ""
252
  return
253
 
254
+ # --- Model-specific inference paths ---
255
+
256
+ # Moondream2 has a unique generation method
257
  if model_name == "Moondream2(vision)":
258
  image_embeds = moondream.encode_image(image)
259
  answer = moondream.answer_question(
 
263
  )
264
  yield answer, answer
265
  return
266
+
267
+ # Ovis2-1B has a custom preprocessing pipeline
268
+ elif model_name == "AIDC-AI/Ovis2-1B(ovis)":
269
+ conversations = [{"from": "human", "value": f"<image>\n{prompt_input}"}]
270
+
271
+ _, input_ids, pixel_values = model_o2.preprocess_inputs(conversations, [image], max_partition=16)
272
+ attention_mask = torch.ne(input_ids, text_tokenizer_o2.pad_token_id)
273
+
274
+ model_inputs = {
275
+ "inputs": input_ids.unsqueeze(0).to(device=model_o2.device),
276
+ "attention_mask": attention_mask.unsqueeze(0).to(device=model_o2.device),
277
+ "pixel_values": [pixel_values.to(dtype=torch.bfloat16, device=model_o2.device)]
278
+ }
279
+
280
+ streamer = TextIteratorStreamer(text_tokenizer_o2, skip_prompt=True, skip_special_tokens=True)
281
+
282
+ generation_kwargs = {
283
+ **model_inputs,
284
+ "streamer": streamer,
285
+ "max_new_tokens": max_new_tokens,
286
+ "do_sample": True,
287
+ "temperature": temperature,
288
+ "top_p": top_p,
289
+ "top_k": top_k,
290
+ "repetition_penalty": repetition_penalty,
291
+ "eos_token_id": model_o2.generation_config.eos_token_id,
292
+ "pad_token_id": text_tokenizer_o2.pad_token_id,
293
+ "use_cache": True
294
+ }
295
+
296
+ thread = Thread(target=model_o2.generate, kwargs=generation_kwargs)
297
+ thread.start()
298
+
299
+ buffer = ""
300
+ for new_text in streamer:
301
+ buffer += new_text
302
+ time.sleep(0.01)
303
+ yield buffer, buffer
304
+
305
+ yield buffer, buffer
306
+ return
307
 
308
+ # --- Standardized inference path for most other models ---
309
+
310
+ # Select model and processor
311
  if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
312
  elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
313
  elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
 
321
  elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
322
  elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
323
  elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
 
324
  else:
325
  yield "Invalid model selected.", ""
326
  return
 
379
  model_choice = gr.Dropdown(
380
  choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
381
  "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
382
+ "LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)", "SmolVLM-500M-Instruct(smol)", "AIDC-AI/Ovis2-1B(ovis)"],
383
  label="Select Model", value= "LFM2-VL-450M(fast)"
384
  )
385
  prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")