Spaces:
Running
on
Zero
Running
on
Zero
update app models(++) (#13)
Browse files- update app models(++) (cb89a3bc952a29dba920d66ee6d7edfbbb42d7e8)
app.py
CHANGED
@@ -150,14 +150,15 @@ model_o = AutoModelForVision2Seq.from_pretrained(
|
|
150 |
MODEL_ID_O, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
|
151 |
).to(device).eval()
|
152 |
|
153 |
-
# ---
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
).to(device).eval()
|
|
|
161 |
|
162 |
|
163 |
# --- PDF Generation and Preview Utility Function ---
|
@@ -250,6 +251,9 @@ def process_document_stream(
|
|
250 |
yield "Please enter a prompt.", ""
|
251 |
return
|
252 |
|
|
|
|
|
|
|
253 |
if model_name == "Moondream2(vision)":
|
254 |
image_embeds = moondream.encode_image(image)
|
255 |
answer = moondream.answer_question(
|
@@ -259,8 +263,51 @@ def process_document_stream(
|
|
259 |
)
|
260 |
yield answer, answer
|
261 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
|
263 |
-
#
|
|
|
|
|
264 |
if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
|
265 |
elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
|
266 |
elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
|
@@ -274,7 +321,6 @@ def process_document_stream(
|
|
274 |
elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
|
275 |
elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
|
276 |
elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
|
277 |
-
elif model_name == "SmolVLM2-500M-Video-Instruct(video)": processor, model = processor_sv, model_sv
|
278 |
else:
|
279 |
yield "Invalid model selected.", ""
|
280 |
return
|
@@ -333,7 +379,7 @@ def create_gradio_interface():
|
|
333 |
model_choice = gr.Dropdown(
|
334 |
choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
|
335 |
"VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
|
336 |
-
"LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)", "SmolVLM-500M-Instruct(smol)", "
|
337 |
label="Select Model", value= "LFM2-VL-450M(fast)"
|
338 |
)
|
339 |
prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")
|
|
|
150 |
MODEL_ID_O, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
|
151 |
).to(device).eval()
|
152 |
|
153 |
+
# --- AIDC-AI/Ovis2-1B Model Loading ---
|
154 |
+
MODEL_ID_O2 = 'AIDC-AI/Ovis2-1B'
|
155 |
+
model_o2 = AutoModelForCausalLM.from_pretrained(
|
156 |
+
MODEL_ID_O2,
|
157 |
+
torch_dtype=torch.bfloat16,
|
158 |
+
multimodal_max_length=8192,
|
159 |
+
trust_remote_code=True
|
160 |
).to(device).eval()
|
161 |
+
text_tokenizer_o2 = model_o2.get_text_tokenizer()
|
162 |
|
163 |
|
164 |
# --- PDF Generation and Preview Utility Function ---
|
|
|
251 |
yield "Please enter a prompt.", ""
|
252 |
return
|
253 |
|
254 |
+
# --- Model-specific inference paths ---
|
255 |
+
|
256 |
+
# Moondream2 has a unique generation method
|
257 |
if model_name == "Moondream2(vision)":
|
258 |
image_embeds = moondream.encode_image(image)
|
259 |
answer = moondream.answer_question(
|
|
|
263 |
)
|
264 |
yield answer, answer
|
265 |
return
|
266 |
+
|
267 |
+
# Ovis2-1B has a custom preprocessing pipeline
|
268 |
+
elif model_name == "AIDC-AI/Ovis2-1B(ovis)":
|
269 |
+
conversations = [{"from": "human", "value": f"<image>\n{prompt_input}"}]
|
270 |
+
|
271 |
+
_, input_ids, pixel_values = model_o2.preprocess_inputs(conversations, [image], max_partition=16)
|
272 |
+
attention_mask = torch.ne(input_ids, text_tokenizer_o2.pad_token_id)
|
273 |
+
|
274 |
+
model_inputs = {
|
275 |
+
"inputs": input_ids.unsqueeze(0).to(device=model_o2.device),
|
276 |
+
"attention_mask": attention_mask.unsqueeze(0).to(device=model_o2.device),
|
277 |
+
"pixel_values": [pixel_values.to(dtype=torch.bfloat16, device=model_o2.device)]
|
278 |
+
}
|
279 |
+
|
280 |
+
streamer = TextIteratorStreamer(text_tokenizer_o2, skip_prompt=True, skip_special_tokens=True)
|
281 |
+
|
282 |
+
generation_kwargs = {
|
283 |
+
**model_inputs,
|
284 |
+
"streamer": streamer,
|
285 |
+
"max_new_tokens": max_new_tokens,
|
286 |
+
"do_sample": True,
|
287 |
+
"temperature": temperature,
|
288 |
+
"top_p": top_p,
|
289 |
+
"top_k": top_k,
|
290 |
+
"repetition_penalty": repetition_penalty,
|
291 |
+
"eos_token_id": model_o2.generation_config.eos_token_id,
|
292 |
+
"pad_token_id": text_tokenizer_o2.pad_token_id,
|
293 |
+
"use_cache": True
|
294 |
+
}
|
295 |
+
|
296 |
+
thread = Thread(target=model_o2.generate, kwargs=generation_kwargs)
|
297 |
+
thread.start()
|
298 |
+
|
299 |
+
buffer = ""
|
300 |
+
for new_text in streamer:
|
301 |
+
buffer += new_text
|
302 |
+
time.sleep(0.01)
|
303 |
+
yield buffer, buffer
|
304 |
+
|
305 |
+
yield buffer, buffer
|
306 |
+
return
|
307 |
|
308 |
+
# --- Standardized inference path for most other models ---
|
309 |
+
|
310 |
+
# Select model and processor
|
311 |
if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
|
312 |
elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
|
313 |
elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
|
|
|
321 |
elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
|
322 |
elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
|
323 |
elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
|
|
|
324 |
else:
|
325 |
yield "Invalid model selected.", ""
|
326 |
return
|
|
|
379 |
model_choice = gr.Dropdown(
|
380 |
choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
|
381 |
"VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
|
382 |
+
"LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)", "SmolVLM-500M-Instruct(smol)", "AIDC-AI/Ovis2-1B(ovis)"],
|
383 |
label="Select Model", value= "LFM2-VL-450M(fast)"
|
384 |
)
|
385 |
prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")
|