Spaces:

prithivMLmods
/

Tiny-VLMs-Lab

Running on Zero

App Files Files Community

prithivMLmods commited on Aug 17

Commit

e425156

verified ·

1 Parent(s): 76bcb3d

update app (#15)

Browse files

- update app (69d5c6df8b9fd0236d1a25e519150d82e46685fb)

Files changed (1) hide show

app.py +42 -60

app.py CHANGED Viewed

@@ -28,6 +28,8 @@ from transformers import (
     AutoProcessor,
     TextIteratorStreamer,
     AutoTokenizer,
 )
 from transformers.image_utils import load_image
@@ -150,10 +152,13 @@ model_o = AutoModelForVision2Seq.from_pretrained(
     MODEL_ID_O, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
 ).to(device).eval()
-# --- NEW MODEL: Florence-2-Flux-Large ---
-MODEL_ID_FL = "gokaygokay/Florence-2-Flux-Large"
-model_fl = AutoModelForCausalLM.from_pretrained(MODEL_ID_FL, trust_remote_code=True, torch_dtype=torch.float16).to(device).eval()
-processor_fl = AutoProcessor.from_pretrained(MODEL_ID_FL, trust_remote_code=True)
 # --- PDF Generation and Preview Utility Function ---
@@ -246,6 +251,7 @@ def process_document_stream(
         yield "Please enter a prompt.", ""
         return
     if model_name == "Moondream2(vision)":
         image_embeds = moondream.encode_image(image)
         answer = moondream.answer_question(
@@ -255,61 +261,38 @@ def process_document_stream(
         )
         yield answer, answer
         return
-    if model_name == "Florence-2-Flux-Large(caption)":
-        processor, model = processor_fl, model_fl
-        task_prompt = "<DESCRIPTION>"
-        if any(keyword in prompt_input.lower() for keyword in ["ocr", "read", "text", "extract"]):
-            task_prompt = "<OCR>"
-        prompt = task_prompt + prompt_input
-        if image.mode != "RGB":
-            image = image.convert("RGB")
-        inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
-        generated_ids = model.generate(
-            input_ids=inputs["input_ids"],
-            pixel_values=inputs["pixel_values"],
-            max_new_tokens=max_new_tokens,
-            do_sample=True,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-        )
-        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-        parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
-        final_answer = parsed_answer.get(task_prompt, "Error: Could not parse model output.")
-        if isinstance(final_answer, dict) and 'labels' in final_answer:
-             final_answer = " ".join(final_answer['labels'])
-        yield final_answer, final_answer
-        return
-    if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
-    elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
-    elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
-    elif model_name == "SmolVLM-Instruct-250M(smol)": processor, model = processor_c, model_c
-    elif model_name == "MonkeyOCR-pro-1.2B(ocr)": processor, model = processor_g, model_g
-    elif model_name == "VLAA-Thinker-Qwen2VL-2B(reason)": processor, model = processor_i, model_i
-    elif model_name == "Nanonets-OCR-s(ocr)": processor, model = processor_a, model_a
-    elif model_name == "Megalodon-OCR-Sync-0713(ocr)": processor, model = processor_x, model_x
-    elif model_name == "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)": processor, model = processor_n, model_n
-    elif model_name == "LMM-R1-MGT-PerceReason(reason)": processor, model = processor_f, model_f
-    elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
-    elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
-    elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
     else:
-        yield "Invalid model selected.", ""
-        return
-    messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
-    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
@@ -359,10 +342,9 @@ def create_gradio_interface():
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
-                    choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)",
-                             "Florence-2-Flux-Large(caption)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
                              "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
-                             "LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)", "SmolVLM-500M-Instruct(smol)"],
                     label="Select Model", value= "LFM2-VL-450M(fast)"
                 )
                 prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")

     AutoProcessor,
     TextIteratorStreamer,
     AutoTokenizer,
+    LlavaOnevisionForConditionalGeneration,
+    LlavaOnevisionProcessor,
 )
 from transformers.image_utils import load_image
     MODEL_ID_O, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
 ).to(device).eval()
+# --- New Model: llava-onevision ---
+MODEL_ID_LO = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+processor_lo = LlavaOnevisionProcessor.from_pretrained(MODEL_ID_LO)
+model_lo = LlavaOnevisionForConditionalGeneration.from_pretrained(
+    MODEL_ID_LO,
+    torch_dtype=torch.float16
+).to(device).eval()
 # --- PDF Generation and Preview Utility Function ---
         yield "Please enter a prompt.", ""
         return
+    # --- Special Handling for Moondream2 ---
     if model_name == "Moondream2(vision)":
         image_embeds = moondream.encode_image(image)
         answer = moondream.answer_question(
         )
         yield answer, answer
         return
+    processor = None
+    model = None
+    # --- Special Handling for Llava-OneVision ---
+    if model_name == "llava-onevision-qwen2-0.5b-ov-hf(mini)":
+        processor, model = processor_lo, model_lo
+        prompt = f"<|im_start|>user <image>\n{prompt_input}<|im_end|><|im_start|>assistant"
+        inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch.float16)
+    # --- Generic Handling for all other models ---
     else:
+        if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
+        elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
+        elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
+        elif model_name == "SmolVLM-Instruct-250M(smol)": processor, model = processor_c, model_c
+        elif model_name == "MonkeyOCR-pro-1.2B(ocr)": processor, model = processor_g, model_g
+        elif model_name == "VLAA-Thinker-Qwen2VL-2B(reason)": processor, model = processor_i, model_i
+        elif model_name == "Nanonets-OCR-s(ocr)": processor, model = processor_a, model_a
+        elif model_name == "Megalodon-OCR-Sync-0713(ocr)": processor, model = processor_x, model_x
+        elif model_name == "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)": processor, model = processor_n, model_n
+        elif model_name == "LMM-R1-MGT-PerceReason(reason)": processor, model = processor_f, model_f
+        elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
+        elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
+        elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
+        else:
+            yield "Invalid model selected.", ""
+            return
+        messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
+        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
+                    choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
                              "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
+                             "LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)", "SmolVLM-500M-Instruct(smol)", "llava-onevision-qwen2-0.5b-ov-hf(mini)"],
                     label="Select Model", value= "LFM2-VL-450M(fast)"
                 )
                 prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")