Spaces:

prithivMLmods
/

Tiny-VLMs-Lab

Running on Zero

App Files Files Community

prithivMLmods commited on 9 days ago

Commit

2404d81

verified ·

1 Parent(s): 3108bef

Rename app(2).py to app.py

Browse files

Files changed (1) hide show

app(2).py → app.py +57 -5

app(2).py → app.py RENAMED Viewed

@@ -112,10 +112,20 @@ moondream = AutoModelForCausalLM.from_pretrained(
   revision=REVISION_MD,
   trust_remote_code=True,
   torch_dtype=torch.float16,
-  device_map={"": "cuda"},
 )
 tokenizer_md = AutoTokenizer.from_pretrained(MODEL_ID_MD, revision=REVISION_MD)
 # --- PDF Generation and Preview Utility Function ---
 def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
@@ -212,11 +222,52 @@ def process_document_stream(
         answer = moondream.answer_question(
             image_embeds=image_embeds,
             question=prompt_input,
-            tokenizer=tokenizer_md
         )
         yield answer, answer
         return
     if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
     elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
     elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
@@ -229,7 +280,8 @@ def process_document_stream(
         yield "Invalid model selected.", ""
         return
-    messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
@@ -281,9 +333,9 @@ def create_gradio_interface():
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
-                    choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
                              "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Nanonets-OCR-s(ocr)"],
-                    label="Select Model", value= "LFM2-VL-450M(fast)"
                 )
                 prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])

   revision=REVISION_MD,
   trust_remote_code=True,
   torch_dtype=torch.float16,
+  device_map="auto",
 )
 tokenizer_md = AutoTokenizer.from_pretrained(MODEL_ID_MD, revision=REVISION_MD)
+# --- SmolVLM2 Model Loading ---
+MODEL_ID_S2 = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+processor_s2 = AutoProcessor.from_pretrained(MODEL_ID_S2, trust_remote_code=True)
+model_s2 = AutoModelForImageTextToText.from_pretrained(
+    MODEL_ID_S2,
+    torch_dtype=torch.float16,
+    trust_remote_code=True,
+    device_map="auto"
+).eval()
 # --- PDF Generation and Preview Utility Function ---
 def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
         answer = moondream.answer_question(
             image_embeds=image_embeds,
             question=prompt_input,
+            tokenizer=tokenizer_md,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
         )
         yield answer, answer
         return
+    elif model_name == "SmolVLM2-2.2B-Instruct(smol)":
+        processor, model = processor_s2, model_s2
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt_input},
+                {"type": "image"},
+            ]
+        }]
+        inputs = processor.apply_chat_template(
+            messages,
+            images=[image],
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(model.device)
+        # Convert float32 tensors to float16 if necessary
+        for k, v in inputs.items():
+            if v.dtype == torch.float32:
+                inputs[k] = v.to(torch.float16)
+        generated_ids = model.generate(
+            **inputs,
+            do_sample=True,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+        )
+        generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        final_output = generated_texts[0].split("Assistant:")[-1].strip()
+        yield final_output, final_output
+        return
+    # Assign processor and model for other models
     if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
     elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
     elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
         yield "Invalid model selected.", ""
         return
+    # Common streaming logic for the rest of the models
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_input}]}]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
+                    choices=["SmolVLM2-2.2B-Instruct(smol)", "LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
                              "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Nanonets-OCR-s(ocr)"],
+                    label="Select Model", value= "SmolVLM2-2.2B-Instruct(smol)"
                 )
                 prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])