Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on Jul 4

Commit

1175275

verified ·

1 Parent(s): b99c8a5

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -60

app.py CHANGED Viewed

@@ -21,8 +21,6 @@ from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
-    AutoModelForCausalLM,
-    AutoTokenizer
 )
 from qwen_vl_utils import process_vision_info
@@ -69,16 +67,6 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load moondream2
-MODEL_ID_MD = "vikhyatk/moondream2"
-tokenizer_md = AutoTokenizer.from_pretrained(MODEL_ID_MD)
-model_md = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID_MD,
-    revision="2025-06-21",
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
 # Helper functions for object detection
 def image_to_base64(image):
     """Convert a PIL image to a base64-encoded string."""
@@ -206,25 +194,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     elif model_name == "ShotVL-7B":
         processor = processor_s
         model = model_s
-    elif model_name == "moondream2":
-        model = model_md
-        tokenizer = tokenizer_md
-        image_embeds = model.encode_image(image)
-        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-        thread = Thread(target=model.answer_question, kwargs={
-            "image_embeds": image_embeds,
-            "question": text,
-            "tokenizer": tokenizer,
-            "max_new_tokens": max_new_tokens,
-            "streamer": streamer,
-        })
-        thread.start()
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text
-            time.sleep(0.01)
-            yield buffer, buffer
-        return
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -281,31 +250,6 @@ def generate_video(model_name: str, text: str, video_path: str,
     elif model_name == "ShotVL-7B":
         processor = processor_s
         model = model_s
-    elif model_name == "moondream2":
-        model = model_md
-        tokenizer = tokenizer_md
-        frames = downsample_video(video_path)
-        buffer = ""
-        for frame in frames:
-            image, timestamp = frame
-            image_embeds = model.encode_image(image)
-            streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-            thread = Thread(target=model.answer_question, kwargs={
-                "image_embeds": image_embeds,
-                "question": text,
-                "tokenizer": tokenizer,
-                "max_new_tokens": max_new_tokens,
-                "streamer": streamer,
-            })
-            thread.start()
-            frame_buffer = f"Frame {timestamp}:\n"
-            for new_text in streamer:
-                frame_buffer += new_text
-                buffer += new_text
-                time.sleep(0.01)
-                yield buffer, buffer
-            buffer += "\n\n"
-        return
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -371,7 +315,7 @@ object_detection_examples = [
     ["Detect Green Car.", "images/11.png"]
 ]
-# CSS
 css = """
 .submit-btn {
     background-color: #2980b9 !important;
@@ -447,17 +391,16 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                     markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
             model_choice = gr.Radio(
-                choices=["Camel-Doc-OCR-062825", "ViLaSR-7B", "OCRFlux-3B", "ShotVL-7B", "moondream2"],
                 label="Select Model",
                 value="Camel-Doc-OCR-062825"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
             gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
-            gr.Markdown("> [OCRFlux-3B](https://h темаuggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
             gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
             gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
-            gr.Markdown("> [moondream2](https://huggingface.co/vikhyatk/moondream2) : A small vision language model that can be run on edge devices. Capable of captioning, visual querying, object detection, and more.")
             gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
     image_submit.click(

     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from qwen_vl_utils import process_vision_info
     torch_dtype=torch.float16
 ).to(device).eval()
 # Helper functions for object detection
 def image_to_base64(image):
     """Convert a PIL image to a base64-encoded string."""
     elif model_name == "ShotVL-7B":
         processor = processor_s
         model = model_s
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
     elif model_name == "ShotVL-7B":
         processor = processor_s
         model = model_s
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
     ["Detect Green Car.", "images/11.png"]
 ]
+# Added CSS to style the output area as a "Canvas"
 css = """
 .submit-btn {
     background-color: #2980b9 !important;
                     markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
             model_choice = gr.Radio(
+                choices=["Camel-Doc-OCR-062825", "ViLaSR-7B", "OCRFlux-3B", "ShotVL-7B"],
                 label="Select Model",
                 value="Camel-Doc-OCR-062825"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
             gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
+            gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
             gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
             gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
             gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
     image_submit.click(