Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on 1 day ago

Commit

27347ad

verified ·

1 Parent(s): 6a6b031

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -25

app.py CHANGED Viewed

@@ -32,15 +32,6 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load Qwen2.5-VL-3B-Instruct-abliterated
-MODEL_ID_X = "huihui-ai/Qwen2.5-VL-3B-Instruct-abliterated"
-processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
-model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_X,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
 # Load Megalodon-OCR-Sync-0713
 MODEL_ID_T = "prithivMLmods/Megalodon-OCR-Sync-0713"
 processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
@@ -59,8 +50,8 @@ model_s = Glm4vForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load NuMarkdown-8B-Thinking
-MODEL_ID_Y = "numind/NuMarkdown-8B-Thinking"
 processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
 model_y = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_Y,
@@ -110,9 +101,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     elif model_name == "NuMarkdown-8B-Thinking":
         processor = processor_y
         model = model_y
-    elif model_name == "Qwen2.5-VL-3B-Instruct-abliterated":
-        processor = processor_x
-        model = model_x
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -166,12 +154,9 @@ def generate_video(model_name: str, text: str, video_path: str,
     elif model_name == "GLM-4.1V-9B-Thinking":
         processor = processor_s
         model = model_s
-    elif model_name == "NuMarkdown-8B-Thinking":
         processor = processor_y
         model = model_y
-    elif model_name == "Qwen2.5-VL-3B-Instruct-abliterated":
-        processor = processor_x
-        model = model_x
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -286,15 +271,12 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                     markdown_output = gr.Markdown(label="(Result.md)")
             model_choice = gr.Radio(
-                choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "NuMarkdown-8B-Thinking", "Qwen2.5-VL-3B-Instruct-abliterated"],
                 label="Select Model",
                 value="Camel-Doc-OCR-062825"
             )
-            gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
-            gr.Markdown("> Camel-Doc-OCR-062825 and Megalodon-OCR-Sync-0713 are both fine-tuned versions of the Qwen2.5-VL series focused on document retrieval, content extraction, analysis recognition, and excelling in OCR and visual document analysis tasks for structured and unstructured content—Camel-Doc-OCR-062825 leveraging the Qwen2.5-VL-7B-Instruct as its base, while Megalodon-OCR-Sync-0713 uses Qwen2.5-VL-3B-Instruct and is especially trained on diverse captioning datasets.")
-            gr.Markdown("> GLM-4.1V-9B-Thinking is a vision-language model (VLM) based on the GLM-4-9B-0414 foundation, with a strong emphasis on advanced reasoning capabilities, chain-of-thought inference, and robust bilingual (Chinese/English) performance on complex multimodal benchmarks.")
-            gr.Markdown("> DeepEyes-7B stands out for its agentic reinforcement learning approach, focusing on thinking with images for better visual reasoning, math problem-solving, and mitigating hallucination using Qwen2.5-VL-7B-Instruct as its foundation. Finally, Qwen2.5-VL-3B-Instruct-abliterated is part of the Qwen2.5-VL family, known for its versatile vision-language understanding and generation, serving as the foundational architecture for several of these fine-tuned vision-language and OCR models.")
     # Define the submit button actions
     image_submit.click(fn=generate_image,
                        inputs=[
@@ -312,4 +294,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                        outputs=[output, markdown_output])
 if __name__ == "__main__":
-    demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)

     torch_dtype=torch.float16
 ).to(device).eval()
 # Load Megalodon-OCR-Sync-0713
 MODEL_ID_T = "prithivMLmods/Megalodon-OCR-Sync-0713"
 processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load ViLaSR
+MODEL_ID_Y = "inclusionAI/ViLaSR"
 processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
 model_y = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_Y,
     elif model_name == "NuMarkdown-8B-Thinking":
         processor = processor_y
         model = model_y
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
     elif model_name == "GLM-4.1V-9B-Thinking":
         processor = processor_s
         model = model_s
+    elif model_name == "ViLaSR":
         processor = processor_y
         model = model_y
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
                     markdown_output = gr.Markdown(label="(Result.md)")
             model_choice = gr.Radio(
+                choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "ViLaSR"],
                 label="Select Model",
                 value="Camel-Doc-OCR-062825"
             )
+            gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-v1.0/discussions)")
     # Define the submit button actions
     image_submit.click(fn=generate_image,
                        inputs=[
                        outputs=[output, markdown_output])
 if __name__ == "__main__":
+    demo.queue(max_size=40).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)