Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on 22 days ago

Commit

27577e4

verified ·

1 Parent(s): 8e118cf

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -3

app.py CHANGED Viewed

@@ -282,7 +282,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.md)")
@@ -291,9 +290,11 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 label="Select Model",
                 value="GLM-4.1V-9B-Thinking"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
     # Define the submit button actions
     image_submit.click(fn=generate_image,
                        inputs=[

             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.md)")
                 label="Select Model",
                 value="GLM-4.1V-9B-Thinking"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
+            gr.Markdown("> Camel-Doc-OCR-062825 and Megalodon-OCR-Sync-0713 are both fine-tuned versions of the Qwen2.5-VL series focused on document retrieval, content extraction, analysis recognition, and excelling in OCR and visual document analysis tasks for structured and unstructured content—Camel-Doc-OCR-062825 leveraging the Qwen2.5-VL-7B-Instruct as its base, while Megalodon-OCR-Sync-0713 uses Qwen2.5-VL-3B-Instruct and is especially trained on diverse captioning datasets. ")
+            gr.Markdown("GLM-4.1V-9B-Thinking is a vision-language model (VLM) based on the GLM-4-9B-0414 foundation, with a strong emphasis on advanced reasoning capabilities, chain-of-thought inference, and robust bilingual (Chinese/English) performance on complex multimodal benchmarks.")
+            gr.Markdown("DeepEyes-7B stands out for its agentic reinforcement learning approach, focusing on “thinking with images” for better visual reasoning, math problem-solving, and mitigating hallucination using Qwen2.5-VL-7B-Instruct as its foundation. Finally, Qwen2.5-VL-3B-Instruct-abliterated is part of the Qwen2.5-VL family, known for its versatile vision-language understanding and generation, serving as the foundational architecture for several of these fine-tuned vision-language and OCR models.")
     # Define the submit button actions
     image_submit.click(fn=generate_image,
                        inputs=[