Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on about 20 hours ago

Commit

1d3bbac

verified ·

1 Parent(s): d48f554

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -31

app.py CHANGED Viewed

@@ -231,35 +231,6 @@ css = """
     border-radius: 10px;
     padding: 20px;
 }
-.model-choice-reasoning {
-    background-color: #2ecc71 !important; /* Green for reasoning models */
-    color: white !important;
-    padding: 5px 10px;
-    border-radius: 5px;
-}
-.model-choice-ocr {
-    background-color: #3498db !important; /* Blue for OCR models */
-    color: white !important;
-    padding: 5px 10px;
-    border-radius: 5px;
-}
-"""
-# JavaScript to apply classes to radio button labels
-js_script = """
-<script>
-document.addEventListener('DOMContentLoaded', function() {
-    const labels = document.querySelectorAll('.gr-radio label');
-    labels.forEach(label => {
-        const text = label.textContent.trim();
-        if (text === 'GLM-4.1V-9B-Thinking' || text === 'ViLaSR-7B') {
-            label.classList.add('model-choice-reasoning');
-        } else if (text === 'Camel-Doc-OCR-062825' || text === 'Megalodon-OCR-Sync-0713') {
-            label.classList.add('model-choice-ocr');
-        }
-    });
-});
-</script>
 """
 # Create the Gradio Interface
@@ -302,8 +273,9 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 value="Camel-Doc-OCR-062825"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-v1.0/discussions)")
-            gr.HTML(js_script)  # Inject JavaScript to apply classes
     # Define the submit button actions
     image_submit.click(fn=generate_image,
                        inputs=[

     border-radius: 10px;
     padding: 20px;
 }
 """
 # Create the Gradio Interface
                 value="Camel-Doc-OCR-062825"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-v1.0/discussions)")
+            gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825), [GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking), [Megalodon-OCR-Sync-0713](https://huggingface.co/prithivMLmods/Megalodon-OCR-Sync-0713), and [ViLaSR-7B](https://huggingface.co/inclusionAI/ViLaSR) are recent vision-language models excelling in document intelligence and multimodal understanding. Camel-Doc-OCR-062825 is a Qwen2.5-VL-7B-Instruct finetune, highly optimized for document retrieval, structured extraction, analysis, and direct Markdown generation from images and PDFs. GLM-4.1V-9B-Thinking offers next-level multimodal reasoning, bringing visual and textual comprehension together for advanced question answering.")
+            gr.Markdown("> Megalodon-OCR-Sync-0713, finetuned from Qwen2.5-VL-3B-Instruct, specializes in context-aware multimodal document extraction and analysis, excelling at retrieval, layout parsing, math, and chart/table recognition, with robust video and long-form comprehension capabilities. ViLaSR-7B focuses on reinforcing spatial reasoning in visual-language tasks by combining interwoven thinking with visual drawing, making it especially suited for spatial reasoning and complex tip-based queries.")
     # Define the submit button actions
     image_submit.click(fn=generate_image,
                        inputs=[