Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on 28 days ago

Commit

6c355bd

verified ·

1 Parent(s): ddb22eb

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -6

app.py CHANGED Viewed

@@ -17,8 +17,8 @@ from transformers import (
 from qwen_vl_utils import process_vision_info
 # Constants for text generation
-MAX_MAX_NEW_TOKENS = 4096
-DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -61,7 +61,6 @@ model_s = Glm4vForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
 def downsample_video(video_path):
     """
     Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
@@ -219,7 +218,7 @@ video_examples = [
     ["explain the video in detail.", "videos/2.mp4"]
 ]
-# Added CSS to style the output area as a "Canvas"
 css = """
 .submit-btn {
     background-color: #2980b9 !important;
@@ -233,11 +232,17 @@ css = """
     border-radius: 10px;
     padding: 20px;
 }
 """
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
-    gr.Markdown("# **[Multimodal VLM OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
@@ -276,7 +281,8 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
             model_choice = gr.Radio(
                 choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "MonkeyOCR-pro-1.2B"],
                 label="Select Model",
-                value="Camel-Doc-OCR-062825"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")

 from qwen_vl_utils import process_vision_info
 # Constants for text generation
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     torch_dtype=torch.float16
 ).to(device).eval()
 def downsample_video(video_path):
     """
     Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
     ["explain the video in detail.", "videos/2.mp4"]
 ]
+# Updated CSS with model choice highlighting
 css = """
 .submit-btn {
     background-color: #2980b9 !important;
     border-radius: 10px;
     padding: 20px;
 }
+.model-choice label {
+    color: red;
+}
+.model-choice input[value="Camel-Doc-OCR-062825"] + label {
+    color: blue;
+}
 """
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
+    gr.Markdown("# **[Multimodal VLMOCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
             model_choice = gr.Radio(
                 choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "MonkeyOCR-pro-1.2B"],
                 label="Select Model",
+                value="Camel-Doc-OCR-062825",
+                elem_classes=["model-choice"]
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")