prithivMLmods commited on
Commit
6c355bd
·
verified ·
1 Parent(s): ddb22eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -6
app.py CHANGED
@@ -17,8 +17,8 @@ from transformers import (
17
  from qwen_vl_utils import process_vision_info
18
 
19
  # Constants for text generation
20
- MAX_MAX_NEW_TOKENS = 4096
21
- DEFAULT_MAX_NEW_TOKENS = 2048
22
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
23
 
24
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -61,7 +61,6 @@ model_s = Glm4vForConditionalGeneration.from_pretrained(
61
  torch_dtype=torch.float16
62
  ).to(device).eval()
63
 
64
-
65
  def downsample_video(video_path):
66
  """
67
  Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
@@ -219,7 +218,7 @@ video_examples = [
219
  ["explain the video in detail.", "videos/2.mp4"]
220
  ]
221
 
222
- # Added CSS to style the output area as a "Canvas"
223
  css = """
224
  .submit-btn {
225
  background-color: #2980b9 !important;
@@ -233,11 +232,17 @@ css = """
233
  border-radius: 10px;
234
  padding: 20px;
235
  }
 
 
 
 
 
 
236
  """
237
 
238
  # Create the Gradio Interface
239
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
240
- gr.Markdown("# **[Multimodal VLM OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
241
  with gr.Row():
242
  with gr.Column():
243
  with gr.Tabs():
@@ -276,7 +281,8 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
276
  model_choice = gr.Radio(
277
  choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "MonkeyOCR-pro-1.2B"],
278
  label="Select Model",
279
- value="Camel-Doc-OCR-062825"
 
280
  )
281
 
282
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
 
17
  from qwen_vl_utils import process_vision_info
18
 
19
  # Constants for text generation
20
+ MAX_MAX_NEW_TOKENS = 2048
21
+ DEFAULT_MAX_NEW_TOKENS = 1024
22
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
23
 
24
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
61
  torch_dtype=torch.float16
62
  ).to(device).eval()
63
 
 
64
  def downsample_video(video_path):
65
  """
66
  Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
 
218
  ["explain the video in detail.", "videos/2.mp4"]
219
  ]
220
 
221
+ # Updated CSS with model choice highlighting
222
  css = """
223
  .submit-btn {
224
  background-color: #2980b9 !important;
 
232
  border-radius: 10px;
233
  padding: 20px;
234
  }
235
+ .model-choice label {
236
+ color: red;
237
+ }
238
+ .model-choice input[value="Camel-Doc-OCR-062825"] + label {
239
+ color: blue;
240
+ }
241
  """
242
 
243
  # Create the Gradio Interface
244
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
245
+ gr.Markdown("# **[Multimodal VLMOCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
246
  with gr.Row():
247
  with gr.Column():
248
  with gr.Tabs():
 
281
  model_choice = gr.Radio(
282
  choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "MonkeyOCR-pro-1.2B"],
283
  label="Select Model",
284
+ value="Camel-Doc-OCR-062825",
285
+ elem_classes=["model-choice"]
286
  )
287
 
288
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")