prithivMLmods commited on
Commit
d327195
·
verified ·
1 Parent(s): b6e3398

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -7
app.py CHANGED
@@ -10,6 +10,7 @@ import cv2
10
  from transformers import (
11
  Qwen2_5_VLForConditionalGeneration,
12
  Qwen2VLForConditionalGeneration,
 
13
  AutoProcessor,
14
  TextIteratorStreamer,
15
  )
@@ -31,7 +32,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
31
  torch_dtype=torch.float16
32
  ).to(device).eval()
33
 
34
- # MonkeyOCR-pro-1.2B
35
  MODEL_ID_X = "echo840/MonkeyOCR-pro-1.2B"
36
  SUBFOLDER = "Recognition"
37
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, subfolder=SUBFOLDER, trust_remote_code=True)
@@ -51,15 +52,16 @@ model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
51
  torch_dtype=torch.float16
52
  ).to(device).eval()
53
 
54
- # Load ShotVL-7B
55
- MODEL_ID_S = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
56
  processor_s = AutoProcessor.from_pretrained(MODEL_ID_S, trust_remote_code=True)
57
- model_s = Qwen2VLForConditionalGeneration.from_pretrained(
58
  MODEL_ID_S,
59
  trust_remote_code=True,
60
  torch_dtype=torch.float16
61
  ).to(device).eval()
62
 
 
63
  def downsample_video(video_path):
64
  """
65
  Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
@@ -99,7 +101,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
99
  elif model_name == "Megalodon-OCR-Sync-0713":
100
  processor = processor_t
101
  model = model_t
102
- elif model_name == "Qwen2-VL-OCR-2B":
103
  processor = processor_s
104
  model = model_s
105
  else:
@@ -155,7 +157,7 @@ def generate_video(model_name: str, text: str, video_path: str,
155
  elif model_name == "Megalodon-OCR-Sync-0713":
156
  processor = processor_t
157
  model = model_t
158
- elif model_name == "Qwen2-VL-OCR-2B":
159
  processor = processor_s
160
  model = model_s
161
  else:
@@ -272,7 +274,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
272
  markdown_output = gr.Markdown(label="(Result.md)")
273
 
274
  model_choice = gr.Radio(
275
- choices=["Camel-Doc-OCR-062825", "MonkeyOCR-pro-1.2B", "Megalodon-OCR-Sync-0713", "Qwen2-VL-OCR-2B"],
276
  label="Select Model",
277
  value="Camel-Doc-OCR-062825"
278
  )
 
10
  from transformers import (
11
  Qwen2_5_VLForConditionalGeneration,
12
  Qwen2VLForConditionalGeneration,
13
+ Glm4vForConditionalGeneration,
14
  AutoProcessor,
15
  TextIteratorStreamer,
16
  )
 
32
  torch_dtype=torch.float16
33
  ).to(device).eval()
34
 
35
+ # Load MonkeyOCR-pro-1.2B
36
  MODEL_ID_X = "echo840/MonkeyOCR-pro-1.2B"
37
  SUBFOLDER = "Recognition"
38
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, subfolder=SUBFOLDER, trust_remote_code=True)
 
52
  torch_dtype=torch.float16
53
  ).to(device).eval()
54
 
55
+ # Load GLM-4.1V-9B-Thinking
56
+ MODEL_ID_S = "zai-org/GLM-4.1V-9B-Thinking"
57
  processor_s = AutoProcessor.from_pretrained(MODEL_ID_S, trust_remote_code=True)
58
+ model_s = Glm4vForConditionalGeneration.from_pretrained(
59
  MODEL_ID_S,
60
  trust_remote_code=True,
61
  torch_dtype=torch.float16
62
  ).to(device).eval()
63
 
64
+
65
  def downsample_video(video_path):
66
  """
67
  Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
 
101
  elif model_name == "Megalodon-OCR-Sync-0713":
102
  processor = processor_t
103
  model = model_t
104
+ elif model_name == "GLM-4.1V-9B-Thinking":
105
  processor = processor_s
106
  model = model_s
107
  else:
 
157
  elif model_name == "Megalodon-OCR-Sync-0713":
158
  processor = processor_t
159
  model = model_t
160
+ elif model_name == "GLM-4.1V-9B-Thinking":
161
  processor = processor_s
162
  model = model_s
163
  else:
 
274
  markdown_output = gr.Markdown(label="(Result.md)")
275
 
276
  model_choice = gr.Radio(
277
+ choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "MonkeyOCR-pro-1.2B", "Megalodon-OCR-Sync-0713"],
278
  label="Select Model",
279
  value="Camel-Doc-OCR-062825"
280
  )