prithivMLmods commited on
Commit
02c1205
·
verified ·
1 Parent(s): b7d7dc4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -1
app.py CHANGED
@@ -11,6 +11,7 @@ from transformers import (
11
  Qwen2_5_VLForConditionalGeneration,
12
  Qwen2VLForConditionalGeneration,
13
  Glm4vForConditionalGeneration,
 
14
  AutoProcessor,
15
  TextIteratorStreamer,
16
  )
@@ -61,6 +62,15 @@ model_s = Glm4vForConditionalGeneration.from_pretrained(
61
  torch_dtype=torch.float16
62
  ).to(device).eval()
63
 
 
 
 
 
 
 
 
 
 
64
  def downsample_video(video_path):
65
  """
66
  Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
@@ -103,6 +113,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
103
  elif model_name == "GLM-4.1V-9B-Thinking":
104
  processor = processor_s
105
  model = model_s
 
 
 
106
  else:
107
  yield "Invalid model selected.", "Invalid model selected."
108
  return
@@ -159,6 +172,9 @@ def generate_video(model_name: str, text: str, video_path: str,
159
  elif model_name == "GLM-4.1V-9B-Thinking":
160
  processor = processor_s
161
  model = model_s
 
 
 
162
  else:
163
  yield "Invalid model selected.", "Invalid model selected."
164
  return
@@ -273,7 +289,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
273
  markdown_output = gr.Markdown(label="(Result.md)")
274
 
275
  model_choice = gr.Radio(
276
- choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "MonkeyOCR-pro-1.2B"],
277
  label="Select Model",
278
  value="Camel-Doc-OCR-062825"
279
  )
 
11
  Qwen2_5_VLForConditionalGeneration,
12
  Qwen2VLForConditionalGeneration,
13
  Glm4vForConditionalGeneration,
14
+ AutoModelForVision2Seq,
15
  AutoProcessor,
16
  TextIteratorStreamer,
17
  )
 
62
  torch_dtype=torch.float16
63
  ).to(device).eval()
64
 
65
+ # Load kanana-1.5-v-3b-instruct
66
+ MODEL_ID_F = "kakaocorp/kanana-1.5-v-3b-instruct"
67
+ processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
68
+ model_f = AutoModelForVision2Seq.from_pretrained(
69
+ MODEL_ID_F,
70
+ trust_remote_code=True,
71
+ torch_dtype=torch.float16
72
+ ).to(device).eval()
73
+
74
  def downsample_video(video_path):
75
  """
76
  Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
 
113
  elif model_name == "GLM-4.1V-9B-Thinking":
114
  processor = processor_s
115
  model = model_s
116
+ elif model_name == "kanana-1.5-v-3b":
117
+ processor = processor_f
118
+ model = model_f
119
  else:
120
  yield "Invalid model selected.", "Invalid model selected."
121
  return
 
172
  elif model_name == "GLM-4.1V-9B-Thinking":
173
  processor = processor_s
174
  model = model_s
175
+ elif model_name == "kanana-1.5-v-3b":
176
+ processor = processor_f
177
+ model = model_f
178
  else:
179
  yield "Invalid model selected.", "Invalid model selected."
180
  return
 
289
  markdown_output = gr.Markdown(label="(Result.md)")
290
 
291
  model_choice = gr.Radio(
292
+ choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "MonkeyOCR-pro-1.2B", "kanana-1.5-v-3b"],
293
  label="Select Model",
294
  value="Camel-Doc-OCR-062825"
295
  )