prithivMLmods commited on
Commit
27347ad
·
verified ·
1 Parent(s): 6a6b031

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -25
app.py CHANGED
@@ -32,15 +32,6 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
32
  torch_dtype=torch.float16
33
  ).to(device).eval()
34
 
35
- # Load Qwen2.5-VL-3B-Instruct-abliterated
36
- MODEL_ID_X = "huihui-ai/Qwen2.5-VL-3B-Instruct-abliterated"
37
- processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
38
- model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
39
- MODEL_ID_X,
40
- trust_remote_code=True,
41
- torch_dtype=torch.float16
42
- ).to(device).eval()
43
-
44
  # Load Megalodon-OCR-Sync-0713
45
  MODEL_ID_T = "prithivMLmods/Megalodon-OCR-Sync-0713"
46
  processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
@@ -59,8 +50,8 @@ model_s = Glm4vForConditionalGeneration.from_pretrained(
59
  torch_dtype=torch.float16
60
  ).to(device).eval()
61
 
62
- # Load NuMarkdown-8B-Thinking
63
- MODEL_ID_Y = "numind/NuMarkdown-8B-Thinking"
64
  processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
65
  model_y = Qwen2_5_VLForConditionalGeneration.from_pretrained(
66
  MODEL_ID_Y,
@@ -110,9 +101,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
110
  elif model_name == "NuMarkdown-8B-Thinking":
111
  processor = processor_y
112
  model = model_y
113
- elif model_name == "Qwen2.5-VL-3B-Instruct-abliterated":
114
- processor = processor_x
115
- model = model_x
116
  else:
117
  yield "Invalid model selected.", "Invalid model selected."
118
  return
@@ -166,12 +154,9 @@ def generate_video(model_name: str, text: str, video_path: str,
166
  elif model_name == "GLM-4.1V-9B-Thinking":
167
  processor = processor_s
168
  model = model_s
169
- elif model_name == "NuMarkdown-8B-Thinking":
170
  processor = processor_y
171
  model = model_y
172
- elif model_name == "Qwen2.5-VL-3B-Instruct-abliterated":
173
- processor = processor_x
174
- model = model_x
175
  else:
176
  yield "Invalid model selected.", "Invalid model selected."
177
  return
@@ -286,15 +271,12 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
286
  markdown_output = gr.Markdown(label="(Result.md)")
287
 
288
  model_choice = gr.Radio(
289
- choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "NuMarkdown-8B-Thinking", "Qwen2.5-VL-3B-Instruct-abliterated"],
290
  label="Select Model",
291
  value="Camel-Doc-OCR-062825"
292
  )
293
- gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
294
- gr.Markdown("> Camel-Doc-OCR-062825 and Megalodon-OCR-Sync-0713 are both fine-tuned versions of the Qwen2.5-VL series focused on document retrieval, content extraction, analysis recognition, and excelling in OCR and visual document analysis tasks for structured and unstructured content—Camel-Doc-OCR-062825 leveraging the Qwen2.5-VL-7B-Instruct as its base, while Megalodon-OCR-Sync-0713 uses Qwen2.5-VL-3B-Instruct and is especially trained on diverse captioning datasets.")
295
- gr.Markdown("> GLM-4.1V-9B-Thinking is a vision-language model (VLM) based on the GLM-4-9B-0414 foundation, with a strong emphasis on advanced reasoning capabilities, chain-of-thought inference, and robust bilingual (Chinese/English) performance on complex multimodal benchmarks.")
296
- gr.Markdown("> DeepEyes-7B stands out for its agentic reinforcement learning approach, focusing on thinking with images for better visual reasoning, math problem-solving, and mitigating hallucination using Qwen2.5-VL-7B-Instruct as its foundation. Finally, Qwen2.5-VL-3B-Instruct-abliterated is part of the Qwen2.5-VL family, known for its versatile vision-language understanding and generation, serving as the foundational architecture for several of these fine-tuned vision-language and OCR models.")
297
-
298
  # Define the submit button actions
299
  image_submit.click(fn=generate_image,
300
  inputs=[
@@ -312,4 +294,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
312
  outputs=[output, markdown_output])
313
 
314
  if __name__ == "__main__":
315
- demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
32
  torch_dtype=torch.float16
33
  ).to(device).eval()
34
 
 
 
 
 
 
 
 
 
 
35
  # Load Megalodon-OCR-Sync-0713
36
  MODEL_ID_T = "prithivMLmods/Megalodon-OCR-Sync-0713"
37
  processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
 
50
  torch_dtype=torch.float16
51
  ).to(device).eval()
52
 
53
+ # Load ViLaSR
54
+ MODEL_ID_Y = "inclusionAI/ViLaSR"
55
  processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
56
  model_y = Qwen2_5_VLForConditionalGeneration.from_pretrained(
57
  MODEL_ID_Y,
 
101
  elif model_name == "NuMarkdown-8B-Thinking":
102
  processor = processor_y
103
  model = model_y
 
 
 
104
  else:
105
  yield "Invalid model selected.", "Invalid model selected."
106
  return
 
154
  elif model_name == "GLM-4.1V-9B-Thinking":
155
  processor = processor_s
156
  model = model_s
157
+ elif model_name == "ViLaSR":
158
  processor = processor_y
159
  model = model_y
 
 
 
160
  else:
161
  yield "Invalid model selected.", "Invalid model selected."
162
  return
 
271
  markdown_output = gr.Markdown(label="(Result.md)")
272
 
273
  model_choice = gr.Radio(
274
+ choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "ViLaSR"],
275
  label="Select Model",
276
  value="Camel-Doc-OCR-062825"
277
  )
278
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-v1.0/discussions)")
279
+
 
 
 
280
  # Define the submit button actions
281
  image_submit.click(fn=generate_image,
282
  inputs=[
 
294
  outputs=[output, markdown_output])
295
 
296
  if __name__ == "__main__":
297
+ demo.queue(max_size=40).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)