prithivMLmods commited on
Commit
1175275
·
verified ·
1 Parent(s): b99c8a5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -60
app.py CHANGED
@@ -21,8 +21,6 @@ from transformers import (
21
  Qwen2_5_VLForConditionalGeneration,
22
  AutoProcessor,
23
  TextIteratorStreamer,
24
- AutoModelForCausalLM,
25
- AutoTokenizer
26
  )
27
  from qwen_vl_utils import process_vision_info
28
 
@@ -69,16 +67,6 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
69
  torch_dtype=torch.float16
70
  ).to(device).eval()
71
 
72
- # Load moondream2
73
- MODEL_ID_MD = "vikhyatk/moondream2"
74
- tokenizer_md = AutoTokenizer.from_pretrained(MODEL_ID_MD)
75
- model_md = AutoModelForCausalLM.from_pretrained(
76
- MODEL_ID_MD,
77
- revision="2025-06-21",
78
- trust_remote_code=True,
79
- torch_dtype=torch.float16
80
- ).to(device).eval()
81
-
82
  # Helper functions for object detection
83
  def image_to_base64(image):
84
  """Convert a PIL image to a base64-encoded string."""
@@ -206,25 +194,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
206
  elif model_name == "ShotVL-7B":
207
  processor = processor_s
208
  model = model_s
209
- elif model_name == "moondream2":
210
- model = model_md
211
- tokenizer = tokenizer_md
212
- image_embeds = model.encode_image(image)
213
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
214
- thread = Thread(target=model.answer_question, kwargs={
215
- "image_embeds": image_embeds,
216
- "question": text,
217
- "tokenizer": tokenizer,
218
- "max_new_tokens": max_new_tokens,
219
- "streamer": streamer,
220
- })
221
- thread.start()
222
- buffer = ""
223
- for new_text in streamer:
224
- buffer += new_text
225
- time.sleep(0.01)
226
- yield buffer, buffer
227
- return
228
  else:
229
  yield "Invalid model selected.", "Invalid model selected."
230
  return
@@ -281,31 +250,6 @@ def generate_video(model_name: str, text: str, video_path: str,
281
  elif model_name == "ShotVL-7B":
282
  processor = processor_s
283
  model = model_s
284
- elif model_name == "moondream2":
285
- model = model_md
286
- tokenizer = tokenizer_md
287
- frames = downsample_video(video_path)
288
- buffer = ""
289
- for frame in frames:
290
- image, timestamp = frame
291
- image_embeds = model.encode_image(image)
292
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
293
- thread = Thread(target=model.answer_question, kwargs={
294
- "image_embeds": image_embeds,
295
- "question": text,
296
- "tokenizer": tokenizer,
297
- "max_new_tokens": max_new_tokens,
298
- "streamer": streamer,
299
- })
300
- thread.start()
301
- frame_buffer = f"Frame {timestamp}:\n"
302
- for new_text in streamer:
303
- frame_buffer += new_text
304
- buffer += new_text
305
- time.sleep(0.01)
306
- yield buffer, buffer
307
- buffer += "\n\n"
308
- return
309
  else:
310
  yield "Invalid model selected.", "Invalid model selected."
311
  return
@@ -371,7 +315,7 @@ object_detection_examples = [
371
  ["Detect Green Car.", "images/11.png"]
372
  ]
373
 
374
- # CSS
375
  css = """
376
  .submit-btn {
377
  background-color: #2980b9 !important;
@@ -447,17 +391,16 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
447
  markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
448
 
449
  model_choice = gr.Radio(
450
- choices=["Camel-Doc-OCR-062825", "ViLaSR-7B", "OCRFlux-3B", "ShotVL-7B", "moondream2"],
451
  label="Select Model",
452
  value="Camel-Doc-OCR-062825"
453
  )
454
 
455
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
456
  gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
457
- gr.Markdown("> [OCRFlux-3B](https://h темаuggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
458
  gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
459
  gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
460
- gr.Markdown("> [moondream2](https://huggingface.co/vikhyatk/moondream2) : A small vision language model that can be run on edge devices. Capable of captioning, visual querying, object detection, and more.")
461
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
462
 
463
  image_submit.click(
 
21
  Qwen2_5_VLForConditionalGeneration,
22
  AutoProcessor,
23
  TextIteratorStreamer,
 
 
24
  )
25
  from qwen_vl_utils import process_vision_info
26
 
 
67
  torch_dtype=torch.float16
68
  ).to(device).eval()
69
 
 
 
 
 
 
 
 
 
 
 
70
  # Helper functions for object detection
71
  def image_to_base64(image):
72
  """Convert a PIL image to a base64-encoded string."""
 
194
  elif model_name == "ShotVL-7B":
195
  processor = processor_s
196
  model = model_s
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  else:
198
  yield "Invalid model selected.", "Invalid model selected."
199
  return
 
250
  elif model_name == "ShotVL-7B":
251
  processor = processor_s
252
  model = model_s
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  else:
254
  yield "Invalid model selected.", "Invalid model selected."
255
  return
 
315
  ["Detect Green Car.", "images/11.png"]
316
  ]
317
 
318
+ # Added CSS to style the output area as a "Canvas"
319
  css = """
320
  .submit-btn {
321
  background-color: #2980b9 !important;
 
391
  markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
392
 
393
  model_choice = gr.Radio(
394
+ choices=["Camel-Doc-OCR-062825", "ViLaSR-7B", "OCRFlux-3B", "ShotVL-7B"],
395
  label="Select Model",
396
  value="Camel-Doc-OCR-062825"
397
  )
398
 
399
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
400
  gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
401
+ gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
402
  gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
403
  gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
 
404
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
405
 
406
  image_submit.click(