Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -21,8 +21,6 @@ from transformers import (
|
|
21 |
Qwen2_5_VLForConditionalGeneration,
|
22 |
AutoProcessor,
|
23 |
TextIteratorStreamer,
|
24 |
-
AutoModelForCausalLM,
|
25 |
-
AutoTokenizer
|
26 |
)
|
27 |
from qwen_vl_utils import process_vision_info
|
28 |
|
@@ -69,16 +67,6 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
69 |
torch_dtype=torch.float16
|
70 |
).to(device).eval()
|
71 |
|
72 |
-
# Load moondream2
|
73 |
-
MODEL_ID_MD = "vikhyatk/moondream2"
|
74 |
-
tokenizer_md = AutoTokenizer.from_pretrained(MODEL_ID_MD)
|
75 |
-
model_md = AutoModelForCausalLM.from_pretrained(
|
76 |
-
MODEL_ID_MD,
|
77 |
-
revision="2025-06-21",
|
78 |
-
trust_remote_code=True,
|
79 |
-
torch_dtype=torch.float16
|
80 |
-
).to(device).eval()
|
81 |
-
|
82 |
# Helper functions for object detection
|
83 |
def image_to_base64(image):
|
84 |
"""Convert a PIL image to a base64-encoded string."""
|
@@ -206,25 +194,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
206 |
elif model_name == "ShotVL-7B":
|
207 |
processor = processor_s
|
208 |
model = model_s
|
209 |
-
elif model_name == "moondream2":
|
210 |
-
model = model_md
|
211 |
-
tokenizer = tokenizer_md
|
212 |
-
image_embeds = model.encode_image(image)
|
213 |
-
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
214 |
-
thread = Thread(target=model.answer_question, kwargs={
|
215 |
-
"image_embeds": image_embeds,
|
216 |
-
"question": text,
|
217 |
-
"tokenizer": tokenizer,
|
218 |
-
"max_new_tokens": max_new_tokens,
|
219 |
-
"streamer": streamer,
|
220 |
-
})
|
221 |
-
thread.start()
|
222 |
-
buffer = ""
|
223 |
-
for new_text in streamer:
|
224 |
-
buffer += new_text
|
225 |
-
time.sleep(0.01)
|
226 |
-
yield buffer, buffer
|
227 |
-
return
|
228 |
else:
|
229 |
yield "Invalid model selected.", "Invalid model selected."
|
230 |
return
|
@@ -281,31 +250,6 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
281 |
elif model_name == "ShotVL-7B":
|
282 |
processor = processor_s
|
283 |
model = model_s
|
284 |
-
elif model_name == "moondream2":
|
285 |
-
model = model_md
|
286 |
-
tokenizer = tokenizer_md
|
287 |
-
frames = downsample_video(video_path)
|
288 |
-
buffer = ""
|
289 |
-
for frame in frames:
|
290 |
-
image, timestamp = frame
|
291 |
-
image_embeds = model.encode_image(image)
|
292 |
-
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
293 |
-
thread = Thread(target=model.answer_question, kwargs={
|
294 |
-
"image_embeds": image_embeds,
|
295 |
-
"question": text,
|
296 |
-
"tokenizer": tokenizer,
|
297 |
-
"max_new_tokens": max_new_tokens,
|
298 |
-
"streamer": streamer,
|
299 |
-
})
|
300 |
-
thread.start()
|
301 |
-
frame_buffer = f"Frame {timestamp}:\n"
|
302 |
-
for new_text in streamer:
|
303 |
-
frame_buffer += new_text
|
304 |
-
buffer += new_text
|
305 |
-
time.sleep(0.01)
|
306 |
-
yield buffer, buffer
|
307 |
-
buffer += "\n\n"
|
308 |
-
return
|
309 |
else:
|
310 |
yield "Invalid model selected.", "Invalid model selected."
|
311 |
return
|
@@ -371,7 +315,7 @@ object_detection_examples = [
|
|
371 |
["Detect Green Car.", "images/11.png"]
|
372 |
]
|
373 |
|
374 |
-
# CSS
|
375 |
css = """
|
376 |
.submit-btn {
|
377 |
background-color: #2980b9 !important;
|
@@ -447,17 +391,16 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
447 |
markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
|
448 |
|
449 |
model_choice = gr.Radio(
|
450 |
-
choices=["Camel-Doc-OCR-062825", "ViLaSR-7B", "OCRFlux-3B", "ShotVL-7B"
|
451 |
label="Select Model",
|
452 |
value="Camel-Doc-OCR-062825"
|
453 |
)
|
454 |
|
455 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
|
456 |
gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
|
457 |
-
gr.Markdown("> [OCRFlux-3B](https://
|
458 |
gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
|
459 |
gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
|
460 |
-
gr.Markdown("> [moondream2](https://huggingface.co/vikhyatk/moondream2) : A small vision language model that can be run on edge devices. Capable of captioning, visual querying, object detection, and more.")
|
461 |
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
462 |
|
463 |
image_submit.click(
|
|
|
21 |
Qwen2_5_VLForConditionalGeneration,
|
22 |
AutoProcessor,
|
23 |
TextIteratorStreamer,
|
|
|
|
|
24 |
)
|
25 |
from qwen_vl_utils import process_vision_info
|
26 |
|
|
|
67 |
torch_dtype=torch.float16
|
68 |
).to(device).eval()
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
# Helper functions for object detection
|
71 |
def image_to_base64(image):
|
72 |
"""Convert a PIL image to a base64-encoded string."""
|
|
|
194 |
elif model_name == "ShotVL-7B":
|
195 |
processor = processor_s
|
196 |
model = model_s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
else:
|
198 |
yield "Invalid model selected.", "Invalid model selected."
|
199 |
return
|
|
|
250 |
elif model_name == "ShotVL-7B":
|
251 |
processor = processor_s
|
252 |
model = model_s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
else:
|
254 |
yield "Invalid model selected.", "Invalid model selected."
|
255 |
return
|
|
|
315 |
["Detect Green Car.", "images/11.png"]
|
316 |
]
|
317 |
|
318 |
+
# Added CSS to style the output area as a "Canvas"
|
319 |
css = """
|
320 |
.submit-btn {
|
321 |
background-color: #2980b9 !important;
|
|
|
391 |
markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
|
392 |
|
393 |
model_choice = gr.Radio(
|
394 |
+
choices=["Camel-Doc-OCR-062825", "ViLaSR-7B", "OCRFlux-3B", "ShotVL-7B"],
|
395 |
label="Select Model",
|
396 |
value="Camel-Doc-OCR-062825"
|
397 |
)
|
398 |
|
399 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
|
400 |
gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
|
401 |
+
gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
|
402 |
gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
|
403 |
gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
|
|
|
404 |
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
405 |
|
406 |
image_submit.click(
|