Spaces:

prithivMLmods
/

DocScope-R1

Running on Zero

App Files Files Community

prithivMLmods commited on 5 days ago

Commit

c6a1ef4

verified ·

1 Parent(s): 53f1230

Update app.py

Browse files

Files changed (1) hide show

app.py +151 -84

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import os
 import uuid
 import time
 import asyncio
 from threading import Thread
@@ -14,125 +16,190 @@ import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
-    TextIteratorStreamer
 )
-# Constants
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load multimodal processor and model (Callisto OCR3)
 MODEL_ID = "nvidia/Cosmos-Reason1-7B"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     trust_remote_code=True,
     torch_dtype=torch.float16
-).to(device).eval()
-def downsample_video(video_path: str, num_frames: int = 10):
     vidcap = cv2.VideoCapture(video_path)
-    total = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
-    idxs = np.linspace(0, total - 1, num_frames, dtype=int)
     frames = []
-    for i in idxs:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        ok, img = vidcap.read()
-        if ok:
-            rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-            pil = Image.fromarray(rgb)
             timestamp = round(i / fps, 2)
-            frames.append((pil, timestamp))
     vidcap.release()
     return frames
 def progress_bar_html(label: str) -> str:
-    return f'''<div style="display:flex; align-items:center;">
-  <span style="margin-right:10px; font-size:14px;">{label}</span>
-  <div style="width:110px; height:5px; background:#B0E0E6; border-radius:2px; overflow:hidden;">
-    <div style="width:100%; height:100%; background:#00FFFF; animation:load 1.5s linear infinite;"></div>
-  </div>
 </div>
-<style>@keyframes load{{0%{{transform:translateX(-100%)}}100%{{transform:translateX(100%)}}}}</style>'''
 @spaces.GPU
-def generate(prompt: str, files: list[str] = None):
-    files = files or []
-    # Determine mode
-    is_video = any(f.lower().endswith(('.mp4', '.avi', '.mov')) for f in files)
-    is_image = any(f.lower().endswith(('.jpg', '.png', '.jpeg', '.bmp')) for f in files)
-    if is_video:
-        yield progress_bar_html("Processing video with cosmos-reason1")
-        video = files[0]
-        frames = downsample_video(video)
-        # Build messages
-        messages = [
-            {"role": "system", "content": [{"type":"text","text":"You are a helpful assistant."}]},
-            {"role": "user", "content": [{"type":"text","text": prompt}]}
-        ]
-        for img, ts in frames:
-            path = f"frame_{uuid.uuid4().hex}.png"
-            img.save(path)
-            messages[1]["content"].extend([
-                {"type":"text","text": f"Frame {ts}:"},
-                {"type":"image","url": path}
-            ])
-        inputs = processor.apply_chat_template(
-            messages, tokenize=True, add_generation_prompt=True,
-            return_dict=True, return_tensors="pt",
-            truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
-        ).to(device)
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        Thread(target=model.generate, kwargs={**inputs, "streamer": streamer}).start()
         buffer = ""
-        for txt in streamer:
-            buffer += txt.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer
-        return
-    if is_image:
-        yield progress_bar_html("Processing image with cosmos-reason1")
-        imgs = [Image.open(f) for f in files]
         messages = [
-            {"role":"user","content":[*[{"type":"image","image":i} for i in imgs],{"type":"text","text":prompt}]}]
-        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(
-            text=[prompt_full], images=imgs,
-            return_tensors="pt", padding=True,
-            truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
-        ).to(device)
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        Thread(target=model.generate, kwargs={**inputs, "streamer": streamer}).start()
-        out = ""
-        for txt in streamer:
-            out += txt.replace("<|im_end|>", "")
             time.sleep(0.01)
-            yield out
-        return
-    # No valid media
-    yield "Please upload at least one image or a video for inference."
-def main():
-    demo = gr.ChatInterface(
         fn=generate,
-        additional_inputs=[
-            gr.File(label="Upload Images/Videos", file_types=["image", "video"], file_count="multiple")
-        ],
-        description="# **cosmos-reason1 by nvidia**",
-        textbox=gr.Textbox(label="Prompt"),
-        cache_examples=False,
-        type="messages",
-        multimodal=True,
-        stop_btn="Stop Generation"
     )
-    demo.queue(max_size=10).launch(share=True)
 if __name__ == "__main__":
-    main()

 import os
+import random
 import uuid
+import json
 import time
 import asyncio
 from threading import Thread
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
+    TextIteratorStreamer,
 )
+from transformers.image_utils import load_image
+# Constants for text generation
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load multimodal processor and model (Callisto OCR3)
 MODEL_ID = "nvidia/Cosmos-Reason1-7B"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     trust_remote_code=True,
     torch_dtype=torch.float16
+).to("cuda").eval()
+def downsample_video(video_path):
+    """
+    Downsamples the video to 10 evenly spaced frames.
+    Each frame is returned as a PIL image along with its timestamp.
+    """
     vidcap = cv2.VideoCapture(video_path)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
+    # Sample 10 evenly spaced frames.
+    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
+    for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
+            pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
 def progress_bar_html(label: str) -> str:
+    """
+    Returns an HTML snippet for a thin progress bar with a label.
+    The progress bar is styled as a light cyan animated bar.
+    """
+    return f'''
+<div style="display: flex; align-items: center;">
+    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
+    <div style="width: 110px; height: 5px; background-color: #B0E0E6; border-radius: 2px; overflow: hidden;">
+        <div style="width: 100%; height: 100%; background-color: #00FFFF; animation: loading 1.5s linear infinite;"></div>
+    </div>
 </div>
+<style>
+@keyframes loading {{
+    0% {{ transform: translateX(-100%); }}
+    100% {{ transform: translateX(100%); }}
+}}
+</style>
+    '''
 @spaces.GPU
+def generate(text: str, files: list,
+             max_new_tokens: int = 1024,
+             temperature: float = 0.6,
+             top_p: float = 0.9,
+             top_k: int = 50,
+             repetition_penalty: float = 1.2):
+    """
+    Generates responses using the Qwen2VL model for image and video inputs.
+    - If images are provided, performs image inference.
+    - If videos are provided, performs video inference by downsampling to frames.
+    """
+    if not files:
+        yield "Please upload an image or video for inference."
+        return
+    # Determine if the files are images or videos
+    image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]
+    video_files = [f for f in files if f.lower().endswith(('.mp4', '.avi', '.mov', '.mkv'))]
+    if image_files and video_files:
+        yield "Please upload either images or videos, not both."
+        return
+    if image_files:
+        # Image inference
+        images = [load_image(image) for image in image_files]
+        messages = [{
+            "role": "user",
+            "content": [
+                *[{"type": "image", "image": image} for image in images],
+                {"type": "text", "text": text},
+            ]
+        }]
+        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(
+            text=[prompt_full],
+            images=images,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=MAX_INPUT_TOKEN_LENGTH
+        ).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
+        thread.start()
         buffer = ""
+        yield progress_bar_html("Processing images with cosmos-reasoning")
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer
+    elif video_files:
+        # Video inference
+        video_path = video_files[0]  # Assuming only one video is uploaded
+        frames = downsample_video(video_path)
         messages = [
+            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+            {"role": "user", "content": [{"type": "text", "text": text}]}
+        ]
+        # Append each frame with its timestamp.
+        for frame in frames:
+            image, timestamp = frame
+            image_path = f"video_frame_{uuid.uuid4().hex}.png"
+            image.save(image_path)
+            messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
+            messages[1]["content"].append({"type": "image", "url": image_path})
+        inputs = processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt",
+            truncation=True,
+            max_length=MAX_INPUT_TOKEN_LENGTH
+        ).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        yield progress_bar_html("Processing video with cosmos-reasoning")
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
+            yield buffer
+    else:
+        yield "Unsupported file type. Please upload images or videos."
+# Create the Gradio Interface
+with gr.Blocks() as demo:
+    gr.Markdown("# **cosmos-reason1 by nvidia**")
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+            file_input = gr.File(label="Upload Image or Video", file_types=["image", "video"], file_count="multiple")
+            max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
+            temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
+            top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
+            top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
+            repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
+            submit_btn = gr.Button("Submit")
+        with gr.Column():
+            output = gr.Textbox(label="Output", interactive=False)
+    submit_btn.click(
         fn=generate,
+        inputs=[text_input, file_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=output
     )
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch(share=True)