Spaces:

prithivMLmods
/

DocScope-R1

Running on Zero

App Files Files Community

prithivMLmods commited on May 5

Commit

fbaf052

verified ·

1 Parent(s): a2e799a

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -49

app.py CHANGED Viewed

@@ -4,8 +4,6 @@ from threading import Thread
 import time
 import torch
 import spaces
-import cv2
-import numpy as np
 from PIL import Image
 from transformers import (
     Qwen2VLForConditionalGeneration,
@@ -35,30 +33,6 @@ def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_colo
 </style>
     '''
-def downsample_video(video_path):
-    """
-    Downsamples a video file by extracting 10 evenly spaced frames.
-    Returns a list of tuples (PIL.Image, timestamp).
-    """
-    vidcap = cv2.VideoCapture(video_path)
-    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    fps = vidcap.get(cv2.CAP_PROP_FPS)
-    frames = []
-    if total_frames <= 0 or fps <= 0:
-        vidcap.release()
-        return frames
-    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
-    for i in frame_indices:
-        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        success, image = vidcap.read()
-        if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(image)
-            timestamp = round(i / fps, 2)
-            frames.append((pil_image, timestamp))
-    vidcap.release()
-    return frames
 # Model and Processor Setup
 QV_MODEL_ID = "prithivMLmods/Qwen2-VL-Ocrtest-2B-Instruct"
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
@@ -83,28 +57,19 @@ def model_inference(message, history, use_docscopeocr):
     files = message.get("files", [])
     if not text and not files:
-        yield "Error: Please input a text query or provide files (images or videos)."
         return
-    # Process files: images and videos only
     image_list = []
     for idx, file in enumerate(files):
-        if file.lower().endswith((".mp4", ".avi", ".mov")):
-            frames = downsample_video(file)
-            if not frames:
-                yield "Error: Could not extract frames from the video."
-                return
-            for frame, timestamp in frames:
-                label = f"Video {idx+1} Frame {timestamp}:"
-                image_list.append((label, frame))
-        else:
-            try:
-                img = load_image(file)
-                label = f"Image {idx+1}:"
-                image_list.append((label, img))
-            except Exception as e:
-                yield f"Error loading image: {str(e)}"
-                return
     # Build content list
     content = [{"type": "text", "text": text}]
@@ -147,9 +112,8 @@ def model_inference(message, history, use_docscopeocr):
 # Gradio Interface
 examples = [
-    [{"text": "OCR the Text in the Image", "files": ["example/image.jpg"]}],
-    [{"text": "Explain the video, frame by frame.", "files": ["example/demo1.mp4"]}],
-    [{"text": "Describe the ad in detail.", "files": ["example/demo2.mp4"]}],
 ]
 demo = gr.ChatInterface(
@@ -158,9 +122,9 @@ demo = gr.ChatInterface(
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
-        file_types=["image", "video"],
         file_count="multiple",
-        placeholder="Input your query and optionally upload image(s) or video(s). Select the model using the checkbox."
     ),
     stop_btn="Stop Generation",
     multimodal=True,

 import time
 import torch
 import spaces
 from PIL import Image
 from transformers import (
     Qwen2VLForConditionalGeneration,
 </style>
     '''
 # Model and Processor Setup
 QV_MODEL_ID = "prithivMLmods/Qwen2-VL-Ocrtest-2B-Instruct"
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
     files = message.get("files", [])
     if not text and not files:
+        yield "Error: Please input a text query or provide image files."
         return
+    # Process files: images only
     image_list = []
     for idx, file in enumerate(files):
+        try:
+            img = load_image(file)
+            label = f"Image {idx+1}:"
+            image_list.append((label, img))
+        except Exception as e:
+            yield f"Error loading image: {str(e)}"
+            return
     # Build content list
     content = [{"type": "text", "text": text}]
 # Gradio Interface
 examples = [
+    [{"text": "OCR the text in the image", "files": ["example/image.jpg"]}],
+    [{"text": "Describe the content of the image", "files": ["example/image2.jpg"]}],
 ]
 demo = gr.ChatInterface(
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
+        file_types=["image"],
         file_count="multiple",
+        placeholder="Input your query and optionally upload image(s). Select the model using the checkbox."
     ),
     stop_btn="Stop Generation",
     multimodal=True,