prithivMLmods commited on
Commit
fbaf052
·
verified ·
1 Parent(s): a2e799a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -49
app.py CHANGED
@@ -4,8 +4,6 @@ from threading import Thread
4
  import time
5
  import torch
6
  import spaces
7
- import cv2
8
- import numpy as np
9
  from PIL import Image
10
  from transformers import (
11
  Qwen2VLForConditionalGeneration,
@@ -35,30 +33,6 @@ def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_colo
35
  </style>
36
  '''
37
 
38
- def downsample_video(video_path):
39
- """
40
- Downsamples a video file by extracting 10 evenly spaced frames.
41
- Returns a list of tuples (PIL.Image, timestamp).
42
- """
43
- vidcap = cv2.VideoCapture(video_path)
44
- total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
45
- fps = vidcap.get(cv2.CAP_PROP_FPS)
46
- frames = []
47
- if total_frames <= 0 or fps <= 0:
48
- vidcap.release()
49
- return frames
50
- frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
51
- for i in frame_indices:
52
- vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
53
- success, image = vidcap.read()
54
- if success:
55
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
56
- pil_image = Image.fromarray(image)
57
- timestamp = round(i / fps, 2)
58
- frames.append((pil_image, timestamp))
59
- vidcap.release()
60
- return frames
61
-
62
  # Model and Processor Setup
63
  QV_MODEL_ID = "prithivMLmods/Qwen2-VL-Ocrtest-2B-Instruct"
64
  qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
@@ -83,28 +57,19 @@ def model_inference(message, history, use_docscopeocr):
83
  files = message.get("files", [])
84
 
85
  if not text and not files:
86
- yield "Error: Please input a text query or provide files (images or videos)."
87
  return
88
 
89
- # Process files: images and videos only
90
  image_list = []
91
  for idx, file in enumerate(files):
92
- if file.lower().endswith((".mp4", ".avi", ".mov")):
93
- frames = downsample_video(file)
94
- if not frames:
95
- yield "Error: Could not extract frames from the video."
96
- return
97
- for frame, timestamp in frames:
98
- label = f"Video {idx+1} Frame {timestamp}:"
99
- image_list.append((label, frame))
100
- else:
101
- try:
102
- img = load_image(file)
103
- label = f"Image {idx+1}:"
104
- image_list.append((label, img))
105
- except Exception as e:
106
- yield f"Error loading image: {str(e)}"
107
- return
108
 
109
  # Build content list
110
  content = [{"type": "text", "text": text}]
@@ -147,9 +112,8 @@ def model_inference(message, history, use_docscopeocr):
147
 
148
  # Gradio Interface
149
  examples = [
150
- [{"text": "OCR the Text in the Image", "files": ["example/image.jpg"]}],
151
- [{"text": "Explain the video, frame by frame.", "files": ["example/demo1.mp4"]}],
152
- [{"text": "Describe the ad in detail.", "files": ["example/demo2.mp4"]}],
153
  ]
154
 
155
  demo = gr.ChatInterface(
@@ -158,9 +122,9 @@ demo = gr.ChatInterface(
158
  examples=examples,
159
  textbox=gr.MultimodalTextbox(
160
  label="Query Input",
161
- file_types=["image", "video"],
162
  file_count="multiple",
163
- placeholder="Input your query and optionally upload image(s) or video(s). Select the model using the checkbox."
164
  ),
165
  stop_btn="Stop Generation",
166
  multimodal=True,
 
4
  import time
5
  import torch
6
  import spaces
 
 
7
  from PIL import Image
8
  from transformers import (
9
  Qwen2VLForConditionalGeneration,
 
33
  </style>
34
  '''
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  # Model and Processor Setup
37
  QV_MODEL_ID = "prithivMLmods/Qwen2-VL-Ocrtest-2B-Instruct"
38
  qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
 
57
  files = message.get("files", [])
58
 
59
  if not text and not files:
60
+ yield "Error: Please input a text query or provide image files."
61
  return
62
 
63
+ # Process files: images only
64
  image_list = []
65
  for idx, file in enumerate(files):
66
+ try:
67
+ img = load_image(file)
68
+ label = f"Image {idx+1}:"
69
+ image_list.append((label, img))
70
+ except Exception as e:
71
+ yield f"Error loading image: {str(e)}"
72
+ return
 
 
 
 
 
 
 
 
 
73
 
74
  # Build content list
75
  content = [{"type": "text", "text": text}]
 
112
 
113
  # Gradio Interface
114
  examples = [
115
+ [{"text": "OCR the text in the image", "files": ["example/image.jpg"]}],
116
+ [{"text": "Describe the content of the image", "files": ["example/image2.jpg"]}],
 
117
  ]
118
 
119
  demo = gr.ChatInterface(
 
122
  examples=examples,
123
  textbox=gr.MultimodalTextbox(
124
  label="Query Input",
125
+ file_types=["image"],
126
  file_count="multiple",
127
+ placeholder="Input your query and optionally upload image(s). Select the model using the checkbox."
128
  ),
129
  stop_btn="Stop Generation",
130
  multimodal=True,