SixOpen commited on
Commit
8793261
·
verified ·
1 Parent(s): dbbe955
Files changed (1) hide show
  1. app.py +36 -20
app.py CHANGED
@@ -79,11 +79,9 @@ def draw_polygons(image, prediction, fill_mask=False):
79
  for _polygon in polygons:
80
  _polygon = np.array(_polygon).reshape(-1, 2)
81
  if _polygon.shape[0] < 3:
82
- print('Invalid polygon:', _polygon)
83
  continue
84
  _polygon = (_polygon * scale).reshape(-1).tolist()
85
  if len(_polygon) % 2 != 0:
86
- print('Invalid polygon:', _polygon)
87
  continue
88
  polygon_points = np.array(_polygon).reshape(-1, 2)
89
  if fill_mask:
@@ -102,10 +100,12 @@ def draw_ocr_bboxes(image, prediction):
102
  bboxes, labels = prediction['quad_boxes'], prediction['labels']
103
  for box, label in zip(bboxes, labels):
104
  color = random.choice(colormap)
105
- new_box = (np.array(box) * scale).tolist()
 
 
106
  polygon = patches.Polygon(new_box, edgecolor=color, fill=False, linewidth=3)
107
  ax.add_patch(polygon)
108
- plt.text(new_box[0], new_box[1], label, color='white', fontsize=8, bbox=dict(facecolor=color, alpha=0.5))
109
  ax.axis('off')
110
  return fig_to_pil(fig)
111
 
@@ -114,15 +114,25 @@ def draw_ocr_bboxes(image, prediction):
114
  def process_video(input_video_path, task_prompt):
115
  cap = cv2.VideoCapture(input_video_path)
116
  if not cap.isOpened():
117
- print("Error: Can't open the video file.")
118
- return
119
 
120
  frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
121
  frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
122
  fps = cap.get(cv2.CAP_PROP_FPS)
 
 
 
 
 
 
123
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
124
  out = cv2.VideoWriter("output_vid.mp4", fourcc, fps, (frame_width, frame_height))
125
 
 
 
 
 
 
126
  while cap.isOpened():
127
  ret, frame = cap.read()
128
  if not ret:
@@ -133,19 +143,25 @@ def process_video(input_video_path, task_prompt):
133
 
134
  result = run_example(task_prompt, pil_image)
135
 
 
136
  if task_prompt == "<OD>":
137
- processed_image = plot_bbox(pil_image, result['<OD>'])
 
138
  elif task_prompt == "<DENSE_REGION_CAPTION>":
139
- processed_image = plot_bbox(pil_image, result['<DENSE_REGION_CAPTION>'])
140
- else:
141
- processed_image = pil_image
142
 
143
  processed_frame = cv2.cvtColor(np.array(processed_image), cv2.COLOR_RGB2BGR)
144
  out.write(processed_frame)
 
145
 
146
  cap.release()
147
  out.release()
148
  cv2.destroyAllWindows()
 
 
 
 
149
  return "output_vid.mp4"
150
 
151
  css = """
@@ -162,11 +178,11 @@ with gr.Blocks(css=css) as demo:
162
  with gr.Row():
163
  with gr.Column():
164
  input_img = gr.Image(label="Input Picture", type="pil")
165
- task_radio = gr.Radio(
166
- ["Caption", "Detailed Caption", "More Detailed Caption", "Caption to Phrase Grounding",
167
- "Object Detection", "Dense Region Caption", "Region Proposal", "Referring Expression Segmentation",
168
- "Region to Segmentation", "Open Vocabulary Detection", "Region to Category", "Region to Description",
169
- "OCR", "OCR with Region"],
170
  label="Task", value="Caption"
171
  )
172
  text_input = gr.Textbox(label="Text Input (is Optional)", visible=False)
@@ -179,8 +195,8 @@ with gr.Blocks(css=css) as demo:
179
  with gr.Row():
180
  with gr.Column():
181
  input_video = gr.Video(label="Video")
182
- video_task_radio = gr.Radio(
183
- ["Object Detection", "Dense Region Caption"],
184
  label="Video Task", value="Object Detection"
185
  )
186
  video_submit_btn = gr.Button(value="Process Video")
@@ -192,7 +208,7 @@ with gr.Blocks(css=css) as demo:
192
  "Region to Segmentation", "Open Vocabulary Detection", "Region to Category",
193
  "Region to Description"])
194
 
195
- task_radio.change(fn=update_text_input, inputs=task_radio, outputs=text_input)
196
 
197
  def process_image(image, task, text):
198
  task_mapping = {
@@ -219,7 +235,7 @@ with gr.Blocks(css=css) as demo:
219
  else:
220
  return "", image
221
 
222
- submit_btn.click(fn=process_image, inputs=[input_img, task_radio, text_input], outputs=[output_text, output_image])
223
- video_submit_btn.click(fn=process_video, inputs=[input_video, video_task_radio], outputs=output_video)
224
 
225
  demo.launch()
 
79
  for _polygon in polygons:
80
  _polygon = np.array(_polygon).reshape(-1, 2)
81
  if _polygon.shape[0] < 3:
 
82
  continue
83
  _polygon = (_polygon * scale).reshape(-1).tolist()
84
  if len(_polygon) % 2 != 0:
 
85
  continue
86
  polygon_points = np.array(_polygon).reshape(-1, 2)
87
  if fill_mask:
 
100
  bboxes, labels = prediction['quad_boxes'], prediction['labels']
101
  for box, label in zip(bboxes, labels):
102
  color = random.choice(colormap)
103
+ new_box = np.array(box) * scale
104
+ if new_box.ndim == 1:
105
+ new_box = new_box.reshape(-1, 2)
106
  polygon = patches.Polygon(new_box, edgecolor=color, fill=False, linewidth=3)
107
  ax.add_patch(polygon)
108
+ plt.text(new_box[0, 0], new_box[0, 1], label, color='white', fontsize=8, bbox=dict(facecolor=color, alpha=0.5))
109
  ax.axis('off')
110
  return fig_to_pil(fig)
111
 
 
114
  def process_video(input_video_path, task_prompt):
115
  cap = cv2.VideoCapture(input_video_path)
116
  if not cap.isOpened():
117
+ return None
 
118
 
119
  frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
120
  frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
121
  fps = cap.get(cv2.CAP_PROP_FPS)
122
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
123
+
124
+ if frame_width <= 0 or frame_height <= 0 or fps <= 0 or total_frames <= 0:
125
+ cap.release()
126
+ return None
127
+
128
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
129
  out = cv2.VideoWriter("output_vid.mp4", fourcc, fps, (frame_width, frame_height))
130
 
131
+ if not out.isOpened():
132
+ cap.release()
133
+ return None
134
+
135
+ processed_frames = 0
136
  while cap.isOpened():
137
  ret, frame = cap.read()
138
  if not ret:
 
143
 
144
  result = run_example(task_prompt, pil_image)
145
 
146
+ processed_image = pil_image
147
  if task_prompt == "<OD>":
148
+ if "<OD>" in result and "bboxes" in result["<OD>"] and "labels" in result["<OD>"]:
149
+ processed_image = plot_bbox(pil_image, result['<OD>'])
150
  elif task_prompt == "<DENSE_REGION_CAPTION>":
151
+ if "<DENSE_REGION_CAPTION>" in result and "polygons" in result["<DENSE_REGION_CAPTION>"] and "labels" in result["<DENSE_REGION_CAPTION>"]:
152
+ processed_image = draw_polygons(pil_image, result['<DENSE_REGION_CAPTION>'], fill_mask=True)
 
153
 
154
  processed_frame = cv2.cvtColor(np.array(processed_image), cv2.COLOR_RGB2BGR)
155
  out.write(processed_frame)
156
+ processed_frames += 1
157
 
158
  cap.release()
159
  out.release()
160
  cv2.destroyAllWindows()
161
+
162
+ if processed_frames == 0:
163
+ return None
164
+
165
  return "output_vid.mp4"
166
 
167
  css = """
 
178
  with gr.Row():
179
  with gr.Column():
180
  input_img = gr.Image(label="Input Picture", type="pil")
181
+ task_dropdown = gr.Dropdown(
182
+ choices=["Caption", "Detailed Caption", "More Detailed Caption", "Caption to Phrase Grounding",
183
+ "Object Detection", "Dense Region Caption", "Region Proposal", "Referring Expression Segmentation",
184
+ "Region to Segmentation", "Open Vocabulary Detection", "Region to Category", "Region to Description",
185
+ "OCR", "OCR with Region"],
186
  label="Task", value="Caption"
187
  )
188
  text_input = gr.Textbox(label="Text Input (is Optional)", visible=False)
 
195
  with gr.Row():
196
  with gr.Column():
197
  input_video = gr.Video(label="Video")
198
+ video_task_dropdown = gr.Dropdown(
199
+ choices=["Object Detection", "Dense Region Caption"],
200
  label="Video Task", value="Object Detection"
201
  )
202
  video_submit_btn = gr.Button(value="Process Video")
 
208
  "Region to Segmentation", "Open Vocabulary Detection", "Region to Category",
209
  "Region to Description"])
210
 
211
+ task_dropdown.change(fn=update_text_input, inputs=task_dropdown, outputs=text_input)
212
 
213
  def process_image(image, task, text):
214
  task_mapping = {
 
235
  else:
236
  return "", image
237
 
238
+ submit_btn.click(fn=process_image, inputs=[input_img, task_dropdown, text_input], outputs=[output_text, output_image])
239
+ video_submit_btn.click(fn=process_video, inputs=[input_video, video_task_dropdown], outputs=output_video)
240
 
241
  demo.launch()