Spaces:
Running
on
Zero
Running
on
Zero
Dropdown
Browse files
app.py
CHANGED
@@ -79,11 +79,9 @@ def draw_polygons(image, prediction, fill_mask=False):
|
|
79 |
for _polygon in polygons:
|
80 |
_polygon = np.array(_polygon).reshape(-1, 2)
|
81 |
if _polygon.shape[0] < 3:
|
82 |
-
print('Invalid polygon:', _polygon)
|
83 |
continue
|
84 |
_polygon = (_polygon * scale).reshape(-1).tolist()
|
85 |
if len(_polygon) % 2 != 0:
|
86 |
-
print('Invalid polygon:', _polygon)
|
87 |
continue
|
88 |
polygon_points = np.array(_polygon).reshape(-1, 2)
|
89 |
if fill_mask:
|
@@ -102,10 +100,12 @@ def draw_ocr_bboxes(image, prediction):
|
|
102 |
bboxes, labels = prediction['quad_boxes'], prediction['labels']
|
103 |
for box, label in zip(bboxes, labels):
|
104 |
color = random.choice(colormap)
|
105 |
-
new_box =
|
|
|
|
|
106 |
polygon = patches.Polygon(new_box, edgecolor=color, fill=False, linewidth=3)
|
107 |
ax.add_patch(polygon)
|
108 |
-
plt.text(new_box[0], new_box[1], label, color='white', fontsize=8, bbox=dict(facecolor=color, alpha=0.5))
|
109 |
ax.axis('off')
|
110 |
return fig_to_pil(fig)
|
111 |
|
@@ -114,15 +114,25 @@ def draw_ocr_bboxes(image, prediction):
|
|
114 |
def process_video(input_video_path, task_prompt):
|
115 |
cap = cv2.VideoCapture(input_video_path)
|
116 |
if not cap.isOpened():
|
117 |
-
|
118 |
-
return
|
119 |
|
120 |
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
121 |
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
122 |
fps = cap.get(cv2.CAP_PROP_FPS)
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
124 |
out = cv2.VideoWriter("output_vid.mp4", fourcc, fps, (frame_width, frame_height))
|
125 |
|
|
|
|
|
|
|
|
|
|
|
126 |
while cap.isOpened():
|
127 |
ret, frame = cap.read()
|
128 |
if not ret:
|
@@ -133,19 +143,25 @@ def process_video(input_video_path, task_prompt):
|
|
133 |
|
134 |
result = run_example(task_prompt, pil_image)
|
135 |
|
|
|
136 |
if task_prompt == "<OD>":
|
137 |
-
|
|
|
138 |
elif task_prompt == "<DENSE_REGION_CAPTION>":
|
139 |
-
|
140 |
-
|
141 |
-
processed_image = pil_image
|
142 |
|
143 |
processed_frame = cv2.cvtColor(np.array(processed_image), cv2.COLOR_RGB2BGR)
|
144 |
out.write(processed_frame)
|
|
|
145 |
|
146 |
cap.release()
|
147 |
out.release()
|
148 |
cv2.destroyAllWindows()
|
|
|
|
|
|
|
|
|
149 |
return "output_vid.mp4"
|
150 |
|
151 |
css = """
|
@@ -162,11 +178,11 @@ with gr.Blocks(css=css) as demo:
|
|
162 |
with gr.Row():
|
163 |
with gr.Column():
|
164 |
input_img = gr.Image(label="Input Picture", type="pil")
|
165 |
-
|
166 |
-
["Caption", "Detailed Caption", "More Detailed Caption", "Caption to Phrase Grounding",
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
label="Task", value="Caption"
|
171 |
)
|
172 |
text_input = gr.Textbox(label="Text Input (is Optional)", visible=False)
|
@@ -179,8 +195,8 @@ with gr.Blocks(css=css) as demo:
|
|
179 |
with gr.Row():
|
180 |
with gr.Column():
|
181 |
input_video = gr.Video(label="Video")
|
182 |
-
|
183 |
-
["Object Detection", "Dense Region Caption"],
|
184 |
label="Video Task", value="Object Detection"
|
185 |
)
|
186 |
video_submit_btn = gr.Button(value="Process Video")
|
@@ -192,7 +208,7 @@ with gr.Blocks(css=css) as demo:
|
|
192 |
"Region to Segmentation", "Open Vocabulary Detection", "Region to Category",
|
193 |
"Region to Description"])
|
194 |
|
195 |
-
|
196 |
|
197 |
def process_image(image, task, text):
|
198 |
task_mapping = {
|
@@ -219,7 +235,7 @@ with gr.Blocks(css=css) as demo:
|
|
219 |
else:
|
220 |
return "", image
|
221 |
|
222 |
-
submit_btn.click(fn=process_image, inputs=[input_img,
|
223 |
-
video_submit_btn.click(fn=process_video, inputs=[input_video,
|
224 |
|
225 |
demo.launch()
|
|
|
79 |
for _polygon in polygons:
|
80 |
_polygon = np.array(_polygon).reshape(-1, 2)
|
81 |
if _polygon.shape[0] < 3:
|
|
|
82 |
continue
|
83 |
_polygon = (_polygon * scale).reshape(-1).tolist()
|
84 |
if len(_polygon) % 2 != 0:
|
|
|
85 |
continue
|
86 |
polygon_points = np.array(_polygon).reshape(-1, 2)
|
87 |
if fill_mask:
|
|
|
100 |
bboxes, labels = prediction['quad_boxes'], prediction['labels']
|
101 |
for box, label in zip(bboxes, labels):
|
102 |
color = random.choice(colormap)
|
103 |
+
new_box = np.array(box) * scale
|
104 |
+
if new_box.ndim == 1:
|
105 |
+
new_box = new_box.reshape(-1, 2)
|
106 |
polygon = patches.Polygon(new_box, edgecolor=color, fill=False, linewidth=3)
|
107 |
ax.add_patch(polygon)
|
108 |
+
plt.text(new_box[0, 0], new_box[0, 1], label, color='white', fontsize=8, bbox=dict(facecolor=color, alpha=0.5))
|
109 |
ax.axis('off')
|
110 |
return fig_to_pil(fig)
|
111 |
|
|
|
114 |
def process_video(input_video_path, task_prompt):
|
115 |
cap = cv2.VideoCapture(input_video_path)
|
116 |
if not cap.isOpened():
|
117 |
+
return None
|
|
|
118 |
|
119 |
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
120 |
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
121 |
fps = cap.get(cv2.CAP_PROP_FPS)
|
122 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
123 |
+
|
124 |
+
if frame_width <= 0 or frame_height <= 0 or fps <= 0 or total_frames <= 0:
|
125 |
+
cap.release()
|
126 |
+
return None
|
127 |
+
|
128 |
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
129 |
out = cv2.VideoWriter("output_vid.mp4", fourcc, fps, (frame_width, frame_height))
|
130 |
|
131 |
+
if not out.isOpened():
|
132 |
+
cap.release()
|
133 |
+
return None
|
134 |
+
|
135 |
+
processed_frames = 0
|
136 |
while cap.isOpened():
|
137 |
ret, frame = cap.read()
|
138 |
if not ret:
|
|
|
143 |
|
144 |
result = run_example(task_prompt, pil_image)
|
145 |
|
146 |
+
processed_image = pil_image
|
147 |
if task_prompt == "<OD>":
|
148 |
+
if "<OD>" in result and "bboxes" in result["<OD>"] and "labels" in result["<OD>"]:
|
149 |
+
processed_image = plot_bbox(pil_image, result['<OD>'])
|
150 |
elif task_prompt == "<DENSE_REGION_CAPTION>":
|
151 |
+
if "<DENSE_REGION_CAPTION>" in result and "polygons" in result["<DENSE_REGION_CAPTION>"] and "labels" in result["<DENSE_REGION_CAPTION>"]:
|
152 |
+
processed_image = draw_polygons(pil_image, result['<DENSE_REGION_CAPTION>'], fill_mask=True)
|
|
|
153 |
|
154 |
processed_frame = cv2.cvtColor(np.array(processed_image), cv2.COLOR_RGB2BGR)
|
155 |
out.write(processed_frame)
|
156 |
+
processed_frames += 1
|
157 |
|
158 |
cap.release()
|
159 |
out.release()
|
160 |
cv2.destroyAllWindows()
|
161 |
+
|
162 |
+
if processed_frames == 0:
|
163 |
+
return None
|
164 |
+
|
165 |
return "output_vid.mp4"
|
166 |
|
167 |
css = """
|
|
|
178 |
with gr.Row():
|
179 |
with gr.Column():
|
180 |
input_img = gr.Image(label="Input Picture", type="pil")
|
181 |
+
task_dropdown = gr.Dropdown(
|
182 |
+
choices=["Caption", "Detailed Caption", "More Detailed Caption", "Caption to Phrase Grounding",
|
183 |
+
"Object Detection", "Dense Region Caption", "Region Proposal", "Referring Expression Segmentation",
|
184 |
+
"Region to Segmentation", "Open Vocabulary Detection", "Region to Category", "Region to Description",
|
185 |
+
"OCR", "OCR with Region"],
|
186 |
label="Task", value="Caption"
|
187 |
)
|
188 |
text_input = gr.Textbox(label="Text Input (is Optional)", visible=False)
|
|
|
195 |
with gr.Row():
|
196 |
with gr.Column():
|
197 |
input_video = gr.Video(label="Video")
|
198 |
+
video_task_dropdown = gr.Dropdown(
|
199 |
+
choices=["Object Detection", "Dense Region Caption"],
|
200 |
label="Video Task", value="Object Detection"
|
201 |
)
|
202 |
video_submit_btn = gr.Button(value="Process Video")
|
|
|
208 |
"Region to Segmentation", "Open Vocabulary Detection", "Region to Category",
|
209 |
"Region to Description"])
|
210 |
|
211 |
+
task_dropdown.change(fn=update_text_input, inputs=task_dropdown, outputs=text_input)
|
212 |
|
213 |
def process_image(image, task, text):
|
214 |
task_mapping = {
|
|
|
235 |
else:
|
236 |
return "", image
|
237 |
|
238 |
+
submit_btn.click(fn=process_image, inputs=[input_img, task_dropdown, text_input], outputs=[output_text, output_image])
|
239 |
+
video_submit_btn.click(fn=process_video, inputs=[input_video, video_task_dropdown], outputs=output_video)
|
240 |
|
241 |
demo.launch()
|