Spaces:
Runtime error
Runtime error
feat: apply audio trancript evaluation
Browse files- src/obs_eval_gradio.py +95 -28
src/obs_eval_gradio.py
CHANGED
@@ -10,6 +10,7 @@ from langchain.chat_models import ChatOpenAI
|
|
10 |
from langchain.schema import StrOutputParser
|
11 |
from PIL import Image
|
12 |
|
|
|
13 |
global_dict = {}
|
14 |
|
15 |
######
|
@@ -55,9 +56,9 @@ def validate_api_key(api_key):
|
|
55 |
raise gr.Error(f"OpenAI returned an API Error: {error}")
|
56 |
|
57 |
|
58 |
-
def _process_video(
|
59 |
# Read and process the video file
|
60 |
-
video = cv2.VideoCapture(
|
61 |
|
62 |
base64Frames = []
|
63 |
while video.isOpened():
|
@@ -75,10 +76,21 @@ def _process_video(image_file):
|
|
75 |
raise gr.Error(f"Cannot open the video.")
|
76 |
return base64Frames
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
-
def _make_video_batch(
|
80 |
|
81 |
-
frames = _process_video(
|
82 |
|
83 |
TOTAL_FRAME_COUNT = len(frames)
|
84 |
BATCH_SIZE = int(batch_size)
|
@@ -110,9 +122,9 @@ def _make_video_batch(image_file, batch_size, total_batch_percent):
|
|
110 |
return base64FramesBatch
|
111 |
|
112 |
|
113 |
-
def show_batches(
|
114 |
|
115 |
-
batched_frames = _make_video_batch(
|
116 |
|
117 |
images = []
|
118 |
for i, l in enumerate(batched_frames):
|
@@ -130,6 +142,31 @@ def show_batches(image_file, batch_size, total_batch_percent):
|
|
130 |
return images
|
131 |
|
132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
def call_gpt_vision(api_key, instruction, progress=gr.Progress()):
|
134 |
frames = global_dict.get('batched_frames')
|
135 |
openai.api_key = api_key
|
@@ -147,7 +184,7 @@ def call_gpt_vision(api_key, instruction, progress=gr.Progress()):
|
|
147 |
{
|
148 |
"role": "user",
|
149 |
"content": [
|
150 |
-
"Evaluate the behavior's actions based on the <
|
151 |
*map(lambda x: {"image": x, "resize": 300}, batch),
|
152 |
],
|
153 |
},
|
@@ -234,7 +271,7 @@ def get_final_anser(api_key, result_text):
|
|
234 |
def main():
|
235 |
with gr.Blocks() as demo:
|
236 |
gr.Markdown("# GPT-4 Vision for Evaluation")
|
237 |
-
gr.Markdown("## 1st STEP. Make Batched Snapshots")
|
238 |
with gr.Row():
|
239 |
with gr.Column(scale=1):
|
240 |
api_key_input = gr.Textbox(
|
@@ -247,47 +284,77 @@ def main():
|
|
247 |
label="Upload your video (under 1 minute video is the best..!)",
|
248 |
file_types=["video"],
|
249 |
)
|
250 |
-
batch_size = gr.
|
251 |
label="Number of images in one batch",
|
252 |
-
info="
|
253 |
value=5,
|
254 |
minimum=2,
|
255 |
-
maximum=5
|
|
|
256 |
)
|
257 |
-
total_batch_percent = gr.
|
258 |
label="Percentage(%) of batched image frames to total frames",
|
259 |
-
info="(
|
260 |
value=5,
|
261 |
minimum=5,
|
262 |
maximum=20,
|
263 |
step=5
|
264 |
)
|
265 |
-
process_button = gr.Button("Process")
|
266 |
-
|
267 |
with gr.Column(scale=1):
|
268 |
gallery = gr.Gallery(
|
269 |
label="Batched Snapshots of Video",
|
270 |
columns=[5],
|
271 |
-
rows=[1],
|
272 |
object_fit="contain",
|
273 |
height="auto"
|
274 |
)
|
275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
with gr.Row():
|
277 |
with gr.Column(scale=1):
|
278 |
-
|
279 |
-
label="
|
280 |
-
info="
|
281 |
-
|
282 |
-
|
283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
|
|
|
285 |
with gr.Column(scale=1):
|
286 |
-
|
287 |
-
label="Batched
|
288 |
lines=10,
|
289 |
interactive=False
|
290 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
gr.Markdown("## 3rd STEP. Summarize and Get Result")
|
292 |
with gr.Row():
|
293 |
with gr.Column(scale=1):
|
@@ -299,11 +366,11 @@ def main():
|
|
299 |
submit_button_2 = gr.Button("Summarize")
|
300 |
|
301 |
with gr.Column(scale=1):
|
302 |
-
output_box_fin_fin = gr.Textbox(label="
|
303 |
|
304 |
|
305 |
-
process_button.click(fn=validate_api_key, inputs=api_key_input, outputs=None).success(fn=show_batches, inputs=[video_upload, batch_size, total_batch_percent], outputs=gallery)
|
306 |
-
submit_button.click(fn=call_gpt_vision, inputs=[api_key_input,
|
307 |
submit_button_2.click(fn=get_final_anser, inputs=[api_key_input, output_box_fin], outputs=output_box_fin_fin)
|
308 |
|
309 |
demo.launch()
|
|
|
10 |
from langchain.schema import StrOutputParser
|
11 |
from PIL import Image
|
12 |
|
13 |
+
|
14 |
global_dict = {}
|
15 |
|
16 |
######
|
|
|
56 |
raise gr.Error(f"OpenAI returned an API Error: {error}")
|
57 |
|
58 |
|
59 |
+
def _process_video(video_file):
|
60 |
# Read and process the video file
|
61 |
+
video = cv2.VideoCapture(video_file.name)
|
62 |
|
63 |
base64Frames = []
|
64 |
while video.isOpened():
|
|
|
76 |
raise gr.Error(f"Cannot open the video.")
|
77 |
return base64Frames
|
78 |
|
79 |
+
def _process_audio(video_file, api_key):
|
80 |
+
audio_file = open(video_file.name, "rb")
|
81 |
+
|
82 |
+
client = openai.OpenAI(api_key=api_key)
|
83 |
+
transcript = client.audio.transcriptions.create(
|
84 |
+
model="whisper-1",
|
85 |
+
file=audio_file,
|
86 |
+
response_format="text"
|
87 |
+
)
|
88 |
+
return transcript
|
89 |
+
|
90 |
|
91 |
+
def _make_video_batch(video_file, batch_size, total_batch_percent):
|
92 |
|
93 |
+
frames = _process_video(video_file)
|
94 |
|
95 |
TOTAL_FRAME_COUNT = len(frames)
|
96 |
BATCH_SIZE = int(batch_size)
|
|
|
122 |
return base64FramesBatch
|
123 |
|
124 |
|
125 |
+
def show_batches(video_file, batch_size, total_batch_percent):
|
126 |
|
127 |
+
batched_frames = _make_video_batch(video_file, batch_size, total_batch_percent)
|
128 |
|
129 |
images = []
|
130 |
for i, l in enumerate(batched_frames):
|
|
|
142 |
return images
|
143 |
|
144 |
|
145 |
+
def change_audio_rubric(choice):
|
146 |
+
if choice == "Video only":
|
147 |
+
return gr.Textbox(visible=False)
|
148 |
+
else:
|
149 |
+
return gr.Textbox(
|
150 |
+
label="3. Audio Evaluation Rubric (if needed)",
|
151 |
+
info="Enter your evaluation rubric here...",
|
152 |
+
placeholder="<RUBRIC>\nHere's what the performer should *SAY* as follows:\n1. From standing, you need to shout 'Start' signal.\n2. Rock forward, you shouldn't make any noise while rolling.\n3. Standing still again, you need to shout 'Finish' signal.",
|
153 |
+
lines=7,
|
154 |
+
interactive=True,
|
155 |
+
visible=True)
|
156 |
+
|
157 |
+
|
158 |
+
def change_audio_eval(choice):
|
159 |
+
if choice == "Video only":
|
160 |
+
return gr.Textbox(visible=False)
|
161 |
+
else:
|
162 |
+
return gr.Textbox(
|
163 |
+
label="Audio Script Eval...",
|
164 |
+
lines=10,
|
165 |
+
interactive=False,
|
166 |
+
visible=True
|
167 |
+
)
|
168 |
+
|
169 |
+
|
170 |
def call_gpt_vision(api_key, instruction, progress=gr.Progress()):
|
171 |
frames = global_dict.get('batched_frames')
|
172 |
openai.api_key = api_key
|
|
|
184 |
{
|
185 |
"role": "user",
|
186 |
"content": [
|
187 |
+
"Evaluate the behavior's actions based on the <RUBRIC> provided.\n\n" + instruction,
|
188 |
*map(lambda x: {"image": x, "resize": 300}, batch),
|
189 |
],
|
190 |
},
|
|
|
271 |
def main():
|
272 |
with gr.Blocks() as demo:
|
273 |
gr.Markdown("# GPT-4 Vision for Evaluation")
|
274 |
+
gr.Markdown("## 1st STEP. Make Batched Snapshots & Audio Script")
|
275 |
with gr.Row():
|
276 |
with gr.Column(scale=1):
|
277 |
api_key_input = gr.Textbox(
|
|
|
284 |
label="Upload your video (under 1 minute video is the best..!)",
|
285 |
file_types=["video"],
|
286 |
)
|
287 |
+
batch_size = gr.Slider(
|
288 |
label="Number of images in one batch",
|
289 |
+
info="Choose between 2 and 5",
|
290 |
value=5,
|
291 |
minimum=2,
|
292 |
+
maximum=5,
|
293 |
+
step=1
|
294 |
)
|
295 |
+
total_batch_percent = gr.Slider(
|
296 |
label="Percentage(%) of batched image frames to total frames",
|
297 |
+
info="Choose between 5(%) and 20(%)",
|
298 |
value=5,
|
299 |
minimum=5,
|
300 |
maximum=20,
|
301 |
step=5
|
302 |
)
|
303 |
+
process_button = gr.Button("Process")
|
|
|
304 |
with gr.Column(scale=1):
|
305 |
gallery = gr.Gallery(
|
306 |
label="Batched Snapshots of Video",
|
307 |
columns=[5],
|
|
|
308 |
object_fit="contain",
|
309 |
height="auto"
|
310 |
)
|
311 |
+
transcript_box = gr.Textbox(
|
312 |
+
label="Audio Transcript",
|
313 |
+
lines=8,
|
314 |
+
interactive=False
|
315 |
+
)
|
316 |
+
|
317 |
+
gr.Markdown("## 2nd STEP. Set Evaluation Rubric")
|
318 |
with gr.Row():
|
319 |
with gr.Column(scale=1):
|
320 |
+
multimodal_radio = gr.Radio(
|
321 |
+
label="1. Multimodal Selection",
|
322 |
+
info="Choose evaluation channel",
|
323 |
+
value="Video + Audio",
|
324 |
+
choices=["Video + Audio", "Video only"]
|
325 |
+
)
|
326 |
+
rubric_video_input = gr.Textbox(
|
327 |
+
label="2. Video Evaluation Rubric",
|
328 |
+
info="Enter your evaluation rubric here...",
|
329 |
+
placeholder="<RUBRIC>\nHere's what the performer should *SHOW* as follows:\n1. From standing, bend your knees and straighten your arms in front of you.\n2. Place your hands on the floor, shoulder width apart with fingers pointing forward and your chin on your chest.\n3. Rock forward, straighten legs and transfer body weight onto shoulders.\n4. Rock forward on a rounded back placing both feet on the floor.\n5. Stand using arms for balance, without hands touching the floor.",
|
330 |
+
lines=7
|
331 |
+
)
|
332 |
+
rubric_audio_input = gr.Textbox(
|
333 |
+
label="3. Audio Evaluation Rubric (if needed)",
|
334 |
+
info="Enter your evaluation rubric here...",
|
335 |
+
placeholder="<RUBRIC>\nHere's what the performer should *SAY* as follows:\n1. From standing, you need to shout 'Start' signal.\n2. Rock forward, you shouldn't make any noise while rolling.\n3. Standing still again, you need to shout 'Finish' signal.",
|
336 |
+
interactive=True,
|
337 |
+
visible=True,
|
338 |
+
lines=7
|
339 |
+
)
|
340 |
+
multimodal_radio.change(fn=change_audio_rubric, inputs=multimodal_radio, outputs=rubric_audio_input)
|
341 |
|
342 |
+
submit_button = gr.Button("Evaluate")
|
343 |
with gr.Column(scale=1):
|
344 |
+
video_output_box = gr.Textbox(
|
345 |
+
label="Video Batched Snapshots Eval...",
|
346 |
lines=10,
|
347 |
interactive=False
|
348 |
)
|
349 |
+
audio_output_box = gr.Textbox(
|
350 |
+
label="Audio Script Eval...",
|
351 |
+
lines=10,
|
352 |
+
interactive=False,
|
353 |
+
visible=True
|
354 |
+
)
|
355 |
+
multimodal_radio.change(fn=change_audio_eval, inputs=multimodal_radio, outputs=audio_output_box)
|
356 |
+
|
357 |
+
|
358 |
gr.Markdown("## 3rd STEP. Summarize and Get Result")
|
359 |
with gr.Row():
|
360 |
with gr.Column(scale=1):
|
|
|
366 |
submit_button_2 = gr.Button("Summarize")
|
367 |
|
368 |
with gr.Column(scale=1):
|
369 |
+
output_box_fin_fin = gr.Textbox(label="Final Evaluation", lines=10, interactive=True)
|
370 |
|
371 |
|
372 |
+
process_button.click(fn=validate_api_key, inputs=api_key_input, outputs=None).success(fn=_process_audio, inputs=[video_upload, api_key_input], outputs=transcript_box).success(fn=show_batches, inputs=[video_upload, batch_size, total_batch_percent], outputs=gallery)
|
373 |
+
submit_button.click(fn=call_gpt_vision, inputs=[api_key_input, rubric_video_input], outputs=video_output_box).then().then(get_full_result, None, output_box_fin)
|
374 |
submit_button_2.click(fn=get_final_anser, inputs=[api_key_input, output_box_fin], outputs=output_box_fin_fin)
|
375 |
|
376 |
demo.launch()
|