SOAP_temp /
adiv07's picture
6b3ba4b verified
history blame
20.6 kB
import gradio as gr
import plotly.graph_objs as go
import numpy as np
import time
from openai import OpenAI
import os
from hardCodedData import *
from Helper import *
import cv2
from moviepy.editor import VideoFileClip
import time
import base64
import whisperx
import gc
from moviepy.editor import VideoFileClip
from dotenv import load_dotenv
Model Information
import openai
api_key = os.getenv("OPENAI_API_KEY")
client = openai.OpenAI(
# Whisperx config
device = "cpu"
batch_size = 16 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)
from faster_whisper.transcribe import TranscriptionOptions
# Initialize TranscriptionOptions with the required arguments
default_asr_options = TranscriptionOptions(
# Load the model using whisperx.load_model
model = whisperx.load_model("large-v2", device, compute_type=compute_type)
video_file = None
base64Frames = []
transcript='''Dialogue: A take-off is something that happens in the story that gets everything going.
start: 4
end: 8
Dialogue: It can be something scary, something funny, or even a problem.
start: 9
end: 15
Dialogue: All stories have a take-off or a problem that starts the story off and makes the characters do something or get into action.
start: 15
end: 23
Dialogue: Can you guys think of some scary things that might happen to get a story going?
start: 24
end: 29
Dialogue: What would make you run?
start: 30
end: 31
Dialogue: A bear?
start: 32
end: 33
Dialogue: A bear.
start: 33
end: 33
Dialogue: But let's say you guys are going to go home and tell your mom a story about what happened today in school.
start: 34
end: 39
Dialogue: And let's say you say, well, first, this big guy came and got us out of class to learn how to tell stories.
start: 40
end: 46
Dialogue: And we were sitting in the classroom.
start: 46
end: 48
Dialogue: I was sitting in the classroom with Jared, Jared, and Jacob, when all of a sudden, it's about those words, all of a sudden, a grizzly bear walked through the door.
start: 49
end: 60
Dialogue: Would that be a problem?
start: 62
end: 63
Dialogue: Yeah.
start: 65
end: 65
Dialogue: Okay.
start: 65
end: 66
Dialogue: Would that be our takeoff?
start: 66
end: 69
Dialogue: Yeah.
start: 69
end: 69
Dialogue: So what's our takeoff in that story?
start: 69
end: 70
Dialogue: A grizzly bear walked through the door.
start: 72
end: 73
Dialogue: Or we could say, all of a sudden, a spaceship landed outside the school.
start: 76
end: 81
Dialogue: Or a flying saucer.
start: 81
end: 82
Dialogue: A flying saucer landed outside the school.
start: 82
end: 85
Dialogue: Would that be a good takeoff to a story?
start: 85
end: 87
Dialogue: How about
start: 88
end: 88
Dialogue: Can you think of some funny things that might happen to get a story going?
start: 91
end: 94
Dialogue: A clown came in through the door.
start: 95
end: 98
Dialogue: Or my chair broke and I fell on the floor.
start: 100
end: 103
Dialogue: What's this icon called?
start: 105
end: 107
Dialogue: Takeoff!
start: 107
end: 109
Dialogue: Tell me what kinds of things can get a story started.
start: 110
end: 112
Dialogue: scary, funny, or a problem.
start: 116
end: 120
# transcript=""
def process_video(video_path, seconds_per_frame=2, target_width=320, target_height=180):
global audio_path
base64Frames = []
base_video_path, _ = os.path.splitext(video_path)
video = cv2.VideoCapture(video_path)
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
fps = video.get(cv2.CAP_PROP_FPS)
frames_to_skip = int(fps * seconds_per_frame)
curr_frame = 0
# Retrieve and print the original width and height
original_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"Original width: {original_width}, Original height: {original_height}")
# Loop through the video and extract frames at specified sampling rate
while curr_frame < total_frames - 1:
video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
success, frame =
if not success:
# Resize the frame
resized_frame = cv2.resize(frame, (target_width, target_height))
_, buffer = cv2.imencode(".jpg", resized_frame)
curr_frame += frames_to_skip
# Extract audio from video
audio_path = f"./TakeOff.mp3"
clip = VideoFileClip(video_path), bitrate="32k")
# transcribe_video(audio_path)
print(f"Extracted {len(base64Frames)} frames")
print(f"Extracted audio to {audio_path}")
return base64Frames, audio_path
chat_history = []
"role": "system",
"content": (
You are an assistant chatbot for a Speech Language Pathologist (SLP).
Your task is to help analyze a provided video of a therapy session and answer questions accurately.
Provide timestamps in MM:SS format as frames are given at 1 fps for specific events or behaviors mentioned.
Analyse the video for IRB based on information below: Initiating Behavioral Request (IBR): the child's skill in using behavior(s) to elicit aid in obtaining an object, or object related event
Instances of IBR:
-Language: Listen for intelligible single words or greater verbal expressions the child uses to request an object or assistance.
-React:Observe if the child extends their arm with an open palm towards the object or the adult. Do not consider grabbing as a --requesting gesture.
-Point: Look for the child pointing at the object or direction where the object is located.
-Give: Watch if the child hands a toy or object to the adult to request help.
def transcribe_video(audio_path):
global transcript
if not audio_path:
raise ValueError("Audio path is None")
audio = whisperx.load_audio(audio_path)
result = model.transcribe(audio, batch_size=batch_size)
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
hf_auth_token = os.getenv("HF_AUTH_TOKEN")
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_auth_token, device=device)
diarize_segments = diarize_model(audio)
dia_result = whisperx.assign_word_speakers(diarize_segments, result)
for res in dia_result["segments"]:
# transcript += "Speaker: " + str(res.get("speaker", None)) + "\n"
transcript += "Dialogue: " + str(res["text"].lstrip()) + "\n"
transcript += "start: " + str(int(res["start"])) + "\n"
transcript += "end: " + str(int(res["end"])) + "\n"
transcript += "\n"
return transcript
def handle_video(video=None):
global video_file, base64Frames, audio_path, chat_history, transcript
if video is None:
# Load example video
video = "./TakeOff.mp4"
base64Frames, audio_path = process_video(video, seconds_per_frame=1, target_width=320, target_height=180)
"role": "user",
"content": [
{"type": "text", "text": "These are the frames from the video."},
*map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames)
if transcript:
"type": "text",
"text": "Also, below is the template of transcript from the video:\n"
"Speaker: <the speaker of the dialogue>\n"
"Dialogue: <the text of the dialogue>\n"
"start: <the starting timestamp of the dialogue in the video in second>\n"
"end: <the ending timestamp of the dialogue in the video in second>\n"
f"Transcription: {transcript}"
video_file = video
return video_file
def new_prompt(prompt):
global chat_history, video_file
chat_history.append({"role": "user","content": prompt,})
# print(transcript)
if video_file:
# Video exists and is processed
response =,messages=chat_history,temperature=0,)
# No video uploaded yet
response =,messages=chat_history,temperature=0,)
# Extract the text content from the response and append it to the chat history
assistant_message = response.choices[0].message.content
chat_history.append({'role': 'system', 'content': assistant_message})
except Exception as e:
print("Error: ",e)
assistant_message = "API rate limit has been reached. Please wait a moment and try again."
chat_history.append({'role': 'system', 'content': assistant_message})
# except google.api_core.exceptions.ResourceExhausted:
# assistant_message = "API rate limit has been reached. Please wait a moment and try again."
# chat_history.append({'role': 'model', 'parts': [assistant_message]})
# except Exception as e:
# assistant_message = f"An error occurred: {str(e)}"
# chat_history.append({'role': 'model', 'parts': [assistant_message]})
return chat_history
def user_input(user_message, history):
return "", history + [[user_message, None]]
def bot_response(history):
user_message = history[-1][0]
updated_history = new_prompt(user_message)
assistant_message = updated_history[-1]['content']
history[-1][1] = assistant_message
yield history
Behaivor box
initial_behaviors = [
("Initiating Behavioral Request (IBR)",
("The child's skill in using behavior(s) to elicit aid in obtaining an object, or object related event",
["00:10", "00:45", "01:30"])),
("Initiating Joint Attention (IJA)",
("The child's skill in using behavior(s) to initiate shared attention to objects or events.",
["00:15", "00:50", "01:40"])),
("Responding to Joint Attention (RJA)",
("The child's skill in following the examiner’s line of regard and pointing gestures.",
["00:20", "01:00", "02:00"])),
("Initiating Social Interaction (ISI)",
("The child's skill at initiating turn-taking sequences and the tendency to tease the tester",
["00:20", "00:50", "02:00"])),
("Responding to Social Interaction (RSI)",
("The child’s skill in responding to turn-taking interactions initiated by the examiner.",
["00:20", "01:00", "02:00"]))
behaviors = initial_behaviors
behavior_bank = []
def add_or_update_behavior(name, definition, timestamps, selected_behavior):
global behaviors, behavior_bank
if selected_behavior: # Update existing behavior
for i, (old_name, _) in enumerate(behaviors):
if old_name == selected_behavior:
behaviors[i] = (name, (definition, timestamps))
# Update behavior in the bank if it exists
behavior_bank = [name if b == selected_behavior else b for b in behavior_bank]
else: # Add new behavior
new_behavior = (name, (definition, timestamps))
choices = [b[0] for b in behaviors]
return gr.Dropdown(choices=choices, value=None, interactive=True), gr.CheckboxGroup(choices=behavior_bank, value=behavior_bank, interactive=True), "", "", ""
def add_to_behaivor_bank(selected_behavior, checkbox_group_values):
global behavior_bank
if selected_behavior and selected_behavior not in checkbox_group_values:
behavior_bank = checkbox_group_values
return gr.CheckboxGroup(choices=checkbox_group_values, value=checkbox_group_values, interactive=True), gr.Dropdown(value=None,interactive=True)
def delete_behavior(selected_behavior, checkbox_group_values):
global behaviors, behavior_bank
behaviors = [b for b in behaviors if b[0] != selected_behavior]
behavior_bank = [b for b in behavior_bank if b != selected_behavior]
updated_choices = [b[0] for b in behaviors]
updated_checkbox_group = [cb for cb in checkbox_group_values if cb != selected_behavior]
return gr.Dropdown(choices=updated_choices, value=None, interactive=True), gr.CheckboxGroup(choices=updated_checkbox_group, value=updated_checkbox_group, interactive=True)
def edit_behavior(selected_behavior):
for name, (definition, timestamps) in behaviors:
if name == selected_behavior:
# Return values to populate textboxes
return name, definition, timestamps
return "", "", ""
welcome_message = """
Hello! I'm your AI assistant.
I can help you analyze your video sessions following your instructions.
To get started, please upload a video or add your behaviors to the Behavior Bank using the Behavior Manager.
#If you want to tell me about the people in the video, please name them starting from left to right.
body {
background-color: #edf1fa; /* offwhite */
.gradio-container {
background-color: #edf1fa; /* offwhite */
.column-form .wrap {
flex-direction: column;
.sidebar {
background: #ffffff;
padding: 10px;
border-right: 1px solid #dee2e6;
.content {
padding: 10px;
Gradio Demo
with gr.Blocks(theme='base', css=css, title="Soap.AI") as demo:
gr.Markdown("# 🤖 AI-Supported SOAP Generation")
with gr.Row():
with gr.Column():
video = gr.Video(label="Video", visible=True, height=360, container=True)
with gr.Row():
with gr.Column(min_width=1, scale=1):
video_upload_button = gr.Button("Analyze Video", variant="primary")
with gr.Column(min_width=1, scale=1):
example_video_button = gr.Button("Load Example Video"), inputs=video, outputs=video), None, outputs=video)
with gr.Column():
chat_section = gr.Group(visible=True)
with chat_section:
chatbot = gr.Chatbot(elem_id="chatbot",
value=[[None, welcome_message]],
avatar_images=(None, "./avatar.webp"))
with gr.Row():
txt = gr.Textbox(show_label=False, placeholder="Type here!")
with gr.Row():
send_btn = gr.Button("Send Message", elem_id="send-btn", variant="primary")
clear_btn = gr.Button("Clear Chat", elem_id="clear-btn")
with gr.Row():
behaivor_bank = gr.CheckboxGroup(label="Behavior Bank",
info="A space to store all the behaviors you want to analyze.")
open_sidebar_btn = gr.Button("Show Behavior Manager", scale=0)
close_sidebar_btn = gr.Button("Hide Behavior Manager", visible=False, scale=0)
txt.submit(user_input, [txt, chatbot], [txt, chatbot], queue=False).then(
bot_response, chatbot, chatbot), [txt, chatbot], [txt, chatbot], queue=False).then(
bot_response, chatbot, chatbot) None, None, chatbot, queue=False)
# Define a sidebar column that is initially hidden
with gr.Column(visible=False, min_width=200, scale=0.5, elem_classes="sidebar") as sidebar:
behavior_dropdown = gr.Dropdown(label="Behavior Collection",
info="Choose a behavior to add to the bank, edit or remove.")
with gr.Row():
add_toBank_button = gr.Button("Add Behavior to Bank", variant="primary")
edit_button = gr.Button("Edit Behavior")
delete_button = gr.Button("Remove Behavior")
with gr.Row():
name_input = gr.Textbox(label="Behavior Name",
placeholder="(e.g., IBR)",
info="The name you give to the specific behavior you're tracking or analyzing.")
timestamps_input = gr.Textbox(label="Timestamps MM:SS",
placeholder="(e.g., (01:15,01:35) )",
info="The exact times during a session when you saw the behavior. The first two digits represent minutes and the last two digits represent seconds.")
definition_input = gr.Textbox(lines=3,
label="Behavior Definition",
placeholder="(e.g., the child's skill in using behavior(s) to elicit aid in obtaining an object, or object related event)",
info="Provide a clear definition of the behavior.")
with gr.Row():
submit_button = gr.Button("Save Behavior", variant="primary"),
inputs=[name_input, definition_input, timestamps_input, behavior_dropdown],
outputs=[behavior_dropdown, behaivor_bank, name_input, definition_input, timestamps_input]),
inputs=[behavior_dropdown, behaivor_bank],
outputs=[behaivor_bank, behavior_dropdown]),
inputs=[behavior_dropdown, behaivor_bank],
outputs=[behavior_dropdown, behaivor_bank]),
outputs=[name_input, definition_input, timestamps_input])
# Function to open the sidebar {
open_sidebar_btn: gr.Button(visible=False),
close_sidebar_btn: gr.Button(visible=True),
sidebar: gr.Column(visible=True)
}, outputs=[open_sidebar_btn, close_sidebar_btn, sidebar])
# Function to close the sidebar {
open_sidebar_btn: gr.Button(visible=True),
close_sidebar_btn: gr.Button(visible=False),
sidebar: gr.Column(visible=False)
}, outputs=[open_sidebar_btn, close_sidebar_btn, sidebar])
# Launch the demo