import gradio as gr import plotly.graph_objs as go import numpy as np import time from openai import OpenAI import os from hardCodedData import * from Helper import * import cv2 from moviepy.editor import VideoFileClip import time import base64 import whisperx import gc from moviepy.editor import VideoFileClip from dotenv import load_dotenv load_dotenv() ''' Model Information GPT4o ''' import openai api_key = os.getenv("OPENAI_API_KEY") client = openai.OpenAI( api_key=api_key, base_url="https://openai.gateway.salt-lab.org/v1", ) MODEL="gpt-4o" # Whisperx config device = "cpu" batch_size = 16 # reduce if low on GPU mem compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy) from faster_whisper.transcribe import TranscriptionOptions # Initialize TranscriptionOptions with the required arguments default_asr_options = TranscriptionOptions( beam_size=5, best_of=5, patience=0.0, length_penalty=1.0, repetition_penalty=1.0, no_repeat_ngram_size=0, log_prob_threshold=-1.0, no_speech_threshold=0.6, compression_ratio_threshold=2.4, condition_on_previous_text=True, prompt_reset_on_temperature=True, temperatures=[0.0], initial_prompt=None, prefix=None, suppress_blank=True, suppress_tokens=[], without_timestamps=False, max_initial_timestamp=1.0, word_timestamps=False, prepend_punctuations="\"'“¿([{-", append_punctuations="\"'.。,,!!??::”)]}、", max_new_tokens=512, clip_timestamps=True, hallucination_silence_threshold=0.5 ) # Load the model using whisperx.load_model model = whisperx.load_model("large-v2", device, compute_type=compute_type) ''' Video ''' video_file = None audio_path=None base64Frames = [] transcript='''Dialogue: A take-off is something that happens in the story that gets everything going. start: 4 end: 8 Dialogue: It can be something scary, something funny, or even a problem. start: 9 end: 15 Dialogue: All stories have a take-off or a problem that starts the story off and makes the characters do something or get into action. start: 15 end: 23 Dialogue: Can you guys think of some scary things that might happen to get a story going? start: 24 end: 29 Dialogue: What would make you run? start: 30 end: 31 Dialogue: A bear? start: 32 end: 33 Dialogue: A bear. start: 33 end: 33 Dialogue: But let's say you guys are going to go home and tell your mom a story about what happened today in school. start: 34 end: 39 Dialogue: And let's say you say, well, first, this big guy came and got us out of class to learn how to tell stories. start: 40 end: 46 Dialogue: And we were sitting in the classroom. start: 46 end: 48 Dialogue: I was sitting in the classroom with Jared, Jared, and Jacob, when all of a sudden, it's about those words, all of a sudden, a grizzly bear walked through the door. start: 49 end: 60 Dialogue: Would that be a problem? start: 62 end: 63 Dialogue: Yeah. start: 65 end: 65 Dialogue: Okay. start: 65 end: 66 Dialogue: Would that be our takeoff? start: 66 end: 69 Dialogue: Yeah. start: 69 end: 69 Dialogue: So what's our takeoff in that story? start: 69 end: 70 Dialogue: A grizzly bear walked through the door. start: 72 end: 73 Dialogue: Or we could say, all of a sudden, a spaceship landed outside the school. start: 76 end: 81 Dialogue: Or a flying saucer. start: 81 end: 82 Dialogue: A flying saucer landed outside the school. start: 82 end: 85 Dialogue: Would that be a good takeoff to a story? start: 85 end: 87 Dialogue: How about start: 88 end: 88 Dialogue: Can you think of some funny things that might happen to get a story going? start: 91 end: 94 Dialogue: A clown came in through the door. start: 95 end: 98 Dialogue: Or my chair broke and I fell on the floor. start: 100 end: 103 Dialogue: What's this icon called? start: 105 end: 107 Dialogue: Takeoff! start: 107 end: 109 Dialogue: Tell me what kinds of things can get a story started. start: 110 end: 112 Dialogue: scary, funny, or a problem. start: 116 end: 120 ''' # transcript="" def process_video(video_path, seconds_per_frame=2, target_width=320, target_height=180): global audio_path base64Frames = [] base_video_path, _ = os.path.splitext(video_path) video = cv2.VideoCapture(video_path) total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) fps = video.get(cv2.CAP_PROP_FPS) frames_to_skip = int(fps * seconds_per_frame) curr_frame = 0 # Retrieve and print the original width and height original_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) original_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) print(f"Original width: {original_width}, Original height: {original_height}") # Loop through the video and extract frames at specified sampling rate while curr_frame < total_frames - 1: video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame) success, frame = video.read() if not success: break # Resize the frame resized_frame = cv2.resize(frame, (target_width, target_height)) _, buffer = cv2.imencode(".jpg", resized_frame) base64Frames.append(base64.b64encode(buffer).decode("utf-8")) curr_frame += frames_to_skip video.release() # Extract audio from video audio_path = f"./TakeOff.mp3" clip = VideoFileClip(video_path) clip.audio.write_audiofile(audio_path, bitrate="32k") clip.audio.close() clip.close() # transcribe_video(audio_path) print(f"Extracted {len(base64Frames)} frames") print(f"Extracted audio to {audio_path}") return base64Frames, audio_path chat_history = [] chat_history.append({ "role": "system", "content": ( """ You are an assistant chatbot for a Speech Language Pathologist (SLP). Your task is to help analyze a provided video of a therapy session and answer questions accurately. Provide timestamps in MM:SS format as frames are given at 1 fps for specific events or behaviors mentioned. Analyse the video for IRB based on information below: Initiating Behavioral Request (IBR): the child's skill in using behavior(s) to elicit aid in obtaining an object, or object related event Instances of IBR: -Language: Listen for intelligible single words or greater verbal expressions the child uses to request an object or assistance. -React:Observe if the child extends their arm with an open palm towards the object or the adult. Do not consider grabbing as a --requesting gesture. -Point: Look for the child pointing at the object or direction where the object is located. -Give: Watch if the child hands a toy or object to the adult to request help. """ ) }) def transcribe_video(audio_path): global transcript if not audio_path: raise ValueError("Audio path is None") print(audio_path) audio = whisperx.load_audio(audio_path) result = model.transcribe(audio, batch_size=batch_size) model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) hf_auth_token = os.getenv("HF_AUTH_TOKEN") diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_auth_token, device=device) diarize_segments = diarize_model(audio) dia_result = whisperx.assign_word_speakers(diarize_segments, result) for res in dia_result["segments"]: # transcript += "Speaker: " + str(res.get("speaker", None)) + "\n" transcript += "Dialogue: " + str(res["text"].lstrip()) + "\n" transcript += "start: " + str(int(res["start"])) + "\n" transcript += "end: " + str(int(res["end"])) + "\n" transcript += "\n" print(transcript) return transcript def handle_video(video=None): global video_file, base64Frames, audio_path, chat_history, transcript if video is None: # Load example video video = "./TakeOff.mp4" base64Frames, audio_path = process_video(video, seconds_per_frame=1, target_width=320, target_height=180) chat_history.append({ "role": "user", "content": [ {"type": "text", "text": "These are the frames from the video."}, *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames) ] }) if transcript: chat_history[-1]['content'].append({ "type": "text", "text": "Also, below is the template of transcript from the video:\n" "Speaker: \n" "Dialogue: \n" "start: \n" "end: \n" f"Transcription: {transcript}" }) video_file = video return video_file ''' Chatbot ''' def new_prompt(prompt): global chat_history, video_file chat_history.append({"role": "user","content": prompt,}) MODEL="gpt-4o" print(chat_history) # print(transcript) try: if video_file: # Video exists and is processed response = client.chat.completions.create(model=MODEL,messages=chat_history,temperature=0,) else: # No video uploaded yet response = client.chat.completions.create(model=MODEL,messages=chat_history,temperature=0,) # Extract the text content from the response and append it to the chat history assistant_message = response.choices[0].message.content chat_history.append({'role': 'system', 'content': assistant_message}) print(assistant_message) except Exception as e: print("Error: ",e) assistant_message = "API rate limit has been reached. Please wait a moment and try again." chat_history.append({'role': 'system', 'content': assistant_message}) # except google.api_core.exceptions.ResourceExhausted: # assistant_message = "API rate limit has been reached. Please wait a moment and try again." # chat_history.append({'role': 'model', 'parts': [assistant_message]}) # except Exception as e: # assistant_message = f"An error occurred: {str(e)}" # chat_history.append({'role': 'model', 'parts': [assistant_message]}) return chat_history def user_input(user_message, history): return "", history + [[user_message, None]] def bot_response(history): user_message = history[-1][0] updated_history = new_prompt(user_message) assistant_message = updated_history[-1]['content'] history[-1][1] = assistant_message yield history ''' Behaivor box ''' initial_behaviors = [ ("Initiating Behavioral Request (IBR)", ("The child's skill in using behavior(s) to elicit aid in obtaining an object, or object related event", ["00:10", "00:45", "01:30"])), ("Initiating Joint Attention (IJA)", ("The child's skill in using behavior(s) to initiate shared attention to objects or events.", ["00:15", "00:50", "01:40"])), ("Responding to Joint Attention (RJA)", ("The child's skill in following the examiner’s line of regard and pointing gestures.", ["00:20", "01:00", "02:00"])), ("Initiating Social Interaction (ISI)", ("The child's skill at initiating turn-taking sequences and the tendency to tease the tester", ["00:20", "00:50", "02:00"])), ("Responding to Social Interaction (RSI)", ("The child’s skill in responding to turn-taking interactions initiated by the examiner.", ["00:20", "01:00", "02:00"])) ] behaviors = initial_behaviors behavior_bank = [] def add_or_update_behavior(name, definition, timestamps, selected_behavior): global behaviors, behavior_bank if selected_behavior: # Update existing behavior for i, (old_name, _) in enumerate(behaviors): if old_name == selected_behavior: behaviors[i] = (name, (definition, timestamps)) break # Update behavior in the bank if it exists behavior_bank = [name if b == selected_behavior else b for b in behavior_bank] else: # Add new behavior new_behavior = (name, (definition, timestamps)) behaviors.append(new_behavior) choices = [b[0] for b in behaviors] return gr.Dropdown(choices=choices, value=None, interactive=True), gr.CheckboxGroup(choices=behavior_bank, value=behavior_bank, interactive=True), "", "", "" def add_to_behaivor_bank(selected_behavior, checkbox_group_values): global behavior_bank if selected_behavior and selected_behavior not in checkbox_group_values: checkbox_group_values.append(selected_behavior) behavior_bank = checkbox_group_values return gr.CheckboxGroup(choices=checkbox_group_values, value=checkbox_group_values, interactive=True), gr.Dropdown(value=None,interactive=True) def delete_behavior(selected_behavior, checkbox_group_values): global behaviors, behavior_bank behaviors = [b for b in behaviors if b[0] != selected_behavior] behavior_bank = [b for b in behavior_bank if b != selected_behavior] updated_choices = [b[0] for b in behaviors] updated_checkbox_group = [cb for cb in checkbox_group_values if cb != selected_behavior] return gr.Dropdown(choices=updated_choices, value=None, interactive=True), gr.CheckboxGroup(choices=updated_checkbox_group, value=updated_checkbox_group, interactive=True) def edit_behavior(selected_behavior): for name, (definition, timestamps) in behaviors: if name == selected_behavior: # Return values to populate textboxes return name, definition, timestamps return "", "", "" welcome_message = """ Hello! I'm your AI assistant. I can help you analyze your video sessions following your instructions. To get started, please upload a video or add your behaviors to the Behavior Bank using the Behavior Manager. """ #If you want to tell me about the people in the video, please name them starting from left to right. css=""" body { background-color: #edf1fa; /* offwhite */ } .gradio-container { background-color: #edf1fa; /* offwhite */ } .column-form .wrap { flex-direction: column; } .sidebar { background: #ffffff; padding: 10px; border-right: 1px solid #dee2e6; } .content { padding: 10px; } """ ''' Gradio Demo ''' with gr.Blocks(theme='base', css=css, title="Soap.AI") as demo: gr.Markdown("# 🤖 AI-Supported SOAP Generation") with gr.Row(): with gr.Column(): video = gr.Video(label="Video", visible=True, height=360, container=True) with gr.Row(): with gr.Column(min_width=1, scale=1): video_upload_button = gr.Button("Analyze Video", variant="primary") with gr.Column(min_width=1, scale=1): example_video_button = gr.Button("Load Example Video") video_upload_button.click(handle_video, inputs=video, outputs=video) example_video_button.click(handle_video, None, outputs=video) with gr.Column(): chat_section = gr.Group(visible=True) with chat_section: chatbot = gr.Chatbot(elem_id="chatbot", container=True, likeable=True, value=[[None, welcome_message]], avatar_images=(None, "./avatar.webp")) with gr.Row(): txt = gr.Textbox(show_label=False, placeholder="Type here!") with gr.Row(): send_btn = gr.Button("Send Message", elem_id="send-btn", variant="primary") clear_btn = gr.Button("Clear Chat", elem_id="clear-btn") with gr.Row(): behaivor_bank = gr.CheckboxGroup(label="Behavior Bank", choices=[], interactive=True, info="A space to store all the behaviors you want to analyze.") open_sidebar_btn = gr.Button("Show Behavior Manager", scale=0) close_sidebar_btn = gr.Button("Hide Behavior Manager", visible=False, scale=0) txt.submit(user_input, [txt, chatbot], [txt, chatbot], queue=False).then( bot_response, chatbot, chatbot) send_btn.click(user_input, [txt, chatbot], [txt, chatbot], queue=False).then( bot_response, chatbot, chatbot) clear_btn.click(lambda: None, None, chatbot, queue=False) # Define a sidebar column that is initially hidden with gr.Column(visible=False, min_width=200, scale=0.5, elem_classes="sidebar") as sidebar: behavior_dropdown = gr.Dropdown(label="Behavior Collection", choices=behaviors, interactive=True, container=True, elem_classes="column-form", info="Choose a behavior to add to the bank, edit or remove.") with gr.Row(): add_toBank_button = gr.Button("Add Behavior to Bank", variant="primary") edit_button = gr.Button("Edit Behavior") delete_button = gr.Button("Remove Behavior") with gr.Row(): name_input = gr.Textbox(label="Behavior Name", placeholder="(e.g., IBR)", info="The name you give to the specific behavior you're tracking or analyzing.") timestamps_input = gr.Textbox(label="Timestamps MM:SS", placeholder="(e.g., (01:15,01:35) )", info="The exact times during a session when you saw the behavior. The first two digits represent minutes and the last two digits represent seconds.") definition_input = gr.Textbox(lines=3, label="Behavior Definition", placeholder="(e.g., the child's skill in using behavior(s) to elicit aid in obtaining an object, or object related event)", info="Provide a clear definition of the behavior.") with gr.Row(): submit_button = gr.Button("Save Behavior", variant="primary") submit_button.click(fn=add_or_update_behavior, inputs=[name_input, definition_input, timestamps_input, behavior_dropdown], outputs=[behavior_dropdown, behaivor_bank, name_input, definition_input, timestamps_input]) add_toBank_button.click(fn=add_to_behaivor_bank, inputs=[behavior_dropdown, behaivor_bank], outputs=[behaivor_bank, behavior_dropdown]) delete_button.click(fn=delete_behavior, inputs=[behavior_dropdown, behaivor_bank], outputs=[behavior_dropdown, behaivor_bank]) edit_button.click(fn=edit_behavior, inputs=[behavior_dropdown], outputs=[name_input, definition_input, timestamps_input]) # Function to open the sidebar open_sidebar_btn.click(lambda: { open_sidebar_btn: gr.Button(visible=False), close_sidebar_btn: gr.Button(visible=True), sidebar: gr.Column(visible=True) }, outputs=[open_sidebar_btn, close_sidebar_btn, sidebar]) # Function to close the sidebar close_sidebar_btn.click(lambda: { open_sidebar_btn: gr.Button(visible=True), close_sidebar_btn: gr.Button(visible=False), sidebar: gr.Column(visible=False) }, outputs=[open_sidebar_btn, close_sidebar_btn, sidebar]) # Launch the demo demo.launch(share=True)