File size: 4,986 Bytes
fc004ff
 
 
 
 
 
 
 
 
 
 
 
08a9af1
fc004ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e29cd5a
fc004ff
 
 
 
 
4af9b2b
 
fc004ff
4af9b2b
fc004ff
4af9b2b
fc004ff
4af9b2b
fc004ff
 
 
 
 
d80aaed
fc004ff
60020af
 
fc004ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e87013
 
d03c109
 
23a772a
0204a3b
8e87013
d03c109
 
 
 
 
 
 
 
 
 
 
 
0204a3b
8e87013
 
957bba8
88300e6
0204a3b
d80aaed
d03c109
909274b
fc004ff
957bba8
52332ac
fc004ff
 
 
8e87013
fc004ff
 
 
 
 
 
 
d80aaed
 
fc004ff
c1d76d0
fc004ff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
os.system("pip install git+https://github.com/openai/whisper.git")
import whisper
import evaluate
from evaluate.utils import launch_gradio_widget
import gradio as gr
import torch
import pandas as pd
import random
import classify
from whisper.model import Whisper
from whisper.tokenizer import get_tokenizer
from transformers import pipeline, WhisperTokenizer


# pull in emotion detection
# --- Add element for specification
# pull in text classification
# --- Add custom labels
# --- Associate labels with radio elements
# add logic to initiate mock notificaiton when detected
# pull in misophonia-specific model

model_cache = {}


# static classes for now, but it would be best ot have the user select from multiple, and to enter their own
class_options = {
    "misophonia": ["chewing", "breathing", "mouthsounds", "popping", "sneezing", "yawning", "smacking", "sniffling", "panting"]
}

pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large")



def slider_logic(slider):
    threshold = 0
    if slider == 1:
        threshold = .88
    elif slider == 2:
        threshold = .78
    elif slider == 3:
        threshold = .67
    elif slider == 4:
        threshold = .56
    elif slider == 5:
        threshold = .45
    else:
        threshold = []
    return threshold

# Create a Gradio interface with audio file and text inputs
def classify_toxicity(audio_file, selected_sounds, slider):
    # Transcribe the audio file using Whisper ASR
    # transcribed_text = pipe(audio_file)["text"]

    threshold = slider_logic(slider)
    model = whisper.load_model("large")
    # model = model_cache[model_name]
    # class_names = classify_anxiety.split(",")
    classify_anxiety = "misophonia"
    class_names_list = class_options.get(classify_anxiety, [])
    class_str = ""
    for elm in class_names_list:
        class_str += elm + ","
    #class_names = class_names_temp.split(",")
    class_names = class_str.split(",")
    print("class names ", class_names, "classify_anxiety ", classify_anxiety)
    
    tokenizer = get_tokenizer("large")
    # tokenizer= WhisperTokenizer.from_pretrained("openai/whisper-large")

    internal_lm_average_logprobs = classify.calculate_internal_lm_average_logprobs(
        model=model,
        class_names=class_names,
        # class_names=classify_anxiety,
        tokenizer=tokenizer,
    )
    audio_features = classify.calculate_audio_features(audio_file, model)
    average_logprobs = classify.calculate_average_logprobs(
        model=model,
        audio_features=audio_features,
        class_names=class_names,
        tokenizer=tokenizer,
    )
    average_logprobs -= internal_lm_average_logprobs
    scores = average_logprobs.softmax(-1).tolist()
    
    class_score_dict = {class_name: score for class_name, score in zip(class_names, scores)}
    matching_label_score = {}
    exceeding_threshold = []
    for selected_class_name in selected_sounds:
        print(selected_sounds)
        if selected_class_name in class_score_dict:
            #score = class_score_dict[selected_class_name]
            matching_label_score[class_name] = label_score_dict[class_name]
            for label, score in matching_label_score.items():
                # Check if the score is greater than the threshold
                if score > threshold:
                    # If it exceeds the threshold, append the label and score to the list
                    exceeding_threshold.append((label, score))
                    affirm = f"Threshold exceeded with "{exceeding_threshold}
                else:
                    affirm = ""
                
'''
            print(score)
            if score > threshold:
                print(f"Threshold exceeded for class '{selected_class_name}': Score = {score:.4f}")
                affirm = f"Threshold exceeded for class '{selected_class_name}': Score = {score:.4f}"
            else:
                print("under threshold")
                affirm = ""
'''
    miso_label_data= {class_name: score for class_name, score in zip(class_names, scores)}
    # miso_label_dict = {label: score for label, score in classify_anxiety[0].items()}

    return miso_label_data, affirm
    
with gr.Blocks() as iface:
    with gr.Column():
        miso_sounds = gr.CheckboxGroup(["chewing", "breathing", "mouthsounds", "popping", "sneezing", "yawning", "smacking", "sniffling", "panting"])
        sense_slider = gr.Slider(minimum=1, maximum=5, step=1.0, label="How readily do you want the tool to intervene? 1 = in extreme cases and 5 = at every opportunity")
    with gr.Column():
        aud_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
        submit_btn = gr.Button(label="Run")
    with gr.Column():
        # out_val = gr.Textbox()
        out_class = gr.Label()
        out_text = gr.Textbox()
    submit_btn.click(fn=classify_toxicity, inputs=[aud_input, miso_sounds, sense_slider], outputs=[out_class, out_text])


iface.launch()