Spaces:

dj-dawgs-ipd
/

IPD-Audio-Pipeline

Runtime error

File size: 3,211 Bytes

0d2e8f8

from gradio_client import Client, handle_file
import pandas as pd
import gradio as gr
from vosk import Model, KaldiRecognizer
import json
import wave

clientEngText = Client("dj-dawgs-ipd/IPD-Text-English-Finetune")
clientHingText = Client("dj-dawgs-ipd/IPD-Text-Hinglish")
clientAud = Client("dj-dawgs-ipd/IPD_Audio_HuBERT")

profanity_df = pd.read_csv('Hinglish_Profanity_List.csv', encoding='utf-8')
profanity_hn = profanity_df['profanity_hn']
vosk_model = Model(lang="en-us")


# import whisper
# def stt_whisper(file_path):
#     model = whisper.load_model("base")
#     try:
#         result = model.transcribe(file_path)
#         return result["text"]
#     except Exception as e:
#           print(e)
#         return ""


def stt_vosk(file_path):
    try:
        wf = wave.open(file_path, "rb")
        rec = KaldiRecognizer(vosk_model, wf.getframerate())
        rec.SetWords(True)
        rec.SetPartialWords(True)
        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            rec.AcceptWaveform(data)
        data = json.loads(rec.FinalResult())
        return data["text"]
    except:
        return ""


def extract_text(audio_path):
    return stt_vosk(audio_path).lower()


def predict_hate_speech(audio_path):

    audResult = clientAud.predict(
        audio_path=handle_file(audio_path),
        api_name="/predict"
    )
    audResult = json.loads(audResult.replace("'", '"'))

    stt_text = extract_text(audio_path)

    engResult = clientEngText.predict(
        text=stt_text[:200],
        api_name="/predict"
    )

    hingResult = clientHingText.predict(
        text=stt_text[:200],
        api_name="/predict"
    )

    profanityFound = any(word in stt_text.split() for word in profanity_hn)
    threshold = 0.6
    isHate = (engResult[0] != "NEITHER" and engResult[1] > threshold) or (
        hingResult[0] != "NAG" and hingResult[1] > threshold) or (
        audResult['Classification'] == 'Hate Speech\n' and audResult['Confidence'] > threshold)

    engConf = engResult[1] if engResult[0] != "NEITHER" else (1 - engResult[1])
    hingConf = hingResult[1] if hingResult[0] != "NEITHER" else (1 - hingResult[1])
    audConf = audResult['Confidence'] if audResult['Classification'] == 'Hate Speech\n' else (1 - audResult['Confidence'])

    confidence = (engConf + hingConf + audConf) / 3
    # print(profanityFound, engResult, hingResult, audResult)
    if profanityFound:
        return ["hate", f"Result: Profanity Found", f"Text: {stt_text}"]
    elif isHate:
        return ["hate", f"Confidence: {confidence}", f"Text: {stt_text}"]

    return ["not_hate", "No hate found, yay!"]


iface = gr.Interface(
    fn=predict_hate_speech,
    inputs=gr.Audio(type="filepath", label="Upload Audio"),
    outputs=gr.Textbox(label="Hate Speech Analysis"),
    title="Hate Speech Audio Pipeline",
    description="Upload an audio file to detect potential hate speech content.",
    examples=[
        ["hate_video_3_3_snippet2.wav"]
    ],
    allow_flagging="manual"
)

if __name__ == "__main__":
    iface.launch()