Spaces:
Runtime error
Runtime error
File size: 3,173 Bytes
0899e82 e0c2b71 f488509 e0c2b71 f488509 c6c6f6f ddab9eb 7dd7859 c6c6f6f 8b338c8 417aaff 7dd7859 dbe0d2a ddab9eb dd9eb54 77a2b12 dbe0d2a 3ccd8bb c6c6f6f a846d22 3ccd8bb 8b338c8 a846d22 8b338c8 a846d22 717ce50 77a2b12 717ce50 dbe0d2a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import gradio as gr
import torch
import whisper
from transformers import pipeline
### ββββββββββββββββββββββββββββββββββββββββ
title="Whisper to Emotion"
### ββββββββββββββββββββββββββββββββββββββββ
whisper_model = whisper.load_model("small")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
emotion_classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion')
def translate_and_classify(audio):
print("""
β
Sending audio to Whisper ...
β
""")
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
_, probs = whisper_model.detect_language(mel)
transcript_options = whisper.DecodingOptions(task="transcribe", fp16 = False)
translate_options = whisper.DecodingOptions(task="translate", fp16 = False)
transcription = whisper.decode(whisper_model, mel, transcript_options)
translation = whisper.decode(whisper_model, mel, translate_options)
print("Language Spoken: " + transcription.language)
print("Transcript: " + transcription.text)
print("Translated: " + translation.text)
emotion = emotion_classifier(translation.text)
detected_emotion = emotion[0]["label"]
return transcription.text, detected_emotion
with gr.Blocks() as demo:
gr.Markdown("""
## Emotion Detection From Speech with Whisper
""")
gr.HTML(
<p style="margin-bottom: 10px; font-size: 94%">
Whisper is a general-purpose speech recognition model released by OpenAI that can perform multilingual speech recognition as well as speech translation and language identification. This allows for detecting emotion directly from speech in multiple languages
</p>
)
with gr.Row():
with gr.Column():
#gr.Markdown(""" ### Record audio """)
audio_input = gr.Audio(label = 'Record Audio Input',source="microphone",type="filepath")
with gr.Row():
transcribe_audio = gr.Button('Transcribe')
with gr.Column():
with gr.Row():
transcript_output = gr.Textbox(label="Transcription in the language you spoke", lines = 3)
emotion_output = gr.Textbox(label = "Detected Emotion")
transcribe_audio.click(translate_and_classify, inputs = audio_input, outputs = [transcript_output,emotion_output])
gr.HTML('''
<div class="footer">
<p>Whisper Model by <a href="https://github.com/openai/whisper" style="text-decoration: underline;" target="_blank">OpenAI</a> - Emotion Detection Model
<a href="https://huggingface.co/bhadresh-savani/distilbert-base-uncased-emotion" style="text-decoration: underline;" target="_blank">OpenAI</a>
</p>
</div>
''')
demo.launch() |