Spaces:

RamAnanth1
/

whisper_to_emotion

Runtime error

RamAnanth1 commited on Jan 7, 2023

Commit

1f93035

1 Parent(s): e0b4905

Switch to HF based whisper-large-v2 model

Files changed (1) hide show

app.py CHANGED Viewed

@@ -9,40 +9,30 @@ title="Whisper to Emotion"
 ### ————————————————————————————————————————
-whisper_model = whisper.load_model("large")
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 emotion_classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion')
 def translate_and_classify(audio):
-    print("""
-    —
-    Sending audio to Whisper ...
-    —
-    """)
-    audio = whisper.load_audio(audio)
-    audio = whisper.pad_or_trim(audio)
-    mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
-    _, probs = whisper_model.detect_language(mel)
-    transcript_options = whisper.DecodingOptions(task="transcribe", fp16 = False)
-    translate_options = whisper.DecodingOptions(task="translate", fp16 = False)
-    transcription = whisper.decode(whisper_model, mel, transcript_options)
-    translation = whisper.decode(whisper_model, mel, translate_options)
-    print("Language Spoken: " + transcription.language)
-    print("Transcript: " + transcription.text)
-    print("Translated: " + translation.text)
-    emotion = emotion_classifier(translation.text)
     detected_emotion = emotion[0]["label"]
     print("Detected Emotion: ", detected_emotion)
-    return transcription.text, detected_emotion
 css = """
         .gradio-container {

 ### ————————————————————————————————————————
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+whisper_model = pipeline(
+    task="automatic-speech-recognition",
+    model="openai/whisper-large-v2",
+    chunk_length_s=30,
+    device=device,
+)
+all_special_ids = whisper_model.tokenizer.all_special_ids
+transcribe_token_id = all_special_ids[-5]
+translate_token_id = all_special_ids[-6]
 emotion_classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion')
 def translate_and_classify(audio):
+    task = "Transcribe in Spoken Language"
+    whisper_model.model.config.forced_decoder_ids = [[2, transcribe_token_id if task=="Transcribe in Spoken Language" else translate_token_id]]
+    text = whisper_model(audio)["text"]
+    emotion = emotion_classifier(text)
     detected_emotion = emotion[0]["label"]
     print("Detected Emotion: ", detected_emotion)
+    return text, detected_emotion
 css = """
         .gradio-container {