Spaces:

Dpngtm
/

Audio-Emotion-Recognition

Running

App Files Files Community

Dpngtm commited on Oct 29, 2024

Commit

fc0b2dd

verified ·

1 Parent(s): e61150e

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -20

app.py CHANGED Viewed

@@ -3,10 +3,13 @@ import torch
 from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
 import torchaudio
 # Load model and processor
 model_name = "Dpngtm/wave2vec2-emotion-recognition"  # Replace with your model's Hugging Face Hub path
 model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
-processor = Wav2Vec2Processor.from_pretrained(model_name)
 # Define device (use GPU if available)
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -14,24 +17,44 @@ model.to(device)
 # Preprocessing and inference function
 def recognize_emotion(audio):
-    # Load and resample audio to 16kHz
-    speech_array, sampling_rate = torchaudio.load(audio)
-    if sampling_rate != 16000:
-        resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
-        speech_array = resampler(speech_array)
-    speech_array = speech_array.mean(dim=0).numpy()  # Convert to mono if multi-channel
-    # Process input and make predictions
-    inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True)
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    with torch.no_grad():
-        logits = model(**inputs).logits
-    predicted_id = torch.argmax(logits, dim=-1).item()
-    # Define emotion labels (use the same order as during training)
-    # Emotion labels mapped to indices
-    emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]
-    return emotion_labels[predicted_id]
 # Gradio interface with both microphone and file upload options
 interface = gr.Interface(
@@ -42,6 +65,5 @@ interface = gr.Interface(
     description="Upload an audio file or record audio, and the model will predict the emotion."
 )
 # Launch the app
 interface.launch()

 from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
 import torchaudio
+# Define emotion labels (use the same order as during training)
+emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]
 # Load model and processor
 model_name = "Dpngtm/wave2vec2-emotion-recognition"  # Replace with your model's Hugging Face Hub path
 model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
+processor = Wav2Vec2Processor.from_pretrained(model_name, num_labels=len(emotion_labels))
 # Define device (use GPU if available)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Preprocessing and inference function
 def recognize_emotion(audio):
+    """
+    Predicts the emotion from an audio file using the fine-tuned Wav2Vec2 model.
+    Args:
+        audio (str or file-like object): Path or file-like object for the audio file to predict emotion for.
+    Returns:
+        str: Predicted emotion label for the given audio file.
+    """
+    try:
+        # Determine if input is a file path or file-like object
+        audio_path = audio if isinstance(audio, str) else audio.name
+        print(f'Received audio file:', audio_path)
+        # Load and resample audio to 16kHz if necessary
+        speech_array, sampling_rate = torchaudio.load(audio_path)
+        print(f'Loaded audio with sampling rate:', sampling_rate)
+        if sampling_rate != 16000:
+            resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
+            speech_array = resampler(speech_array).squeeze().numpy()
+        else:
+            speech_array = speech_array.squeeze().numpy()
+        # Process input for the model
+        inputs = processor(speech_array, sampling_rate=16000, return_tensors='pt', padding=True)
+        input_values = inputs.input_values.to(device)
+        # Make predictions
+        with torch.no_grad():
+            logits = model(input_values).logits
+        predicted_label = torch.argmax(logits, dim=1).item()
+        # Map prediction to emotion label
+        emotion = emotion_labels[predicted_label]
+        return emotion
+    except Exception as e:
+        return f'Error during prediction: {str(e)}'
 # Gradio interface with both microphone and file upload options
 interface = gr.Interface(
     description="Upload an audio file or record audio, and the model will predict the emotion."
 )
 # Launch the app
 interface.launch()