Spaces:

kedar-bhumkar
/

audio_emotion_detector

Sleeping

File size: 3,398 Bytes

ae55e39

import os
import torch
import librosa
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor

# Define emotion labels for the model
EMOTION_LABELS = [
    "Neutral", "Happy", "Sad", "Angry", "Fearful", "Disgusted", "Surprised"
]

# Model paths
MODEL_NAME = "superb/wav2vec2-base-superb-er"
# Look for model files directly in the current directory
LOCAL_MODEL_DIR = "."
LOCAL_FEATURE_EXTRACTOR_DIR = "."

def load_model():
    """Load the emotion recognition model and feature extractor"""
    try:
        # Check if model files exist in current directory
        model_files_exist = any(f.startswith("pytorch_model") for f in os.listdir(LOCAL_MODEL_DIR))
        config_file_exists = os.path.exists(os.path.join(LOCAL_MODEL_DIR, "config.json"))
        feature_extractor_exists = os.path.exists(os.path.join(LOCAL_FEATURE_EXTRACTOR_DIR, "preprocessor_config.json"))
        
        if model_files_exist and config_file_exists and feature_extractor_exists:
            print("Loading model and feature extractor from current directory...")
            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(LOCAL_FEATURE_EXTRACTOR_DIR)
            model = Wav2Vec2ForSequenceClassification.from_pretrained(LOCAL_MODEL_DIR)
        else:
            print("Local model files not found. Loading from Hugging Face...")
            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
            model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME)
        
        return model, feature_extractor
    except Exception as e:
        print(f"Error loading model: {e}")
        # Fallback to using Auto classes if specific classes fail
        from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
        feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
        model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME)
        return model, feature_extractor

def predict_emotion(audio_path):
    """Predict emotion from audio file"""
    try:
        # Load model and feature extractor
        model, feature_extractor = load_model()
        
        # Load and preprocess audio
        speech_array, sampling_rate = librosa.load(audio_path, sr=16000)
        
        # Process the audio
        inputs = feature_extractor(speech_array, sampling_rate=sampling_rate, return_tensors="pt")
        
        # Predict emotion
        with torch.no_grad():
            logits = model(**inputs).logits
        
        # Get emotion label
        predicted_class_id = torch.argmax(logits, dim=-1).item()
        
        # Return the predicted emotion
        return EMOTION_LABELS[predicted_class_id]
    
    except Exception as e:
        print(f"Error predicting emotion: {e}")
        return "Error: Could not predict emotion"

# For testing
if __name__ == "__main__":
    # Test with a file from current directory
    current_dir = "."
    wav_files = [f for f in os.listdir(current_dir) if f.endswith(".wav")]
    
    if wav_files:
        test_file = wav_files[0]
        print(f"Testing with file: {test_file}")
        emotion = predict_emotion(test_file)
        print(f"Predicted emotion: {emotion}")
    else:
        print("No .wav files found in current directory for testing.")