Spaces:

Boltz79
/

Sentiment-Analysis

Sleeping

File size: 7,213 Bytes

cd578af
00ae0ce
def04d4
7539cee
 
 
 
 
53d1efd
 
 
cd578af
53d1efd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f98b5f
53d1efd
 
7539cee
6f98b5f
7539cee
 
 
 
 
 
00ae0ce
7539cee
cd578af
 
 
 
 
 
 
7539cee
 
 
 
 
 
 
 
 
cb9a254
7539cee
cd578af
 
 
 
7539cee
 
 
 
 
 
 
cc50c45
6f98b5f
7539cee
 
 
 
 
 
 
 
 
 
def04d4
7539cee
 
 
 
cd578af
7539cee
 
def04d4
7539cee
 
 
 
53d1efd
cd578af
 
 
7539cee
 
53d1efd
7539cee
 
4667629
7539cee
4667629
cd578af
eebb902
cd578af
 
7539cee
 
 
53d1efd
cd578af
 
 
53d1efd
 
 
 
 
 
 
 
cc50c45
53d1efd
 
cd578af
 
 
 
53d1efd
 
cd578af
53d1efd
 
cd578af
53d1efd
cc50c45
cd578af
53d1efd
7539cee
53d1efd
 
 
 
d250b36
53d1efd
 
 
 
 
 
 
9729a4f
53d1efd
 
cd578af
53d1efd
 
 
 
 
 
6f98b5f
 
 
 
 
 
 
cd578af
53d1efd
6f98b5f
 
 
53d1efd
786ea23
def04d4
53d1efd

# app.py
import gradio as gr
import librosa
import numpy as np
import os
import tempfile
from collections import Counter
from speechbrain.inference.interfaces import foreign_class
import io
import matplotlib.pyplot as plt
import librosa.display
from PIL import Image  # For image conversion

# Try to import noisereduce (if not available, noise reduction will be skipped)
try:
    import noisereduce as nr
    NOISEREDUCE_AVAILABLE = True
except ImportError:
    NOISEREDUCE_AVAILABLE = False

# Mapping from emotion labels to emojis
emotion_to_emoji = {
    "angry": "😠",
    "happy": "😊",
    "sad": "😢",
    "neutral": "😐",
    "excited": "😄",
    "fear": "😨",
    "disgust": "🤢",
    "surprise": "😲"
}

def add_emoji_to_label(label):
    """Append an emoji corresponding to the emotion label."""
    emoji = emotion_to_emoji.get(label.lower(), "")
    return f"{label.capitalize()} {emoji}"

# Load the pre-trained SpeechBrain classifier
classifier = foreign_class(
    source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
    pymodule_file="custom_interface.py",
    classname="CustomEncoderWav2vec2Classifier",
    run_opts={"device": "cpu"}  # Change to {"device": "cuda"} if GPU is available
)

def preprocess_audio(audio_file, apply_noise_reduction=False):
    """
    Load and preprocess the audio file:
      - Convert to 16kHz mono.
      - Optionally apply noise reduction.
      - Normalize the audio.
    Saves the processed audio to a temporary file and returns its path.
    """
    y, sr = librosa.load(audio_file, sr=16000, mono=True)
    if apply_noise_reduction and NOISEREDUCE_AVAILABLE:
        y = nr.reduce_noise(y=y, sr=sr)
    if np.max(np.abs(y)) > 0:
        y = y / np.max(np.abs(y))
    temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    import soundfile as sf
    sf.write(temp_file.name, y, sr)
    return temp_file.name

def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
    """
    For longer audio files, split into overlapping segments, predict each segment,
    and return the majority-voted emotion label.
    """
    y, sr = librosa.load(audio_file, sr=16000, mono=True)
    total_duration = librosa.get_duration(y=y, sr=sr)
    
    if total_duration <= segment_duration:
        temp_file = preprocess_audio(audio_file, apply_noise_reduction)
        _, _, _, label = classifier.classify_file(temp_file)
        os.remove(temp_file)
        return label[0]

    step = segment_duration - overlap
    segments = []
    for start in np.arange(0, total_duration - segment_duration + 0.001, step):
        start_sample = int(start * sr)
        end_sample = int((start + segment_duration) * sr)
        segment_audio = y[start_sample:end_sample]
        temp_seg = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        import soundfile as sf
        sf.write(temp_seg.name, segment_audio, sr)
        segments.append(temp_seg.name)
    
    predictions = []
    for seg in segments:
        temp_file = preprocess_audio(seg, apply_noise_reduction)
        _, _, _, label = classifier.classify_file(temp_file)
        predictions.append(label[0])
        os.remove(temp_file)
        os.remove(seg)
    
    vote = Counter(predictions)
    most_common = vote.most_common(1)[0][0]
    return most_common

def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
    """
    Predict emotion from an audio file and return the emotion with an emoji.
    """
    try:
        if use_ensemble:
            label = ensemble_prediction(audio_file, apply_noise_reduction, segment_duration, overlap)
        else:
            temp_file = preprocess_audio(audio_file, apply_noise_reduction)
            result = classifier.classify_file(temp_file)
            os.remove(temp_file)
            if isinstance(result, tuple) and len(result) > 3:
                label = result[3][0]  # Extract predicted emotion label from the tuple
            else:
                label = str(result)
        return add_emoji_to_label(label.lower())
    except Exception as e:
        return f"Error processing file: {str(e)}"

def plot_waveform(audio_file):
    """
    Generate and return a waveform plot image (as a PIL Image) for the given audio file.
    """
    y, sr = librosa.load(audio_file, sr=16000, mono=True)
    plt.figure(figsize=(10, 3))
    librosa.display.waveshow(y, sr=sr)
    plt.title("Waveform")
    buf = io.BytesIO()
    plt.savefig(buf, format="png")
    plt.close()
    buf.seek(0)
    return Image.open(buf)

def predict_and_plot(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap):
    """
    Run emotion prediction and generate a waveform plot.
    Returns a tuple: (emotion label with emoji, waveform image as a PIL Image).
    """
    emotion = predict_emotion(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap)
    waveform = plot_waveform(audio_file)
    return emotion, waveform

with gr.Blocks(css=".gradio-container {background-color: #f7f7f7; font-family: Arial;}") as demo:
    gr.Markdown("<h1 style='text-align: center;'>Enhanced Emotion Recognition</h1>")
    gr.Markdown(
        "Upload an audio file, and the model will predict the emotion using a wav2vec2 model fine-tuned on IEMOCAP data. "
        "The prediction is accompanied by an emoji in the output, and you can also view the audio's waveform. "
        "Use the options below to adjust ensemble prediction and noise reduction settings."
    )
    
    with gr.Tabs():
        with gr.TabItem("Emotion Recognition"):
            with gr.Row():
                audio_input = gr.Audio(type="filepath", label="Upload Audio")
            use_ensemble = gr.Checkbox(label="Use Ensemble Prediction (for long audio)", value=False)
            apply_noise_reduction = gr.Checkbox(label="Apply Noise Reduction", value=False)
            with gr.Row():
                segment_duration = gr.Slider(minimum=1.0, maximum=10.0, step=0.5, value=3.0, label="Segment Duration (s)")
                overlap = gr.Slider(minimum=0.0, maximum=5.0, step=0.5, value=1.0, label="Segment Overlap (s)")
            predict_button = gr.Button("Predict Emotion")
            result_text = gr.Textbox(label="Predicted Emotion")
            waveform_image = gr.Image(label="Audio Waveform", type="pil")
            
            predict_button.click(
                predict_and_plot,
                inputs=[audio_input, use_ensemble, apply_noise_reduction, segment_duration, overlap],
                outputs=[result_text, waveform_image]
            )
        
        with gr.TabItem("About"):
            gr.Markdown("""
**Enhanced Emotion Recognition App**

- **Model:** SpeechBrain's wav2vec2 model fine-tuned on IEMOCAP for emotion recognition.
- **Features:**
  - Ensemble Prediction for long audio files.
  - Optional Noise Reduction.
  - Visualization of the audio waveform.
  - Emoji representation of the predicted emotion in the output.

**Credits:**
- [SpeechBrain](https://speechbrain.github.io)
- [Gradio](https://gradio.app)
            """)

if __name__ == "__main__":
    demo.launch()