Spaces:

Boltz79
/

Sentiment-Analysis

Sleeping

App Files Files Community

Boltz79 commited on Feb 8

Commit

53d1efd

verified ·

1 Parent(s): 3f27f30

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -42

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import librosa
 import numpy as np
@@ -5,6 +6,32 @@ import os
 import tempfile
 from collections import Counter
 from speechbrain.inference.interfaces import foreign_class
 # Load the pre-trained SpeechBrain classifier (Emotion Recognition with wav2vec2 on IEMOCAP)
 classifier = foreign_class(
@@ -14,13 +41,6 @@ classifier = foreign_class(
     run_opts={"device": "cpu"}  # Change to {"device": "cuda"} if GPU is available
 )
-# Try to import noisereduce (if not available, noise reduction will be skipped)
-try:
-    import noisereduce as nr
-    NOISEREDUCE_AVAILABLE = True
-except ImportError:
-    NOISEREDUCE_AVAILABLE = False
 def preprocess_audio(audio_file, apply_noise_reduction=False):
     """
     Load and preprocess the audio file:
@@ -29,18 +49,14 @@ def preprocess_audio(audio_file, apply_noise_reduction=False):
       - Normalize the audio.
     The processed audio is saved to a temporary file and its path is returned.
     """
-    # Load audio (resampled to 16kHz and in mono)
     y, sr = librosa.load(audio_file, sr=16000, mono=True)
-    # Apply noise reduction if requested and available
     if apply_noise_reduction and NOISEREDUCE_AVAILABLE:
         y = nr.reduce_noise(y=y, sr=sr)
-    # Normalize the audio (scale to -1 to 1)
     if np.max(np.abs(y)) > 0:
         y = y / np.max(np.abs(y))
-    # Write the preprocessed audio to a temporary WAV file
     temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
     import soundfile as sf
     sf.write(temp_file.name, y, sr)
@@ -48,34 +64,29 @@ def preprocess_audio(audio_file, apply_noise_reduction=False):
 def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
     """
-    For audio files longer than a given segment duration, split the file into overlapping segments,
-    predict the emotion for each segment, and then return the majority-voted label.
     """
-    # Load audio
     y, sr = librosa.load(audio_file, sr=16000, mono=True)
     total_duration = librosa.get_duration(y=y, sr=sr)
-    # If the audio is short, just process it directly
     if total_duration <= segment_duration:
         temp_file = preprocess_audio(audio_file, apply_noise_reduction)
         _, _, _, label = classifier.classify_file(temp_file)
         os.remove(temp_file)
         return label
-    # Split the audio into overlapping segments
     step = segment_duration - overlap
     segments = []
     for start in np.arange(0, total_duration - segment_duration + 0.001, step):
         start_sample = int(start * sr)
         end_sample = int((start + segment_duration) * sr)
         segment_audio = y[start_sample:end_sample]
-        # Save the segment as a temporary file
         temp_seg = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
         import soundfile as sf
         sf.write(temp_seg.name, segment_audio, sr)
         segments.append(temp_seg.name)
-    # Process each segment and collect predictions
     predictions = []
     for seg in segments:
         temp_file = preprocess_audio(seg, apply_noise_reduction)
@@ -84,46 +95,94 @@ def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duratio
         os.remove(temp_file)
         os.remove(seg)
-    # Determine the final label via majority vote
     vote = Counter(predictions)
     most_common = vote.most_common(1)[0][0]
     return most_common
-def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False):
     """
     Main prediction function.
-      - If use_ensemble is True, the audio is split into segments and ensemble prediction is used.
-      - Otherwise, the audio is processed as a whole.
     """
     try:
         if use_ensemble:
-            label = ensemble_prediction(audio_file, apply_noise_reduction)
         else:
             temp_file = preprocess_audio(audio_file, apply_noise_reduction)
             _, _, _, label = classifier.classify_file(temp_file)
             os.remove(temp_file)
-        return label
     except Exception as e:
         return f"Error processing file: {str(e)}"
-# Define the Gradio interface with additional options for ensemble prediction and noise reduction
-iface = gr.Interface(
-    fn=predict_emotion,
-    inputs=[
-        gr.Audio(type="filepath", label="Upload Audio"),
-        gr.Checkbox(label="Use Ensemble Prediction (for long audio)", value=False),
-        gr.Checkbox(label="Apply Noise Reduction", value=False)
-    ],
-    outputs="text",
-    title="Enhanced Emotion Recognition",
-    description=(
-        "Upload an audio file (expected 16kHz, mono) and the model will predict the emotion "
-        "using a wav2vec2 model fine-tuned on IEMOCAP data.\n\n"
-        "Options:\n"
-        " - Use Ensemble Prediction: For long audio, the file is split into segments and predictions are aggregated.\n"
-        " - Apply Noise Reduction: Applies a noise reduction filter before classification (requires noisereduce library)."
     )
-)
 if __name__ == "__main__":
-    iface.launch()

+# app.py
 import gradio as gr
 import librosa
 import numpy as np
 import tempfile
 from collections import Counter
 from speechbrain.inference.interfaces import foreign_class
+import io
+import matplotlib.pyplot as plt
+import librosa.display
+# Try to import noisereduce (if not available, noise reduction will be skipped)
+try:
+    import noisereduce as nr
+    NOISEREDUCE_AVAILABLE = True
+except ImportError:
+    NOISEREDUCE_AVAILABLE = False
+# Mapping from emotion labels to emojis
+emotion_to_emoji = {
+    "angry": "😠",
+    "happy": "😊",
+    "sad": "😢",
+    "neutral": "😐",
+    "excited": "😄",
+    "fear": "😨",
+    "disgust": "🤢",
+    "surprise": "😲"
+}
+def add_emoji_to_label(label):
+    emoji = emotion_to_emoji.get(label.lower(), "")
+    return f"{label.capitalize()} {emoji}"
 # Load the pre-trained SpeechBrain classifier (Emotion Recognition with wav2vec2 on IEMOCAP)
 classifier = foreign_class(
     run_opts={"device": "cpu"}  # Change to {"device": "cuda"} if GPU is available
 )
 def preprocess_audio(audio_file, apply_noise_reduction=False):
     """
     Load and preprocess the audio file:
       - Normalize the audio.
     The processed audio is saved to a temporary file and its path is returned.
     """
     y, sr = librosa.load(audio_file, sr=16000, mono=True)
     if apply_noise_reduction and NOISEREDUCE_AVAILABLE:
         y = nr.reduce_noise(y=y, sr=sr)
     if np.max(np.abs(y)) > 0:
         y = y / np.max(np.abs(y))
     temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
     import soundfile as sf
     sf.write(temp_file.name, y, sr)
 def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
     """
+    For long audio files, split the file into overlapping segments, predict the emotion for each segment,
+    and return the majority-voted label.
     """
     y, sr = librosa.load(audio_file, sr=16000, mono=True)
     total_duration = librosa.get_duration(y=y, sr=sr)
     if total_duration <= segment_duration:
         temp_file = preprocess_audio(audio_file, apply_noise_reduction)
         _, _, _, label = classifier.classify_file(temp_file)
         os.remove(temp_file)
         return label
     step = segment_duration - overlap
     segments = []
     for start in np.arange(0, total_duration - segment_duration + 0.001, step):
         start_sample = int(start * sr)
         end_sample = int((start + segment_duration) * sr)
         segment_audio = y[start_sample:end_sample]
         temp_seg = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
         import soundfile as sf
         sf.write(temp_seg.name, segment_audio, sr)
         segments.append(temp_seg.name)
     predictions = []
     for seg in segments:
         temp_file = preprocess_audio(seg, apply_noise_reduction)
         os.remove(temp_file)
         os.remove(seg)
     vote = Counter(predictions)
     most_common = vote.most_common(1)[0][0]
     return most_common
+def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
     """
     Main prediction function.
+      - Uses ensemble prediction if enabled.
+      - Otherwise, processes the entire audio at once.
+      - Returns the predicted emotion with an emoji.
     """
     try:
         if use_ensemble:
+            label = ensemble_prediction(audio_file, apply_noise_reduction, segment_duration, overlap)
         else:
             temp_file = preprocess_audio(audio_file, apply_noise_reduction)
             _, _, _, label = classifier.classify_file(temp_file)
             os.remove(temp_file)
+        return add_emoji_to_label(label)
     except Exception as e:
         return f"Error processing file: {str(e)}"
+def plot_waveform(audio_file):
+    """
+    Generate a waveform plot for the given audio file and return the image bytes.
+    """
+    y, sr = librosa.load(audio_file, sr=16000, mono=True)
+    plt.figure(figsize=(10, 3))
+    librosa.display.waveshow(y, sr=sr)
+    plt.title("Waveform")
+    buf = io.BytesIO()
+    plt.savefig(buf, format="png")
+    plt.close()
+    buf.seek(0)
+    return buf.read()
+def predict_and_plot(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap):
+    """
+    Predict the emotion and also generate the waveform plot.
+    Returns a tuple: (emotion label with emoji, waveform image)
+    """
+    emotion = predict_emotion(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap)
+    waveform = plot_waveform(audio_file)
+    return emotion, waveform
+# Build the enhanced UI using Gradio Blocks
+with gr.Blocks(css=".gradio-container {background-color: #f7f7f7; font-family: Arial;}") as demo:
+    gr.Markdown("<h1 style='text-align: center;'>Enhanced Emotion Recognition 😊</h1>")
+    gr.Markdown(
+        "Upload an audio file and the model will predict the emotion using a wav2vec2 model fine-tuned on IEMOCAP data. "
+        "The prediction is accompanied by an emoji, and you can also view the audio's waveform. "
+        "Use the options below to adjust ensemble prediction and noise reduction settings."
     )
+    with gr.Tabs():
+        with gr.TabItem("Emotion Recognition"):
+            with gr.Row():
+                audio_input = gr.Audio(type="filepath", label="Upload Audio", source="upload")
+            use_ensemble = gr.Checkbox(label="Use Ensemble Prediction (for long audio)", value=False)
+            apply_noise_reduction = gr.Checkbox(label="Apply Noise Reduction", value=False)
+            with gr.Row():
+                segment_duration = gr.Slider(minimum=1.0, maximum=10.0, step=0.5, value=3.0, label="Segment Duration (s)")
+                overlap = gr.Slider(minimum=0.0, maximum=5.0, step=0.5, value=1.0, label="Segment Overlap (s)")
+            predict_button = gr.Button("Predict Emotion")
+            result_text = gr.Textbox(label="Predicted Emotion")
+            waveform_image = gr.Image(label="Audio Waveform", type="auto")
+            predict_button.click(
+                predict_and_plot,
+                inputs=[audio_input, use_ensemble, apply_noise_reduction, segment_duration, overlap],
+                outputs=[result_text, waveform_image]
+            )
+        with gr.TabItem("About"):
+            gr.Markdown("""
+            **Enhanced Emotion Recognition App**
+            - **Model:** SpeechBrain's wav2vec2 model fine-tuned on IEMOCAP for emotion recognition.
+            - **Features:**
+              - Ensemble Prediction for long audio files.
+              - Optional Noise Reduction.
+              - Visualization of the audio waveform.
+              - Emoji representation of the predicted emotion.
+            **Credits:**
+            - [SpeechBrain](https://speechbrain.github.io)
+            - [Gradio](https://gradio.app)
+            """)
 if __name__ == "__main__":
+    demo.launch()