Boltz79's picture
Update app.py
ddf32d8 verified
raw
history blame
4.24 kB
import gradio as gr
import numpy as np
import torch
from transformers import pipeline
import librosa
import soundfile as sf
class EmotionRecognizer:
def __init__(self):
self.classifier = pipeline(
"audio-classification",
model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
device=0 if torch.cuda.is_available() else -1
)
self.target_sr = 16000 # Target sample rate for the model
self.max_duration = 10 # Max audio duration in seconds
def process_audio(self, audio_path):
try:
# Load audio file using soundfile (works better in Hugging Face Spaces)
audio, orig_sr = sf.read(audio_path)
# Convert stereo to mono if needed
if len(audio.shape) > 1:
audio = np.mean(audio, axis=1)
# Resample if necessary
if orig_sr != self.target_sr:
audio = librosa.resample(
y=audio.astype(np.float32),
orig_sr=orig_sr,
target_sr=self.target_sr
)
else:
audio = audio.astype(np.float32)
# Normalize audio
audio = librosa.util.normalize(audio)
# Trim/pad audio to max duration
max_samples = self.max_duration * self.target_sr
if len(audio) > max_samples:
audio = audio[:max_samples]
else:
audio = np.pad(audio, (0, max(0, max_samples - len(audio))))
# Run classification
results = self.classifier(
{"array": audio, "sampling_rate": self.target_sr}
)
# Format output
labels = [res["label"] for res in results]
scores = [res["score"] * 100 for res in results]
text_output = "\n".join([
f"{label}: {score:.2f}%"
for label, score in zip(labels, scores)
])
plot_data = {
"labels": labels,
"values": scores
}
return text_output, plot_data
except Exception as e:
error_msg = f"Error processing audio: {str(e)}"
print(error_msg)
return error_msg, None
def create_interface():
recognizer = EmotionRecognizer()
with gr.Blocks(title="Audio Emotion Recognition") as interface:
gr.Markdown("# 🎙️ Audio Emotion Recognition")
gr.Markdown("Record or upload audio (English speech, 3-10 seconds)")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Input Audio",
waveform_options={"waveform_progress_color": "#FF0066"}
)
submit_btn = gr.Button("Analyze", variant="primary")
with gr.Column():
text_output = gr.Textbox(
label="Emotion Analysis Results",
interactive=False
)
plot_output = gr.BarPlot(
label="Confidence Scores",
x="labels",
y="values",
color="labels",
height=300
)
submit_btn.click(
fn=recognizer.process_audio,
inputs=audio_input,
outputs=[text_output, plot_output]
)
gr.Examples(
examples=[
"https://huggingface.co/spaces/echalabres/emotion-recognition/raw/main/example_angry.wav",
"https://huggingface.co/spaces/echalabres/emotion-recognition/raw/main/example_happy.wav"
],
inputs=audio_input,
outputs=[text_output, plot_output],
fn=recognizer.process_audio,
cache_examples=True
)
return interface
if __name__ == "__main__":
demo = create_interface()
demo.launch()