Spaces:
Sleeping
Sleeping
File size: 3,371 Bytes
00ae0ce def04d4 786ea23 def04d4 00ae0ce def04d4 904a0dd def04d4 904a0dd def04d4 904a0dd cb9a254 904a0dd def04d4 904a0dd ddf32d8 def04d4 ddf32d8 904a0dd ddf32d8 904a0dd def04d4 904a0dd def04d4 904a0dd def04d4 904a0dd def04d4 904a0dd cb9a254 def04d4 904a0dd 8b1154e def04d4 904a0dd ba147ac def04d4 904a0dd def04d4 904a0dd ddf32d8 904a0dd ddf32d8 def04d4 defc213 904a0dd ddf32d8 904a0dd ddf32d8 def04d4 904a0dd 786ea23 def04d4 ddf32d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import gradio as gr
import numpy as np
import torch
from transformers import pipeline
import librosa
class EmotionRecognizer:
def __init__(self):
self.device = 0 if torch.cuda.is_available() else -1
self.model = pipeline(
"audio-classification",
model="superb/wav2vec2-base-superb-er",
device=self.device
)
self.target_sr = 16000 # Model's required sample rate
self.max_duration = 6 # Optimal duration for this model
def process_audio(self, audio):
try:
# Handle Gradio audio input (sample_rate, audio_array)
sample_rate, audio_array = audio
# Convert stereo to mono if needed
if len(audio_array.shape) > 1:
audio_array = np.mean(audio_array, axis=1)
# Convert to float32 and normalize
audio_array = audio_array.astype(np.float32)
audio_array /= np.max(np.abs(audio_array))
# Resample if necessary
if sample_rate != self.target_sr:
audio_array = librosa.resample(
audio_array,
orig_sr=sample_rate,
target_sr=self.target_sr
)
# Trim to max duration
max_samples = self.max_duration * self.target_sr
if len(audio_array) > max_samples:
audio_array = audio_array[:max_samples]
# Run inference
results = self.model({
"array": audio_array,
"sampling_rate": self.target_sr
})
# Format output
output_text = "\n".join(
[f"{res['label']}: {res['score']*100:.1f}%"
for res in results]
)
plot_data = {
"labels": [res["label"] for res in results],
"scores": [res["score"]*100 for res in results]
}
return output_text, plot_data
except Exception as e:
return f"Error: {str(e)}", None
def create_interface():
recognizer = EmotionRecognizer()
with gr.Blocks(title="Voice Emotion Analysis") as app:
gr.Markdown("# 🎤 Real-time Voice Emotion Analysis")
gr.Markdown("Record or upload short audio clips (3-6 seconds)")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="numpy",
label="Input Audio"
)
analyze_btn = gr.Button("Analyze Emotion", variant="primary")
with gr.Column():
output_text = gr.Textbox(label="Emotion Results", lines=4)
output_plot = gr.BarPlot(
x="labels",
y="scores",
title="Emotion Distribution",
color="labels",
height=300
)
analyze_btn.click(
fn=recognizer.process_audio,
inputs=audio_input,
outputs=[output_text, output_plot]
)
return app
if __name__ == "__main__":
demo = create_interface()
demo.launch() |