Spaces:

Dpngtm
/

Audio-Emotion-Recognition

Running

App Files Files Community

Dpngtm commited on Oct 29, 2024

Commit

0a3c034

verified ·

1 Parent(s): 87f6c9c

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -40

app.py CHANGED Viewed

@@ -16,70 +16,46 @@ processor = Wav2Vec2Processor.from_pretrained(model_name, num_labels=len(emotion
 # Define device
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
-model.eval()  # Set model to evaluation mode
 def recognize_emotion(audio):
-    """
-    Predicts the emotion and confidence scores from an audio file.
-    Max duration: 60 seconds
-    """
     try:
         if audio is None:
             return {emotion: 0.0 for emotion in emotion_labels}
-        # Handle audio input
         audio_path = audio if isinstance(audio, str) else audio.name
-        # Load and resample audio
         speech_array, sampling_rate = torchaudio.load(audio_path)
-        # Check audio duration
         duration = speech_array.shape[1] / sampling_rate
-        if duration > 60:  # 60 seconds (1 minute) limit
             return {
                 "Error": "Audio too long (max 1 minute)",
                 **{emotion: 0.0 for emotion in emotion_labels}
             }
-        # Resample if needed
         if sampling_rate != 16000:
             resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
             speech_array = resampler(speech_array)
-        # Convert to mono if stereo
         if speech_array.shape[0] > 1:
             speech_array = torch.mean(speech_array, dim=0, keepdim=True)
-        # Normalize audio
         speech_array = speech_array / torch.max(torch.abs(speech_array))
-        # Convert to numpy and squeeze
         speech_array = speech_array.squeeze().numpy()
-        # Process input
-        inputs = processor(
-            speech_array,
-            sampling_rate=16000,
-            return_tensors='pt',
-            padding=True
-        )
         input_values = inputs.input_values.to(device)
-        # Get predictions
         with torch.no_grad():
             outputs = model(input_values)
             logits = outputs.logits
-            # Get probabilities using softmax
             probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
-            # Get confidence scores for all emotions
             confidence_scores = {
-                emotion: round(float(prob) * 100, 2) # Convert to percentage with 2 decimal places
                 for emotion, prob in zip(emotion_labels, probs)
             }
-            # Sort confidence scores by value
             sorted_scores = dict(sorted(
                 confidence_scores.items(),
                 key=lambda x: x[1],
@@ -94,14 +70,13 @@ def recognize_emotion(audio):
             **{emotion: 0.0 for emotion in emotion_labels}
         }
-# Create Gradio interface
 interface = gr.Interface(
     fn=recognize_emotion,
     inputs=gr.Audio(
-        sources=["microphone", "upload"],
         type="filepath",
         label="Upload audio or record from microphone",
-        max_length=60  # Set max length to 60 seconds in Gradio interface
     ),
     outputs=gr.Label(
         num_top_classes=len(emotion_labels),
@@ -130,13 +105,13 @@ interface = gr.Interface(
     - Maximum audio length: 1 minute
     - Best results with clear speech and minimal background noise
     - Confidence scores are shown as percentages
-    """,
-# Launch the app
-interface.launch(
-    share=True,
-    debug=True,
-    server_name="0.0.0.0",
-    server_port=7860
-)

 # Define device
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
+model.eval()
 def recognize_emotion(audio):
     try:
         if audio is None:
             return {emotion: 0.0 for emotion in emotion_labels}
         audio_path = audio if isinstance(audio, str) else audio.name
         speech_array, sampling_rate = torchaudio.load(audio_path)
         duration = speech_array.shape[1] / sampling_rate
+        if duration > 60:
             return {
                 "Error": "Audio too long (max 1 minute)",
                 **{emotion: 0.0 for emotion in emotion_labels}
             }
         if sampling_rate != 16000:
             resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
             speech_array = resampler(speech_array)
         if speech_array.shape[0] > 1:
             speech_array = torch.mean(speech_array, dim=0, keepdim=True)
         speech_array = speech_array / torch.max(torch.abs(speech_array))
         speech_array = speech_array.squeeze().numpy()
+        inputs = processor(speech_array, sampling_rate=16000, return_tensors='pt', padding=True)
         input_values = inputs.input_values.to(device)
         with torch.no_grad():
             outputs = model(input_values)
             logits = outputs.logits
             probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
             confidence_scores = {
+                emotion: round(float(prob) * 100, 2)
                 for emotion, prob in zip(emotion_labels, probs)
             }
             sorted_scores = dict(sorted(
                 confidence_scores.items(),
                 key=lambda x: x[1],
             **{emotion: 0.0 for emotion in emotion_labels}
         }
 interface = gr.Interface(
     fn=recognize_emotion,
     inputs=gr.Audio(
+        sources=["microphone", "upload"],
         type="filepath",
         label="Upload audio or record from microphone",
+        max_length=60
     ),
     outputs=gr.Label(
         num_top_classes=len(emotion_labels),
     - Maximum audio length: 1 minute
     - Best results with clear speech and minimal background noise
     - Confidence scores are shown as percentages
+    """
+)
+if __name__ == "__main__":
+    interface.launch(
+        share=True,
+        debug=True,
+        server_name="0.0.0.0",
+        server_port=7860
+    )