kedar-bhumkar commited on
Commit
ae55e39
·
verified ·
1 Parent(s): 6264ff0

Upload 12 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ OAF_youth_happy.wav filter=lfs diff=lfs merge=lfs -text
37
+ YAF_youth_disgust.wav filter=lfs diff=lfs merge=lfs -text
38
+ YAF_youth_sad.wav filter=lfs diff=lfs merge=lfs -text
OAF_youth_angry.wav ADDED
Binary file (77.3 kB). View file
 
OAF_youth_happy.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2abac0dc8317e1ce831cad5897d69533dd380e55ee2b6cfeceadad953acabb4
3
+ size 100888
README.md CHANGED
@@ -1,14 +1,116 @@
1
- ---
2
- title: Audio Emotion Detector
3
- emoji: 🌍
4
- colorFrom: green
5
- colorTo: pink
6
- sdk: streamlit
7
- sdk_version: 1.43.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: Detect emotion within a .wav sound file
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Audio Emotion Analyzer
3
+ emoji: 🎵
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.31.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # Audio Emotion Analyzer
14
+
15
+ A Streamlit application that analyzes the emotional tone in speech audio files using a pre-trained Wav2Vec2 model.
16
+
17
+ ## Model
18
+
19
+ This application uses the [superb/wav2vec2-base-superb-er](https://huggingface.co/superb/wav2vec2-base-superb-er) model from Hugging Face, which is a Wav2Vec2 model fine-tuned for speech emotion recognition.
20
+
21
+ ## Demo App
22
+
23
+ [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://share.streamlit.io/)
24
+
25
+ ## Features
26
+
27
+ - Upload your own .wav audio files for emotion analysis
28
+ - Select from existing .wav files in your current directory
29
+ - Real-time emotion prediction
30
+ - Visual feedback with emojis
31
+
32
+ ## Quick Use
33
+
34
+ You can use this application in two ways:
35
+
36
+ ### Option 1: Run on Hugging Face Spaces
37
+ Click the "Spaces" tab on the model page to access the hosted version of this app.
38
+
39
+ ### Option 2: Run Locally
40
+
41
+ 1. Clone this repository
42
+ 2. Install the required dependencies:
43
+ ```bash
44
+ pip install -r requirements.txt
45
+ ```
46
+ 3. Download the pre-trained model:
47
+ ```bash
48
+ python download_model.py
49
+ ```
50
+ 4. Run the Streamlit app:
51
+ ```bash
52
+ streamlit run app.py
53
+ ```
54
+
55
+ ## Using Audio Files
56
+
57
+ The application automatically scans for .wav files in:
58
+ - The current directory where the app is running
59
+ - Immediate subdirectories (one level deep)
60
+
61
+ You can:
62
+ 1. Place .wav files in the same directory as the app
63
+ 2. Place .wav files in subdirectories
64
+ 3. Upload new .wav files directly through the interface
65
+
66
+ ## Supported Emotions
67
+
68
+ The model can detect 7 different emotions:
69
+ - Neutral 😐
70
+ - Happy 😊
71
+ - Sad 😢
72
+ - Angry 😠
73
+ - Fearful 😨
74
+ - Disgusted 🤢
75
+ - Surprised 😲
76
+
77
+ ## Technical Details
78
+
79
+ This application uses:
80
+ - [superb/wav2vec2-base-superb-er](https://huggingface.co/superb/wav2vec2-base-superb-er) pre-trained model
81
+ - Wav2Vec2ForSequenceClassification for emotion classification
82
+ - Wav2Vec2FeatureExtractor for audio feature extraction
83
+ - Streamlit for the web interface
84
+
85
+ ## Limitations
86
+
87
+ - The model works best with clear speech audio in English
88
+ - Background noise may affect the accuracy of emotion detection
89
+ - Short audio clips (1-5 seconds) tend to work better than longer recordings
90
+
91
+ ## Troubleshooting
92
+
93
+ If you encounter issues with model loading, try:
94
+ 1. Running `python download_model.py` again to download the model files
95
+ 2. Ensuring you have a stable internet connection for the initial model download
96
+ 3. Checking that your audio files are in .wav format with a 16kHz sample rate
97
+ 4. Verifying that the model files (pytorch_model.bin, config.json, preprocessor_config.json) are in your current directory
98
+
99
+ ## Citation
100
+
101
+ If you use this application or the underlying model in your work, please cite:
102
+
103
+ ```bibtex
104
+ @misc{superb2021,
105
+ author = {SUPERB Team},
106
+ title = {SUPERB: Speech processing Universal PERformance Benchmark},
107
+ year = {2021},
108
+ publisher = {GitHub},
109
+ journal = {GitHub repository},
110
+ howpublished = {\url{https://github.com/s3prl/s3prl}},
111
+ }
112
+ ```
113
+
114
+ ## License
115
+
116
+ This project is licensed under the MIT License - see the LICENSE file for details.
YAF_youth_disgust.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b240172f5784d6f62dd9210e1f50b08f705960659b3260307ce95d3ed776eb5
3
+ size 130620
YAF_youth_sad.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eddb450d5e5acb32f968ae6da9224bcb10c65326e060c4d87d9cd3c4c29399be
3
+ size 116028
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import tempfile
4
+ from backend import predict_emotion
5
+
6
+ # Set page configuration
7
+ st.set_page_config(
8
+ page_title="Audio Emotion Analyzer",
9
+ page_icon="🎵",
10
+ layout="centered"
11
+ )
12
+
13
+ # Title and description
14
+ st.title("🎵 Audio Emotion Analyzer")
15
+ st.markdown("Upload a .wav file or select an existing audio file to analyze the emotion in the speech.")
16
+
17
+ # Function to load audio files from current directory and subdirectories
18
+ def get_audio_files():
19
+ audio_files = []
20
+ # Scan current directory and immediate subdirectories
21
+ for root, dirs, files in os.walk('.', topdown=True):
22
+ # Limit depth to current directory and immediate subdirectories
23
+ if root.count(os.sep) <= 1: # Only include current dir and immediate subdirs
24
+ for file in files:
25
+ if file.lower().endswith('.wav'):
26
+ rel_path = os.path.join(root, file)
27
+ # Remove leading ./ or .\ from path
28
+ if rel_path.startswith('./') or rel_path.startswith('.\\'):
29
+ rel_path = rel_path[2:]
30
+ audio_files.append(rel_path)
31
+ return sorted(audio_files)
32
+
33
+ # Get audio files
34
+ audio_files = get_audio_files()
35
+
36
+ # Create two columns for upload and file selection
37
+ col1, col2 = st.columns(2)
38
+
39
+ with col1:
40
+ st.subheader("Upload your audio")
41
+ uploaded_file = st.file_uploader("Choose a .wav file", type=["wav"])
42
+
43
+ with col2:
44
+ st.subheader("Or select an existing file")
45
+ selected_file = None
46
+ if audio_files:
47
+ selected_file = st.selectbox("Choose an audio file", ["None"] + audio_files)
48
+ else:
49
+ st.info("No .wav files found in the current directory or immediate subdirectories.")
50
+
51
+ # Determine which file to use
52
+ audio_file = None
53
+ file_path = None
54
+
55
+ if uploaded_file is not None:
56
+ # Create a temporary file to save the uploaded file
57
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
58
+ tmp_file.write(uploaded_file.getvalue())
59
+ file_path = tmp_file.name
60
+ audio_file = uploaded_file.name
61
+ st.audio(uploaded_file, format="audio/wav")
62
+
63
+ elif selected_file is not None and selected_file != "None":
64
+ file_path = selected_file
65
+ audio_file = selected_file
66
+ st.audio(file_path, format="audio/wav")
67
+
68
+ # Submit button
69
+ if st.button("Analyze Emotion", disabled=(file_path is None)):
70
+ if file_path:
71
+ with st.spinner("Analyzing audio..."):
72
+ # Call the backend function to predict emotion
73
+ emotion = predict_emotion(file_path)
74
+
75
+ # Display the result
76
+ st.success(f"Analysis complete!")
77
+ st.markdown(f"## Predicted Emotion: **{emotion}**")
78
+
79
+ # Display emoji based on emotion
80
+ emoji_map = {
81
+ "Neutral": "😐",
82
+ "Happy": "😊",
83
+ "Sad": "😢",
84
+ "Angry": "😠",
85
+ "Fearful": "😨",
86
+ "Disgusted": "🤢",
87
+ "Surprised": "😲"
88
+ }
89
+
90
+ emoji = emoji_map.get(emotion, "🤔")
91
+ st.markdown(f"# {emoji}")
92
+
93
+ # Clean up temporary file if it was created
94
+ if uploaded_file is not None:
95
+ os.unlink(file_path)
96
+ else:
97
+ st.warning("Please upload a file or select an existing file first.")
98
+
99
+ # Add some information about the app
100
+ st.markdown("---")
101
+ st.markdown("""
102
+ ### About this app
103
+ This application uses a pre-trained Wav2Vec2 model to analyze the emotional tone in speech audio.
104
+ The model can detect 7 different emotions: Neutral, Happy, Sad, Angry, Fearful, Disgusted, and Surprised.
105
+
106
+ ### How to use
107
+ 1. Upload a .wav file or select an existing audio file
108
+ 2. Click the "Analyze Emotion" button
109
+ 3. View the predicted emotion result
110
+ """)
backend.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import librosa
4
+ from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
5
+
6
+ # Define emotion labels for the model
7
+ EMOTION_LABELS = [
8
+ "Neutral", "Happy", "Sad", "Angry", "Fearful", "Disgusted", "Surprised"
9
+ ]
10
+
11
+ # Model paths
12
+ MODEL_NAME = "superb/wav2vec2-base-superb-er"
13
+ # Look for model files directly in the current directory
14
+ LOCAL_MODEL_DIR = "."
15
+ LOCAL_FEATURE_EXTRACTOR_DIR = "."
16
+
17
+ def load_model():
18
+ """Load the emotion recognition model and feature extractor"""
19
+ try:
20
+ # Check if model files exist in current directory
21
+ model_files_exist = any(f.startswith("pytorch_model") for f in os.listdir(LOCAL_MODEL_DIR))
22
+ config_file_exists = os.path.exists(os.path.join(LOCAL_MODEL_DIR, "config.json"))
23
+ feature_extractor_exists = os.path.exists(os.path.join(LOCAL_FEATURE_EXTRACTOR_DIR, "preprocessor_config.json"))
24
+
25
+ if model_files_exist and config_file_exists and feature_extractor_exists:
26
+ print("Loading model and feature extractor from current directory...")
27
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(LOCAL_FEATURE_EXTRACTOR_DIR)
28
+ model = Wav2Vec2ForSequenceClassification.from_pretrained(LOCAL_MODEL_DIR)
29
+ else:
30
+ print("Local model files not found. Loading from Hugging Face...")
31
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
32
+ model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME)
33
+
34
+ return model, feature_extractor
35
+ except Exception as e:
36
+ print(f"Error loading model: {e}")
37
+ # Fallback to using Auto classes if specific classes fail
38
+ from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
39
+ feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
40
+ model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME)
41
+ return model, feature_extractor
42
+
43
+ def predict_emotion(audio_path):
44
+ """Predict emotion from audio file"""
45
+ try:
46
+ # Load model and feature extractor
47
+ model, feature_extractor = load_model()
48
+
49
+ # Load and preprocess audio
50
+ speech_array, sampling_rate = librosa.load(audio_path, sr=16000)
51
+
52
+ # Process the audio
53
+ inputs = feature_extractor(speech_array, sampling_rate=sampling_rate, return_tensors="pt")
54
+
55
+ # Predict emotion
56
+ with torch.no_grad():
57
+ logits = model(**inputs).logits
58
+
59
+ # Get emotion label
60
+ predicted_class_id = torch.argmax(logits, dim=-1).item()
61
+
62
+ # Return the predicted emotion
63
+ return EMOTION_LABELS[predicted_class_id]
64
+
65
+ except Exception as e:
66
+ print(f"Error predicting emotion: {e}")
67
+ return "Error: Could not predict emotion"
68
+
69
+ # For testing
70
+ if __name__ == "__main__":
71
+ # Test with a file from current directory
72
+ current_dir = "."
73
+ wav_files = [f for f in os.listdir(current_dir) if f.endswith(".wav")]
74
+
75
+ if wav_files:
76
+ test_file = wav_files[0]
77
+ print(f"Testing with file: {test_file}")
78
+ emotion = predict_emotion(test_file)
79
+ print(f"Predicted emotion: {emotion}")
80
+ else:
81
+ print("No .wav files found in current directory for testing.")
config.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "superb/wav2vec2-base-superb-er",
3
+ "activation_dropout": 0.0,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForSequenceClassification"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 256,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": false,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "sum",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": false,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_norm": "group",
52
+ "feat_proj_dropout": 0.1,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.0,
55
+ "freeze_feat_extract_train": true,
56
+ "hidden_act": "gelu",
57
+ "hidden_dropout": 0.1,
58
+ "hidden_size": 768,
59
+ "id2label": {
60
+ "0": "neu",
61
+ "1": "hap",
62
+ "2": "ang",
63
+ "3": "sad"
64
+ },
65
+ "initializer_range": 0.02,
66
+ "intermediate_size": 3072,
67
+ "label2id": {
68
+ "ang": 2,
69
+ "hap": 1,
70
+ "neu": 0,
71
+ "sad": 3
72
+ },
73
+ "layer_norm_eps": 1e-05,
74
+ "layerdrop": 0.05,
75
+ "mask_channel_length": 10,
76
+ "mask_channel_min_space": 1,
77
+ "mask_channel_other": 0.0,
78
+ "mask_channel_prob": 0.0,
79
+ "mask_channel_selection": "static",
80
+ "mask_feature_length": 10,
81
+ "mask_feature_min_masks": 0,
82
+ "mask_feature_prob": 0.0,
83
+ "mask_time_length": 10,
84
+ "mask_time_min_masks": 2,
85
+ "mask_time_min_space": 1,
86
+ "mask_time_other": 0.0,
87
+ "mask_time_prob": 0.05,
88
+ "mask_time_selection": "static",
89
+ "model_type": "wav2vec2",
90
+ "no_mask_channel_overlap": false,
91
+ "no_mask_time_overlap": false,
92
+ "num_adapter_layers": 3,
93
+ "num_attention_heads": 12,
94
+ "num_codevector_groups": 2,
95
+ "num_codevectors_per_group": 320,
96
+ "num_conv_pos_embedding_groups": 16,
97
+ "num_conv_pos_embeddings": 128,
98
+ "num_feat_extract_layers": 7,
99
+ "num_hidden_layers": 12,
100
+ "num_negatives": 100,
101
+ "output_hidden_size": 768,
102
+ "pad_token_id": 0,
103
+ "proj_codevector_dim": 256,
104
+ "tdnn_dilation": [
105
+ 1,
106
+ 2,
107
+ 3,
108
+ 1,
109
+ 1
110
+ ],
111
+ "tdnn_dim": [
112
+ 512,
113
+ 512,
114
+ 512,
115
+ 512,
116
+ 1500
117
+ ],
118
+ "tdnn_kernel": [
119
+ 5,
120
+ 3,
121
+ 3,
122
+ 1,
123
+ 1
124
+ ],
125
+ "torch_dtype": "float32",
126
+ "transformers_version": "4.49.0",
127
+ "use_weighted_layer_sum": true,
128
+ "vocab_size": 32,
129
+ "xvector_output_dim": 512
130
+ }
download_model.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
4
+
5
+ # Define the model name
6
+ MODEL_NAME = "superb/wav2vec2-base-superb-er"
7
+ OUTPUT_DIR = "." # Save directly to current directory
8
+
9
+ print(f"Downloading model: {MODEL_NAME}")
10
+ print("This may take a few minutes depending on your internet connection...")
11
+
12
+ try:
13
+ # Download and save the feature extractor
14
+ print("Downloading feature extractor...")
15
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
16
+ feature_extractor.save_pretrained(OUTPUT_DIR)
17
+ print(f"Feature extractor saved to current directory")
18
+
19
+ # Download and save the model
20
+ print("Downloading model...")
21
+ model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME)
22
+ model.save_pretrained(OUTPUT_DIR)
23
+ print(f"Model saved to current directory")
24
+
25
+ print("\nModel and feature extractor downloaded successfully!")
26
+ print("You can now use them in your application by loading from the current directory.")
27
+
28
+ except Exception as e:
29
+ print(f"Error downloading model: {e}")
30
+ print("\nTrying alternative approach...")
31
+
32
+ # If direct download fails, try using AutoClasses first and then save with specific classes
33
+ from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
34
+
35
+ # Download with Auto classes
36
+ feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
37
+ model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME)
38
+
39
+ # Save with specific classes
40
+ feature_extractor.save_pretrained(OUTPUT_DIR)
41
+ model.save_pretrained(OUTPUT_DIR)
42
+
43
+ print("\nModel and feature extractor downloaded successfully using alternative approach!")
44
+ print("You can now use them in your application by loading from the current directory.")
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee79804e2abf203994cf7626ff439d44a67daf3878c9ed3ee7b139ce1f36ba1b
3
+ size 378304548
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": false,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit==1.31.0
2
+ torch==2.1.0
3
+ librosa==0.10.1
4
+ transformers==4.35.0
5
+ numpy==1.24.3
6
+ soundfile==0.12.1