Garvitj commited on
Commit
fee0512
·
verified ·
1 Parent(s): c5cc96b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -5
app.py CHANGED
@@ -44,15 +44,17 @@ def predict_text_emotion(text):
44
  # Extract audio features and predict emotion
45
  def extract_audio_features(audio_data, sample_rate):
46
  if not isinstance(audio_data, np.ndarray):
47
- audio_data = np.array(audio_data)
48
-
49
- mfcc = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=704).T, axis=0)
 
 
50
  features = np.expand_dims(mfcc, axis=0)
51
- features = np.reshape(features, (1, 704))
52
  return features
53
 
54
  def predict_audio_emotion(audio_data, sample_rate):
55
  features = extract_audio_features(audio_data, sample_rate)
 
56
  prediction = audio_model.predict(features)
57
  emotion_index = np.argmax(prediction)
58
  return emotion_mapping[emotion_index]
@@ -115,7 +117,7 @@ def transcribe_and_predict_video(video_path):
115
  image_emotion = process_video(video_path)
116
 
117
  # Predict emotion from audio (sound-based)
118
- sample_rate, audio_data = librosa.load(audio_file, sr=None)
119
  audio_emotion = predict_audio_emotion(audio_data, sample_rate)
120
 
121
  # Combine detected emotions for final output (majority voting can be implemented)
 
44
  # Extract audio features and predict emotion
45
  def extract_audio_features(audio_data, sample_rate):
46
  if not isinstance(audio_data, np.ndarray):
47
+ audio_data = np.array(audio_data, dtype=np.float32) # Ensure it is a NumPy array with float type
48
+ else:
49
+ audio_data = audio_data.astype(np.float32) # Convert to float32
50
+
51
+ mfcc = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40).T, axis=0)
52
  features = np.expand_dims(mfcc, axis=0)
 
53
  return features
54
 
55
  def predict_audio_emotion(audio_data, sample_rate):
56
  features = extract_audio_features(audio_data, sample_rate)
57
+ features = np.reshape(features, (1, 40)) # Match model expected input
58
  prediction = audio_model.predict(features)
59
  emotion_index = np.argmax(prediction)
60
  return emotion_mapping[emotion_index]
 
117
  image_emotion = process_video(video_path)
118
 
119
  # Predict emotion from audio (sound-based)
120
+ audio_data, sample_rate = librosa.load(audio_file, sr=None)
121
  audio_emotion = predict_audio_emotion(audio_data, sample_rate)
122
 
123
  # Combine detected emotions for final output (majority voting can be implemented)