reab5555 commited on
Commit
cd6c09b
·
verified ·
1 Parent(s): 0c437f5

Update voice_analysis.py

Browse files
Files changed (1) hide show
  1. voice_analysis.py +15 -18
voice_analysis.py CHANGED
@@ -1,35 +1,33 @@
1
  import numpy as np
2
  import librosa
3
  from sklearn.cluster import DBSCAN
4
- from pydub import AudioSegment
5
 
6
- def extract_voice_features(audio_path, segment_duration=1000):
7
  # Load the audio file
8
  y, sr = librosa.load(audio_path)
9
 
 
 
 
 
 
 
10
  # Extract MFCC features
11
  mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
12
 
13
- # Segment the MFCCs
14
- segment_length = int(segment_duration * sr / 1000)
15
- num_segments = len(y) // segment_length
16
-
17
  segments = []
18
- for i in range(num_segments):
19
- start = i * segment_length
20
- end = start + segment_length
 
 
21
  segment = mfccs[:, start:end]
22
  segments.append(np.mean(segment, axis=1))
23
 
24
  return np.array(segments)
25
 
26
- def remove_nan_features(features):
27
- return features[~np.isnan(features).any(axis=1)]
28
-
29
  def cluster_voices(features):
30
- # Remove NaN values
31
- features = remove_nan_features(features)
32
-
33
  if len(features) < 2:
34
  print("Not enough voice segments for clustering. Assigning all to one cluster.")
35
  return np.zeros(len(features), dtype=int)
@@ -47,9 +45,8 @@ def get_most_frequent_voice(features, clusters):
47
  largest_cluster = max(set(clusters), key=list(clusters).count)
48
  return features[clusters == largest_cluster]
49
 
50
- def process_audio(audio_path, segment_duration=1000):
51
- features = extract_voice_features(audio_path, segment_duration)
52
- features = remove_nan_features(features)
53
  clusters = cluster_voices(features)
54
  most_frequent_voice = get_most_frequent_voice(features, clusters)
55
  return most_frequent_voice, features, clusters
 
1
  import numpy as np
2
  import librosa
3
  from sklearn.cluster import DBSCAN
 
4
 
5
+ def extract_voice_features(audio_path, fps, video_duration):
6
  # Load the audio file
7
  y, sr = librosa.load(audio_path)
8
 
9
+ # Calculate the number of samples per frame
10
+ samples_per_frame = int(sr / fps)
11
+
12
+ # Calculate the total number of frames
13
+ total_frames = int(fps * video_duration)
14
+
15
  # Extract MFCC features
16
  mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
17
 
18
+ # Segment the MFCCs to align with video frames
 
 
 
19
  segments = []
20
+ for i in range(total_frames):
21
+ start = i * samples_per_frame
22
+ end = start + samples_per_frame
23
+ if end > mfccs.shape[1]:
24
+ break
25
  segment = mfccs[:, start:end]
26
  segments.append(np.mean(segment, axis=1))
27
 
28
  return np.array(segments)
29
 
 
 
 
30
  def cluster_voices(features):
 
 
 
31
  if len(features) < 2:
32
  print("Not enough voice segments for clustering. Assigning all to one cluster.")
33
  return np.zeros(len(features), dtype=int)
 
45
  largest_cluster = max(set(clusters), key=list(clusters).count)
46
  return features[clusters == largest_cluster]
47
 
48
+ def process_audio(audio_path, fps, video_duration):
49
+ features = extract_voice_features(audio_path, fps, video_duration)
 
50
  clusters = cluster_voices(features)
51
  most_frequent_voice = get_most_frequent_voice(features, clusters)
52
  return most_frequent_voice, features, clusters