Update voice_analysis.py
Browse files- voice_analysis.py +15 -18
voice_analysis.py
CHANGED
@@ -1,35 +1,33 @@
|
|
1 |
import numpy as np
|
2 |
import librosa
|
3 |
from sklearn.cluster import DBSCAN
|
4 |
-
from pydub import AudioSegment
|
5 |
|
6 |
-
def extract_voice_features(audio_path,
|
7 |
# Load the audio file
|
8 |
y, sr = librosa.load(audio_path)
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
# Extract MFCC features
|
11 |
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
|
12 |
|
13 |
-
# Segment the MFCCs
|
14 |
-
segment_length = int(segment_duration * sr / 1000)
|
15 |
-
num_segments = len(y) // segment_length
|
16 |
-
|
17 |
segments = []
|
18 |
-
for i in range(
|
19 |
-
start = i *
|
20 |
-
end = start +
|
|
|
|
|
21 |
segment = mfccs[:, start:end]
|
22 |
segments.append(np.mean(segment, axis=1))
|
23 |
|
24 |
return np.array(segments)
|
25 |
|
26 |
-
def remove_nan_features(features):
|
27 |
-
return features[~np.isnan(features).any(axis=1)]
|
28 |
-
|
29 |
def cluster_voices(features):
|
30 |
-
# Remove NaN values
|
31 |
-
features = remove_nan_features(features)
|
32 |
-
|
33 |
if len(features) < 2:
|
34 |
print("Not enough voice segments for clustering. Assigning all to one cluster.")
|
35 |
return np.zeros(len(features), dtype=int)
|
@@ -47,9 +45,8 @@ def get_most_frequent_voice(features, clusters):
|
|
47 |
largest_cluster = max(set(clusters), key=list(clusters).count)
|
48 |
return features[clusters == largest_cluster]
|
49 |
|
50 |
-
def process_audio(audio_path,
|
51 |
-
features = extract_voice_features(audio_path,
|
52 |
-
features = remove_nan_features(features)
|
53 |
clusters = cluster_voices(features)
|
54 |
most_frequent_voice = get_most_frequent_voice(features, clusters)
|
55 |
return most_frequent_voice, features, clusters
|
|
|
1 |
import numpy as np
|
2 |
import librosa
|
3 |
from sklearn.cluster import DBSCAN
|
|
|
4 |
|
5 |
+
def extract_voice_features(audio_path, fps, video_duration):
|
6 |
# Load the audio file
|
7 |
y, sr = librosa.load(audio_path)
|
8 |
|
9 |
+
# Calculate the number of samples per frame
|
10 |
+
samples_per_frame = int(sr / fps)
|
11 |
+
|
12 |
+
# Calculate the total number of frames
|
13 |
+
total_frames = int(fps * video_duration)
|
14 |
+
|
15 |
# Extract MFCC features
|
16 |
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
|
17 |
|
18 |
+
# Segment the MFCCs to align with video frames
|
|
|
|
|
|
|
19 |
segments = []
|
20 |
+
for i in range(total_frames):
|
21 |
+
start = i * samples_per_frame
|
22 |
+
end = start + samples_per_frame
|
23 |
+
if end > mfccs.shape[1]:
|
24 |
+
break
|
25 |
segment = mfccs[:, start:end]
|
26 |
segments.append(np.mean(segment, axis=1))
|
27 |
|
28 |
return np.array(segments)
|
29 |
|
|
|
|
|
|
|
30 |
def cluster_voices(features):
|
|
|
|
|
|
|
31 |
if len(features) < 2:
|
32 |
print("Not enough voice segments for clustering. Assigning all to one cluster.")
|
33 |
return np.zeros(len(features), dtype=int)
|
|
|
45 |
largest_cluster = max(set(clusters), key=list(clusters).count)
|
46 |
return features[clusters == largest_cluster]
|
47 |
|
48 |
+
def process_audio(audio_path, fps, video_duration):
|
49 |
+
features = extract_voice_features(audio_path, fps, video_duration)
|
|
|
50 |
clusters = cluster_voices(features)
|
51 |
most_frequent_voice = get_most_frequent_voice(features, clusters)
|
52 |
return most_frequent_voice, features, clusters
|