File size: 1,760 Bytes
b568300 d811c94 b568300 cd6c09b d811c94 cd6c09b d811c94 cd6c09b d811c94 cd6c09b d811c94 b568300 d811c94 b568300 d811c94 b568300 d811c94 b568300 d811c94 b568300 d811c94 cd6c09b d811c94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
import numpy as np
import librosa
from sklearn.cluster import DBSCAN
def extract_voice_features(audio_path, fps, video_duration):
# Load the audio file
y, sr = librosa.load(audio_path)
# Calculate the number of samples per frame
samples_per_frame = int(sr / fps)
# Calculate the total number of frames
total_frames = int(fps * video_duration)
# Extract MFCC features
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
# Segment the MFCCs to align with video frames
segments = []
for i in range(total_frames):
start = i * samples_per_frame
end = start + samples_per_frame
if end > mfccs.shape[1]:
break
segment = mfccs[:, start:end]
segments.append(np.mean(segment, axis=1))
return np.array(segments)
def cluster_voices(features):
if len(features) < 2:
print("Not enough voice segments for clustering. Assigning all to one cluster.")
return np.zeros(len(features), dtype=int)
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')
clusters = dbscan.fit_predict(features)
if np.all(clusters == -1):
print("DBSCAN assigned all to noise. Considering as one cluster.")
return np.zeros(len(features), dtype=int)
return clusters
def get_most_frequent_voice(features, clusters):
largest_cluster = max(set(clusters), key=list(clusters).count)
return features[clusters == largest_cluster]
def process_audio(audio_path, fps, video_duration):
features = extract_voice_features(audio_path, fps, video_duration)
clusters = cluster_voices(features)
most_frequent_voice = get_most_frequent_voice(features, clusters)
return most_frequent_voice, features, clusters |