|
import numpy as np |
|
import librosa |
|
from sklearn.cluster import DBSCAN |
|
|
|
def extract_voice_features(audio_path, fps, video_duration): |
|
|
|
y, sr = librosa.load(audio_path) |
|
|
|
|
|
samples_per_frame = int(sr / fps) |
|
|
|
|
|
total_frames = int(fps * video_duration) |
|
|
|
|
|
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) |
|
|
|
|
|
segments = [] |
|
for i in range(total_frames): |
|
start = i * samples_per_frame |
|
end = start + samples_per_frame |
|
if end > mfccs.shape[1]: |
|
break |
|
segment = mfccs[:, start:end] |
|
segments.append(np.mean(segment, axis=1)) |
|
|
|
return np.array(segments) |
|
|
|
def cluster_voices(features): |
|
if len(features) < 2: |
|
print("Not enough voice segments for clustering. Assigning all to one cluster.") |
|
return np.zeros(len(features), dtype=int) |
|
|
|
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean') |
|
clusters = dbscan.fit_predict(features) |
|
|
|
if np.all(clusters == -1): |
|
print("DBSCAN assigned all to noise. Considering as one cluster.") |
|
return np.zeros(len(features), dtype=int) |
|
|
|
return clusters |
|
|
|
def get_most_frequent_voice(features, clusters): |
|
largest_cluster = max(set(clusters), key=list(clusters).count) |
|
return features[clusters == largest_cluster] |
|
|
|
def process_audio(audio_path, fps, video_duration): |
|
features = extract_voice_features(audio_path, fps, video_duration) |
|
clusters = cluster_voices(features) |
|
most_frequent_voice = get_most_frequent_voice(features, clusters) |
|
return most_frequent_voice, features, clusters |