import numpy as np import librosa from sklearn.cluster import DBSCAN def extract_voice_features(audio_path, fps, video_duration): # Load the audio file y, sr = librosa.load(audio_path) # Calculate the number of samples per frame samples_per_frame = int(sr / fps) # Calculate the total number of frames total_frames = int(fps * video_duration) # Extract MFCC features mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) # Segment the MFCCs to align with video frames segments = [] for i in range(total_frames): start = i * samples_per_frame end = start + samples_per_frame if end > mfccs.shape[1]: break segment = mfccs[:, start:end] segments.append(np.mean(segment, axis=1)) return np.array(segments) def cluster_voices(features): if len(features) < 2: print("Not enough voice segments for clustering. Assigning all to one cluster.") return np.zeros(len(features), dtype=int) dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean') clusters = dbscan.fit_predict(features) if np.all(clusters == -1): print("DBSCAN assigned all to noise. Considering as one cluster.") return np.zeros(len(features), dtype=int) return clusters def get_most_frequent_voice(features, clusters): largest_cluster = max(set(clusters), key=list(clusters).count) return features[clusters == largest_cluster] def process_audio(audio_path, fps, video_duration): features = extract_voice_features(audio_path, fps, video_duration) clusters = cluster_voices(features) most_frequent_voice = get_most_frequent_voice(features, clusters) return most_frequent_voice, features, clusters