import numpy as np import librosa from sklearn.cluster import DBSCAN from pydub import AudioSegment def extract_voice_features(audio_path, segment_duration=1000): # Load the audio file y, sr = librosa.load(audio_path) # Extract MFCC features mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) # Segment the MFCCs segment_length = int(segment_duration * sr / 1000) num_segments = len(y) // segment_length segments = [] for i in range(num_segments): start = i * segment_length end = start + segment_length segment = mfccs[:, start:end] segments.append(np.mean(segment, axis=1)) return np.array(segments) def remove_nan_features(features): return features[~np.isnan(features).any(axis=1)] def cluster_voices(features): # Remove NaN values features = remove_nan_features(features) if len(features) < 2: print("Not enough voice segments for clustering. Assigning all to one cluster.") return np.zeros(len(features), dtype=int) dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean') clusters = dbscan.fit_predict(features) if np.all(clusters == -1): print("DBSCAN assigned all to noise. Considering as one cluster.") return np.zeros(len(features), dtype=int) return clusters def get_most_frequent_voice(features, clusters): largest_cluster = max(set(clusters), key=list(clusters).count) return features[clusters == largest_cluster] def process_audio(audio_path, segment_duration=1000): features = extract_voice_features(audio_path, segment_duration) features = remove_nan_features(features) clusters = cluster_voices(features) most_frequent_voice = get_most_frequent_voice(features, clusters) return most_frequent_voice, features, clusters