File size: 1,760 Bytes

b568300
 
d811c94
b568300
cd6c09b
d811c94
 
 
cd6c09b
 
 
 
 
 
d811c94
 
 
cd6c09b
d811c94
cd6c09b
 
 
 
 
d811c94
 
 
 
 
 
 
b568300
d811c94
b568300
d811c94
 
b568300
 
 
d811c94
b568300
 
 
d811c94
b568300
d811c94
 
cd6c09b
 
d811c94

import numpy as np
import librosa
from sklearn.cluster import DBSCAN

def extract_voice_features(audio_path, fps, video_duration):
    # Load the audio file
    y, sr = librosa.load(audio_path)
    
    # Calculate the number of samples per frame
    samples_per_frame = int(sr / fps)
    
    # Calculate the total number of frames
    total_frames = int(fps * video_duration)
    
    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    
    # Segment the MFCCs to align with video frames
    segments = []
    for i in range(total_frames):
        start = i * samples_per_frame
        end = start + samples_per_frame
        if end > mfccs.shape[1]:
            break
        segment = mfccs[:, start:end]
        segments.append(np.mean(segment, axis=1))
    
    return np.array(segments)

def cluster_voices(features):
    if len(features) < 2:
        print("Not enough voice segments for clustering. Assigning all to one cluster.")
        return np.zeros(len(features), dtype=int)

    dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')
    clusters = dbscan.fit_predict(features)

    if np.all(clusters == -1):
        print("DBSCAN assigned all to noise. Considering as one cluster.")
        return np.zeros(len(features), dtype=int)

    return clusters

def get_most_frequent_voice(features, clusters):
    largest_cluster = max(set(clusters), key=list(clusters).count)
    return features[clusters == largest_cluster]

def process_audio(audio_path, fps, video_duration):
    features = extract_voice_features(audio_path, fps, video_duration)
    clusters = cluster_voices(features)
    most_frequent_voice = get_most_frequent_voice(features, clusters)
    return most_frequent_voice, features, clusters