File size: 1,760 Bytes
b568300
 
d811c94
b568300
cd6c09b
d811c94
 
 
cd6c09b
 
 
 
 
 
d811c94
 
 
cd6c09b
d811c94
cd6c09b
 
 
 
 
d811c94
 
 
 
 
 
 
b568300
d811c94
b568300
d811c94
 
b568300
 
 
d811c94
b568300
 
 
d811c94
b568300
d811c94
 
cd6c09b
 
d811c94
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import numpy as np
import librosa
from sklearn.cluster import DBSCAN

def extract_voice_features(audio_path, fps, video_duration):
    # Load the audio file
    y, sr = librosa.load(audio_path)
    
    # Calculate the number of samples per frame
    samples_per_frame = int(sr / fps)
    
    # Calculate the total number of frames
    total_frames = int(fps * video_duration)
    
    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    
    # Segment the MFCCs to align with video frames
    segments = []
    for i in range(total_frames):
        start = i * samples_per_frame
        end = start + samples_per_frame
        if end > mfccs.shape[1]:
            break
        segment = mfccs[:, start:end]
        segments.append(np.mean(segment, axis=1))
    
    return np.array(segments)

def cluster_voices(features):
    if len(features) < 2:
        print("Not enough voice segments for clustering. Assigning all to one cluster.")
        return np.zeros(len(features), dtype=int)

    dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')
    clusters = dbscan.fit_predict(features)

    if np.all(clusters == -1):
        print("DBSCAN assigned all to noise. Considering as one cluster.")
        return np.zeros(len(features), dtype=int)

    return clusters

def get_most_frequent_voice(features, clusters):
    largest_cluster = max(set(clusters), key=list(clusters).count)
    return features[clusters == largest_cluster]

def process_audio(audio_path, fps, video_duration):
    features = extract_voice_features(audio_path, fps, video_duration)
    clusters = cluster_voices(features)
    most_frequent_voice = get_most_frequent_voice(features, clusters)
    return most_frequent_voice, features, clusters