reab5555 commited on
Commit
a4db1a6
·
verified ·
1 Parent(s): 9998b9d

Update voice_analysis.py

Browse files
Files changed (1) hide show
  1. voice_analysis.py +43 -45
voice_analysis.py CHANGED
@@ -1,52 +1,50 @@
1
- import numpy as np
2
- import librosa
3
- from sklearn.cluster import DBSCAN
 
 
4
 
5
- def extract_voice_features(audio_path, fps, video_duration):
6
- # Load the audio file
7
- y, sr = librosa.load(audio_path)
8
-
9
- # Calculate the number of samples per frame
10
- samples_per_frame = int(sr / fps)
11
-
12
- # Calculate the total number of frames
13
- total_frames = int(fps * video_duration)
14
-
15
- # Extract MFCC features
16
- mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
17
-
18
- # Segment the MFCCs to align with video frames
19
- segments = []
20
- for i in range(total_frames):
21
- start = i * samples_per_frame
22
- end = start + samples_per_frame
23
- if end > mfccs.shape[1]:
24
- break
25
- segment = mfccs[:, start:end]
26
- segments.append(np.mean(segment, axis=1))
27
-
28
- return np.array(segments)
29
 
30
- def cluster_voices(features):
31
- if len(features) < 2:
32
- print("Not enough voice segments for clustering. Assigning all to one cluster.")
33
- return np.zeros(len(features), dtype=int)
34
 
35
- dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')
36
- clusters = dbscan.fit_predict(features)
 
37
 
38
- if np.all(clusters == -1):
39
- print("DBSCAN assigned all to noise. Considering as one cluster.")
40
- return np.zeros(len(features), dtype=int)
 
 
 
 
41
 
42
- return clusters
 
 
 
43
 
44
- def get_most_frequent_voice(features, clusters):
45
- largest_cluster = max(set(clusters), key=list(clusters).count)
46
- return features[clusters == largest_cluster]
47
 
48
- def process_audio(audio_path, fps, video_duration):
49
- features = extract_voice_features(audio_path, fps, video_duration)
50
- clusters = cluster_voices(features)
51
- most_frequent_voice = get_most_frequent_voice(features, clusters)
52
- return most_frequent_voice, features, clusters
 
 
 
 
 
 
 
 
 
 
1
+ import moviepy.editor as mp
2
+ from pyannote.audio import Pipeline
3
+ import torch
4
+ import torchaudio
5
+ from pyannote.core import Segment
6
 
7
+ def extract_audio_from_video(video_path):
8
+ video = mp.VideoFileClip(video_path)
9
+ audio_path = video_path.rsplit('.', 1)[0] + '.wav'
10
+ video.audio.write_audiofile(audio_path)
11
+ return audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ def diarize_speakers(audio_path):
14
+ pipeline = Pipeline.from_pretrained("pyannote/[email protected]", use_auth_token="YOUR_HF_TOKEN")
15
+ diarization = pipeline(audio_path)
16
+ return diarization
17
 
18
+ def get_speaker_embeddings(audio_path, diarization, model):
19
+ waveform, sample_rate = torchaudio.load(audio_path)
20
+ embeddings = []
21
 
22
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
23
+ start = int(turn.start * sample_rate)
24
+ end = int(turn.end * sample_rate)
25
+
26
+ segment = waveform[:, start:end]
27
+ if segment.shape[1] == 0:
28
+ continue
29
 
30
+ with torch.no_grad():
31
+ embedding = model({"waveform": segment, "sample_rate": sample_rate})
32
+
33
+ embeddings.append({"time": turn.start, "embedding": embedding.squeeze().cpu().numpy(), "speaker": speaker})
34
 
35
+ return embeddings
 
 
36
 
37
+ def align_voice_embeddings(voice_embeddings, frame_count, fps):
38
+ aligned_embeddings = []
39
+ current_embedding_index = 0
40
+
41
+ for frame in range(frame_count):
42
+ frame_time = frame / fps
43
+
44
+ while (current_embedding_index < len(voice_embeddings) - 1 and
45
+ voice_embeddings[current_embedding_index + 1]["time"] <= frame_time):
46
+ current_embedding_index += 1
47
+
48
+ aligned_embeddings.append(voice_embeddings[current_embedding_index]["embedding"])
49
+
50
+ return np.array(aligned_embeddings)