Spaces:
Runtime error
Runtime error
Update voice_analysis.py
Browse files- voice_analysis.py +43 -45
voice_analysis.py
CHANGED
@@ -1,52 +1,50 @@
|
|
1 |
-
import
|
2 |
-
import
|
3 |
-
|
|
|
|
|
4 |
|
5 |
-
def
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
samples_per_frame = int(sr / fps)
|
11 |
-
|
12 |
-
# Calculate the total number of frames
|
13 |
-
total_frames = int(fps * video_duration)
|
14 |
-
|
15 |
-
# Extract MFCC features
|
16 |
-
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
|
17 |
-
|
18 |
-
# Segment the MFCCs to align with video frames
|
19 |
-
segments = []
|
20 |
-
for i in range(total_frames):
|
21 |
-
start = i * samples_per_frame
|
22 |
-
end = start + samples_per_frame
|
23 |
-
if end > mfccs.shape[1]:
|
24 |
-
break
|
25 |
-
segment = mfccs[:, start:end]
|
26 |
-
segments.append(np.mean(segment, axis=1))
|
27 |
-
|
28 |
-
return np.array(segments)
|
29 |
|
30 |
-
def
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
|
35 |
-
|
36 |
-
|
|
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
41 |
|
42 |
-
|
|
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
largest_cluster = max(set(clusters), key=list(clusters).count)
|
46 |
-
return features[clusters == largest_cluster]
|
47 |
|
48 |
-
def
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import moviepy.editor as mp
|
2 |
+
from pyannote.audio import Pipeline
|
3 |
+
import torch
|
4 |
+
import torchaudio
|
5 |
+
from pyannote.core import Segment
|
6 |
|
7 |
+
def extract_audio_from_video(video_path):
|
8 |
+
video = mp.VideoFileClip(video_path)
|
9 |
+
audio_path = video_path.rsplit('.', 1)[0] + '.wav'
|
10 |
+
video.audio.write_audiofile(audio_path)
|
11 |
+
return audio_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
+
def diarize_speakers(audio_path):
|
14 |
+
pipeline = Pipeline.from_pretrained("pyannote/[email protected]", use_auth_token="YOUR_HF_TOKEN")
|
15 |
+
diarization = pipeline(audio_path)
|
16 |
+
return diarization
|
17 |
|
18 |
+
def get_speaker_embeddings(audio_path, diarization, model):
|
19 |
+
waveform, sample_rate = torchaudio.load(audio_path)
|
20 |
+
embeddings = []
|
21 |
|
22 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
23 |
+
start = int(turn.start * sample_rate)
|
24 |
+
end = int(turn.end * sample_rate)
|
25 |
+
|
26 |
+
segment = waveform[:, start:end]
|
27 |
+
if segment.shape[1] == 0:
|
28 |
+
continue
|
29 |
|
30 |
+
with torch.no_grad():
|
31 |
+
embedding = model({"waveform": segment, "sample_rate": sample_rate})
|
32 |
+
|
33 |
+
embeddings.append({"time": turn.start, "embedding": embedding.squeeze().cpu().numpy(), "speaker": speaker})
|
34 |
|
35 |
+
return embeddings
|
|
|
|
|
36 |
|
37 |
+
def align_voice_embeddings(voice_embeddings, frame_count, fps):
|
38 |
+
aligned_embeddings = []
|
39 |
+
current_embedding_index = 0
|
40 |
+
|
41 |
+
for frame in range(frame_count):
|
42 |
+
frame_time = frame / fps
|
43 |
+
|
44 |
+
while (current_embedding_index < len(voice_embeddings) - 1 and
|
45 |
+
voice_embeddings[current_embedding_index + 1]["time"] <= frame_time):
|
46 |
+
current_embedding_index += 1
|
47 |
+
|
48 |
+
aligned_embeddings.append(voice_embeddings[current_embedding_index]["embedding"])
|
49 |
+
|
50 |
+
return np.array(aligned_embeddings)
|