reab5555 commited on
Commit
d811c94
·
verified ·
1 Parent(s): 485faa9

Update voice_analysis.py

Browse files
Files changed (1) hide show
  1. voice_analysis.py +36 -28
voice_analysis.py CHANGED
@@ -1,40 +1,48 @@
1
- import torch
2
  import numpy as np
3
- from speechbrain.pretrained import EncoderClassifier
4
- from pydub import AudioSegment
5
- from sklearn.cluster import DBSCAN
6
  import librosa
 
 
7
 
8
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
- classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb", run_opts={"device": device})
10
-
11
- def extract_voice_embedding(audio_segment):
12
- signal = np.array(audio_segment.get_array_of_samples())
13
- signal = signal.astype(np.float32) / 32768.0 # Normalize to [-1, 1]
14
- embedding = classifier.encode_batch(torch.tensor(signal).unsqueeze(0))
15
- return embedding.squeeze().cpu().numpy()
16
-
17
- def process_audio(audio_path, segment_duration=1000):
18
- audio = AudioSegment.from_file(audio_path)
19
- segments = [audio[i:i+segment_duration] for i in range(0, len(audio), segment_duration)]
20
- embeddings = [extract_voice_embedding(segment) for segment in segments]
21
- return embeddings
22
-
23
- def cluster_voices(embeddings):
24
- if len(embeddings) < 2:
 
 
 
 
 
25
  print("Not enough voice segments for clustering. Assigning all to one cluster.")
26
- return np.zeros(len(embeddings), dtype=int)
27
 
28
- X = np.stack(embeddings)
29
- dbscan = DBSCAN(eps=0.3, min_samples=5, metric='cosine')
30
- clusters = dbscan.fit_predict(X)
31
 
32
  if np.all(clusters == -1):
33
  print("DBSCAN assigned all to noise. Considering as one cluster.")
34
- return np.zeros(len(embeddings), dtype=int)
35
 
36
  return clusters
37
 
38
- def get_most_frequent_voice(embeddings, clusters):
39
  largest_cluster = max(set(clusters), key=list(clusters).count)
40
- return [emb for emb, cluster in zip(embeddings, clusters) if cluster == largest_cluster]
 
 
 
 
 
 
 
 
1
  import numpy as np
 
 
 
2
  import librosa
3
+ from sklearn.cluster import DBSCAN
4
+ from pydub import AudioSegment
5
 
6
+ def extract_voice_features(audio_path, segment_duration=1000):
7
+ # Load the audio file
8
+ y, sr = librosa.load(audio_path)
9
+
10
+ # Extract MFCC features
11
+ mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
12
+
13
+ # Segment the MFCCs
14
+ segment_length = int(segment_duration * sr / 1000)
15
+ num_segments = len(y) // segment_length
16
+
17
+ segments = []
18
+ for i in range(num_segments):
19
+ start = i * segment_length
20
+ end = start + segment_length
21
+ segment = mfccs[:, start:end]
22
+ segments.append(np.mean(segment, axis=1))
23
+
24
+ return np.array(segments)
25
+
26
+ def cluster_voices(features):
27
+ if len(features) < 2:
28
  print("Not enough voice segments for clustering. Assigning all to one cluster.")
29
+ return np.zeros(len(features), dtype=int)
30
 
31
+ dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')
32
+ clusters = dbscan.fit_predict(features)
 
33
 
34
  if np.all(clusters == -1):
35
  print("DBSCAN assigned all to noise. Considering as one cluster.")
36
+ return np.zeros(len(features), dtype=int)
37
 
38
  return clusters
39
 
40
+ def get_most_frequent_voice(features, clusters):
41
  largest_cluster = max(set(clusters), key=list(clusters).count)
42
+ return features[clusters == largest_cluster]
43
+
44
+ def process_audio(audio_path, segment_duration=1000):
45
+ features = extract_voice_features(audio_path, segment_duration)
46
+ clusters = cluster_voices(features)
47
+ most_frequent_voice = get_most_frequent_voice(features, clusters)
48
+ return most_frequent_voice, features, clusters