Spaces:

reab5555
/

Multimodal-Behavioral-Anomalies-Detection

Running

App Files Files Community

Multimodal-Behavioral-Anomalies-Detection / voice_analysis.py

reab5555

Update voice_analysis.py

cd6c09b verified 11 months ago

raw

history blame

1.76 kB

	import numpy as np
	import librosa
	from sklearn.cluster import DBSCAN

	def extract_voice_features(audio_path, fps, video_duration):
	# Load the audio file
	y, sr = librosa.load(audio_path)

	# Calculate the number of samples per frame
	samples_per_frame = int(sr / fps)

	# Calculate the total number of frames
	total_frames = int(fps * video_duration)

	# Extract MFCC features
	mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

	# Segment the MFCCs to align with video frames
	segments = []
	for i in range(total_frames):
	start = i * samples_per_frame
	end = start + samples_per_frame
	if end > mfccs.shape[1]:
	break
	segment = mfccs[:, start:end]
	segments.append(np.mean(segment, axis=1))

	return np.array(segments)

	def cluster_voices(features):
	if len(features) < 2:
	print("Not enough voice segments for clustering. Assigning all to one cluster.")
	return np.zeros(len(features), dtype=int)

	dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')
	clusters = dbscan.fit_predict(features)

	if np.all(clusters == -1):
	print("DBSCAN assigned all to noise. Considering as one cluster.")
	return np.zeros(len(features), dtype=int)

	return clusters

	def get_most_frequent_voice(features, clusters):
	largest_cluster = max(set(clusters), key=list(clusters).count)
	return features[clusters == largest_cluster]

	def process_audio(audio_path, fps, video_duration):
	features = extract_voice_features(audio_path, fps, video_duration)
	clusters = cluster_voices(features)
	most_frequent_voice = get_most_frequent_voice(features, clusters)
	return most_frequent_voice, features, clusters