Spaces:

reab5555
/

Multimodal-Behavioral-Anomalies-Detection

Sleeping

App Files Files Community

Multimodal-Behavioral-Anomalies-Detection / voice_analysis.py

reab5555

Update voice_analysis.py

93892b7 verified about 1 year ago

raw

history blame

2.46 kB

	import moviepy.editor as mp
	from pyannote.audio import Pipeline
	import torch
	import torchaudio
	from pyannote.core import Segment
	from pyannote.audio import Model
	from pyannote.audio.pipelines.utils.hook import IteratorHook
	import os

	def extract_audio_from_video(video_path):
	video = mp.VideoFileClip(video_path)
	audio_path = video_path.rsplit('.', 1)[0] + '.wav'
	video.audio.write_audiofile(audio_path)
	return audio_path

	def diarize_speakers(audio_path):
	# Load the token from the environment variable
	hf_token = os.environ.get("py_annote_hf_token")

	if not hf_token:
	raise ValueError("HF_TOKEN environment variable is not set. Please add it in your Hugging Face Space's Variables and secrets section.")

	pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=hf_token)
	diarization = pipeline(audio_path)
	return diarization

	def get_speaker_embeddings(audio_path, diarization, model_name="pyannote/embedding"):
	hf_token = os.environ.get("py_annote_hf_token")

	if not hf_token:
	raise ValueError("py_annote_hf_token environment variable is not set. Please check your Hugging Face Space's Variables and secrets section.")

	model = Model.from_pretrained(model_name, use_auth_token=hf_token)

	waveform, sample_rate = torchaudio.load(audio_path)
	embeddings = []

	for turn, _, speaker in diarization.itertracks(yield_label=True):
	start = int(turn.start * sample_rate)
	end = int(turn.end * sample_rate)

	segment = waveform[:, start:end]
	if segment.shape[1] == 0:
	continue

	with torch.no_grad():
	embedding = model({"waveform": segment, "sample_rate": sample_rate})

	embeddings.append({"time": turn.start, "embedding": embedding.squeeze().cpu().numpy(), "speaker": speaker})

	return embeddings

	def align_voice_embeddings(voice_embeddings, frame_count, fps):
	aligned_embeddings = []
	current_embedding_index = 0

	for frame in range(frame_count):
	frame_time = frame / fps

	while (current_embedding_index < len(voice_embeddings) - 1 and
	voice_embeddings[current_embedding_index + 1]["time"] <= frame_time):
	current_embedding_index += 1

	aligned_embeddings.append(voice_embeddings[current_embedding_index]["embedding"])

	return np.array(aligned_embeddings)