metadata
tags:
- pyannote
- pyannote-audio
- pyannote-audio-pipeline
# load pretrained pipeline
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained('hbredin/utter-project-diarization')
# send it to MPS device (on Apple Silicon)
import torch
mps = torch.device('mps')
pipeline.to(mps)
# apply it on sample file
from pyannote.audio.sample import SAMPLE_FILE
diarization = pipeline(SAMPLE_FILE)
# print output
print(diarization)
# [ 00:00:06.730 --> 00:00:06.747] A speaker90
# [ 00:00:06.747 --> 00:00:07.169] B speaker91
# [ 00:00:07.169 --> 00:00:07.185] C speaker90
# [ 00:00:07.590 --> 00:00:07.624] D speaker90
# [ 00:00:07.624 --> 00:00:08.029] E speaker91
# [ 00:00:08.029 --> 00:00:09.970] F speaker90
# [ 00:00:09.970 --> 00:00:10.982] G speaker91
# [ 00:00:10.459 --> 00:00:14.729] H speaker90
# [ 00:00:14.307 --> 00:00:17.884] I speaker91
# [ 00:00:18.019 --> 00:00:21.512] J 2
# [ 00:00:18.188 --> 00:00:18.407] K speaker91
# [ 00:00:21.765 --> 00:00:28.499] L speaker91
# [ 00:00:27.824 --> 00:00:29.967] M 2
# compute diarization error rate
from pyannote.metrics.diarization import DiarizationErrorRate
metric = DiarizationErrorRate()
metric(SAMPLE_FILE['annotation'], diarization, detailed=True)
# {'confusion': 6.2540312500000015,
# 'missed detection': 0.5480625000000003,
# 'correct': 17.547906249999997,
# 'false alarm': 0.4811874999999999,
# 'total': 24.349999999999998,
# 'diarization error rate': 0.2991080595482547}