from pprint import pprint | |
from transformers import pipeline | |
from datasets import load_dataset | |
# config | |
model_id = "kotoba-tech/kotoba-whisper-v1.0" | |
generate_kwargs = {"language": "japanese", "task": "transcribe"} | |
# load model | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model=model_id, | |
chunk_length_s=15, | |
batch_size=64 | |
) | |
# load sample audio (concatenate instances to create a long audio) | |
dataset = load_dataset("kotoba-tech/kotoba-whisper-eval", split="train") | |
x = dataset['audio'][0] | |
elapsed = {} | |
for x in dataset['audio']: | |
start = time() | |
transcription = pipe(x.copy(), generate_kwargs=generate_kwargs) | |
elapsed[x['path']] = time() - start | |
pprint(elapsed) |