from pprint import pprint from transformers import pipeline from datasets import load_dataset # config model_id = "kotoba-tech/kotoba-whisper-v1.0" generate_kwargs = {"language": "japanese", "task": "transcribe"} # load model pipe = pipeline( "automatic-speech-recognition", model=model_id, chunk_length_s=15, batch_size=64 ) # load sample audio (concatenate instances to create a long audio) dataset = load_dataset("kotoba-tech/kotoba-whisper-eval", split="train") x = dataset['audio'][0] elapsed = {} for x in dataset['audio']: start = time() transcription = pipe(x.copy(), generate_kwargs=generate_kwargs) elapsed[x['path']] = time() - start pprint(elapsed)