Spaces:
Sleeping
Sleeping
import torch | |
import nemo.collections.asr as nemo_asr | |
import time | |
import argparse | |
def load_model(model_name, device): | |
model = nemo_asr.models.ASRModel.from_pretrained(model_name) | |
model.freeze() # inference mode | |
model = model.to(device) # transfer model to device | |
return model | |
def transcribe_audio(model, audio_file, batch_size, language_id, decoder_type): | |
model.cur_decoder = decoder_type | |
transcribed_text = model.transcribe([audio_file], batch_size=batch_size, language_id=language_id)[0] | |
return transcribed_text | |
def measure_execution_time(model, audio_file, batch_size, language_id, decoder_type): | |
start_time = time.time() | |
transcribed_text = transcribe_audio(model, audio_file, batch_size, language_id, decoder_type) | |
end_time = time.time() | |
execution_time = end_time - start_time | |
return transcribed_text, execution_time | |
def main(device_type): | |
model_name = "ai4bharat/indicconformer_stt_kn_hybrid_rnnt_large" | |
audio_file = 'kannada_query_infer.wav' | |
batch_size = 1 | |
language_id = 'kn' | |
decoder_type = "rnnt" | |
device = torch.device(device_type if torch.cuda.is_available() and device_type == "cuda" else "cpu") | |
model = load_model(model_name, device) | |
transcribed_text, execution_time = measure_execution_time(model, audio_file, batch_size, language_id, decoder_type) | |
print(f"Execution time on {device_type}: {execution_time:.4f} seconds") | |
print(f"Transcribed text: {transcribed_text}") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Transcribe audio using ASR model.") | |
parser.add_argument("--device", type=str, default="cpu", choices=["cpu", "cuda"], help="Device type to use for inference (cpu or cuda).") | |
args = parser.parse_args() | |
main(args.device) |