Spaces:

nvidia
/

parakeet-tdt_ctc-1.1b

Running on T4

nithinraok commited on Jun 6, 2024

Commit

ea18850

verified ·

1 Parent(s): 39684bb

Update nemo_align.py

Files changed (1) hide show

nemo_align.py CHANGED Viewed

@@ -440,8 +440,15 @@ def align_tdt_to_ctc_timestamps(tdt_txt, model, audio_filepath):
         model.change_decoding_strategy(decoder_type="ctc")
     else:
         raise ValueError("Currently supporting hybrid models")
-    with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16):
         with torch.inference_mode():
             hypotheses = model.transcribe([audio_filepath], return_hypotheses=True, batch_size=1)
@@ -498,7 +505,7 @@ def align_tdt_to_ctc_timestamps(tdt_txt, model, audio_filepath):
             model.preprocessor.featurizer.hop_length * model_downsample_factor / model.cfg.preprocessor.sample_rate
         )
-    alignments_batch = viterbi_decoding(log_probs_batch, y_batch, T_batch, U_batch, torch.device('cuda'))
     utt_obj = add_t_start_end_to_utt_obj(utt_obj, alignments_batch[0], output_timestep_duration)

         model.change_decoding_strategy(decoder_type="ctc")
     else:
         raise ValueError("Currently supporting hybrid models")
+    if torch.cuda.is_available():
+        enable = True
+        viterbi_device = torch.device('cuda')
+    else:
+        enable = False
+        viterbi_device = torch.device('cpu')
+    with torch.cuda.amp.autocast(enabled=enable, dtype=torch.bfloat16):
         with torch.inference_mode():
             hypotheses = model.transcribe([audio_filepath], return_hypotheses=True, batch_size=1)
             model.preprocessor.featurizer.hop_length * model_downsample_factor / model.cfg.preprocessor.sample_rate
         )
+    alignments_batch = viterbi_decoding(log_probs_batch, y_batch, T_batch, U_batch, viterbi_device)
     utt_obj = add_t_start_end_to_utt_obj(utt_obj, alignments_batch[0], output_timestep_duration)