asr-inference

Running on Zero

App Files Files Community

federicocosta1989 commited on 5 days ago

Commit

6fccdc4

verified ·

1 Parent(s): 99577ba

Update whisper_cs.py (#40)

Browse files

- Update whisper_cs.py (ee5993709babb4400395b6bccd3d2cf4d3152d9e)

Files changed (1) hide show

whisper_cs.py +19 -2

whisper_cs.py CHANGED Viewed

@@ -11,6 +11,7 @@ from faster_whisper import WhisperModel
 device = 0 if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float32
 MODEL_PATH_V2 = "langtech-veu/whisper-timestamped-cs"
 MODEL_PATH_V2_FAST = "langtech-veu/faster-whisper-timestamped-cs"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -101,10 +102,18 @@ def post_merge_consecutive_segments_from_text(transcription_text: str) -> str:
     return merged_transcription.strip()
 def cleanup_temp_files(*file_paths):
     for path in file_paths:
         if path and os.path.exists(path):
             os.remove(path)
 '''
 try:
     faster_model = WhisperModel(
@@ -173,6 +182,9 @@ def transcribe_audio(model, audio_path: str) -> Dict:
 def generate(audio_path, use_v2_fast):
     if use_v2_fast:
         split_stereo_channels(audio_path)
         left_channel_path = "temp_mono_speaker2.wav"
@@ -206,12 +218,13 @@ def generate(audio_path, use_v2_fast):
         clean_output = ""
         for start, end, speaker, text in merged_transcript:
-            clean_output += f"[{speaker}]: {text}\n"
-        print('clean_output',clean_output)
         # FIX Seems that post_merge_consecutive_segments_from_text returns an empty string
         #clean_output = post_merge_consecutive_segments_from_text(clean_output)
         #print('clean_output',clean_output)
     else:
         model = load_whisper_model(MODEL_PATH_V2)
@@ -248,9 +261,13 @@ def generate(audio_path, use_v2_fast):
         clean_output = output.strip()
     cleanup_temp_files(
         "temp_mono_speaker1.wav",
         "temp_mono_speaker2.wav"
     )
     return clean_output

 device = 0 if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float32
+DEBUG_MODE = True
 MODEL_PATH_V2 = "langtech-veu/whisper-timestamped-cs"
 MODEL_PATH_V2_FAST = "langtech-veu/faster-whisper-timestamped-cs"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
     return merged_transcription.strip()
 def cleanup_temp_files(*file_paths):
+    if DEBUG_MODE: print(f"Entered cleanup_temp_files function...")
+        if DEBUG_MODE: print(f"file_paths: {file_paths}")
     for path in file_paths:
         if path and os.path.exists(path):
+            if DEBUG_MODE: print(f"Removing path: {path}")
             os.remove(path)
+    if DEBUG_MODE: print(f"Exited cleanup_temp_files function.")
 '''
 try:
     faster_model = WhisperModel(
 def generate(audio_path, use_v2_fast):
+    if DEBUG_MODE: print(f"Entering generate function...")
+    if DEBUG_MODE: print(f"use_v2_fast: {use_v2_fast}")
     if use_v2_fast:
         split_stereo_channels(audio_path)
         left_channel_path = "temp_mono_speaker2.wav"
         clean_output = ""
         for start, end, speaker, text in merged_transcript:
+            clean_output += f"[{speaker}]: {text}\n"
         # FIX Seems that post_merge_consecutive_segments_from_text returns an empty string
         #clean_output = post_merge_consecutive_segments_from_text(clean_output)
         #print('clean_output',clean_output)
+        if DEBUG_MODE: print(f"clean_output: {clean_output}")
     else:
         model = load_whisper_model(MODEL_PATH_V2)
         clean_output = output.strip()
+    if DEBUG_MODE: print(f"Clean output generated.")
     cleanup_temp_files(
         "temp_mono_speaker1.wav",
         "temp_mono_speaker2.wav"
     )
+    if DEBUG_MODE: print(f"Exiting generate function...")
     return clean_output