Spaces:
Running
on
Zero
Running
on
Zero
Update whisper_cs.py (#26)
Browse files- Update whisper_cs.py (a413b59f2b8b32465ca45c89d8058e998c60b2af)
Co-authored-by: Sarah Solito <[email protected]>
- whisper_cs.py +11 -9
whisper_cs.py
CHANGED
|
@@ -25,9 +25,15 @@ def clean_text(input_text):
|
|
| 25 |
|
| 26 |
|
| 27 |
def split_stereo_channels(audio_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
audio = AudioSegment.from_wav(audio_path)
|
| 30 |
-
|
| 31 |
channels = audio.split_to_mono()
|
| 32 |
if len(channels) != 2:
|
| 33 |
raise ValueError(f"Audio {audio_path} does not have 2 channels.")
|
|
@@ -127,10 +133,8 @@ def post_process_transcription(transcription, max_repeats=2):
|
|
| 127 |
|
| 128 |
return cleaned_transcription
|
| 129 |
|
| 130 |
-
def post_merge_consecutive_segments(input_file, output_file): #check
|
| 131 |
-
with open(input_file, "r") as f:
|
| 132 |
-
transcription_text = f.read()
|
| 133 |
|
|
|
|
| 134 |
segments = re.split(r'(\[SPEAKER_\d{2}\])', transcription_text)
|
| 135 |
merged_transcription = ''
|
| 136 |
current_speaker = None
|
|
@@ -153,8 +157,7 @@ def post_merge_consecutive_segments(input_file, output_file): #check
|
|
| 153 |
if current_speaker is not None:
|
| 154 |
merged_transcription += f'[SPEAKER_{current_speaker}] {" ".join(current_segment)}\n'
|
| 155 |
|
| 156 |
-
|
| 157 |
-
f.write(merged_transcription.strip())
|
| 158 |
|
| 159 |
def cleanup_temp_files(*file_paths):
|
| 160 |
for path in file_paths:
|
|
@@ -262,8 +265,6 @@ def generate(audio_path, use_v2):
|
|
| 262 |
model = load_whisper_model(MODEL_PATH_2)
|
| 263 |
split_stereo_channels(audio_path)
|
| 264 |
|
| 265 |
-
audio_id = os.path.splitext(os.path.basename(audio_path))[0]
|
| 266 |
-
|
| 267 |
left_channel_path = "temp_mono_speaker2.wav"
|
| 268 |
right_channel_path = "temp_mono_speaker1.wav"
|
| 269 |
|
|
@@ -309,6 +310,7 @@ def generate(audio_path, use_v2):
|
|
| 309 |
clean_output = ""
|
| 310 |
for line in aligned_text:
|
| 311 |
clean_output += f"{line}\n"
|
|
|
|
| 312 |
cleanup_temp_files(mono_audio_path,tmp_full_path)
|
| 313 |
|
| 314 |
cleanup_temp_files(
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
def split_stereo_channels(audio_path):
|
| 28 |
+
ext = os.path.splitext(audio_path)[1].lower()
|
| 29 |
+
|
| 30 |
+
if ext == ".wav":
|
| 31 |
+
audio = AudioSegment.from_wav(audio_path)
|
| 32 |
+
elif ext == ".mp3":
|
| 33 |
+
audio = AudioSegment.from_file(audio_path, format="mp3")
|
| 34 |
+
else:
|
| 35 |
+
raise ValueError(f"Unsupported file format: {audio_path}")
|
| 36 |
|
|
|
|
|
|
|
| 37 |
channels = audio.split_to_mono()
|
| 38 |
if len(channels) != 2:
|
| 39 |
raise ValueError(f"Audio {audio_path} does not have 2 channels.")
|
|
|
|
| 133 |
|
| 134 |
return cleaned_transcription
|
| 135 |
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
+
def post_merge_consecutive_segments_from_text(transcription_text: str) -> str:
|
| 138 |
segments = re.split(r'(\[SPEAKER_\d{2}\])', transcription_text)
|
| 139 |
merged_transcription = ''
|
| 140 |
current_speaker = None
|
|
|
|
| 157 |
if current_speaker is not None:
|
| 158 |
merged_transcription += f'[SPEAKER_{current_speaker}] {" ".join(current_segment)}\n'
|
| 159 |
|
| 160 |
+
return merged_transcription.strip()
|
|
|
|
| 161 |
|
| 162 |
def cleanup_temp_files(*file_paths):
|
| 163 |
for path in file_paths:
|
|
|
|
| 265 |
model = load_whisper_model(MODEL_PATH_2)
|
| 266 |
split_stereo_channels(audio_path)
|
| 267 |
|
|
|
|
|
|
|
| 268 |
left_channel_path = "temp_mono_speaker2.wav"
|
| 269 |
right_channel_path = "temp_mono_speaker1.wav"
|
| 270 |
|
|
|
|
| 310 |
clean_output = ""
|
| 311 |
for line in aligned_text:
|
| 312 |
clean_output += f"{line}\n"
|
| 313 |
+
clean_output = post_merge_consecutive_segments_from_text(clean_output)
|
| 314 |
cleanup_temp_files(mono_audio_path,tmp_full_path)
|
| 315 |
|
| 316 |
cleanup_temp_files(
|