Update modules/whisper/whisper_base.py
Browse files
modules/whisper/whisper_base.py
CHANGED
@@ -9,6 +9,7 @@ import numpy as np
|
|
9 |
from datetime import datetime
|
10 |
from faster_whisper.vad import VadOptions
|
11 |
from dataclasses import astuple
|
|
|
12 |
|
13 |
from modules.uvr.music_separator import MusicSeparator
|
14 |
from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
|
@@ -99,13 +100,10 @@ class WhisperBase(ABC):
|
|
99 |
elapsed_time: float
|
100 |
elapsed time for running
|
101 |
"""
|
|
|
|
|
102 |
params = WhisperParameters.as_value(*whisper_params)
|
103 |
|
104 |
-
self.cache_parameters(
|
105 |
-
whisper_params=params,
|
106 |
-
add_timestamp=add_timestamp
|
107 |
-
)
|
108 |
-
|
109 |
if params.lang is None:
|
110 |
pass
|
111 |
elif params.lang == "Automatic Detection":
|
@@ -134,12 +132,16 @@ class WhisperBase(ABC):
|
|
134 |
|
135 |
if params.uvr_enable_offload:
|
136 |
self.music_separator.offload()
|
|
|
137 |
|
|
|
|
|
138 |
if params.vad_filter:
|
139 |
# Explicit value set for float('inf') from gr.Number()
|
140 |
if params.max_speech_duration_s is None or params.max_speech_duration_s >= 9999:
|
141 |
params.max_speech_duration_s = float('inf')
|
142 |
|
|
|
143 |
vad_options = VadOptions(
|
144 |
threshold=params.threshold,
|
145 |
min_speech_duration_ms=params.min_speech_duration_ms,
|
@@ -148,31 +150,57 @@ class WhisperBase(ABC):
|
|
148 |
speech_pad_ms=params.speech_pad_ms
|
149 |
)
|
150 |
|
151 |
-
|
152 |
audio=audio,
|
153 |
vad_parameters=vad_options,
|
154 |
progress=progress
|
155 |
)
|
156 |
|
|
|
|
|
|
|
|
|
|
|
157 |
result, elapsed_time = self.transcribe(
|
158 |
audio,
|
159 |
progress,
|
160 |
*astuple(params)
|
161 |
)
|
|
|
|
|
162 |
|
163 |
if params.vad_filter:
|
164 |
-
|
165 |
segments=result,
|
166 |
speech_chunks=speech_chunks,
|
167 |
)
|
|
|
|
|
|
|
|
|
168 |
|
169 |
if params.is_diarize:
|
|
|
170 |
result, elapsed_time_diarization = self.diarizer.run(
|
171 |
-
audio=
|
172 |
use_auth_token=params.hf_token,
|
173 |
transcribed_result=result,
|
|
|
174 |
)
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
return result, elapsed_time
|
177 |
|
178 |
def transcribe_file(self,
|
|
|
9 |
from datetime import datetime
|
10 |
from faster_whisper.vad import VadOptions
|
11 |
from dataclasses import astuple
|
12 |
+
from copy import deepcopy
|
13 |
|
14 |
from modules.uvr.music_separator import MusicSeparator
|
15 |
from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
|
|
|
100 |
elapsed_time: float
|
101 |
elapsed time for running
|
102 |
"""
|
103 |
+
|
104 |
+
start_time = time.time()
|
105 |
params = WhisperParameters.as_value(*whisper_params)
|
106 |
|
|
|
|
|
|
|
|
|
|
|
107 |
if params.lang is None:
|
108 |
pass
|
109 |
elif params.lang == "Automatic Detection":
|
|
|
132 |
|
133 |
if params.uvr_enable_offload:
|
134 |
self.music_separator.offload()
|
135 |
+
elapsed_time_bgm_sep = time.time() - start_time
|
136 |
|
137 |
+
origin_audio = deepcopy(audio)
|
138 |
+
|
139 |
if params.vad_filter:
|
140 |
# Explicit value set for float('inf') from gr.Number()
|
141 |
if params.max_speech_duration_s is None or params.max_speech_duration_s >= 9999:
|
142 |
params.max_speech_duration_s = float('inf')
|
143 |
|
144 |
+
progress(0, desc="Filtering silent parts from audio...")
|
145 |
vad_options = VadOptions(
|
146 |
threshold=params.threshold,
|
147 |
min_speech_duration_ms=params.min_speech_duration_ms,
|
|
|
150 |
speech_pad_ms=params.speech_pad_ms
|
151 |
)
|
152 |
|
153 |
+
vad_processed, speech_chunks = self.vad.run(
|
154 |
audio=audio,
|
155 |
vad_parameters=vad_options,
|
156 |
progress=progress
|
157 |
)
|
158 |
|
159 |
+
if vad_processed.size > 0:
|
160 |
+
audio = vad_processed
|
161 |
+
else:
|
162 |
+
vad_params.vad_filter = False
|
163 |
+
|
164 |
result, elapsed_time = self.transcribe(
|
165 |
audio,
|
166 |
progress,
|
167 |
*astuple(params)
|
168 |
)
|
169 |
+
if params.whisper_enable_offload:
|
170 |
+
self.offload()
|
171 |
|
172 |
if params.vad_filter:
|
173 |
+
restored_result = self.vad.restore_speech_timestamps(
|
174 |
segments=result,
|
175 |
speech_chunks=speech_chunks,
|
176 |
)
|
177 |
+
if restored_result:
|
178 |
+
result = restored_result
|
179 |
+
else:
|
180 |
+
print("VAD detected no speech segments in the audio.")
|
181 |
|
182 |
if params.is_diarize:
|
183 |
+
progress(0.99, desc="Diarizing speakers...")
|
184 |
result, elapsed_time_diarization = self.diarizer.run(
|
185 |
+
audio=origin_audio,
|
186 |
use_auth_token=params.hf_token,
|
187 |
transcribed_result=result,
|
188 |
+
device=params.diarization_device
|
189 |
)
|
190 |
+
if params.diarization_enable_offload:
|
191 |
+
self.diarizer.offload()
|
192 |
+
|
193 |
+
self.cache_parameters(
|
194 |
+
whisper_params=params,
|
195 |
+
add_timestamp=add_timestamp
|
196 |
+
)
|
197 |
+
|
198 |
+
if not result:
|
199 |
+
print(f"Whisper did not detected any speech segments in the audio.")
|
200 |
+
result = list()
|
201 |
+
|
202 |
+
progress(1.0, desc="Processing done!")
|
203 |
+
total_elapsed_time = time.time() - start_time
|
204 |
return result, elapsed_time
|
205 |
|
206 |
def transcribe_file(self,
|