LAP-DEV commited on
Commit
3a41d39
·
verified ·
1 Parent(s): e7f9f55

Update modules/whisper/whisper_base.py

Browse files
Files changed (1) hide show
  1. modules/whisper/whisper_base.py +37 -9
modules/whisper/whisper_base.py CHANGED
@@ -9,6 +9,7 @@ import numpy as np
9
  from datetime import datetime
10
  from faster_whisper.vad import VadOptions
11
  from dataclasses import astuple
 
12
 
13
  from modules.uvr.music_separator import MusicSeparator
14
  from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
@@ -99,13 +100,10 @@ class WhisperBase(ABC):
99
  elapsed_time: float
100
  elapsed time for running
101
  """
 
 
102
  params = WhisperParameters.as_value(*whisper_params)
103
 
104
- self.cache_parameters(
105
- whisper_params=params,
106
- add_timestamp=add_timestamp
107
- )
108
-
109
  if params.lang is None:
110
  pass
111
  elif params.lang == "Automatic Detection":
@@ -134,12 +132,16 @@ class WhisperBase(ABC):
134
 
135
  if params.uvr_enable_offload:
136
  self.music_separator.offload()
 
137
 
 
 
138
  if params.vad_filter:
139
  # Explicit value set for float('inf') from gr.Number()
140
  if params.max_speech_duration_s is None or params.max_speech_duration_s >= 9999:
141
  params.max_speech_duration_s = float('inf')
142
 
 
143
  vad_options = VadOptions(
144
  threshold=params.threshold,
145
  min_speech_duration_ms=params.min_speech_duration_ms,
@@ -148,31 +150,57 @@ class WhisperBase(ABC):
148
  speech_pad_ms=params.speech_pad_ms
149
  )
150
 
151
- audio, speech_chunks = self.vad.run(
152
  audio=audio,
153
  vad_parameters=vad_options,
154
  progress=progress
155
  )
156
 
 
 
 
 
 
157
  result, elapsed_time = self.transcribe(
158
  audio,
159
  progress,
160
  *astuple(params)
161
  )
 
 
162
 
163
  if params.vad_filter:
164
- result = self.vad.restore_speech_timestamps(
165
  segments=result,
166
  speech_chunks=speech_chunks,
167
  )
 
 
 
 
168
 
169
  if params.is_diarize:
 
170
  result, elapsed_time_diarization = self.diarizer.run(
171
- audio=audio,
172
  use_auth_token=params.hf_token,
173
  transcribed_result=result,
 
174
  )
175
- elapsed_time += elapsed_time_diarization
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  return result, elapsed_time
177
 
178
  def transcribe_file(self,
 
9
  from datetime import datetime
10
  from faster_whisper.vad import VadOptions
11
  from dataclasses import astuple
12
+ from copy import deepcopy
13
 
14
  from modules.uvr.music_separator import MusicSeparator
15
  from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
 
100
  elapsed_time: float
101
  elapsed time for running
102
  """
103
+
104
+ start_time = time.time()
105
  params = WhisperParameters.as_value(*whisper_params)
106
 
 
 
 
 
 
107
  if params.lang is None:
108
  pass
109
  elif params.lang == "Automatic Detection":
 
132
 
133
  if params.uvr_enable_offload:
134
  self.music_separator.offload()
135
+ elapsed_time_bgm_sep = time.time() - start_time
136
 
137
+ origin_audio = deepcopy(audio)
138
+
139
  if params.vad_filter:
140
  # Explicit value set for float('inf') from gr.Number()
141
  if params.max_speech_duration_s is None or params.max_speech_duration_s >= 9999:
142
  params.max_speech_duration_s = float('inf')
143
 
144
+ progress(0, desc="Filtering silent parts from audio...")
145
  vad_options = VadOptions(
146
  threshold=params.threshold,
147
  min_speech_duration_ms=params.min_speech_duration_ms,
 
150
  speech_pad_ms=params.speech_pad_ms
151
  )
152
 
153
+ vad_processed, speech_chunks = self.vad.run(
154
  audio=audio,
155
  vad_parameters=vad_options,
156
  progress=progress
157
  )
158
 
159
+ if vad_processed.size > 0:
160
+ audio = vad_processed
161
+ else:
162
+ vad_params.vad_filter = False
163
+
164
  result, elapsed_time = self.transcribe(
165
  audio,
166
  progress,
167
  *astuple(params)
168
  )
169
+ if params.whisper_enable_offload:
170
+ self.offload()
171
 
172
  if params.vad_filter:
173
+ restored_result = self.vad.restore_speech_timestamps(
174
  segments=result,
175
  speech_chunks=speech_chunks,
176
  )
177
+ if restored_result:
178
+ result = restored_result
179
+ else:
180
+ print("VAD detected no speech segments in the audio.")
181
 
182
  if params.is_diarize:
183
+ progress(0.99, desc="Diarizing speakers...")
184
  result, elapsed_time_diarization = self.diarizer.run(
185
+ audio=origin_audio,
186
  use_auth_token=params.hf_token,
187
  transcribed_result=result,
188
+ device=params.diarization_device
189
  )
190
+ if params.diarization_enable_offload:
191
+ self.diarizer.offload()
192
+
193
+ self.cache_parameters(
194
+ whisper_params=params,
195
+ add_timestamp=add_timestamp
196
+ )
197
+
198
+ if not result:
199
+ print(f"Whisper did not detected any speech segments in the audio.")
200
+ result = list()
201
+
202
+ progress(1.0, desc="Processing done!")
203
+ total_elapsed_time = time.time() - start_time
204
  return result, elapsed_time
205
 
206
  def transcribe_file(self,