Spaces:
Build error
Build error
Split streaming audio into chunks
Browse files
app.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
import datetime
|
2 |
import os
|
|
|
|
|
|
|
3 |
os.system('pip install git+https://github.com/openai/whisper.git')
|
4 |
import gradio as gr
|
5 |
import wave
|
@@ -11,7 +14,7 @@ import torchaudio.functional as F
|
|
11 |
LOGGING_FORMAT = '%(asctime)s %(message)s'
|
12 |
logging.basicConfig(format=LOGGING_FORMAT,level=logging.INFO)
|
13 |
|
14 |
-
REC_INTERVAL_IN_SECONDS =
|
15 |
|
16 |
# tmp dir to store audio files.
|
17 |
if not os.path.isdir('./tmp/'):
|
@@ -31,7 +34,7 @@ class WhisperStreaming():
|
|
31 |
def transcribe_audio_file(self, wave_file_path):
|
32 |
waveform, sample_rate = torchaudio.load(wave_file_path)
|
33 |
resampled_waveform = F.resample(waveform, sample_rate, self.whisper_sample_rate, lowpass_filter_width=6)
|
34 |
-
audio_tmp = whisper.pad_or_trim(resampled_waveform[0])
|
35 |
mel = whisper.log_mel_spectrogram(audio_tmp)
|
36 |
results = self.whisper_model.decode(mel, self.decode_option)
|
37 |
return results
|
@@ -64,27 +67,53 @@ whisper_model = WhisperStreaming(model_name='base', language='en', fp16=False)
|
|
64 |
|
65 |
def transcribe(audio, state={}):
|
66 |
logging.info(f'Transcribe audio file {audio}')
|
|
|
67 |
logging.info(state)
|
|
|
|
|
|
|
|
|
68 |
|
69 |
if not state:
|
70 |
-
state['
|
71 |
-
state['result_text'] = 'Waitting...'
|
72 |
state['count'] = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
else:
|
74 |
-
state['concated_audio'] = concat_multiple_wav_files([state['concated_audio'], audio])
|
75 |
-
state['count'] += 1
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
80 |
logging.info('complete transcribe.......')
|
81 |
-
state['result_text'] = result.text
|
82 |
-
logging.info('The text is:' + state['result_text'])
|
83 |
else:
|
84 |
-
logging.info(f'The
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
-
return state['
|
87 |
|
|
|
|
|
88 |
|
89 |
gr.Interface(fn=transcribe,
|
90 |
inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],
|
|
|
1 |
import datetime
|
2 |
import os
|
3 |
+
|
4 |
+
from whisper.audio import N_SAMPLES, CHUNK_LENGTH
|
5 |
+
|
6 |
os.system('pip install git+https://github.com/openai/whisper.git')
|
7 |
import gradio as gr
|
8 |
import wave
|
|
|
14 |
LOGGING_FORMAT = '%(asctime)s %(message)s'
|
15 |
logging.basicConfig(format=LOGGING_FORMAT,level=logging.INFO)
|
16 |
|
17 |
+
REC_INTERVAL_IN_SECONDS = 4
|
18 |
|
19 |
# tmp dir to store audio files.
|
20 |
if not os.path.isdir('./tmp/'):
|
|
|
34 |
def transcribe_audio_file(self, wave_file_path):
|
35 |
waveform, sample_rate = torchaudio.load(wave_file_path)
|
36 |
resampled_waveform = F.resample(waveform, sample_rate, self.whisper_sample_rate, lowpass_filter_width=6)
|
37 |
+
audio_tmp = whisper.pad_or_trim(resampled_waveform[0], length=N_SAMPLES)
|
38 |
mel = whisper.log_mel_spectrogram(audio_tmp)
|
39 |
results = self.whisper_model.decode(mel, self.decode_option)
|
40 |
return results
|
|
|
67 |
|
68 |
def transcribe(audio, state={}):
|
69 |
logging.info(f'Transcribe audio file {audio}')
|
70 |
+
print('=====================')
|
71 |
logging.info(state)
|
72 |
+
# Whisper only take maximum 30s of audio as input.
|
73 |
+
# And the gradio streaming does not guarantee each callback is 1s, so -2 as buffer
|
74 |
+
# After count reach 28 * n, a new audio file is created.
|
75 |
+
# However the text should not change.
|
76 |
|
77 |
if not state:
|
78 |
+
state['all_chunk_texts'] = 'Waitting...'
|
|
|
79 |
state['count'] = 0
|
80 |
+
state['chunks'] = {}
|
81 |
+
return state['all_chunk_texts'], state
|
82 |
+
|
83 |
+
chunk = state['count'] // (CHUNK_LENGTH - 2)
|
84 |
+
chunk_offset = state['count'] % (CHUNK_LENGTH - 2)
|
85 |
+
|
86 |
+
if chunk_offset == 0:
|
87 |
+
state['chunks'][chunk] = {}
|
88 |
+
state['chunks'][chunk]['concated_audio'] = audio
|
89 |
+
state['chunks'][chunk]['result_text'] = ''
|
90 |
else:
|
91 |
+
state['chunks'][chunk]['concated_audio'] = concat_multiple_wav_files([state['chunks'][chunk]['concated_audio'], audio])
|
|
|
92 |
|
93 |
+
state['count'] += 1
|
94 |
+
|
95 |
+
# Determin if recognizes current chunk.
|
96 |
+
if (chunk_offset + 1) % REC_INTERVAL_IN_SECONDS == 0 and chunk_offset > 0:
|
97 |
+
logging.info(f'start to transcribe chunk: {chunk}, offset: {chunk_offset}')
|
98 |
+
result = whisper_model.transcribe_audio_file(state['chunks'][chunk]['concated_audio'])
|
99 |
logging.info('complete transcribe.......')
|
100 |
+
state['chunks'][chunk]['result_text'] = result.text
|
101 |
+
logging.info('The text is:' + state['chunks'][chunk]['result_text'])
|
102 |
else:
|
103 |
+
logging.info(f'The offset of streaming chunk is {chunk_offset}, and skip speech recognition')
|
104 |
+
|
105 |
+
# Concat result_texts of all chunks
|
106 |
+
result_texts = ''
|
107 |
+
|
108 |
+
for tmp_chunk_idx, tmp_chunk_values in state['chunks'].items():
|
109 |
+
result_texts += tmp_chunk_values['result_text']
|
110 |
+
|
111 |
+
state['all_chunk_texts'] = result_texts
|
112 |
|
113 |
+
return state['all_chunk_texts'], state
|
114 |
|
115 |
+
# Make sure not missing any audio clip.
|
116 |
+
assert (CHUNK_LENGTH - 2) % REC_INTERVAL_IN_SECONDS == 0
|
117 |
|
118 |
gr.Interface(fn=transcribe,
|
119 |
inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],
|