Spaces:
Runtime error
Runtime error
""" | |
Speech-to-text module based on Vosk and Whisper for SillyTavern Extras | |
- Vosk website: https://alphacephei.com/vosk/ | |
- Vosk api: https://github.com/alphacep/vosk-api | |
- Whisper github: https://github.com/openai/whisper | |
Authors: | |
- Tony Ribeiro (https://github.com/Tony-sama) | |
Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper and C:/Users/toto/.cache/vosk | |
References: | |
- Code adapted from: | |
- whisper github: https://github.com/openai/whisper | |
- oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui | |
- vosk github: https://github.com/alphacep/vosk-api/blob/master/python/example/test_microphone.py | |
""" | |
from flask import jsonify, abort | |
import queue | |
import sys | |
import sounddevice as sd | |
import soundfile as sf | |
import io | |
import numpy as np | |
from scipy.io.wavfile import write | |
import vosk | |
import whisper | |
DEBUG_PREFIX = "<stt streaming module>" | |
RECORDING_FILE_PATH = "stt_test.wav" | |
whisper_model = None | |
vosk_model = None | |
device = None | |
def load_model(file_path=None): | |
""" | |
Load given vosk model from file or default to en-us model. | |
Download model to user cache folder, example: C:/Users/toto/.cache/vosk | |
""" | |
if file_path is None: | |
return (whisper.load_model("base.en"), vosk.Model(lang="en-us")) | |
else: | |
return (whisper.load_model(file_path), vosk.Model(lang="en-us")) | |
def convert_bytearray_to_wav_ndarray(input_bytearray: bytes, sampling_rate=16000): | |
""" | |
Convert a bytearray to wav format to output in a file for quality check debuging | |
""" | |
bytes_wav = bytes() | |
byte_io = io.BytesIO(bytes_wav) | |
write(byte_io, sampling_rate, np.frombuffer(input_bytearray, dtype=np.int16)) | |
output_wav = byte_io.read() | |
output, _ = sf.read(io.BytesIO(output_wav)) | |
return output | |
def record_and_transcript(): | |
""" | |
Continuously record from mic and transcript voice. | |
Return the transcript once no more voice is detected. | |
""" | |
if whisper_model is None: | |
print(DEBUG_PREFIX,"Whisper model not initialized yet.") | |
return "" | |
q = queue.Queue() | |
stream_errors = list() | |
def callback(indata, frames, time, status): | |
"""This is called (from a separate thread) for each audio block.""" | |
if status: | |
print(status, file=sys.stderr) | |
stream_errors.append(status) | |
q.put(bytes(indata)) | |
try: | |
device_info = sd.query_devices(device, "input") | |
# soundfile expects an int, sounddevice provides a float: | |
samplerate = int(device_info["default_samplerate"]) | |
print(DEBUG_PREFIX, "Start recording from:", device_info["name"], "with samplerate", samplerate) | |
with sd.RawInputStream(samplerate=samplerate, blocksize = 8000, device=device, dtype="int16", channels=1, callback=callback): | |
rec = vosk.KaldiRecognizer(vosk_model, samplerate) | |
full_recording = bytearray() | |
while True: | |
data = q.get() | |
if len(stream_errors) > 0: | |
raise Exception(DEBUG_PREFIX+" Stream errors: "+str(stream_errors)) | |
full_recording.extend(data) | |
if rec.AcceptWaveform(data): | |
# Extract transcript string | |
transcript = rec.Result()[14:-3] | |
print(DEBUG_PREFIX, "Transcripted from microphone stream (vosk):", transcript) | |
# ---------------------------------- | |
# DEBUG: save recording to wav file | |
# ---------------------------------- | |
output_file = convert_bytearray_to_wav_ndarray(input_bytearray=full_recording, sampling_rate=samplerate) | |
sf.write(file=RECORDING_FILE_PATH, data=output_file, samplerate=samplerate) | |
print(DEBUG_PREFIX, "Recorded message saved to", RECORDING_FILE_PATH) | |
# Whisper HACK | |
result = whisper_model.transcribe(RECORDING_FILE_PATH) | |
transcript = result["text"] | |
print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) | |
# ---------------------------------- | |
return jsonify({"transcript": transcript}) | |
#else: | |
# print(rec.PartialResult()) | |
except Exception as e: # No exception observed during test but we never know | |
print(e) | |
abort(500, DEBUG_PREFIX+" Exception occurs while recording") |