import { getPreviewString, saveTtsProviderSettings } from './index.js'; export { TtsWebuiProvider }; class TtsWebuiProvider { settings; voices = []; separator = ' . '; audioElement = document.createElement('audio'); audioContext = null; audioWorkletNode = null; currentVolume = 1.0; // Track current volume defaultSettings = { voiceMap: {}, model: 'chatterbox', speed: 1, volume: 1.0, available_voices: [''], provider_endpoint: 'http://127.0.0.1:7778/v1/audio/speech', streaming: true, stream_chunk_size: 100, desired_length: 80, max_length: 200, halve_first_chunk: true, exaggeration: 0.5, cfg_weight: 0.5, temperature: 0.8, device: 'auto', dtype: 'float32', cpu_offload: false, chunked: true, cache_voice: false, tokens_per_slice: 1000, remove_milliseconds: 45, remove_milliseconds_start: 25, chunk_overlap_method: 'zero', seed: -1, }; get settingsHtml() { let html = `

TTS WebUI Settings

Provider Endpoint:

Model:

Available Voices (comma separated):

Streaming

Volume: ${this.defaultSettings.volume}

Generation Settings

Exaggeration: ${this.defaultSettings.exaggeration}

CFG Weight: ${this.defaultSettings.cfg_weight}

Temperature: ${this.defaultSettings.temperature}

Seed (-1 for random):

Chunking

Split prompt into chunks

Halve First Chunk

Desired Length: ${this.defaultSettings.desired_length}

Max Length: ${this.defaultSettings.max_length}

Model

Device:

Data Type:

CPU Offload

Streaming (Advanced Settings)

Tokens Per Slice: ${this.defaultSettings.tokens_per_slice}

Chunk Overlap Method:

Remove Milliseconds: ${this.defaultSettings.remove_milliseconds}

Remove Milliseconds Start: ${this.defaultSettings.remove_milliseconds_start}

`; return html; } async loadSettings(settings) { // Populate Provider UI given input settings if (Object.keys(settings).length == 0) { console.info('Using default TTS Provider settings'); } // Only accept keys defined in defaultSettings this.settings = this.defaultSettings; for (const key in settings) { if (key in this.settings) { this.settings[key] = settings[key]; } else { throw `Invalid setting passed to TTS Provider: ${key}`; } } $('#tts_webui_endpoint').val(this.settings.provider_endpoint); $('#tts_webui_endpoint').on('input', () => { this.onSettingsChange(); }); $('#tts_webui_model').val(this.settings.model); $('#tts_webui_model').on('input', () => { this.onSettingsChange(); }); $('#tts_webui_voices').val(this.settings.available_voices.join()); $('#tts_webui_voices').on('input', () => { this.onSettingsChange(); }); $('#tts_webui_streaming').prop('checked', this.settings.streaming); $('#tts_webui_streaming').on('change', () => { this.onSettingsChange(); }); $('#tts_webui_volume').val(this.settings.volume); $('#tts_webui_volume').on('input', () => { this.onSettingsChange(); }); $('#tts_webui_stream_chunk_size').val(this.settings.stream_chunk_size); $('#tts_webui_stream_chunk_size').on('input', () => { this.onSettingsChange(); }); $('#tts_webui_desired_length').val(this.settings.desired_length); $('#tts_webui_desired_length').on('input', () => { this.onSettingsChange(); }); $('#tts_webui_max_length').val(this.settings.max_length); $('#tts_webui_max_length').on('input', () => { this.onSettingsChange(); }); $('#tts_webui_halve_first_chunk').prop('checked', this.settings.halve_first_chunk); $('#tts_webui_halve_first_chunk').on('change', () => { this.onSettingsChange(); }); $('#tts_webui_exaggeration').val(this.settings.exaggeration); $('#tts_webui_exaggeration').on('input', () => { this.onSettingsChange(); }); $('#tts_webui_cfg_weight').val(this.settings.cfg_weight); $('#tts_webui_cfg_weight').on('input', () => { this.onSettingsChange(); }); $('#tts_webui_temperature').val(this.settings.temperature); $('#tts_webui_temperature').on('input', () => { this.onSettingsChange(); }); $('#tts_webui_device').val(this.settings.device); $('#tts_webui_device').on('change', () => { this.onSettingsChange(); }); $('#tts_webui_dtype').val(this.settings.dtype); $('#tts_webui_dtype').on('change', () => { this.onSettingsChange(); }); $('#tts_webui_cpu_offload').prop('checked', this.settings.cpu_offload); $('#tts_webui_cpu_offload').on('change', () => { this.onSettingsChange(); }); $('#tts_webui_chunked').prop('checked', this.settings.chunked); $('#tts_webui_chunked').on('change', () => { this.onSettingsChange(); }); $('#tts_webui_tokens_per_slice').val(this.settings.tokens_per_slice); $('#tts_webui_tokens_per_slice').on('input', () => { this.onSettingsChange(); }); $('#tts_webui_remove_milliseconds').val(this.settings.remove_milliseconds); $('#tts_webui_remove_milliseconds').on('input', () => { this.onSettingsChange(); }); $('#tts_webui_remove_milliseconds_start').val(this.settings.remove_milliseconds_start); $('#tts_webui_remove_milliseconds_start').on('input', () => { this.onSettingsChange(); }); $('#tts_webui_chunk_overlap_method').val(this.settings.chunk_overlap_method); $('#tts_webui_chunk_overlap_method').on('change', () => { this.onSettingsChange(); }); $('#tts_webui_seed').val(this.settings.seed); $('#tts_webui_seed').on('input', () => { this.onSettingsChange(); }); // Update output labels $('#tts_webui_volume_output').text(this.settings.volume); $('#tts_webui_desired_length_output').text(this.settings.desired_length); $('#tts_webui_max_length_output').text(this.settings.max_length); $('#tts_webui_exaggeration_output').text(this.settings.exaggeration); $('#tts_webui_cfg_weight_output').text(this.settings.cfg_weight); $('#tts_webui_temperature_output').text(this.settings.temperature); $('#tts_webui_tokens_per_slice_output').text(this.settings.tokens_per_slice); $('#tts_webui_remove_milliseconds_output').text(this.settings.remove_milliseconds); $('#tts_webui_remove_milliseconds_start_output').text(this.settings.remove_milliseconds_start); await this.checkReady(); console.debug('OpenAI Compatible TTS: Settings loaded'); } onSettingsChange() { // Update dynamically this.settings.provider_endpoint = String($('#tts_webui_endpoint').val()); this.settings.model = String($('#tts_webui_model').val()); this.settings.available_voices = String($('#tts_webui_voices').val()).split(','); this.settings.volume = Number($('#tts_webui_volume').val()); this.settings.streaming = $('#tts_webui_streaming').is(':checked'); this.settings.stream_chunk_size = Number($('#tts_webui_stream_chunk_size').val()); this.settings.desired_length = Number($('#tts_webui_desired_length').val()); this.settings.max_length = Number($('#tts_webui_max_length').val()); this.settings.halve_first_chunk = $('#tts_webui_halve_first_chunk').is(':checked'); this.settings.exaggeration = Number($('#tts_webui_exaggeration').val()); this.settings.cfg_weight = Number($('#tts_webui_cfg_weight').val()); this.settings.temperature = Number($('#tts_webui_temperature').val()); this.settings.device = String($('#tts_webui_device').val()); this.settings.dtype = String($('#tts_webui_dtype').val()); this.settings.cpu_offload = $('#tts_webui_cpu_offload').is(':checked'); this.settings.chunked = $('#tts_webui_chunked').is(':checked'); this.settings.tokens_per_slice = Number($('#tts_webui_tokens_per_slice').val()); this.settings.remove_milliseconds = Number($('#tts_webui_remove_milliseconds').val()); this.settings.remove_milliseconds_start = Number($('#tts_webui_remove_milliseconds_start').val()); this.settings.chunk_overlap_method = String($('#tts_webui_chunk_overlap_method').val()); this.settings.seed = parseInt($('#tts_webui_seed').val()) || -1; // Apply volume change immediately this.setVolume(this.settings.volume); // Update output labels $('#tts_webui_volume_output').text(this.settings.volume); $('#tts_webui_desired_length_output').text(this.settings.desired_length); $('#tts_webui_max_length_output').text(this.settings.max_length); $('#tts_webui_exaggeration_output').text(this.settings.exaggeration); $('#tts_webui_cfg_weight_output').text(this.settings.cfg_weight); $('#tts_webui_temperature_output').text(this.settings.temperature); $('#tts_webui_tokens_per_slice_output').text(this.settings.tokens_per_slice); $('#tts_webui_remove_milliseconds_output').text(this.settings.remove_milliseconds); $('#tts_webui_remove_milliseconds_start_output').text(this.settings.remove_milliseconds_start); saveTtsProviderSettings(); } async checkReady() { await this.fetchTtsVoiceObjects(); } async onRefreshClick() { await this.fetchTtsVoiceObjects(); console.info('TTS voices refreshed'); } async getVoice(voiceName) { if (this.voices.length == 0) { this.voices = await this.fetchTtsVoiceObjects(); } const match = this.voices.filter( oaicVoice => oaicVoice.name == voiceName, )[0]; if (!match) { throw `TTS Voice name ${voiceName} not found`; } return match; } async generateTts(text, voiceId) { const response = await this.fetchTtsGeneration(text, voiceId); if (this.settings.streaming) { // Stream audio in real-time await this.processStreamingAudio(response); // Return empty string since audio is already played via AudioWorklet return ''; } return response; } async fetchTtsVoiceObjects() { // Try to fetch voices from the provider endpoint try { const voicesEndpoint = this.settings.provider_endpoint.replace('/speech', '/voices/' + this.settings.model); const response = await fetch(voicesEndpoint); if (!response.ok) { throw new Error(`HTTP ${response.status}`); } const responseJson = await response.json(); console.info('Discovered voices from provider:', responseJson); this.voices = responseJson.voices.map(({ value, label }) => ({ name: label, voice_id: value, lang: 'en-US', })); return this.voices; } catch (error) { console.warn('Voice discovery failed, using configured voices:', error); } // Fallback to configured voices this.voices = this.settings.available_voices.map(name => ({ name, voice_id: name, lang: 'en-US', })); return this.voices; } async initAudioWorklet(wavSampleRate) { this.audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: wavSampleRate }); // Load the PCM processor from separate file const processorUrl = './scripts/extensions/tts/lib/pcm-processor.js'; await this.audioContext.audioWorklet.addModule(processorUrl); this.audioWorkletNode = new AudioWorkletNode(this.audioContext, 'pcm-processor'); this.audioWorkletNode.connect(this.audioContext.destination); } parseWavHeader(buffer) { const view = new DataView(buffer); // Sample rate is at bytes 24-27 (little endian) const sampleRate = view.getUint32(24, true); // Number of channels is at bytes 22-23 (little endian) const channels = view.getUint16(22, true); // Bits per sample is at bytes 34-35 (little endian) const bitsPerSample = view.getUint16(34, true); return { sampleRate, channels, bitsPerSample }; } async processStreamingAudio(response) { if (!response.ok) { throw new Error(`HTTP ${response.status}`); } const reader = response.body.getReader(); let headerParsed = false; let wavInfo = null; const processStream = async ({ done, value }) => { if (done) { return; } if (!headerParsed) { // Parse WAV header to get sample rate wavInfo = this.parseWavHeader(value.buffer); console.log('WAV Info:', wavInfo); // Initialize AudioWorklet with correct sample rate await this.initAudioWorklet(wavInfo.sampleRate); // Skip WAV header (first 44 bytes typically) const pcmData = value.slice(44); this.audioWorkletNode.port.postMessage({ pcmData }); headerParsed = true; const next = await reader.read(); return processStream(next); } // Send PCM data to AudioWorklet for immediate playback this.audioWorkletNode.port.postMessage({ pcmData: value }); const next = await reader.read(); return processStream(next); }; const firstChunk = await reader.read(); await processStream(firstChunk); } async previewTtsVoice(voiceId) { this.audioElement.pause(); this.audioElement.currentTime = 0; const text = getPreviewString('en-US'); const response = await this.fetchTtsGeneration(text, voiceId); if (this.settings.streaming) { // Use shared streaming method await this.processStreamingAudio(response); } else { // For non-streaming, response is a fetch Response object if (!response.ok) { throw new Error(`HTTP ${response.status}`); } const audio = await response.blob(); const url = URL.createObjectURL(audio); this.audioElement.src = url; this.audioElement.play(); this.audioElement.onended = () => URL.revokeObjectURL(url); } } async fetchTtsGeneration(inputText, voiceId) { console.info(`Generating new TTS for voice_id ${voiceId}`); const settings = this.settings; const streaming = settings.streaming; const chatterboxParams = [ 'desired_length', 'max_length', 'halve_first_chunk', 'exaggeration', 'cfg_weight', 'temperature', 'device', 'dtype', 'cpu_offload', 'chunked', 'cache_voice', 'tokens_per_slice', 'remove_milliseconds', 'remove_milliseconds_start', 'chunk_overlap_method', 'seed', ]; const getParams = settings => Object.fromEntries( Object.entries(settings).filter(([key]) => chatterboxParams.includes(key), ), ); const requestBody = { model: settings.model, voice: voiceId, input: inputText, response_format: 'wav', speed: settings.speed, stream: streaming, params: getParams(settings), }; const headers = { 'Content-Type': 'application/json', 'Cache-Control': streaming ? 'no-cache' : undefined, }; if (streaming) { headers['Cache-Control'] = 'no-cache'; } const response = await fetch(settings.provider_endpoint, { method: 'POST', headers, body: JSON.stringify(requestBody), }); if (!response.ok) { toastr.error(response.statusText, 'TTS Generation Failed'); throw new Error( `HTTP ${response.status}: ${await response.text()}`, ); } return response; } setVolume(volume) { // Clamp volume between 0.0 and 2.0 (0% to 200%) this.currentVolume = Math.max(0, Math.min(2.0, volume)); // Set volume for regular audio element (non-streaming) this.audioElement.volume = Math.min(this.currentVolume, 1.0); // HTML audio element max is 1.0 // Set volume for AudioWorklet (streaming) if (this.audioWorkletNode) { this.audioWorkletNode.port.postMessage({ volume: this.currentVolume }); } } }