<script lang="ts"> import { toast } from 'svelte-sonner'; import { createEventDispatcher, onMount, getContext } from 'svelte'; const dispatch = createEventDispatcher(); import { getBackendConfig } from '$lib/apis'; import { getAudioConfig, updateAudioConfig, getModels as _getModels, getVoices as _getVoices } from '$lib/apis/audio'; import { config } from '$lib/stores'; import SensitiveInput from '$lib/components/common/SensitiveInput.svelte'; import { TTS_RESPONSE_SPLIT } from '$lib/types'; import type { Writable } from 'svelte/store'; import type { i18n as i18nType } from 'i18next'; const i18n = getContext<Writable<i18nType>>('i18n'); export let saveHandler: () => void; // Audio let TTS_OPENAI_API_BASE_URL = ''; let TTS_OPENAI_API_KEY = ''; let TTS_API_KEY = ''; let TTS_ENGINE = ''; let TTS_MODEL = ''; let TTS_VOICE = ''; let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION; let TTS_AZURE_SPEECH_REGION = ''; let TTS_AZURE_SPEECH_OUTPUT_FORMAT = ''; let STT_OPENAI_API_BASE_URL = ''; let STT_OPENAI_API_KEY = ''; let STT_ENGINE = ''; let STT_MODEL = ''; let STT_WHISPER_MODEL = ''; let STT_WHISPER_MODEL_LOADING = false; // eslint-disable-next-line no-undef let voices: SpeechSynthesisVoice[] = []; let models: Awaited<ReturnType<typeof _getModels>>['models'] = []; const getModels = async () => { if (TTS_ENGINE === '') { models = []; } else { const res = await _getModels(localStorage.token).catch((e) => { toast.error(e); }); if (res) { console.log(res); models = res.models; } } }; const getVoices = async () => { if (TTS_ENGINE === '') { const getVoicesLoop = setInterval(() => { voices = speechSynthesis.getVoices(); // do your loop if (voices.length > 0) { clearInterval(getVoicesLoop); voices.sort((a, b) => a.name.localeCompare(b.name, $i18n.resolvedLanguage)); } }, 100); } else { const res = await _getVoices(localStorage.token).catch((e) => { toast.error(e); }); if (res) { console.log(res); voices = res.voices; voices.sort((a, b) => a.name.localeCompare(b.name, $i18n.resolvedLanguage)); } } }; const updateConfigHandler = async () => { const res = await updateAudioConfig(localStorage.token, { tts: { OPENAI_API_BASE_URL: TTS_OPENAI_API_BASE_URL, OPENAI_API_KEY: TTS_OPENAI_API_KEY, API_KEY: TTS_API_KEY, ENGINE: TTS_ENGINE, MODEL: TTS_MODEL, VOICE: TTS_VOICE, SPLIT_ON: TTS_SPLIT_ON, AZURE_SPEECH_REGION: TTS_AZURE_SPEECH_REGION, AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT }, stt: { OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL, OPENAI_API_KEY: STT_OPENAI_API_KEY, ENGINE: STT_ENGINE, MODEL: STT_MODEL, WHISPER_MODEL: STT_WHISPER_MODEL } }); if (res) { saveHandler(); config.set(await getBackendConfig()); } }; const sttModelUpdateHandler = async () => { STT_WHISPER_MODEL_LOADING = true; await updateConfigHandler(); STT_WHISPER_MODEL_LOADING = false; }; onMount(async () => { const res = await getAudioConfig(localStorage.token); if (res) { console.log(res); TTS_OPENAI_API_BASE_URL = res.tts.OPENAI_API_BASE_URL; TTS_OPENAI_API_KEY = res.tts.OPENAI_API_KEY; TTS_API_KEY = res.tts.API_KEY; TTS_ENGINE = res.tts.ENGINE; TTS_MODEL = res.tts.MODEL; TTS_VOICE = res.tts.VOICE; TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION; TTS_AZURE_SPEECH_OUTPUT_FORMAT = res.tts.AZURE_SPEECH_OUTPUT_FORMAT; TTS_AZURE_SPEECH_REGION = res.tts.AZURE_SPEECH_REGION; STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL; STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY; STT_ENGINE = res.stt.ENGINE; STT_MODEL = res.stt.MODEL; STT_WHISPER_MODEL = res.stt.WHISPER_MODEL; } await getVoices(); await getModels(); }); </script> <form class="flex flex-col h-full justify-between space-y-3 text-sm" on:submit|preventDefault={async () => { await updateConfigHandler(); dispatch('save'); }} > <div class=" space-y-3 overflow-y-scroll scrollbar-hidden h-full"> <div class="flex flex-col gap-3"> <div> <div class=" mb-1 text-sm font-medium">{$i18n.t('STT Settings')}</div> <div class=" py-0.5 flex w-full justify-between"> <div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div> <div class="flex items-center relative"> <select class="dark:bg-gray-900 cursor-pointer w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right" bind:value={STT_ENGINE} placeholder="Select an engine" > <option value="">{$i18n.t('Whisper (Local)')}</option> <option value="openai">OpenAI</option> <option value="web">{$i18n.t('Web API')}</option> </select> </div> </div> {#if STT_ENGINE === 'openai'} <div> <div class="mt-1 flex gap-2 mb-1"> <input class="flex-1 w-full bg-transparent outline-none" placeholder={$i18n.t('API Base URL')} bind:value={STT_OPENAI_API_BASE_URL} required /> <SensitiveInput placeholder={$i18n.t('API Key')} bind:value={STT_OPENAI_API_KEY} /> </div> </div> <hr class=" dark:border-gray-850 my-2" /> <div> <div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div> <div class="flex w-full"> <div class="flex-1"> <input list="model-list" class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" bind:value={STT_MODEL} placeholder="Select a model" /> <datalist id="model-list"> <option value="whisper-1" /> </datalist> </div> </div> </div> {:else if STT_ENGINE === ''} <div> <div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div> <div class="flex w-full"> <div class="flex-1 mr-2"> <input class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" placeholder={$i18n.t('Set whisper model')} bind:value={STT_WHISPER_MODEL} /> </div> <button class="px-2.5 bg-gray-50 hover:bg-gray-200 text-gray-800 dark:bg-gray-850 dark:hover:bg-gray-800 dark:text-gray-100 rounded-lg transition" on:click={() => { sttModelUpdateHandler(); }} disabled={STT_WHISPER_MODEL_LOADING} > {#if STT_WHISPER_MODEL_LOADING} <div class="self-center"> <svg class=" w-4 h-4" viewBox="0 0 24 24" fill="currentColor" xmlns="http://www.w3.org/2000/svg" > <style> .spinner_ajPY { transform-origin: center; animation: spinner_AtaB 0.75s infinite linear; } @keyframes spinner_AtaB { 100% { transform: rotate(360deg); } } </style> <path d="M12,1A11,11,0,1,0,23,12,11,11,0,0,0,12,1Zm0,19a8,8,0,1,1,8-8A8,8,0,0,1,12,20Z" opacity=".25" /> <path d="M10.14,1.16a11,11,0,0,0-9,8.92A1.59,1.59,0,0,0,2.46,12,1.52,1.52,0,0,0,4.11,10.7a8,8,0,0,1,6.66-6.61A1.42,1.42,0,0,0,12,2.69h0A1.57,1.57,0,0,0,10.14,1.16Z" class="spinner_ajPY" /> </svg> </div> {:else} <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" fill="currentColor" class="w-4 h-4" > <path d="M8.75 2.75a.75.75 0 0 0-1.5 0v5.69L5.03 6.22a.75.75 0 0 0-1.06 1.06l3.5 3.5a.75.75 0 0 0 1.06 0l3.5-3.5a.75.75 0 0 0-1.06-1.06L8.75 8.44V2.75Z" /> <path d="M3.5 9.75a.75.75 0 0 0-1.5 0v1.5A2.75 2.75 0 0 0 4.75 14h6.5A2.75 2.75 0 0 0 14 11.25v-1.5a.75.75 0 0 0-1.5 0v1.5c0 .69-.56 1.25-1.25 1.25h-6.5c-.69 0-1.25-.56-1.25-1.25v-1.5Z" /> </svg> {/if} </button> </div> <div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500"> {$i18n.t(`Open WebUI uses faster-whisper internally.`)} <a class=" hover:underline dark:text-gray-200 text-gray-800" href="https://github.com/SYSTRAN/faster-whisper" target="_blank" > {$i18n.t( `Click here to learn more about faster-whisper and see the available models.` )} </a> </div> </div> {/if} </div> <hr class=" dark:border-gray-800" /> <div> <div class=" mb-1 text-sm font-medium">{$i18n.t('TTS Settings')}</div> <div class=" py-0.5 flex w-full justify-between"> <div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div> <div class="flex items-center relative"> <select class=" dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right" bind:value={TTS_ENGINE} placeholder="Select a mode" on:change={async (e) => { await updateConfigHandler(); await getVoices(); await getModels(); if (e.target?.value === 'openai') { TTS_VOICE = 'alloy'; TTS_MODEL = 'tts-1'; } else { TTS_VOICE = ''; TTS_MODEL = ''; } }} > <option value="">{$i18n.t('Web API')}</option> <option value="transformers">{$i18n.t('Transformers')} ({$i18n.t('Local')})</option> <option value="openai">{$i18n.t('OpenAI')}</option> <option value="elevenlabs">{$i18n.t('ElevenLabs')}</option> <option value="azure">{$i18n.t('Azure AI Speech')}</option> </select> </div> </div> {#if TTS_ENGINE === 'openai'} <div> <div class="mt-1 flex gap-2 mb-1"> <input class="flex-1 w-full bg-transparent outline-none" placeholder={$i18n.t('API Base URL')} bind:value={TTS_OPENAI_API_BASE_URL} required /> <SensitiveInput placeholder={$i18n.t('API Key')} bind:value={TTS_OPENAI_API_KEY} /> </div> </div> {:else if TTS_ENGINE === 'elevenlabs'} <div> <div class="mt-1 flex gap-2 mb-1"> <input class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" placeholder={$i18n.t('API Key')} bind:value={TTS_API_KEY} required /> </div> </div> {:else if TTS_ENGINE === 'azure'} <div> <div class="mt-1 flex gap-2 mb-1"> <input class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" placeholder={$i18n.t('API Key')} bind:value={TTS_API_KEY} required /> <input class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" placeholder={$i18n.t('Azure Region')} bind:value={TTS_AZURE_SPEECH_REGION} required /> </div> </div> {/if} <hr class=" dark:border-gray-850 my-2" /> {#if TTS_ENGINE === ''} <div> <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> <div class="flex w-full"> <div class="flex-1"> <select class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" bind:value={TTS_VOICE} > <option value="" selected={TTS_VOICE !== ''}>{$i18n.t('Default')}</option> {#each voices as voice} <option value={voice.voiceURI} class="bg-gray-100 dark:bg-gray-700" selected={TTS_VOICE === voice.voiceURI} >{voice.name.replace('+', ', ')}</option > {/each} </select> </div> </div> </div> {:else if TTS_ENGINE === 'transformers'} <div> <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div> <div class="flex w-full"> <div class="flex-1"> <input list="model-list" class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" bind:value={TTS_MODEL} placeholder="CMU ARCTIC speaker embedding name" /> <datalist id="model-list"> <option value="tts-1" /> </datalist> </div> </div> <div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500"> {$i18n.t(`Open WebUI uses SpeechT5 and CMU Arctic speaker embeddings.`)} To learn more about SpeechT5, <a class=" hover:underline dark:text-gray-200 text-gray-800" href="https://github.com/microsoft/SpeechT5" target="_blank" > {$i18n.t(`click here`, { name: 'SpeechT5' })}. </a> To see the available CMU Arctic speaker embeddings, <a class=" hover:underline dark:text-gray-200 text-gray-800" href="https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors" target="_blank" > {$i18n.t(`click here`)}. </a> </div> </div> {:else if TTS_ENGINE === 'openai'} <div class=" flex gap-2"> <div class="w-full"> <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> <div class="flex w-full"> <div class="flex-1"> <input list="voice-list" class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" bind:value={TTS_VOICE} placeholder="Select a voice" /> <datalist id="voice-list"> {#each voices as voice} <option value={voice.id}>{voice.name}</option> {/each} </datalist> </div> </div> </div> <div class="w-full"> <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div> <div class="flex w-full"> <div class="flex-1"> <input list="tts-model-list" class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" bind:value={TTS_MODEL} placeholder="Select a model" /> <datalist id="tts-model-list"> {#each models as model} <option value={model.id} class="bg-gray-50 dark:bg-gray-700" /> {/each} </datalist> </div> </div> </div> </div> {:else if TTS_ENGINE === 'elevenlabs'} <div class=" flex gap-2"> <div class="w-full"> <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> <div class="flex w-full"> <div class="flex-1"> <input list="voice-list" class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" bind:value={TTS_VOICE} placeholder="Select a voice" /> <datalist id="voice-list"> {#each voices as voice} <option value={voice.id}>{voice.name}</option> {/each} </datalist> </div> </div> </div> <div class="w-full"> <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div> <div class="flex w-full"> <div class="flex-1"> <input list="tts-model-list" class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" bind:value={TTS_MODEL} placeholder="Select a model" /> <datalist id="tts-model-list"> {#each models as model} <option value={model.id} class="bg-gray-50 dark:bg-gray-700" /> {/each} </datalist> </div> </div> </div> </div> {:else if TTS_ENGINE === 'azure'} <div class=" flex gap-2"> <div class="w-full"> <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> <div class="flex w-full"> <div class="flex-1"> <input list="voice-list" class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" bind:value={TTS_VOICE} placeholder="Select a voice" /> <datalist id="voice-list"> {#each voices as voice} <option value={voice.id}>{voice.name}</option> {/each} </datalist> </div> </div> </div> <div class="w-full"> <div class=" mb-1.5 text-sm font-medium"> {$i18n.t('Output format')} <a href="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs" target="_blank" > <small>{$i18n.t('Available list')}</small> </a> </div> <div class="flex w-full"> <div class="flex-1"> <input list="tts-model-list" class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" bind:value={TTS_AZURE_SPEECH_OUTPUT_FORMAT} placeholder="Select a output format" /> </div> </div> </div> </div> {/if} <hr class="dark:border-gray-850 my-2" /> <div class="pt-0.5 flex w-full justify-between"> <div class="self-center text-xs font-medium">{$i18n.t('Response splitting')}</div> <div class="flex items-center relative"> <select class="dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right" aria-label="Select how to split message text for TTS requests" bind:value={TTS_SPLIT_ON} > {#each Object.values(TTS_RESPONSE_SPLIT) as split} <option value={split} >{$i18n.t(split.charAt(0).toUpperCase() + split.slice(1))}</option > {/each} </select> </div> </div> <div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500"> {$i18n.t( "Control how message text is split for TTS requests. 'Punctuation' splits into sentences, 'paragraphs' splits into paragraphs, and 'none' keeps the message as a single string." )} </div> </div> </div> </div> <div class="flex justify-end text-sm font-medium"> <button class="px-3.5 py-1.5 text-sm font-medium bg-black hover:bg-gray-900 text-white dark:bg-white dark:text-black dark:hover:bg-gray-100 transition rounded-full" type="submit" > {$i18n.t('Save')} </button> </div> </form>