zach
Update UI to disable generate button during generation and anonymize tts output options
4ea25cd
raw
history blame
7.59 kB
"""
app.py
This file defines the Gradio user interface for interacting with the Anthropic API, Hume TTS API, and ElevenLabs TTS API.
Users can input prompts, which are processed to generate text using the Claude model via the Anthropic API.
The generated text is then converted to audio using both Hume and ElevenLabs TTS APIs, allowing playback in the Gradio UI.
"""
# Standard Library Imports
from concurrent.futures import ThreadPoolExecutor
import random
# Third-Party Library Imports
import gradio as gr
# Local Application Imports
from src.config import logger
from src.constants import PROMPT_MAX_LENGTH, PROMPT_MIN_LENGTH, SAMPLE_PROMPTS
from src.integrations import generate_text_with_claude, text_to_speech_with_hume, text_to_speech_with_elevenlabs
from src.utils import truncate_text, validate_prompt_length
def process_prompt(prompt: str):
"""
Processes the user input by generating text using Claude API, then converting
the generated text to speech using both Hume and ElevenLabs TTS APIs.
Args:
prompt (str): The user's input prompt.
Returns:
tuple: Generated text, two audio paths (Hume & ElevenLabs), and a mapping
of audio options to their respective TTS providers.
"""
logger.info(f'Processing prompt: {truncate_text(prompt, max_length=100)}')
try:
# Validate prompt length
validate_prompt_length(prompt, PROMPT_MAX_LENGTH, PROMPT_MIN_LENGTH)
# Generate text
generated_text = generate_text_with_claude(prompt)
logger.info(f'Generated text successfully (length={len(generated_text)} characters).')
# Run TTS generation in parallel
with ThreadPoolExecutor(max_workers=2) as executor:
hume_future = executor.submit(text_to_speech_with_hume, prompt, generated_text)
elevenlabs_future = executor.submit(text_to_speech_with_elevenlabs, generated_text)
# Retrieve results
hume_audio = hume_future.result()
elevenlabs_audio = elevenlabs_future.result()
logger.info(
f'TTS audio generated: Hume={len(hume_audio)} bytes, '
f'ElevenLabs={len(elevenlabs_audio)} bytes'
)
# Randomly assign audio options
audio_options = [
(hume_audio, 'Hume TTS'),
(elevenlabs_audio, 'ElevenLabs TTS'),
]
random.shuffle(audio_options)
option1_audio, option1_provider = audio_options[0]
option2_audio, option2_provider = audio_options[1]
return generated_text, option1_audio, option2_audio, {
'Option 1': option1_provider,
'Option 2': option2_provider,
}
except ValueError as ve:
logger.warning(f'Validation error: {ve}')
return str(ve), None, None, {}
except Exception as e:
logger.error(f'Unexpected error during processing: {e}')
return 'An unexpected error occurred. Please try again.', None, None, {}
def run_process_prompt(prompt: str):
"""
Handles the UI state transitions while processing a prompt.
Args:
prompt (str): The user's input prompt.
Yields:
tuple: Updates to the UI elements in three stages:
1. Disabling UI and clearing previous outputs.
2. Displaying generated content.
3. Re-enabling UI after generation completes.
"""
# Stage 1: Disable UI and clear previous outputs
yield (
gr.update(interactive=False), # Disable Generate Button
gr.update(value=None), # Clear generated text
gr.update(value=None), # Clear Option 1 audio
gr.update(value=None), # Clear Option 2 audio
gr.update(value=None), # Clear option mapping
None, # Reset Option 2 audio state
)
# Process the prompt
generated_text, option1_audio, option2_audio, option_mapping = process_prompt(prompt)
# Stage 2: Display generated text and first audio (autoplay)
yield (
gr.update(interactive=True), # Enable Generate Button
gr.update(value=generated_text), # Show generated text
gr.update(value=option1_audio, autoplay=True), # Set Option 1 audio
gr.update(value=option2_audio), # Set Option 2 audio
gr.update(value=option_mapping), # Store option mapping
option2_audio, # Store Option 2 audio
)
def build_gradio_interface() -> gr.Blocks:
"""
Constructs the Gradio user interface.
Returns:
gr.Blocks: The Gradio Blocks-based UI.
"""
with gr.Blocks() as demo:
# UI title & instructions
gr.Markdown('# TTS Arena')
gr.Markdown(
'Generate text from a prompt using **Claude by Anthropic**, '
'and compare text-to-speech outputs from **Hume TTS API** and **ElevenLabs TTS API**.'
)
# Prompt selection
with gr.Row():
sample_prompt_dropdown = gr.Dropdown(
choices=list(SAMPLE_PROMPTS.keys()),
label='Choose a sample prompt (or enter your own below)',
value=None,
interactive=True,
)
# Prompt input
with gr.Row():
prompt_input = gr.Textbox(
label='Enter your prompt',
placeholder='Or type your own prompt here...',
lines=2,
max_lines=2
)
# Generate button
with gr.Row():
generate_button = gr.Button('Generate')
# Output section
with gr.Column():
output_text = gr.Textbox(
label='Generated Text',
interactive=False,
lines=8,
max_lines=12,
)
with gr.Row():
option1_audio_player = gr.Audio(label='Option 1', type='filepath', interactive=False)
option2_audio_player = gr.Audio(label='Option 2', type='filepath', interactive=False)
# UI state components
option_mapping_state = gr.State()
option2_audio_state = gr.State()
# Event handlers
sample_prompt_dropdown.change(
fn=lambda choice: SAMPLE_PROMPTS.get(choice, ""),
inputs=[sample_prompt_dropdown],
outputs=[prompt_input],
)
generate_button.click(
fn=run_process_prompt,
inputs=[prompt_input],
outputs=[
generate_button,
output_text,
option1_audio_player,
option2_audio_player,
option_mapping_state,
option2_audio_state,
],
)
# Auto-play second audio after first completes
option1_audio_player.stop(
fn=lambda _: gr.update(value=None), # Reset first audio before playing second
inputs=[option1_audio_player],
outputs=[option2_audio_player],
).then(
fn=lambda option2_audio: gr.update(value=option2_audio, autoplay=True),
inputs=[option2_audio_state],
outputs=[option2_audio_player],
)
logger.debug('Gradio interface built successfully')
return demo
if __name__ == '__main__':
logger.info('Launching TTS Arena Gradio app...')
demo = build_gradio_interface()
demo.launch()