zach
Fix typo in documentation
a265e4f
raw
history blame
13.9 kB
"""
app.py
Gradio UI for interacting with the Anthropic API, Hume TTS API, and ElevenLabs TTS API.
Users enter a prompt, which is processed using Claude by Anthropic to generate text.
The text is then synthesized into speech using both Hume and ElevenLabs text-to-speech (TTS) APIs.
Users can compare the outputs and vote for their favorite in an interactive UI.
"""
# Standard Library Imports
from concurrent.futures import ThreadPoolExecutor
import random
from typing import Union, Tuple
# Third-Party Library Imports
import gradio as gr
# Local Application Imports
from src.config import logger
from src.constants import (
ELEVENLABS,
HUME_AI,
OPTION_A,
OPTION_B,
PROMPT_MAX_LENGTH,
PROMPT_MIN_LENGTH,
SAMPLE_PROMPTS,
TROPHY_EMOJI,
UNKNOWN_PROVIDER,
VOTE_FOR_OPTION_A,
VOTE_FOR_OPTION_B,
)
from src.integrations import (
AnthropicError,
ElevenLabsError,
generate_text_with_claude,
get_random_elevenlabs_voice_id,
get_random_hume_voice_names,
HumeError,
text_to_speech_with_elevenlabs,
text_to_speech_with_hume,
)
from src.theme import CustomTheme
from src.utils import truncate_text, validate_prompt_length
def generate_text(prompt: str) -> Tuple[Union[str, gr.update], gr.update]:
"""
Validates the prompt and generates text using Anthropic API.
Args:
prompt (str): The user-provided text prompt.
Returns:
Tuple containing:
- The generated text (as a gr.update) if successful,
- An update for the "Generate" button.
Raises:
gr.Error: On validation or API errors.
"""
try:
validate_prompt_length(prompt, PROMPT_MAX_LENGTH, PROMPT_MIN_LENGTH)
except ValueError as ve:
logger.warning(f'Validation error: {ve}')
raise gr.Error(str(ve))
try:
generated_text = generate_text_with_claude(prompt)
logger.info(f'Generated text ({len(generated_text)} characters).')
return gr.update(value=generated_text)
except AnthropicError as ae:
logger.error(f'AnthropicError while generating text: {str(ae)}')
raise gr.Error('There was an issue communicating with the Anthropic API. Please try again later.')
except Exception as e:
logger.error(f'Unexpected error while generating text: {e}')
raise gr.Error('Failed to generate text. Please try again.')
def text_to_speech(prompt: str, generated_text: str) -> Tuple[gr.update, gr.update, dict, Union[str, None]]:
"""
Synthesizes two text to speech outputs, loads the two audio players with the
output audio, and updates related UI state components.
- 50% chance to synthesize one Hume and one Elevenlabs output.
- 50% chance to synthesize two Hume outputs.
Args:
prompt (str): The original prompt.
generated_text (str): The generated text.
Returns:
A tuple of:
- Update for first audio player (with autoplay)
- Update for second audio player
- A dictionary mapping options to providers
- The raw audio value for option 2 (if needed)
Raises:
gr.Error: On API or unexpected errors.
"""
if not generated_text:
logger.warning('Skipping text-to-speech due to empty text.')
return gr.skip(), gr.skip(), gr.skip(), gr.skip()
# compare_hume_with_elevenlabs = random.random() < 0.5
compare_hume_with_elevenlabs = False
elevenlabs_voice = get_random_elevenlabs_voice_id()
hume_voice_a, hume_voice_b = get_random_hume_voice_names() # We get two Hume voices preemptively in case we compare Hume with Hume
try:
with ThreadPoolExecutor(max_workers=2) as executor:
provider_a = HUME_AI
future_audio_a = executor.submit(text_to_speech_with_hume, prompt, generated_text, hume_voice_a)
if compare_hume_with_elevenlabs:
provider_b = ELEVENLABS
future_audio_b = executor.submit(text_to_speech_with_elevenlabs, generated_text, elevenlabs_voice)
else:
provider_b = HUME_AI
future_audio_b = executor.submit(text_to_speech_with_hume, prompt, generated_text, hume_voice_b)
audio_a, audio_b = future_audio_a.result(), future_audio_b.result()
logger.info(f'TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes')
options = [(audio_a, provider_a), (audio_b, provider_b)]
random.shuffle(options)
option_a_audio, option_b_audio = options[0][0], options[1][0]
options_map = { OPTION_A: options[0][1], OPTION_B: options[1][1] }
return (
gr.update(value=option_a_audio, autoplay=True),
gr.update(value=option_b_audio),
options_map,
option_b_audio,
)
except ElevenLabsError as ee:
logger.error(f'ElevenLabsError while synthesizing speech from text: {str(ee)}')
raise gr.Error('There was an issue communicating with the Elevenlabs API. Please try again later.')
except HumeError as he:
logger.error(f'HumeError while synthesizing speech from text: {str(he)}')
raise gr.Error('There was an issue communicating with the Hume API. Please try again later.')
except Exception as e:
logger.error(f'Unexpected error during TTS generation: {e}')
raise gr.Error('An unexpected error ocurred. Please try again later.')
def vote(vote_submitted: bool, option_mapping: dict, selected_button: str) -> Tuple[bool, gr.update, gr.update]:
"""
Handles user voting.
Args:
vote_submitted (bool): True if a vote was already submitted
option_mapping (dict): Maps option labels to provider names
selected_button (str): The button clicked
Returns:
A tuple of:
- True if the vote was accepted
- Update for the selected vote button
- Update for the unselected vote button
"""
if not option_mapping or vote_submitted:
return gr.skip(), gr.skip(), gr.skip()
is_option_a = selected_button == VOTE_FOR_OPTION_A
selected_option, other_option = (OPTION_A, OPTION_B) if is_option_a else (OPTION_B, OPTION_A)
selected_provider = option_mapping.get(selected_option, UNKNOWN_PROVIDER)
other_provider = option_mapping.get(other_option, UNKNOWN_PROVIDER)
return (
True,
gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary') if is_option_a
else gr.update(value=other_provider, variant='secondary'),
gr.update(value=other_provider, variant='secondary') if is_option_a
else gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary'),
)
def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None, bool]:
"""
Resets UI state before generating new text.
Returns:
A tuple of updates for:
- option_a_audio_player (clear audio)
- option_b_audio_player (clear audio)
- vote_button_a (disable and reset button text)
- vote_button_a (disable and reset button text)
- option_mapping_state (reset option map state)
- option2_audio_state (reset option 2 audio state)
- vote_submitted_state (reset submitted vote state)
"""
return (
gr.update(value=None),
gr.update(value=None),
gr.update(interactive=False, value=VOTE_FOR_OPTION_A, variant='secondary'),
gr.update(interactive=False, value=VOTE_FOR_OPTION_B, variant='secondary'),
None,
None,
False,
)
def build_input_section() -> Tuple[gr.Markdown, gr.Dropdown, gr.Textbox, gr.Button]:
"""Builds the input section including instructions, sample prompt dropdown, prompt input, and generate button"""
with gr.Column(variant='compact'):
instructions = gr.Markdown(
'Generate text with **Claude by Anthropic**, listen to text-to-speech outputs '
'from **Hume AI** and **ElevenLabs**, and vote for your favorite!'
)
sample_prompt_dropdown = gr.Dropdown(
choices=list(SAMPLE_PROMPTS.keys()),
label='Choose a sample prompt (or enter your own)',
value=None,
interactive=True,
)
prompt_input = gr.Textbox(
label='Prompt',
placeholder='Enter your prompt...',
lines=2,
max_lines=2,
max_length=PROMPT_MAX_LENGTH,
show_copy_button=True,
)
generate_button = gr.Button('Generate text', variant='primary')
return instructions, sample_prompt_dropdown, prompt_input, generate_button
def build_output_section() -> Tuple[gr.Textbox, gr.Audio, gr.Audio, gr.Button, gr.Button]:
"""Builds the output section including generated text, audio players, and vote buttons."""
with gr.Column(variant='compact'):
generated_text = gr.Textbox(
label='Text',
interactive=False,
autoscroll=False,
lines=5,
max_lines=5,
max_length=PROMPT_MAX_LENGTH,
show_copy_button=True,
)
with gr.Row(equal_height=True):
option_a_audio_player = gr.Audio(label=OPTION_A, type='filepath', interactive=False)
option_b_audio_player = gr.Audio(label=OPTION_B, type='filepath', interactive=False)
with gr.Row():
vote_button_a = gr.Button(VOTE_FOR_OPTION_A, interactive=False)
vote_button_b = gr.Button(VOTE_FOR_OPTION_B, interactive=False)
return generated_text, option_a_audio_player, option_b_audio_player, vote_button_a, vote_button_b
def build_gradio_interface() -> gr.Blocks:
"""
Builds and configures the Gradio user interface.
Returns:
gr.Blocks: The fully constructed Gradio UI layout.
"""
custom_theme = CustomTheme()
with gr.Blocks(
title='Expressive TTS Arena',
theme=custom_theme,
fill_width=True,
css_paths='src/assets/styles.css'
) as demo:
# Title
gr.Markdown('# Expressive TTS Arena')
# Build input section
instructions, sample_prompt_dropdown, prompt_input, generate_button = build_input_section()
# Build output section
generated_text, option_a_audio_player, option_b_audio_player, vote_button_a, vote_button_b = build_output_section()
# UI state components
option_mapping_state = gr.State() # Track option map (option 1 and option 2 are randomized)
option2_audio_state = gr.State() # Track generated audio for option 2 for playing automatically after option 1 audio finishes
vote_submitted_state = gr.State(False) # Track whether the user has voted on an option
# --- Register event handlers ---
# When a sample prompt is chosen, update the prompt textbox
sample_prompt_dropdown.change(
fn=lambda choice: SAMPLE_PROMPTS.get(choice, ''),
inputs=[sample_prompt_dropdown],
outputs=[prompt_input],
)
# Generate Button Click Handler Chain:
# 1. Disable the Generate button
# 2. Reset UI state
# 3. Generate text
# 4. Synthesize TTS
# 5. Re-enable the Generate button
generate_button.click(
fn=lambda: gr.update(interactive=False), # Disable the button immediately
inputs=[],
outputs=[generate_button]
).then(
fn=reset_ui,
inputs=[],
outputs=[
option_a_audio_player,
option_b_audio_player,
vote_button_a,
vote_button_b,
option_mapping_state,
option2_audio_state,
vote_submitted_state,
],
).then(
fn=generate_text,
inputs=[prompt_input],
outputs=[generated_text],
).then(
fn=text_to_speech,
inputs=[prompt_input, generated_text],
outputs=[
option_a_audio_player,
option_b_audio_player,
option_mapping_state,
option2_audio_state,
],
).then(
fn=lambda: gr.update(interactive=True), # Re-enable the button
inputs=[],
outputs=[generate_button]
)
# Vote button click handlers
vote_button_a.click(
fn=vote,
inputs=[vote_submitted_state, option_mapping_state, vote_button_a],
outputs=[vote_submitted_state, vote_button_a, vote_button_b],
)
vote_button_b.click(
fn=vote,
inputs=[vote_submitted_state, option_mapping_state, vote_button_b],
outputs=[vote_submitted_state, vote_button_a, vote_button_b],
)
# Auto-play second audio after first finishes (workaround for playing audio back-to-back)
option_a_audio_player.stop(
fn=lambda _: gr.update(value=None),
inputs=[],
outputs=[option_b_audio_player],
).then(
fn=lambda audio: gr.update(value=audio, autoplay=True),
inputs=[option2_audio_state],
outputs=[option_b_audio_player],
)
# Enable voting after second audio option playback finishes
option_b_audio_player.stop(
fn=lambda _: (gr.update(interactive=True), gr.update(interactive=True), gr.update(autoplay=False)),
inputs=[],
outputs=[vote_button_a, vote_button_b, option_b_audio_player],
)
logger.debug('Gradio interface built successfully')
return demo
if __name__ == '__main__':
logger.info('Launching TTS Arena Gradio app...')
demo = build_gradio_interface()
demo.launch()