ginipick commited on
Commit
ab87e84
·
verified ·
1 Parent(s): 355d056

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +504 -137
app.py CHANGED
@@ -1,5 +1,6 @@
1
  """
2
  Gradio UI for Text-to-Speech using HiggsAudioServeEngine
 
3
  """
4
 
5
  import argparse
@@ -23,6 +24,7 @@ from higgs_audio.data_types import ChatMLSample, AudioContent, Message
23
 
24
  # Global engine instance
25
  engine = None
 
26
 
27
  # Default model configuration
28
  DEFAULT_MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
@@ -43,12 +45,16 @@ PREDEFINED_EXAMPLES = {
43
  "voice-clone": {
44
  "system_prompt": "",
45
  "input_text": "Hey there! I'm your friendly voice twin in the making. Pick a voice preset below or upload your own audio - let's clone some vocals and bring your voice to life! ",
46
- "description": "Voice clone to clone the reference audio. Leave the system prompt empty.",
 
 
47
  },
48
  "smart-voice": {
49
  "system_prompt": DEFAULT_SYSTEM_PROMPT,
50
  "input_text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years.",
51
- "description": "Smart voice to generate speech based on the context",
 
 
52
  },
53
  "multispeaker-voice-description": {
54
  "system_prompt": "You are an AI assistant designed to convert text into speech.\n"
@@ -62,7 +68,9 @@ PREDEFINED_EXAMPLES = {
62
  "[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this.\n"
63
  "[SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion!\n"
64
  "[SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act.",
65
- "description": "Multispeaker with different voice descriptions in the system prompt",
 
 
66
  },
67
  "single-speaker-voice-description": {
68
  "system_prompt": "Generate audio following instruction.\n\n"
@@ -74,7 +82,9 @@ PREDEFINED_EXAMPLES = {
74
  "And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
75
  "\n"
76
  "So here's the big question: Do you want to understand how deep learning works?\n",
77
- "description": "Single speaker with voice description in the system prompt",
 
 
78
  },
79
  "single-speaker-zh": {
80
  "system_prompt": "Generate audio following instruction.\n\n"
@@ -85,12 +95,16 @@ PREDEFINED_EXAMPLES = {
85
  "今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
86
  "那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
87
  "或者说, 你能察觉到我其实是个机器人吗?",
88
- "description": "Single speaker speaking Chinese",
 
 
89
  },
90
  "single-speaker-bgm": {
91
  "system_prompt": DEFAULT_SYSTEM_PROMPT,
92
  "input_text": "[music start] I will remember this, thought Ender, when I am defeated. To keep dignity, and give honor where it's due, so that defeat is not disgrace. And I hope I don't have to do it often. [music end]",
93
- "description": "Single speaker with BGM using music tag. This is an experimental feature and you may need to try multiple times to get the best result.",
 
 
94
  },
95
  }
96
 
@@ -110,10 +124,14 @@ def get_current_device():
110
  def load_voice_presets():
111
  """Load the voice presets from the voice_examples directory."""
112
  try:
113
- with open(
114
- os.path.join(os.path.dirname(__file__), "voice_examples", "config.json"),
115
- "r",
116
- ) as f:
 
 
 
 
117
  voice_dict = json.load(f)
118
  voice_presets = {k: v["transcript"] for k, v in voice_dict.items()}
119
  voice_presets["EMPTY"] = "No reference voice"
@@ -156,10 +174,10 @@ def normalize_chinese_punctuation(text):
156
  "】": "]", # right square bracket
157
  "《": "<", # left angle quote
158
  "》": ">", # right angle quote
159
- "": '"', # left double quotation
160
- "": '"', # right double quotation
161
- "": "'", # left single quotation
162
- "": "'", # right single quotation
163
  "、": ",", # enumeration comma
164
  "—": "-", # em dash
165
  "…": "...", # ellipsis
@@ -210,11 +228,14 @@ def normalize_text(transcript: str):
210
  return transcript
211
 
212
 
213
- @spaces.GPU
214
  def initialize_engine(model_path, audio_tokenizer_path) -> bool:
215
  """Initialize the HiggsAudioServeEngine."""
216
  global engine
217
  try:
 
 
 
 
218
  logger.info(f"Initializing engine with model: {model_path} and audio tokenizer: {audio_tokenizer_path}")
219
  engine = HiggsAudioServeEngine(
220
  model_name_or_path=model_path,
@@ -305,7 +326,8 @@ def text_to_speech(
305
  global engine
306
 
307
  if engine is None:
308
- initialize_engine(DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH)
 
309
 
310
  try:
311
  # Prepare ChatML sample
@@ -360,11 +382,23 @@ def text_to_speech(
360
  return f"❌ {error_msg}", None
361
 
362
 
 
 
 
 
 
 
363
  def create_ui():
364
- my_theme = gr.Theme.load("theme.json")
 
 
 
 
 
365
 
366
- # Add custom CSS to disable focus highlighting on textboxes
367
  custom_css = """
 
368
  .gradio-container input:focus,
369
  .gradio-container textarea:focus,
370
  .gradio-container select:focus,
@@ -380,113 +414,340 @@ def create_ui():
380
  background-color: var(--input-background-fill) !important;
381
  }
382
 
383
- /* Override any hover effects as well */
384
- .gradio-container input:hover,
385
- .gradio-container textarea:hover,
386
- .gradio-container select:hover,
387
- .gradio-container .gr-input:hover,
388
- .gradio-container .gr-textarea:hover,
389
- .gradio-container .gr-textbox:hover {
390
- border-color: var(--border-color-primary) !important;
391
- background-color: var(--input-background-fill) !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  }
393
 
394
- /* Style for checked checkbox */
395
- .gradio-container input[type="checkbox"]:checked {
396
- background-color: var(--primary-500) !important;
397
- border-color: var(--primary-500) !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
  }
399
  """
400
 
401
  default_template = "smart-voice"
402
 
403
- """Create the Gradio UI."""
404
- with gr.Blocks(theme=my_theme, css=custom_css) as demo:
405
- gr.Markdown("# Higgs Audio Text-to-Speech Playground")
 
 
 
 
 
 
406
 
407
  # Main UI section
408
  with gr.Row():
409
  with gr.Column(scale=2):
410
- # Template selection dropdown
 
 
411
  template_dropdown = gr.Dropdown(
412
  label="TTS Template",
413
  choices=list(PREDEFINED_EXAMPLES.keys()),
414
  value=default_template,
415
- info="Select a predefined example for system and input messages.",
 
416
  )
417
 
418
- # Template description display
419
  template_description = gr.HTML(
420
- value=f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {PREDEFINED_EXAMPLES[default_template]["description"]}</p>',
421
  visible=True,
422
  )
423
 
424
- system_prompt = gr.TextArea(
425
- label="System Prompt",
426
- placeholder="Enter system prompt to guide the model...",
427
- value=PREDEFINED_EXAMPLES[default_template]["system_prompt"],
428
- lines=2,
429
- )
430
-
431
- input_text = gr.TextArea(
432
- label="Input Text",
433
- placeholder="Type the text you want to convert to speech...",
434
- value=PREDEFINED_EXAMPLES[default_template]["input_text"],
435
- lines=5,
436
- )
437
-
438
- voice_preset = gr.Dropdown(
439
- label="Voice Preset",
440
- choices=list(VOICE_PRESETS.keys()),
441
- value="EMPTY",
442
- interactive=False, # Disabled by default since default template is not voice-clone
443
- visible=False,
444
- )
445
-
446
- with gr.Accordion(
447
- "Custom Reference (Optional)", open=False, visible=False
448
- ) as custom_reference_accordion:
449
- reference_audio = gr.Audio(label="Reference Audio", type="filepath")
450
- reference_text = gr.TextArea(
451
- label="Reference Text (transcript of the reference audio)",
452
- placeholder="Enter the transcript of your reference audio...",
453
  lines=3,
 
454
  )
455
 
456
- with gr.Accordion("Advanced Parameters", open=False):
457
- max_completion_tokens = gr.Slider(
458
- minimum=128,
459
- maximum=4096,
460
- value=1024,
461
- step=10,
462
- label="Max Completion Tokens",
463
- )
464
- temperature = gr.Slider(
465
- minimum=0.0,
466
- maximum=1.5,
467
- value=1.0,
468
- step=0.1,
469
- label="Temperature",
470
- )
471
- top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P")
472
- top_k = gr.Slider(minimum=-1, maximum=100, value=50, step=1, label="Top K")
473
- ras_win_len = gr.Slider(
474
- minimum=0,
475
- maximum=10,
476
- value=7,
477
- step=1,
478
- label="RAS Window Length",
479
- info="Window length for repetition avoidance sampling",
480
  )
481
- ras_win_max_num_repeat = gr.Slider(
482
- minimum=1,
483
- maximum=10,
484
- value=2,
485
- step=1,
486
- label="RAS Max Num Repeat",
487
- info="Maximum number of repetitions allowed in the window",
 
 
 
 
 
488
  )
489
- # Add stop strings component
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  stop_strings = gr.Dataframe(
491
  label="Stop Strings",
492
  headers=["stops"],
@@ -494,32 +755,93 @@ def create_ui():
494
  value=[[s] for s in DEFAULT_STOP_STRINGS],
495
  interactive=True,
496
  col_count=(1, "fixed"),
 
497
  )
498
 
499
- submit_btn = gr.Button("Generate Speech", variant="primary", scale=1)
 
 
 
 
 
 
 
500
 
 
501
  with gr.Column(scale=2):
502
- output_text = gr.TextArea(label="Model Response", lines=2)
503
-
504
- # Audio output
505
- output_audio = gr.Audio(label="Generated Audio", interactive=False, autoplay=True)
506
-
507
- stop_btn = gr.Button("Stop Playback", variant="primary")
 
 
 
 
 
 
 
 
508
 
509
- # Example voice
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  with gr.Row(visible=False) as voice_samples_section:
 
511
  voice_samples_table = gr.Dataframe(
512
  headers=["Voice Preset", "Sample Text"],
513
  datatype=["str", "str"],
514
  value=[[preset, text] for preset, text in VOICE_PRESETS.items() if preset != "EMPTY"],
515
  interactive=False,
 
 
 
 
 
516
  )
517
- sample_audio = gr.Audio(label="Voice Sample")
 
 
 
518
 
519
  # Function to play voice sample when clicking on a row
520
  def play_voice_sample(evt: gr.SelectData):
521
  try:
522
- # Get the preset name from the clicked row
523
  preset_names = [preset for preset in VOICE_PRESETS.keys() if preset != "EMPTY"]
524
  if evt.index[0] < len(preset_names):
525
  preset = preset_names[evt.index[0]]
@@ -537,43 +859,67 @@ def create_ui():
537
  gr.Error(f"Error playing voice sample: {e}")
538
  return None
539
 
540
- voice_samples_table.select(fn=play_voice_sample, outputs=[sample_audio])
541
-
542
  # Function to handle template selection
543
  def apply_template(template_name):
544
  if template_name in PREDEFINED_EXAMPLES:
545
  template = PREDEFINED_EXAMPLES[template_name]
546
- # Enable voice preset and custom reference only for voice-clone template
547
  is_voice_clone = template_name == "voice-clone"
548
  voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
549
- # Set ras_win_len to 0 for single-speaker-bgm, 7 for others
550
  ras_win_len_value = 0 if template_name == "single-speaker-bgm" else 7
551
- description_text = f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {template["description"]}</p>'
 
552
  return (
553
  template["system_prompt"], # system_prompt
554
  template["input_text"], # input_text
555
- description_text, # template_description
556
  gr.update(
557
- value=voice_preset_value, interactive=is_voice_clone, visible=is_voice_clone
558
- ), # voice_preset (value and interactivity)
559
- gr.update(visible=is_voice_clone), # custom reference accordion visibility
560
- gr.update(visible=is_voice_clone), # voice samples section visibility
 
 
561
  ras_win_len_value, # ras_win_len
 
 
562
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  else:
564
- return (
565
- gr.update(),
566
- gr.update(),
567
- gr.update(),
568
- gr.update(),
569
- gr.update(),
570
- gr.update(),
571
- gr.update(),
572
- ) # No change if template not found
573
 
574
  # Set up event handlers
 
 
 
 
 
 
 
575
 
576
- # Connect template dropdown to handler
577
  template_dropdown.change(
578
  fn=apply_template,
579
  inputs=[template_dropdown],
@@ -585,12 +931,20 @@ def create_ui():
585
  custom_reference_accordion,
586
  voice_samples_section,
587
  ras_win_len,
 
 
588
  ],
589
  )
590
 
591
- # Connect submit button to the TTS function
 
 
 
 
 
 
592
  submit_btn.click(
593
- fn=text_to_speech,
594
  inputs=[
595
  input_text,
596
  voice_preset,
@@ -605,7 +959,7 @@ def create_ui():
605
  ras_win_len,
606
  ras_win_max_num_repeat,
607
  ],
608
- outputs=[output_text, output_audio],
609
  api_name="generate_speech",
610
  )
611
 
@@ -617,12 +971,20 @@ def create_ui():
617
  js="() => {const audio = document.querySelector('audio'); if(audio) audio.pause(); return null;}",
618
  )
619
 
 
 
 
 
 
 
 
 
620
  return demo
621
 
622
 
623
  def main():
624
  """Main function to parse arguments and launch the UI."""
625
- global DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH, VOICE_PRESETS
626
 
627
  parser = argparse.ArgumentParser(description="Gradio UI for Text-to-Speech using HiggsAudioServeEngine")
628
  parser.add_argument(
@@ -637,13 +999,18 @@ def main():
637
 
638
  args = parser.parse_args()
639
 
640
- # Update default values if provided via command line
641
- VOICE_PRESETS = load_voice_presets()
642
 
643
  # Create and launch the UI
644
  demo = create_ui()
645
- demo.launch(server_name=args.host, server_port=args.port)
 
 
 
 
 
646
 
647
 
648
  if __name__ == "__main__":
649
- main()
 
1
  """
2
  Gradio UI for Text-to-Speech using HiggsAudioServeEngine
3
+ Enhanced with visual improvements and better user experience
4
  """
5
 
6
  import argparse
 
24
 
25
  # Global engine instance
26
  engine = None
27
+ VOICE_PRESETS = {}
28
 
29
  # Default model configuration
30
  DEFAULT_MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
 
45
  "voice-clone": {
46
  "system_prompt": "",
47
  "input_text": "Hey there! I'm your friendly voice twin in the making. Pick a voice preset below or upload your own audio - let's clone some vocals and bring your voice to life! ",
48
+ "description": "🎭 <b>Voice Clone</b> - Clone any voice with reference audio. Leave the system prompt empty for best results.",
49
+ "icon": "🎭",
50
+ "color": "#FF6B6B"
51
  },
52
  "smart-voice": {
53
  "system_prompt": DEFAULT_SYSTEM_PROMPT,
54
  "input_text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years.",
55
+ "description": "🧠 <b>Smart Voice</b> - Generate natural speech based on context",
56
+ "icon": "🧠",
57
+ "color": "#4ECDC4"
58
  },
59
  "multispeaker-voice-description": {
60
  "system_prompt": "You are an AI assistant designed to convert text into speech.\n"
 
68
  "[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this.\n"
69
  "[SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion!\n"
70
  "[SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act.",
71
+ "description": "👥 <b>Multi-Speaker</b> - Different voices for dialogue and conversations",
72
+ "icon": "👥",
73
+ "color": "#95E1D3"
74
  },
75
  "single-speaker-voice-description": {
76
  "system_prompt": "Generate audio following instruction.\n\n"
 
82
  "And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
83
  "\n"
84
  "So here's the big question: Do you want to understand how deep learning works?\n",
85
+ "description": "🎙️ <b>Voice Description</b> - Generate speech with specific voice characteristics",
86
+ "icon": "🎙️",
87
+ "color": "#F38181"
88
  },
89
  "single-speaker-zh": {
90
  "system_prompt": "Generate audio following instruction.\n\n"
 
95
  "今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
96
  "那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
97
  "或者说, 你能察觉到我其实是个机器人吗?",
98
+ "description": "🇨🇳 <b>Chinese Speech</b> - Generate natural Chinese speech",
99
+ "icon": "🇨🇳",
100
+ "color": "#AA96DA"
101
  },
102
  "single-speaker-bgm": {
103
  "system_prompt": DEFAULT_SYSTEM_PROMPT,
104
  "input_text": "[music start] I will remember this, thought Ender, when I am defeated. To keep dignity, and give honor where it's due, so that defeat is not disgrace. And I hope I don't have to do it often. [music end]",
105
+ "description": "🎵 <b>Speech with BGM</b> - Add background music to your speech (experimental)",
106
+ "icon": "🎵",
107
+ "color": "#FCBAD3"
108
  },
109
  }
110
 
 
124
  def load_voice_presets():
125
  """Load the voice presets from the voice_examples directory."""
126
  try:
127
+ config_path = os.path.join(os.path.dirname(__file__), "voice_examples", "config.json")
128
+
129
+ # Check if directory exists
130
+ if not os.path.exists(os.path.dirname(config_path)):
131
+ logger.warning("Voice examples directory not found")
132
+ return {"EMPTY": "No reference voice"}
133
+
134
+ with open(config_path, "r") as f:
135
  voice_dict = json.load(f)
136
  voice_presets = {k: v["transcript"] for k, v in voice_dict.items()}
137
  voice_presets["EMPTY"] = "No reference voice"
 
174
  "】": "]", # right square bracket
175
  "《": "<", # left angle quote
176
  "》": ">", # right angle quote
177
+ """: '"', # left double quotation
178
+ """: '"', # right double quotation
179
+ "'": "'", # left single quotation
180
+ "'": "'", # right single quotation
181
  "、": ",", # enumeration comma
182
  "—": "-", # em dash
183
  "…": "...", # ellipsis
 
228
  return transcript
229
 
230
 
 
231
  def initialize_engine(model_path, audio_tokenizer_path) -> bool:
232
  """Initialize the HiggsAudioServeEngine."""
233
  global engine
234
  try:
235
+ if engine is not None:
236
+ logger.info("Engine already initialized")
237
+ return True
238
+
239
  logger.info(f"Initializing engine with model: {model_path} and audio tokenizer: {audio_tokenizer_path}")
240
  engine = HiggsAudioServeEngine(
241
  model_name_or_path=model_path,
 
326
  global engine
327
 
328
  if engine is None:
329
+ if not initialize_engine(DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH):
330
+ return "❌ Failed to initialize engine", None
331
 
332
  try:
333
  # Prepare ChatML sample
 
382
  return f"❌ {error_msg}", None
383
 
384
 
385
+ def initialize_globals():
386
+ """Initialize global variables"""
387
+ global VOICE_PRESETS
388
+ VOICE_PRESETS = load_voice_presets()
389
+
390
+
391
  def create_ui():
392
+ # Try to load theme
393
+ try:
394
+ my_theme = gr.Theme.load("theme.json")
395
+ except Exception as e:
396
+ logger.warning(f"Failed to load theme.json: {e}, using default theme")
397
+ my_theme = gr.themes.Default()
398
 
399
+ # Enhanced CSS with animations and visual improvements
400
  custom_css = """
401
+ /* Remove focus highlighting */
402
  .gradio-container input:focus,
403
  .gradio-container textarea:focus,
404
  .gradio-container select:focus,
 
414
  background-color: var(--input-background-fill) !important;
415
  }
416
 
417
+ /* Gradient background */
418
+ .gradio-container {
419
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
420
+ min-height: 100vh;
421
+ }
422
+
423
+ /* Main container styling */
424
+ .container {
425
+ backdrop-filter: blur(10px);
426
+ background: rgba(255, 255, 255, 0.95);
427
+ border-radius: 20px;
428
+ box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
429
+ }
430
+
431
+ /* Header styling */
432
+ .header-container {
433
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
434
+ padding: 2rem;
435
+ border-radius: 15px;
436
+ margin-bottom: 2rem;
437
+ box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
438
+ }
439
+
440
+ .header-title {
441
+ color: white;
442
+ font-size: 2.5rem;
443
+ font-weight: bold;
444
+ text-align: center;
445
+ margin: 0;
446
+ text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2);
447
+ }
448
+
449
+ .header-subtitle {
450
+ color: rgba(255, 255, 255, 0.9);
451
+ text-align: center;
452
+ margin-top: 0.5rem;
453
+ font-size: 1.1rem;
454
+ }
455
+
456
+ /* Template cards */
457
+ .template-card {
458
+ background: white;
459
+ border-radius: 12px;
460
+ padding: 1.5rem;
461
+ margin: 0.5rem;
462
+ border: 2px solid transparent;
463
+ transition: all 0.3s ease;
464
+ cursor: pointer;
465
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
466
+ }
467
+
468
+ .template-card:hover {
469
+ transform: translateY(-3px);
470
+ box-shadow: 0 4px 20px rgba(0, 0, 0, 0.15);
471
+ border-color: var(--primary-500);
472
+ }
473
+
474
+ .template-card.selected {
475
+ border-color: var(--primary-500);
476
+ background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
477
+ }
478
+
479
+ .template-icon {
480
+ font-size: 2rem;
481
+ margin-bottom: 0.5rem;
482
+ }
483
+
484
+ /* Voice preset cards */
485
+ .voice-card {
486
+ background: white;
487
+ border-radius: 10px;
488
+ padding: 1rem;
489
+ margin: 0.5rem;
490
+ border: 2px solid #e0e0e0;
491
+ transition: all 0.3s ease;
492
+ cursor: pointer;
493
+ text-align: center;
494
+ }
495
+
496
+ .voice-card:hover {
497
+ border-color: var(--primary-500);
498
+ transform: scale(1.05);
499
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
500
+ }
501
+
502
+ .voice-card.selected {
503
+ border-color: var(--primary-500);
504
+ background: #f0f8ff;
505
+ }
506
+
507
+ /* Generate button animation */
508
+ .generate-btn {
509
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
510
+ color: white;
511
+ font-size: 1.2rem;
512
+ font-weight: bold;
513
+ padding: 0.8rem 2rem;
514
+ border-radius: 30px;
515
+ border: none;
516
+ cursor: pointer;
517
+ transition: all 0.3s ease;
518
+ box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4);
519
+ }
520
+
521
+ .generate-btn:hover {
522
+ transform: translateY(-2px);
523
+ box-shadow: 0 6px 20px rgba(102, 126, 234, 0.6);
524
+ }
525
+
526
+ .generate-btn:active {
527
+ transform: translateY(0);
528
+ }
529
+
530
+ /* Audio player styling */
531
+ .audio-container {
532
+ background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
533
+ padding: 2rem;
534
+ border-radius: 15px;
535
+ box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
536
+ }
537
+
538
+ /* Progress indicator */
539
+ .progress-bar {
540
+ height: 4px;
541
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
542
+ border-radius: 2px;
543
+ animation: progress 2s ease-in-out infinite;
544
  }
545
 
546
+ @keyframes progress {
547
+ 0% { transform: translateX(-100%); }
548
+ 100% { transform: translateX(100%); }
549
+ }
550
+
551
+ /* Accordion styling */
552
+ .gr-accordion {
553
+ background: white;
554
+ border-radius: 10px;
555
+ border: 1px solid #e0e0e0;
556
+ margin-top: 1rem;
557
+ }
558
+
559
+ /* Info cards */
560
+ .info-card {
561
+ background: #f8f9fa;
562
+ border-left: 4px solid var(--primary-500);
563
+ padding: 1rem;
564
+ margin: 1rem 0;
565
+ border-radius: 5px;
566
+ }
567
+
568
+ /* Tooltips */
569
+ .tooltip {
570
+ position: relative;
571
+ display: inline-block;
572
+ border-bottom: 1px dotted black;
573
+ }
574
+
575
+ .tooltip .tooltiptext {
576
+ visibility: hidden;
577
+ width: 200px;
578
+ background-color: #555;
579
+ color: #fff;
580
+ text-align: center;
581
+ border-radius: 6px;
582
+ padding: 5px;
583
+ position: absolute;
584
+ z-index: 1;
585
+ bottom: 125%;
586
+ left: 50%;
587
+ margin-left: -100px;
588
+ opacity: 0;
589
+ transition: opacity 0.3s;
590
+ }
591
+
592
+ .tooltip:hover .tooltiptext {
593
+ visibility: visible;
594
+ opacity: 1;
595
+ }
596
+
597
+ /* Responsive design */
598
+ @media (max-width: 768px) {
599
+ .header-title {
600
+ font-size: 2rem;
601
+ }
602
+ .template-card {
603
+ margin: 0.25rem;
604
+ padding: 1rem;
605
+ }
606
  }
607
  """
608
 
609
  default_template = "smart-voice"
610
 
611
+ """Create the enhanced Gradio UI."""
612
+ with gr.Blocks(theme=my_theme, css=custom_css, title="Higgs Audio TTS") as demo:
613
+ # Header with gradient background
614
+ gr.HTML("""
615
+ <div class="header-container">
616
+ <h1 class="header-title">🎙️ Higgs Audio Text-to-Speech</h1>
617
+ <p class="header-subtitle">Transform your text into natural, expressive speech with AI</p>
618
+ </div>
619
+ """)
620
 
621
  # Main UI section
622
  with gr.Row():
623
  with gr.Column(scale=2):
624
+ # Template selection with visual cards
625
+ gr.Markdown("### 🎯 Choose Your Template")
626
+
627
  template_dropdown = gr.Dropdown(
628
  label="TTS Template",
629
  choices=list(PREDEFINED_EXAMPLES.keys()),
630
  value=default_template,
631
+ info="Select a predefined template to get started quickly",
632
+ elem_classes=["template-selector"]
633
  )
634
 
635
+ # Template description with enhanced styling
636
  template_description = gr.HTML(
637
+ value=f'<div class="info-card">{PREDEFINED_EXAMPLES[default_template]["description"]}</div>',
638
  visible=True,
639
  )
640
 
641
+ # System prompt with better styling
642
+ with gr.Group():
643
+ gr.Markdown("### 🔧 System Configuration")
644
+ system_prompt = gr.TextArea(
645
+ label="System Prompt",
646
+ placeholder="Enter system prompt to guide the model...",
647
+ value=PREDEFINED_EXAMPLES[default_template]["system_prompt"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  lines=3,
649
+ elem_classes=["system-prompt"]
650
  )
651
 
652
+ # Input text with character counter
653
+ with gr.Group():
654
+ gr.Markdown("### ✍️ Your Text")
655
+ input_text = gr.TextArea(
656
+ label="Input Text",
657
+ placeholder="Type the text you want to convert to speech...",
658
+ value=PREDEFINED_EXAMPLES[default_template]["input_text"],
659
+ lines=6,
660
+ elem_classes=["input-text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
  )
662
+ char_count = gr.Markdown(f"Character count: {len(PREDEFINED_EXAMPLES[default_template]['input_text'])}")
663
+
664
+ # Voice selection section
665
+ with gr.Group(visible=False) as voice_section:
666
+ gr.Markdown("### 🎭 Voice Selection")
667
+ voice_preset = gr.Dropdown(
668
+ label="Voice Preset",
669
+ choices=list(VOICE_PRESETS.keys()),
670
+ value="EMPTY",
671
+ interactive=False,
672
+ visible=False,
673
+ elem_classes=["voice-preset"]
674
  )
675
+
676
+ with gr.Accordion(
677
+ "🎤 Custom Reference Audio", open=False, visible=False
678
+ ) as custom_reference_accordion:
679
+ reference_audio = gr.Audio(
680
+ label="Upload Reference Audio",
681
+ type="filepath",
682
+ elem_classes=["reference-audio"]
683
+ )
684
+ reference_text = gr.TextArea(
685
+ label="Reference Text (transcript of the reference audio)",
686
+ placeholder="Enter the transcript of your reference audio for better voice cloning...",
687
+ lines=3,
688
+ elem_classes=["reference-text"]
689
+ )
690
+
691
+ # Advanced parameters with better organization
692
+ with gr.Accordion("⚙️ Advanced Parameters", open=False):
693
+ with gr.Row():
694
+ with gr.Column():
695
+ max_completion_tokens = gr.Slider(
696
+ minimum=128,
697
+ maximum=4096,
698
+ value=1024,
699
+ step=10,
700
+ label="Max Completion Tokens",
701
+ info="Maximum number of tokens to generate"
702
+ )
703
+ temperature = gr.Slider(
704
+ minimum=0.0,
705
+ maximum=1.5,
706
+ value=1.0,
707
+ step=0.1,
708
+ label="Temperature",
709
+ info="Controls randomness in generation"
710
+ )
711
+ with gr.Column():
712
+ top_p = gr.Slider(
713
+ minimum=0.1,
714
+ maximum=1.0,
715
+ value=0.95,
716
+ step=0.05,
717
+ label="Top P",
718
+ info="Nucleus sampling parameter"
719
+ )
720
+ top_k = gr.Slider(
721
+ minimum=-1,
722
+ maximum=100,
723
+ value=50,
724
+ step=1,
725
+ label="Top K",
726
+ info="Top-k sampling parameter (-1 to disable)"
727
+ )
728
+
729
+ with gr.Row():
730
+ with gr.Column():
731
+ ras_win_len = gr.Slider(
732
+ minimum=0,
733
+ maximum=10,
734
+ value=7,
735
+ step=1,
736
+ label="RAS Window Length",
737
+ info="Window length for repetition avoidance sampling"
738
+ )
739
+ with gr.Column():
740
+ ras_win_max_num_repeat = gr.Slider(
741
+ minimum=1,
742
+ maximum=10,
743
+ value=2,
744
+ step=1,
745
+ label="RAS Max Num Repeat",
746
+ info="Maximum repetitions allowed in the window"
747
+ )
748
+
749
+ # Stop strings with better UI
750
+ gr.Markdown("#### Stop Strings")
751
  stop_strings = gr.Dataframe(
752
  label="Stop Strings",
753
  headers=["stops"],
 
755
  value=[[s] for s in DEFAULT_STOP_STRINGS],
756
  interactive=True,
757
  col_count=(1, "fixed"),
758
+ elem_classes=["stop-strings"]
759
  )
760
 
761
+ # Generate button with enhanced styling
762
+ with gr.Row():
763
+ submit_btn = gr.Button(
764
+ "🚀 Generate Speech",
765
+ variant="primary",
766
+ scale=1,
767
+ elem_classes=["generate-btn"]
768
+ )
769
 
770
+ # Output column with better organization
771
  with gr.Column(scale=2):
772
+ # Status and progress section
773
+ with gr.Group():
774
+ gr.Markdown("### 📊 Generation Status")
775
+ status_text = gr.Markdown("Ready to generate speech...", elem_classes=["status-text"])
776
+
777
+ # Model response section
778
+ with gr.Group():
779
+ gr.Markdown("### 💬 Model Response")
780
+ output_text = gr.TextArea(
781
+ label="Generated Text Output",
782
+ lines=3,
783
+ interactive=False,
784
+ elem_classes=["output-text"]
785
+ )
786
 
787
+ # Audio output with enhanced player
788
+ with gr.Group():
789
+ gr.Markdown("### 🎵 Generated Audio")
790
+ output_audio = gr.Audio(
791
+ label="Audio Player",
792
+ interactive=False,
793
+ autoplay=True,
794
+ elem_classes=["audio-container"]
795
+ )
796
+
797
+ with gr.Row():
798
+ stop_btn = gr.Button(
799
+ "⏹️ Stop Playback",
800
+ variant="secondary",
801
+ elem_classes=["stop-btn"]
802
+ )
803
+ download_btn = gr.Button(
804
+ "💾 Download Audio",
805
+ variant="secondary",
806
+ elem_classes=["download-btn"],
807
+ visible=False
808
+ )
809
+
810
+ # Quick tips section
811
+ gr.Markdown("""
812
+ <div class="info-card">
813
+ <h4>💡 Quick Tips:</h4>
814
+ <ul>
815
+ <li>For voice cloning, upload a clear 10-30 second audio sample</li>
816
+ <li>Use [music start] and [music end] tags for background music</li>
817
+ <li>Add [SPEAKER0] and [SPEAKER1] tags for multi-speaker dialogue</li>
818
+ <li>Experiment with temperature (0.8-1.2) for varied speech styles</li>
819
+ </ul>
820
+ </div>
821
+ """)
822
+
823
+ # Voice samples section with visual cards
824
  with gr.Row(visible=False) as voice_samples_section:
825
+ gr.Markdown("### 🎧 Voice Samples Library")
826
  voice_samples_table = gr.Dataframe(
827
  headers=["Voice Preset", "Sample Text"],
828
  datatype=["str", "str"],
829
  value=[[preset, text] for preset, text in VOICE_PRESETS.items() if preset != "EMPTY"],
830
  interactive=False,
831
+ elem_classes=["voice-samples-table"]
832
+ )
833
+ sample_audio = gr.Audio(
834
+ label="🔊 Preview Voice Sample",
835
+ elem_classes=["sample-audio"]
836
  )
837
+
838
+ # Function to update character count
839
+ def update_char_count(text):
840
+ return f"Character count: {len(text)}"
841
 
842
  # Function to play voice sample when clicking on a row
843
  def play_voice_sample(evt: gr.SelectData):
844
  try:
 
845
  preset_names = [preset for preset in VOICE_PRESETS.keys() if preset != "EMPTY"]
846
  if evt.index[0] < len(preset_names):
847
  preset = preset_names[evt.index[0]]
 
859
  gr.Error(f"Error playing voice sample: {e}")
860
  return None
861
 
 
 
862
  # Function to handle template selection
863
  def apply_template(template_name):
864
  if template_name in PREDEFINED_EXAMPLES:
865
  template = PREDEFINED_EXAMPLES[template_name]
 
866
  is_voice_clone = template_name == "voice-clone"
867
  voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
 
868
  ras_win_len_value = 0 if template_name == "single-speaker-bgm" else 7
869
+ description_html = f'<div class="info-card">{template["description"]}</div>'
870
+
871
  return (
872
  template["system_prompt"], # system_prompt
873
  template["input_text"], # input_text
874
+ description_html, # template_description
875
  gr.update(
876
+ value=voice_preset_value,
877
+ interactive=is_voice_clone,
878
+ visible=is_voice_clone
879
+ ), # voice_preset
880
+ gr.update(visible=is_voice_clone), # custom reference accordion
881
+ gr.update(visible=is_voice_clone), # voice samples section
882
  ras_win_len_value, # ras_win_len
883
+ gr.update(visible=is_voice_clone), # voice_section
884
+ update_char_count(template["input_text"]), # char_count
885
  )
886
+ return (gr.update(),) * 9
887
+
888
+ # Enhanced text_to_speech wrapper with status updates
889
+ def text_to_speech_with_status(
890
+ text, voice_preset, reference_audio, reference_text,
891
+ max_completion_tokens, temperature, top_p, top_k,
892
+ system_prompt, stop_strings, ras_win_len, ras_win_max_num_repeat
893
+ ):
894
+ # Update status
895
+ yield "🔄 Initializing model...", None, None, gr.update(visible=False)
896
+
897
+ # Call the actual TTS function
898
+ result_text, audio_result = text_to_speech(
899
+ text, voice_preset, reference_audio, reference_text,
900
+ max_completion_tokens, temperature, top_p, top_k,
901
+ system_prompt, stop_strings, ras_win_len, ras_win_max_num_repeat
902
+ )
903
+
904
+ if audio_result:
905
+ status = "✅ Speech generated successfully!"
906
+ download_visible = True
907
  else:
908
+ status = "❌ Failed to generate speech"
909
+ download_visible = False
910
+
911
+ yield status, result_text, audio_result, gr.update(visible=download_visible)
 
 
 
 
 
912
 
913
  # Set up event handlers
914
+
915
+ # Character count update
916
+ input_text.change(
917
+ fn=update_char_count,
918
+ inputs=[input_text],
919
+ outputs=[char_count]
920
+ )
921
 
922
+ # Template selection
923
  template_dropdown.change(
924
  fn=apply_template,
925
  inputs=[template_dropdown],
 
931
  custom_reference_accordion,
932
  voice_samples_section,
933
  ras_win_len,
934
+ voice_section,
935
+ char_count,
936
  ],
937
  )
938
 
939
+ # Voice sample preview
940
+ voice_samples_table.select(
941
+ fn=play_voice_sample,
942
+ outputs=[sample_audio]
943
+ )
944
+
945
+ # Generate button with status updates
946
  submit_btn.click(
947
+ fn=text_to_speech_with_status,
948
  inputs=[
949
  input_text,
950
  voice_preset,
 
959
  ras_win_len,
960
  ras_win_max_num_repeat,
961
  ],
962
+ outputs=[status_text, output_text, output_audio, download_btn],
963
  api_name="generate_speech",
964
  )
965
 
 
971
  js="() => {const audio = document.querySelector('audio'); if(audio) audio.pause(); return null;}",
972
  )
973
 
974
+ # Download button functionality
975
+ download_btn.click(
976
+ fn=lambda x: x,
977
+ inputs=[output_audio],
978
+ outputs=[],
979
+ js="(audio) => {if(audio) {const a = document.createElement('a'); a.href = audio.url; a.download = 'generated_speech.wav'; a.click();}}",
980
+ )
981
+
982
  return demo
983
 
984
 
985
  def main():
986
  """Main function to parse arguments and launch the UI."""
987
+ global DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH
988
 
989
  parser = argparse.ArgumentParser(description="Gradio UI for Text-to-Speech using HiggsAudioServeEngine")
990
  parser.add_argument(
 
999
 
1000
  args = parser.parse_args()
1001
 
1002
+ # Initialize global variables
1003
+ initialize_globals()
1004
 
1005
  # Create and launch the UI
1006
  demo = create_ui()
1007
+ demo.launch(
1008
+ server_name=args.host,
1009
+ server_port=args.port,
1010
+ share=False,
1011
+ show_error=True
1012
+ )
1013
 
1014
 
1015
  if __name__ == "__main__":
1016
+ main()