eaysu
voice preset mechanism again changed
5ae480d
raw
history blame
3.43 kB
import gradio as gr
from transformers import AutoProcessor, BarkModel
import torch
import numpy as np
torch.set_num_threads(1)
# Preload available models
models = {
"suno/bark": BarkModel.from_pretrained("suno/bark"),
"suno/bark-small": BarkModel.from_pretrained("suno/bark-small")
}
# Voice presets for each language
voice_presets = {
"English": ["v2/en_speaker_0", "v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3",
"v2/en_speaker_4", "v2/en_speaker_5", "v2/en_speaker_6",
"v2/en_speaker_7", "v2/en_speaker_8", "v2/en_speaker_9"],
"French": ["v2/fr_speaker_0", "v2/fr_speaker_1", "v2/fr_speaker_2", "v2/fr_speaker_3",
"v2/fr_speaker_4", "v2/fr_speaker_5", "v2/fr_speaker_6",
"v2/fr_speaker_7", "v2/fr_speaker_8", "v2/fr_speaker_9"],
"German": ["v2/de_speaker_0", "v2/de_speaker_1", "v2/de_speaker_2", "v2/de_speaker_3",
"v2/de_speaker_4", "v2/de_speaker_5", "v2/de_speaker_6",
"v2/de_speaker_7", "v2/de_speaker_8", "v2/de_speaker_9"],
"Turkish": ["v2/tr_speaker_0", "v2/tr_speaker_1", "v2/tr_speaker_2", "v2/tr_speaker_3",
"v2/tr_speaker_4", "v2/tr_speaker_5", "v2/tr_speaker_6",
"v2/tr_speaker_7", "v2/tr_speaker_8", "v2/tr_speaker_9"]
}
# Function to update voice presets based on selected language
def get_voice_presets(language):
return voice_presets[language]
# Function to generate speech
def generate_speech(text, model_name, voice_preset):
model = models[model_name]
processor = AutoProcessor.from_pretrained(model_name) # Load processor for the selected model
inputs = processor(text, voice_preset=voice_preset, return_tensors="pt")
inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
with torch.no_grad():
audio_array = model.generate(**inputs)
audio_array = audio_array.cpu().numpy().astype('float32').squeeze()
return (model.generation_config.sample_rate, audio_array)
# Gradio app setup
with gr.Blocks() as app:
gr.Markdown("# Multilingual Text-to-Speech with Bark")
# Language selection
language_input = gr.Dropdown(
["English", "French", "German", "Turkish"],
label="Select Language",
value="English"
)
# Textbox for user input
text_input = gr.Textbox(label="Enter Text", placeholder="Type something to synthesize...")
# Model selection
model_preset_input = gr.Dropdown(["suno/bark", "suno/bark-small"], label="Select Model", value="suno/bark-small")
# Voice preset dropdown (default to English presets)
voice_preset_input = gr.Dropdown(
choices=voice_presets["English"],
label="Select Voice Preset"
)
# Button to generate voice
generate_button = gr.Button("Generate Voice")
# Output audio
audio_output = gr.Audio(label="Generated Voice", type="numpy")
# Display voice presets based on language selection
def update_preset_choices(language):
return gr.Dropdown.update(choices=get_voice_presets(language))
# Set dynamic update on language selection
language_input.change(update_preset_choices, inputs=language_input, outputs=voice_preset_input)
# Generate voice on button click
generate_button.click(
generate_speech,
inputs=[text_input, model_preset_input, voice_preset_input],
outputs=audio_output
)
app.launch()