Spaces:
Running
Running
File size: 3,432 Bytes
a6bbc99 907a50a 7cbdddc a6bbc99 907a50a 2fc3324 7cbdddc 5ae480d a6bbc99 7cbdddc 907a50a 7cbdddc a6bbc99 7cbdddc a6bbc99 7cbdddc 2fc3324 a6bbc99 5ae480d 7cbdddc 2fc3324 7cbdddc 5ae480d 2fc3324 5ae480d 2fc3324 7cbdddc a6bbc99 7cbdddc 5ae480d 7cbdddc 5ae480d 7cbdddc a6bbc99 7cbdddc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import gradio as gr
from transformers import AutoProcessor, BarkModel
import torch
import numpy as np
torch.set_num_threads(1)
# Preload available models
models = {
"suno/bark": BarkModel.from_pretrained("suno/bark"),
"suno/bark-small": BarkModel.from_pretrained("suno/bark-small")
}
# Voice presets for each language
voice_presets = {
"English": ["v2/en_speaker_0", "v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3",
"v2/en_speaker_4", "v2/en_speaker_5", "v2/en_speaker_6",
"v2/en_speaker_7", "v2/en_speaker_8", "v2/en_speaker_9"],
"French": ["v2/fr_speaker_0", "v2/fr_speaker_1", "v2/fr_speaker_2", "v2/fr_speaker_3",
"v2/fr_speaker_4", "v2/fr_speaker_5", "v2/fr_speaker_6",
"v2/fr_speaker_7", "v2/fr_speaker_8", "v2/fr_speaker_9"],
"German": ["v2/de_speaker_0", "v2/de_speaker_1", "v2/de_speaker_2", "v2/de_speaker_3",
"v2/de_speaker_4", "v2/de_speaker_5", "v2/de_speaker_6",
"v2/de_speaker_7", "v2/de_speaker_8", "v2/de_speaker_9"],
"Turkish": ["v2/tr_speaker_0", "v2/tr_speaker_1", "v2/tr_speaker_2", "v2/tr_speaker_3",
"v2/tr_speaker_4", "v2/tr_speaker_5", "v2/tr_speaker_6",
"v2/tr_speaker_7", "v2/tr_speaker_8", "v2/tr_speaker_9"]
}
# Function to update voice presets based on selected language
def get_voice_presets(language):
return voice_presets[language]
# Function to generate speech
def generate_speech(text, model_name, voice_preset):
model = models[model_name]
processor = AutoProcessor.from_pretrained(model_name) # Load processor for the selected model
inputs = processor(text, voice_preset=voice_preset, return_tensors="pt")
inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
with torch.no_grad():
audio_array = model.generate(**inputs)
audio_array = audio_array.cpu().numpy().astype('float32').squeeze()
return (model.generation_config.sample_rate, audio_array)
# Gradio app setup
with gr.Blocks() as app:
gr.Markdown("# Multilingual Text-to-Speech with Bark")
# Language selection
language_input = gr.Dropdown(
["English", "French", "German", "Turkish"],
label="Select Language",
value="English"
)
# Textbox for user input
text_input = gr.Textbox(label="Enter Text", placeholder="Type something to synthesize...")
# Model selection
model_preset_input = gr.Dropdown(["suno/bark", "suno/bark-small"], label="Select Model", value="suno/bark-small")
# Voice preset dropdown (default to English presets)
voice_preset_input = gr.Dropdown(
choices=voice_presets["English"],
label="Select Voice Preset"
)
# Button to generate voice
generate_button = gr.Button("Generate Voice")
# Output audio
audio_output = gr.Audio(label="Generated Voice", type="numpy")
# Display voice presets based on language selection
def update_preset_choices(language):
return gr.Dropdown.update(choices=get_voice_presets(language))
# Set dynamic update on language selection
language_input.change(update_preset_choices, inputs=language_input, outputs=voice_preset_input)
# Generate voice on button click
generate_button.click(
generate_speech,
inputs=[text_input, model_preset_input, voice_preset_input],
outputs=audio_output
)
app.launch()
|