File size: 2,108 Bytes
a6bbc99
 
907a50a
7cbdddc
a6bbc99
907a50a
 
d4af80c
7cbdddc
 
 
 
 
d4af80c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6bbc99
 
7cbdddc
 
 
 
 
 
 
907a50a
7cbdddc
a6bbc99
 
 
 
7cbdddc
a6bbc99
7cbdddc
 
 
 
2fc3324
7cbdddc
d4af80c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import gradio as gr
from transformers import AutoProcessor, BarkModel
import torch
import numpy as np

torch.set_num_threads(1)

# Load models
models = {
    "suno/bark": BarkModel.from_pretrained("suno/bark"),
    "suno/bark-small": BarkModel.from_pretrained("suno/bark-small")
}

# Combined voice presets
all_voice_presets = [
    "v2/en_speaker_0", "v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3", 
    "v2/en_speaker_4", "v2/en_speaker_5", "v2/en_speaker_6", 
    "v2/en_speaker_7", "v2/en_speaker_8", "v2/en_speaker_9",
    "v2/fr_speaker_0", "v2/fr_speaker_1", "v2/fr_speaker_2", "v2/fr_speaker_3", 
    "v2/fr_speaker_4", "v2/fr_speaker_5", "v2/fr_speaker_6", 
    "v2/fr_speaker_7", "v2/fr_speaker_8", "v2/fr_speaker_9",
    "v2/de_speaker_0", "v2/de_speaker_1", "v2/de_speaker_2", "v2/de_speaker_3", 
    "v2/de_speaker_4", "v2/de_speaker_5", "v2/de_speaker_6", 
    "v2/de_speaker_7", "v2/de_speaker_8", "v2/de_speaker_9",
    "v2/tr_speaker_0", "v2/tr_speaker_1", "v2/tr_speaker_2", "v2/tr_speaker_3", 
    "v2/tr_speaker_4", "v2/tr_speaker_5", "v2/tr_speaker_6", 
    "v2/tr_speaker_7", "v2/tr_speaker_8", "v2/tr_speaker_9"
]

# Function to generate speech
def generate_speech(text, model_name, voice_preset):
    model = models[model_name]
    processor = AutoProcessor.from_pretrained(model_name)  # Load processor for the selected model
    inputs = processor(text, voice_preset=voice_preset, return_tensors="pt")
    inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])

    with torch.no_grad():
        audio_array = model.generate(**inputs)
    audio_array = audio_array.cpu().numpy().astype('float32').squeeze()
    return (model.generation_config.sample_rate, audio_array)

# Gradio app setup
with gr.Blocks() as app:
    gr.Markdown("# Multilingual Text-to-Speech with Bark")

    # Textbox for user input
    text_input = gr.Textbox(label="Enter Text", placeholder="Type something to synthesize...")

    # Model selection
    model_preset_input = gr.Dropdown(["suno/bark", "suno/bark-small"], label="Select Model", value="suno/bark-small")

    # Combined voice pr