eaysu commited on
Commit
7cbdddc
·
1 Parent(s): 204c3ad

model and language versions enhanced

Browse files
Files changed (1) hide show
  1. app.py +62 -33
app.py CHANGED
@@ -1,52 +1,81 @@
1
  import gradio as gr
2
  from transformers import AutoProcessor, BarkModel
3
  import torch
4
- import scipy
5
 
6
- # Limit CPU usage
7
  torch.set_num_threads(1)
8
 
9
- # Load the Bark model and processor
10
- processor = AutoProcessor.from_pretrained("suno/bark-small")
11
- model = BarkModel.from_pretrained("suno/bark-small")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Function to generate speech
14
- def generate_speech(text, voice_preset):
15
- # Process the input text with the selected voice preset
16
- inputs = processor(text, voice_preset=voice_preset)
17
-
18
- # Generate audio and convert to float32 early to optimize memory usage
19
- with torch.no_grad(): # Disable gradient calculations for faster inference
 
20
  audio_array = model.generate(**inputs)
21
- audio_array = audio_array.cpu().numpy().astype('float32').squeeze() # Converting early
22
-
23
- # Return the audio with sample rate for Gradio's audio component
24
  return (model.generation_config.sample_rate, audio_array)
25
 
26
  # Gradio app setup
27
  with gr.Blocks() as app:
28
- gr.Markdown("# Turkish Text-to-Speech with Bark")
29
- gr.Markdown("Enter text, select a Turkish voice preset, and click 'Generate Voice' to play the generated audio.")
30
 
31
- # Input text box for user to type text
32
- text_input = gr.Textbox(label="Enter Text in Turkish", placeholder="Merhaba, bugün bir yerlere gidelim mi?")
33
-
34
- # Dropdown for selecting voice preset
35
- voice_preset_input = gr.Dropdown(
36
- ["v2/tr_speaker_0", "v2/tr_speaker_1", "v2/tr_speaker_2", "v2/tr_speaker_3",
37
- "v2/tr_speaker_4", "v2/tr_speaker_5", "v2/tr_speaker_6",
38
- "v2/tr_speaker_7", "v2/tr_speaker_8", "v2/tr_speaker_9"],
39
- label="Select Turkish Voice Preset"
40
  )
41
 
42
- # Audio output component for playing generated audio
43
- audio_output = gr.Audio(label="Generated Voice", type="numpy")
44
-
45
- # Button to trigger the generation
 
 
 
 
 
 
46
  generate_button = gr.Button("Generate Voice")
47
 
48
- # When the button is clicked, call the generate_speech function
49
- generate_button.click(generate_speech, inputs=[text_input, voice_preset_input], outputs=audio_output)
 
 
 
 
 
 
 
 
 
 
50
 
51
- # Launch the Gradio app
52
- app.launch(share=True)
 
1
  import gradio as gr
2
  from transformers import AutoProcessor, BarkModel
3
  import torch
4
+ import numpy as np
5
 
 
6
  torch.set_num_threads(1)
7
 
8
+ # Preload available models to optimize switching
9
+ models = {
10
+ "suno/bark": BarkModel.from_pretrained("suno/bark"),
11
+ "suno/bark-small": BarkModel.from_pretrained("suno/bark-small")
12
+ }
13
+
14
+ # Voice presets for each language
15
+ voice_presets = {
16
+ "English": ["v2/en_speaker_0", "v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3",
17
+ "v2/en_speaker_4", "v2/en_speaker_5", "v2/en_speaker_6",
18
+ "v2/en_speaker_7", "v2/en_speaker_8", "v2/en_speaker_9"],
19
+ "French": ["v2/fr_speaker_0", "v2/fr_speaker_1", "v2/fr_speaker_2", "v2/fr_speaker_3",
20
+ "v2/fr_speaker_4", "v2/fr_speaker_5", "v2/fr_speaker_6",
21
+ "v2/fr_speaker_7", "v2/fr_speaker_8", "v2/fr_speaker_9"],
22
+ "German": ["v2/de_speaker_0", "v2/de_speaker_1", "v2/de_speaker_2", "v2/de_speaker_3",
23
+ "v2/de_speaker_4", "v2/de_speaker_5", "v2/de_speaker_6",
24
+ "v2/de_speaker_7", "v2/de_speaker_8", "v2/de_speaker_9"],
25
+ "Turkish": ["v2/tr_speaker_0", "v2/tr_speaker_1", "v2/tr_speaker_2", "v2/tr_speaker_3",
26
+ "v2/tr_speaker_4", "v2/tr_speaker_5", "v2/tr_speaker_6",
27
+ "v2/tr_speaker_7", "v2/tr_speaker_8", "v2/tr_speaker_9"]
28
+ }
29
+
30
+ # Function to update voice presets based on selected language
31
+ def update_voice_presets(language):
32
+ return gr.Dropdown.update(choices=voice_presets[language])
33
 
34
  # Function to generate speech
35
+ def generate_speech(text, model_name, voice_preset):
36
+ model = models[model_name]
37
+ processor = AutoProcessor.from_pretrained(model_name) # Load processor for the selected model
38
+ inputs = processor(text, voice_preset=voice_preset, return_tensors="pt")
39
+ inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
40
+
41
+ with torch.no_grad():
42
  audio_array = model.generate(**inputs)
43
+ audio_array = audio_array.cpu().numpy().astype('float32').squeeze()
 
 
44
  return (model.generation_config.sample_rate, audio_array)
45
 
46
  # Gradio app setup
47
  with gr.Blocks() as app:
48
+ gr.Markdown("# Multilingual Text-to-Speech with Bark")
 
49
 
50
+ # Language selection
51
+ language_input = gr.Dropdown(
52
+ ["English", "French", "German", "Turkish"],
53
+ label="Select Language"
 
 
 
 
 
54
  )
55
 
56
+ # Textbox for user input
57
+ text_input = gr.Textbox(label="Enter Text", placeholder="Type something to synthesize...")
58
+
59
+ # Model selection
60
+ model_preset_input = gr.Dropdown(["suno/bark", "suno/bark-small"], label="Select Model")
61
+
62
+ # Voice preset dropdown (will be updated based on language)
63
+ voice_preset_input = gr.Dropdown(choices=[], label="Select Voice Preset")
64
+
65
+ # Button to generate voice
66
  generate_button = gr.Button("Generate Voice")
67
 
68
+ # Output audio
69
+ audio_output = gr.Audio(label="Generated Voice", type="numpy")
70
+
71
+ # Set dynamic update on language selection
72
+ language_input.change(update_voice_presets, inputs=language_input, outputs=voice_preset_input)
73
+
74
+ # Generate voice on button click
75
+ generate_button.click(
76
+ generate_speech,
77
+ inputs=[text_input, model_preset_input, voice_preset_input],
78
+ outputs=audio_output
79
+ )
80
 
81
+ app.launch()