Zeroxdesignart commited on
Commit
c81140c
·
verified ·
1 Parent(s): 6c33c6b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -193
app.py CHANGED
@@ -1,197 +1,44 @@
1
  import numpy as np
2
  import gradio as gr
3
  from bark import SAMPLE_RATE, generate_audio, preload_models
4
- from bark.generation import SUPPORTED_LANGS
5
- from share_btn import community_icon_html, loading_icon_html, share_js
6
 
7
- DEBUG_MODE = False
8
-
9
- if not DEBUG_MODE:
10
- _ = preload_models()
11
-
12
- AVAILABLE_PROMPTS = ["Unconditional", "Announcer"]
13
- PROMPT_LOOKUP = {}
14
- for _, lang in SUPPORTED_LANGS:
15
- for n in range(10):
16
- label = f"Speaker {n} ({lang})"
17
- AVAILABLE_PROMPTS.append(label)
18
- PROMPT_LOOKUP[label] = f"{lang}_speaker_{n}"
19
- PROMPT_LOOKUP["Unconditional"] = None
20
- PROMPT_LOOKUP["Announcer"] = "announcer"
21
-
22
- default_text = "Hello, my name is Suno. And, uh — and I like pizza. [laughs]\nBut I also have other interests such as playing tic tac toe."
23
-
24
- title = "# 🐶 Bark</div>"
25
-
26
- description = """
27
- <div>
28
- <a style="display:inline-block" href='https://github.com/suno-ai/bark'><img src='https://img.shields.io/github/stars/suno-ai/bark?style=social' /></a>
29
- <a style='display:inline-block' href='https://discord.gg/J2B2vsjKuE'><img src='https://dcbadge.vercel.app/api/server/J2B2vsjKuE?compact=true&style=flat' /></a>
30
- <a style="display:inline-block; margin-left: 1em" href="https://huggingface.co/spaces/suno/bark?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space%20to%20skip%20the%20queue-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a>
31
- </div>
32
- Bark is a universal text-to-audio model created by [Suno](www.suno.ai), with code publicly available [here](https://github.com/suno-ai/bark). \
33
- Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \
34
- This demo should be used for research purposes only. Commercial use is strictly prohibited. \
35
- The model output is not censored and the authors do not endorse the opinions in the generated content. \
36
- Use at your own risk.
37
- """
38
-
39
- article = """
40
-
41
- ## 🌎 Foreign Language
42
-
43
- Bark supports various languages out-of-the-box and automatically determines language from input text. \
44
- When prompted with code-switched text, Bark will even attempt to employ the native accent for the respective languages in the same voice.
45
-
46
- Try the prompt:
47
-
48
- ```
49
- Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.
50
- ```
51
-
52
- ## 🤭 Non-Speech Sounds
53
-
54
- Below is a list of some known non-speech sounds, but we are finding more every day. \
55
- Please let us know if you find patterns that work particularly well on Discord!
56
-
57
- * [laughter]
58
- * [laughs]
59
- * [sighs]
60
- * [music]
61
- * [gasps]
62
- * [clears throat]
63
- * — or ... for hesitations
64
- * ♪ for song lyrics
65
- * capitalization for emphasis of a word
66
- * MAN/WOMAN: for bias towards speaker
67
-
68
- Try the prompt:
69
-
70
- ```
71
- " [clears throat] Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as... ♪ singing ♪."
72
- ```
73
-
74
- ## 🎶 Music
75
- Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. \
76
- Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics.
77
-
78
- Try the prompt:
79
-
80
- ```
81
- ♪ In the jungle, the mighty jungle, the lion barks tonight ♪
82
- ```
83
-
84
- ## 🧬 Voice Cloning
85
-
86
- Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. \
87
- The model also attempts to preserve music, ambient noise, etc. from input audio. \
88
- However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from.
89
-
90
- ## 👥 Speaker Prompts
91
-
92
- You can provide certain speaker prompts such as NARRATOR, MAN, WOMAN, etc. \
93
- Please note that these are not always respected, especially if a conflicting audio history prompt is given.
94
-
95
- Try the prompt:
96
-
97
- ```
98
- WOMAN: I would like an oatmilk latte please.
99
- MAN: Wow, that's expensive!
100
- ```
101
-
102
- ## Details
103
-
104
- Bark model by [Suno](https://suno.ai/), including official [code](https://github.com/suno-ai/bark) and model weights. \
105
- Gradio demo supported by 🤗 Hugging Face. Bark is licensed under a non-commercial license: CC-BY 4.0 NC, see details on [GitHub](https://github.com/suno-ai/bark).
106
-
107
-
108
- """
109
-
110
- examples = [
111
- ["Please surprise me and speak in whatever voice you enjoy. Vielen Dank und Gesundheit!",
112
- "Unconditional"], # , 0.7, 0.7],
113
- ["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe.",
114
- "Speaker 1 (en)"], # , 0.7, 0.7],
115
- ["Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.",
116
- "Speaker 0 (es)"], # , 0.7, 0.7],
117
- ]
118
-
119
-
120
- def gen_tts(text, history_prompt): # , temp_semantic, temp_waveform):
121
- history_prompt = PROMPT_LOOKUP[history_prompt]
122
- if DEBUG_MODE:
123
- audio_arr = np.zeros(SAMPLE_RATE)
124
- else:
125
- # , text_temp=temp_semantic, waveform_temp=temp_waveform)
126
- audio_arr = generate_audio(text, history_prompt=history_prompt)
127
- audio_arr = (audio_arr * 32767).astype(np.int16)
128
- return (SAMPLE_RATE, audio_arr)
129
-
130
-
131
- css = """
132
- #share-btn-container {
133
- display: flex;
134
- padding-left: 0.5rem !important;
135
- padding-right: 0.5rem !important;
136
- background-color: #000000;
137
- justify-content: center;
138
- align-items: center;
139
- border-radius: 9999px !important;
140
- width: 13rem;
141
- margin-top: 10px;
142
- margin-left: auto;
143
- flex: unset !important;
144
- }
145
- #share-btn {
146
- all: initial;
147
- color: #ffffff;
148
- font-weight: 600;
149
- cursor: pointer;
150
- font-family: 'IBM Plex Sans', sans-serif;
151
- margin-left: 0.5rem !important;
152
- padding-top: 0.25rem !important;
153
- padding-bottom: 0.25rem !important;
154
- right:0;
155
- }
156
- #share-btn * {
157
- all: unset !important;
158
- }
159
- #share-btn-container div:nth-child(-n+2){
160
- width: auto !important;
161
- min-height: 0px !important;
162
- }
163
- #share-btn-container .wrap {
164
- display: none !important;
165
- }
166
- """
167
- with gr.Blocks(css=css) as block:
168
- gr.Markdown(title)
169
- gr.Markdown(description)
170
- with gr.Row():
171
- with gr.Column():
172
- input_text = gr.Textbox(
173
- label="Input Text", lines=2, value=default_text, elem_id="input_text")
174
- options = gr.Dropdown(
175
- AVAILABLE_PROMPTS, value="Speaker 1 (en)", label="Acoustic Prompt", elem_id="speaker_option")
176
- run_button = gr.Button(text="Generate Audio", type="button")
177
- with gr.Column():
178
- audio_out = gr.Audio(label="Generated Audio",
179
- type="numpy", elem_id="audio_out")
180
- with gr.Row(visible=False) as share_row:
181
- with gr.Group(elem_id="share-btn-container"):
182
- community_icon = gr.HTML(community_icon_html)
183
- loading_icon = gr.HTML(loading_icon_html)
184
- share_button = gr.Button(
185
- "Share to community", elem_id="share-btn")
186
- share_button.click(None, [], [], _js=share_js)
187
- inputs = [input_text, options]
188
- outputs = [audio_out]
189
- gr.Examples(examples=examples, fn=gen_tts, inputs=inputs,
190
- outputs=outputs, cache_examples=True)
191
- gr.Markdown(article)
192
- run_button.click(fn=lambda: gr.update(visible=False), inputs=None, outputs=share_row, queue=False).then(
193
- fn=gen_tts, inputs=inputs, outputs=outputs, queue=True).then(
194
- fn=lambda: gr.update(visible=True), inputs=None, outputs=share_row, queue=False)
195
-
196
- block.queue()
197
- block.launch()
 
1
  import numpy as np
2
  import gradio as gr
3
  from bark import SAMPLE_RATE, generate_audio, preload_models
 
 
4
 
5
+ model_cache = {}
6
+
7
+ def load_model(model_name):
8
+ if model_name not in model_cache:
9
+ model_cache[model_name] = preload_models(model_name)
10
+ return model_cache[model_name]
11
+
12
+ def validate_input(text):
13
+ if len(text) == 0:
14
+ raise ValueError("Input text cannot be empty.")
15
+ if len(text) > 500:
16
+ raise ValueError("Input text is too long (500 characters max).")
17
+
18
+ def generate_custom_audio(text, prompt, pitch, tempo):
19
+ try:
20
+ validate_input(text)
21
+ model = load_model(prompt)
22
+ audio = generate_audio(text, history_prompt=prompt)
23
+ audio = adjust_audio(audio, pitch, tempo)
24
+ return audio
25
+ except Exception as e:
26
+ return np.zeros(SAMPLE_RATE), str(e)
27
+
28
+ def adjust_audio(audio, pitch, tempo):
29
+ # Apply pitch and tempo adjustments
30
+ return audio * pitch * tempo
31
+
32
+ # Interface design
33
+ with gr.Blocks() as interface:
34
+ gr.Markdown("# 🎤 Advanced Voice Generator")
35
+ text_input = gr.Textbox(label="Enter Text")
36
+ prompt_option = gr.Dropdown(choices=["Speaker 1", "Speaker 2"], label="Voice")
37
+ pitch_slider = gr.Slider(min=0.5, max=2.0, label="Pitch")
38
+ tempo_slider = gr.Slider(min=0.5, max=2.0, label="Tempo")
39
+ generate_button = gr.Button("Generate Audio")
40
+ audio_output = gr.Audio(label="Generated Audio")
41
+
42
+ generate_button.click(generate_custom_audio, inputs=[text_input, prompt_option, pitch_slider, tempo_slider], outputs=audio_output)
43
+
44
+ interface.launch()