Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -5,83 +5,67 @@ from transformers import AutoTokenizer, pipeline, WhisperForConditionalGeneratio
|
|
5 |
import numpy as np
|
6 |
import evaluate
|
7 |
|
8 |
-
# Example prompts from the paper
|
9 |
EXAMPLES = [
|
10 |
-
# Each list is [description, text, guidance_scale, num_retries, wer_threshold]
|
11 |
[
|
12 |
"A man speaks with a booming, medium-pitched voice in a clear environment, delivering his words at a measured speed.",
|
13 |
-
"That's my brother. I do agree, though, it wasn't very well-groomed."
|
14 |
-
1.5, 3, 20.0
|
15 |
],
|
16 |
[
|
17 |
"A male speaker's speech is distinguished by a slurred articulation, delivered at a measured pace in a clear environment.",
|
18 |
-
"reveal my true intentions in different ways. That's why the Street King Project and SMS"
|
19 |
-
1.5, 3, 20.0
|
20 |
],
|
21 |
[
|
22 |
"In a clear environment, a male speaker delivers his words hesitantly with a measured pace.",
|
23 |
-
"the Grand Slam tennis game has sort of taken over our set that's sort of all the way"
|
24 |
-
1.5, 3, 20.0
|
25 |
],
|
26 |
[
|
27 |
"A low-pitched, guttural male voice speaks slowly in a clear environment.",
|
28 |
-
"you know you want to see how far you can push everything and as an artist"
|
29 |
-
1.5, 3, 20.0
|
30 |
],
|
31 |
[
|
32 |
"A man speaks with a measured pace in a clear environment, displaying a distinct British accent.",
|
33 |
-
"most important but the reaction is very similar throughout the world it's really very very similar"
|
34 |
-
1.5, 3, 20.0
|
35 |
],
|
36 |
[
|
37 |
"A male speaker's voice is clear and delivered at a measured pace in a quiet environment. His speech carries a distinct Jamaican accent.",
|
38 |
-
"about God and the people him come from is more Christian, you know. We always"
|
39 |
-
1.5, 3, 20.0
|
40 |
],
|
41 |
[
|
42 |
"In a clear environment, a male voice speaks with a sad tone.",
|
43 |
-
"Was that your landlord?"
|
44 |
-
1.5, 3, 20.0
|
45 |
],
|
46 |
[
|
47 |
"A man speaks with a measured pace in a clear environment, his voice carrying a sleepy tone.",
|
48 |
-
"I mean, to be fair, I did see a UFO, so, you know."
|
49 |
-
1.5, 3, 20.0
|
50 |
],
|
51 |
[
|
52 |
"A frightened woman speaks with a clear and distinct voice.",
|
53 |
-
"Yes, that's what they said. I don't know what you're getting done. What are you getting done? Oh, okay. Yeah."
|
54 |
-
1.5, 3, 20.0
|
55 |
],
|
56 |
[
|
57 |
"A woman speaks slowly in a clear environment, her voice filled with awe.",
|
58 |
-
"Oh wow, this music is fantastic. You play so well. I could just sit here."
|
59 |
-
1.5, 3, 20.0
|
60 |
],
|
61 |
[
|
62 |
"A woman speaks with a high-pitched voice in a clear environment, conveying a sense of anxiety.",
|
63 |
-
"this is just way too overwhelming. I literally don't know how I'm going to get any of this done on time. I feel so overwhelmed right now. No one is helping me. Everyone's ignoring my calls and my emails. I don't know what I'm supposed to do right now."
|
64 |
-
1.5, 3, 20.0
|
65 |
],
|
66 |
[
|
67 |
"A female speaker's high-pitched voice is clear and carries over a laughing, unobstructed environment.",
|
68 |
-
"What is wrong with him, Chad?"
|
69 |
-
1.5, 3, 20.0
|
70 |
],
|
71 |
[
|
72 |
"In a clear environment, a man speaks in a whispered tone.",
|
73 |
-
"The fruit piece, the still lifes, you mean."
|
74 |
-
1.5, 3, 20.0
|
75 |
],
|
76 |
[
|
77 |
"A male speaker with a husky, low-pitched voice delivers clear speech in a quiet environment.",
|
78 |
-
"Ari had to somehow be subservient to Lloyd that would be unbelievable like if Lloyd was the guy who was like running Time Warner you know what I mean like"
|
79 |
-
1.5, 3, 20.0
|
80 |
],
|
81 |
[
|
82 |
"A female speaker's voice is clear and expressed at a measured pace, but carries a high-pitched, nasal tone, recorded in a quiet environment.",
|
83 |
-
"You know, Joe Bow, hockey mom from Wasilla, if I have an idea that would perhaps make"
|
84 |
-
1.5, 3, 20.0
|
85 |
]
|
86 |
]
|
87 |
|
@@ -148,9 +132,9 @@ class ParlerTTSInference:
|
|
148 |
self.description_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
149 |
self.transcription_tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
|
150 |
self.asr_pipeline = pipeline(model=asr_model, device=self.device, chunk_length_s=25.0)
|
151 |
-
return True, "Models loaded successfully! You can now generate audio."
|
152 |
except Exception as e:
|
153 |
-
return False, f"Error loading models: {str(e)}"
|
154 |
|
155 |
def generate_audio(self, description, text, guidance_scale, num_retries, wer_threshold):
|
156 |
"""Generate audio from text with style description"""
|
@@ -194,11 +178,13 @@ def create_demo():
|
|
194 |
# Initialize the inference class
|
195 |
inference = ParlerTTSInference()
|
196 |
|
197 |
-
# Create the interface
|
198 |
-
|
|
|
|
|
199 |
gr.Markdown(
|
200 |
"""
|
201 |
-
# 🎙️ ParaSpeechCaps
|
202 |
|
203 |
Generate expressive speech with rich style control using our Parler-TTS model finetuned on ParaSpeechCaps. Control various aspects of speech including:
|
204 |
- Speaker characteristics (pitch, clarity, etc.)
|
@@ -268,12 +254,12 @@ def create_demo():
|
|
268 |
choices=["distil-whisper/distil-large-v2"],
|
269 |
value="distil-whisper/distil-large-v2",
|
270 |
label="ASR Model",
|
271 |
-
info="ASR model used for
|
272 |
)
|
273 |
|
274 |
with gr.Row():
|
275 |
load_button = gr.Button("📥 Load Models", variant="primary")
|
276 |
-
generate_button = gr.Button("🎵 Generate", variant="
|
277 |
|
278 |
with gr.Column(scale=1):
|
279 |
output_audio = gr.Audio(label="Generated Speech", type="numpy")
|
@@ -283,9 +269,17 @@ def create_demo():
|
|
283 |
load_button.click(
|
284 |
fn=inference.load_models,
|
285 |
inputs=[model_name, asr_model],
|
286 |
-
outputs=[
|
287 |
)
|
288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
generate_button.click(
|
290 |
fn=inference.generate_audio,
|
291 |
inputs=[
|
@@ -298,18 +292,15 @@ def create_demo():
|
|
298 |
outputs=[output_audio, status_text]
|
299 |
)
|
300 |
|
301 |
-
# Add examples
|
302 |
gr.Examples(
|
303 |
examples=EXAMPLES,
|
304 |
inputs=[
|
305 |
description,
|
306 |
-
text
|
307 |
-
guidance_scale,
|
308 |
-
num_retries,
|
309 |
-
wer_threshold
|
310 |
],
|
311 |
outputs=[output_audio, status_text],
|
312 |
-
fn=
|
313 |
cache_examples=False
|
314 |
)
|
315 |
|
|
|
5 |
import numpy as np
|
6 |
import evaluate
|
7 |
|
8 |
+
# Example prompts from the paper (only style and text)
|
9 |
EXAMPLES = [
|
|
|
10 |
[
|
11 |
"A man speaks with a booming, medium-pitched voice in a clear environment, delivering his words at a measured speed.",
|
12 |
+
"That's my brother. I do agree, though, it wasn't very well-groomed."
|
|
|
13 |
],
|
14 |
[
|
15 |
"A male speaker's speech is distinguished by a slurred articulation, delivered at a measured pace in a clear environment.",
|
16 |
+
"reveal my true intentions in different ways. That's why the Street King Project and SMS"
|
|
|
17 |
],
|
18 |
[
|
19 |
"In a clear environment, a male speaker delivers his words hesitantly with a measured pace.",
|
20 |
+
"the Grand Slam tennis game has sort of taken over our set that's sort of all the way"
|
|
|
21 |
],
|
22 |
[
|
23 |
"A low-pitched, guttural male voice speaks slowly in a clear environment.",
|
24 |
+
"you know you want to see how far you can push everything and as an artist"
|
|
|
25 |
],
|
26 |
[
|
27 |
"A man speaks with a measured pace in a clear environment, displaying a distinct British accent.",
|
28 |
+
"most important but the reaction is very similar throughout the world it's really very very similar"
|
|
|
29 |
],
|
30 |
[
|
31 |
"A male speaker's voice is clear and delivered at a measured pace in a quiet environment. His speech carries a distinct Jamaican accent.",
|
32 |
+
"about God and the people him come from is more Christian, you know. We always"
|
|
|
33 |
],
|
34 |
[
|
35 |
"In a clear environment, a male voice speaks with a sad tone.",
|
36 |
+
"Was that your landlord?"
|
|
|
37 |
],
|
38 |
[
|
39 |
"A man speaks with a measured pace in a clear environment, his voice carrying a sleepy tone.",
|
40 |
+
"I mean, to be fair, I did see a UFO, so, you know."
|
|
|
41 |
],
|
42 |
[
|
43 |
"A frightened woman speaks with a clear and distinct voice.",
|
44 |
+
"Yes, that's what they said. I don't know what you're getting done. What are you getting done? Oh, okay. Yeah."
|
|
|
45 |
],
|
46 |
[
|
47 |
"A woman speaks slowly in a clear environment, her voice filled with awe.",
|
48 |
+
"Oh wow, this music is fantastic. You play so well. I could just sit here."
|
|
|
49 |
],
|
50 |
[
|
51 |
"A woman speaks with a high-pitched voice in a clear environment, conveying a sense of anxiety.",
|
52 |
+
"this is just way too overwhelming. I literally don't know how I'm going to get any of this done on time. I feel so overwhelmed right now. No one is helping me. Everyone's ignoring my calls and my emails. I don't know what I'm supposed to do right now."
|
|
|
53 |
],
|
54 |
[
|
55 |
"A female speaker's high-pitched voice is clear and carries over a laughing, unobstructed environment.",
|
56 |
+
"What is wrong with him, Chad?"
|
|
|
57 |
],
|
58 |
[
|
59 |
"In a clear environment, a man speaks in a whispered tone.",
|
60 |
+
"The fruit piece, the still lifes, you mean."
|
|
|
61 |
],
|
62 |
[
|
63 |
"A male speaker with a husky, low-pitched voice delivers clear speech in a quiet environment.",
|
64 |
+
"Ari had to somehow be subservient to Lloyd that would be unbelievable like if Lloyd was the guy who was like running Time Warner you know what I mean like"
|
|
|
65 |
],
|
66 |
[
|
67 |
"A female speaker's voice is clear and expressed at a measured pace, but carries a high-pitched, nasal tone, recorded in a quiet environment.",
|
68 |
+
"You know, Joe Bow, hockey mom from Wasilla, if I have an idea that would perhaps make"
|
|
|
69 |
]
|
70 |
]
|
71 |
|
|
|
132 |
self.description_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
133 |
self.transcription_tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
|
134 |
self.asr_pipeline = pipeline(model=asr_model, device=self.device, chunk_length_s=25.0)
|
135 |
+
return gr.Button(value="🎵 Generate", variant="primary", interactive=True), "Models loaded successfully! You can now generate audio."
|
136 |
except Exception as e:
|
137 |
+
return gr.Button(value="🎵 Generate", variant="primary", interactive=False), f"Error loading models: {str(e)}"
|
138 |
|
139 |
def generate_audio(self, description, text, guidance_scale, num_retries, wer_threshold):
|
140 |
"""Generate audio from text with style description"""
|
|
|
178 |
# Initialize the inference class
|
179 |
inference = ParlerTTSInference()
|
180 |
|
181 |
+
# Create the interface with a simple theme
|
182 |
+
theme = gr.themes.Default()
|
183 |
+
|
184 |
+
with gr.Blocks(title="ParaSpeechCaps Demo", theme=theme) as demo:
|
185 |
gr.Markdown(
|
186 |
"""
|
187 |
+
# 🎙️ Parler-TTS Mini with ParaSpeechCaps
|
188 |
|
189 |
Generate expressive speech with rich style control using our Parler-TTS model finetuned on ParaSpeechCaps. Control various aspects of speech including:
|
190 |
- Speaker characteristics (pitch, clarity, etc.)
|
|
|
254 |
choices=["distil-whisper/distil-large-v2"],
|
255 |
value="distil-whisper/distil-large-v2",
|
256 |
label="ASR Model",
|
257 |
+
info="ASR model used for quality assessment"
|
258 |
)
|
259 |
|
260 |
with gr.Row():
|
261 |
load_button = gr.Button("📥 Load Models", variant="primary")
|
262 |
+
generate_button = gr.Button("🎵 Generate", variant="primary", interactive=False)
|
263 |
|
264 |
with gr.Column(scale=1):
|
265 |
output_audio = gr.Audio(label="Generated Speech", type="numpy")
|
|
|
269 |
load_button.click(
|
270 |
fn=inference.load_models,
|
271 |
inputs=[model_name, asr_model],
|
272 |
+
outputs=[generate_button, status_text]
|
273 |
)
|
274 |
|
275 |
+
def generate_with_default_params(description, text):
|
276 |
+
return inference.generate_audio(
|
277 |
+
description, text,
|
278 |
+
guidance_scale=1.5,
|
279 |
+
num_retries=3,
|
280 |
+
wer_threshold=20.0
|
281 |
+
)
|
282 |
+
|
283 |
generate_button.click(
|
284 |
fn=inference.generate_audio,
|
285 |
inputs=[
|
|
|
292 |
outputs=[output_audio, status_text]
|
293 |
)
|
294 |
|
295 |
+
# Add examples (only style and text)
|
296 |
gr.Examples(
|
297 |
examples=EXAMPLES,
|
298 |
inputs=[
|
299 |
description,
|
300 |
+
text
|
|
|
|
|
|
|
301 |
],
|
302 |
outputs=[output_audio, status_text],
|
303 |
+
fn=generate_with_default_params,
|
304 |
cache_examples=False
|
305 |
)
|
306 |
|