ajd12342 commited on
Commit
b3cf9d6
·
verified ·
1 Parent(s): bd84ccf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -46
app.py CHANGED
@@ -5,83 +5,67 @@ from transformers import AutoTokenizer, pipeline, WhisperForConditionalGeneratio
5
  import numpy as np
6
  import evaluate
7
 
8
- # Example prompts from the paper
9
  EXAMPLES = [
10
- # Each list is [description, text, guidance_scale, num_retries, wer_threshold]
11
  [
12
  "A man speaks with a booming, medium-pitched voice in a clear environment, delivering his words at a measured speed.",
13
- "That's my brother. I do agree, though, it wasn't very well-groomed.",
14
- 1.5, 3, 20.0
15
  ],
16
  [
17
  "A male speaker's speech is distinguished by a slurred articulation, delivered at a measured pace in a clear environment.",
18
- "reveal my true intentions in different ways. That's why the Street King Project and SMS",
19
- 1.5, 3, 20.0
20
  ],
21
  [
22
  "In a clear environment, a male speaker delivers his words hesitantly with a measured pace.",
23
- "the Grand Slam tennis game has sort of taken over our set that's sort of all the way",
24
- 1.5, 3, 20.0
25
  ],
26
  [
27
  "A low-pitched, guttural male voice speaks slowly in a clear environment.",
28
- "you know you want to see how far you can push everything and as an artist",
29
- 1.5, 3, 20.0
30
  ],
31
  [
32
  "A man speaks with a measured pace in a clear environment, displaying a distinct British accent.",
33
- "most important but the reaction is very similar throughout the world it's really very very similar",
34
- 1.5, 3, 20.0
35
  ],
36
  [
37
  "A male speaker's voice is clear and delivered at a measured pace in a quiet environment. His speech carries a distinct Jamaican accent.",
38
- "about God and the people him come from is more Christian, you know. We always",
39
- 1.5, 3, 20.0
40
  ],
41
  [
42
  "In a clear environment, a male voice speaks with a sad tone.",
43
- "Was that your landlord?",
44
- 1.5, 3, 20.0
45
  ],
46
  [
47
  "A man speaks with a measured pace in a clear environment, his voice carrying a sleepy tone.",
48
- "I mean, to be fair, I did see a UFO, so, you know.",
49
- 1.5, 3, 20.0
50
  ],
51
  [
52
  "A frightened woman speaks with a clear and distinct voice.",
53
- "Yes, that's what they said. I don't know what you're getting done. What are you getting done? Oh, okay. Yeah.",
54
- 1.5, 3, 20.0
55
  ],
56
  [
57
  "A woman speaks slowly in a clear environment, her voice filled with awe.",
58
- "Oh wow, this music is fantastic. You play so well. I could just sit here.",
59
- 1.5, 3, 20.0
60
  ],
61
  [
62
  "A woman speaks with a high-pitched voice in a clear environment, conveying a sense of anxiety.",
63
- "this is just way too overwhelming. I literally don't know how I'm going to get any of this done on time. I feel so overwhelmed right now. No one is helping me. Everyone's ignoring my calls and my emails. I don't know what I'm supposed to do right now.",
64
- 1.5, 3, 20.0
65
  ],
66
  [
67
  "A female speaker's high-pitched voice is clear and carries over a laughing, unobstructed environment.",
68
- "What is wrong with him, Chad?",
69
- 1.5, 3, 20.0
70
  ],
71
  [
72
  "In a clear environment, a man speaks in a whispered tone.",
73
- "The fruit piece, the still lifes, you mean.",
74
- 1.5, 3, 20.0
75
  ],
76
  [
77
  "A male speaker with a husky, low-pitched voice delivers clear speech in a quiet environment.",
78
- "Ari had to somehow be subservient to Lloyd that would be unbelievable like if Lloyd was the guy who was like running Time Warner you know what I mean like",
79
- 1.5, 3, 20.0
80
  ],
81
  [
82
  "A female speaker's voice is clear and expressed at a measured pace, but carries a high-pitched, nasal tone, recorded in a quiet environment.",
83
- "You know, Joe Bow, hockey mom from Wasilla, if I have an idea that would perhaps make",
84
- 1.5, 3, 20.0
85
  ]
86
  ]
87
 
@@ -148,9 +132,9 @@ class ParlerTTSInference:
148
  self.description_tokenizer = AutoTokenizer.from_pretrained(model_name)
149
  self.transcription_tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
150
  self.asr_pipeline = pipeline(model=asr_model, device=self.device, chunk_length_s=25.0)
151
- return True, "Models loaded successfully! You can now generate audio."
152
  except Exception as e:
153
- return False, f"Error loading models: {str(e)}"
154
 
155
  def generate_audio(self, description, text, guidance_scale, num_retries, wer_threshold):
156
  """Generate audio from text with style description"""
@@ -194,11 +178,13 @@ def create_demo():
194
  # Initialize the inference class
195
  inference = ParlerTTSInference()
196
 
197
- # Create the interface
198
- with gr.Blocks(title="ParaSpeechCaps Demo", theme=gr.themes.Soft()) as demo:
 
 
199
  gr.Markdown(
200
  """
201
- # 🎙️ ParaSpeechCaps Demo
202
 
203
  Generate expressive speech with rich style control using our Parler-TTS model finetuned on ParaSpeechCaps. Control various aspects of speech including:
204
  - Speaker characteristics (pitch, clarity, etc.)
@@ -268,12 +254,12 @@ def create_demo():
268
  choices=["distil-whisper/distil-large-v2"],
269
  value="distil-whisper/distil-large-v2",
270
  label="ASR Model",
271
- info="ASR model used for resampling"
272
  )
273
 
274
  with gr.Row():
275
  load_button = gr.Button("📥 Load Models", variant="primary")
276
- generate_button = gr.Button("🎵 Generate", variant="secondary", interactive=False)
277
 
278
  with gr.Column(scale=1):
279
  output_audio = gr.Audio(label="Generated Speech", type="numpy")
@@ -283,9 +269,17 @@ def create_demo():
283
  load_button.click(
284
  fn=inference.load_models,
285
  inputs=[model_name, asr_model],
286
- outputs=[status_text, generate_button]
287
  )
288
 
 
 
 
 
 
 
 
 
289
  generate_button.click(
290
  fn=inference.generate_audio,
291
  inputs=[
@@ -298,18 +292,15 @@ def create_demo():
298
  outputs=[output_audio, status_text]
299
  )
300
 
301
- # Add examples
302
  gr.Examples(
303
  examples=EXAMPLES,
304
  inputs=[
305
  description,
306
- text,
307
- guidance_scale,
308
- num_retries,
309
- wer_threshold
310
  ],
311
  outputs=[output_audio, status_text],
312
- fn=inference.generate_audio,
313
  cache_examples=False
314
  )
315
 
 
5
  import numpy as np
6
  import evaluate
7
 
8
+ # Example prompts from the paper (only style and text)
9
  EXAMPLES = [
 
10
  [
11
  "A man speaks with a booming, medium-pitched voice in a clear environment, delivering his words at a measured speed.",
12
+ "That's my brother. I do agree, though, it wasn't very well-groomed."
 
13
  ],
14
  [
15
  "A male speaker's speech is distinguished by a slurred articulation, delivered at a measured pace in a clear environment.",
16
+ "reveal my true intentions in different ways. That's why the Street King Project and SMS"
 
17
  ],
18
  [
19
  "In a clear environment, a male speaker delivers his words hesitantly with a measured pace.",
20
+ "the Grand Slam tennis game has sort of taken over our set that's sort of all the way"
 
21
  ],
22
  [
23
  "A low-pitched, guttural male voice speaks slowly in a clear environment.",
24
+ "you know you want to see how far you can push everything and as an artist"
 
25
  ],
26
  [
27
  "A man speaks with a measured pace in a clear environment, displaying a distinct British accent.",
28
+ "most important but the reaction is very similar throughout the world it's really very very similar"
 
29
  ],
30
  [
31
  "A male speaker's voice is clear and delivered at a measured pace in a quiet environment. His speech carries a distinct Jamaican accent.",
32
+ "about God and the people him come from is more Christian, you know. We always"
 
33
  ],
34
  [
35
  "In a clear environment, a male voice speaks with a sad tone.",
36
+ "Was that your landlord?"
 
37
  ],
38
  [
39
  "A man speaks with a measured pace in a clear environment, his voice carrying a sleepy tone.",
40
+ "I mean, to be fair, I did see a UFO, so, you know."
 
41
  ],
42
  [
43
  "A frightened woman speaks with a clear and distinct voice.",
44
+ "Yes, that's what they said. I don't know what you're getting done. What are you getting done? Oh, okay. Yeah."
 
45
  ],
46
  [
47
  "A woman speaks slowly in a clear environment, her voice filled with awe.",
48
+ "Oh wow, this music is fantastic. You play so well. I could just sit here."
 
49
  ],
50
  [
51
  "A woman speaks with a high-pitched voice in a clear environment, conveying a sense of anxiety.",
52
+ "this is just way too overwhelming. I literally don't know how I'm going to get any of this done on time. I feel so overwhelmed right now. No one is helping me. Everyone's ignoring my calls and my emails. I don't know what I'm supposed to do right now."
 
53
  ],
54
  [
55
  "A female speaker's high-pitched voice is clear and carries over a laughing, unobstructed environment.",
56
+ "What is wrong with him, Chad?"
 
57
  ],
58
  [
59
  "In a clear environment, a man speaks in a whispered tone.",
60
+ "The fruit piece, the still lifes, you mean."
 
61
  ],
62
  [
63
  "A male speaker with a husky, low-pitched voice delivers clear speech in a quiet environment.",
64
+ "Ari had to somehow be subservient to Lloyd that would be unbelievable like if Lloyd was the guy who was like running Time Warner you know what I mean like"
 
65
  ],
66
  [
67
  "A female speaker's voice is clear and expressed at a measured pace, but carries a high-pitched, nasal tone, recorded in a quiet environment.",
68
+ "You know, Joe Bow, hockey mom from Wasilla, if I have an idea that would perhaps make"
 
69
  ]
70
  ]
71
 
 
132
  self.description_tokenizer = AutoTokenizer.from_pretrained(model_name)
133
  self.transcription_tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
134
  self.asr_pipeline = pipeline(model=asr_model, device=self.device, chunk_length_s=25.0)
135
+ return gr.Button(value="🎵 Generate", variant="primary", interactive=True), "Models loaded successfully! You can now generate audio."
136
  except Exception as e:
137
+ return gr.Button(value="🎵 Generate", variant="primary", interactive=False), f"Error loading models: {str(e)}"
138
 
139
  def generate_audio(self, description, text, guidance_scale, num_retries, wer_threshold):
140
  """Generate audio from text with style description"""
 
178
  # Initialize the inference class
179
  inference = ParlerTTSInference()
180
 
181
+ # Create the interface with a simple theme
182
+ theme = gr.themes.Default()
183
+
184
+ with gr.Blocks(title="ParaSpeechCaps Demo", theme=theme) as demo:
185
  gr.Markdown(
186
  """
187
+ # 🎙️ Parler-TTS Mini with ParaSpeechCaps
188
 
189
  Generate expressive speech with rich style control using our Parler-TTS model finetuned on ParaSpeechCaps. Control various aspects of speech including:
190
  - Speaker characteristics (pitch, clarity, etc.)
 
254
  choices=["distil-whisper/distil-large-v2"],
255
  value="distil-whisper/distil-large-v2",
256
  label="ASR Model",
257
+ info="ASR model used for quality assessment"
258
  )
259
 
260
  with gr.Row():
261
  load_button = gr.Button("📥 Load Models", variant="primary")
262
+ generate_button = gr.Button("🎵 Generate", variant="primary", interactive=False)
263
 
264
  with gr.Column(scale=1):
265
  output_audio = gr.Audio(label="Generated Speech", type="numpy")
 
269
  load_button.click(
270
  fn=inference.load_models,
271
  inputs=[model_name, asr_model],
272
+ outputs=[generate_button, status_text]
273
  )
274
 
275
+ def generate_with_default_params(description, text):
276
+ return inference.generate_audio(
277
+ description, text,
278
+ guidance_scale=1.5,
279
+ num_retries=3,
280
+ wer_threshold=20.0
281
+ )
282
+
283
  generate_button.click(
284
  fn=inference.generate_audio,
285
  inputs=[
 
292
  outputs=[output_audio, status_text]
293
  )
294
 
295
+ # Add examples (only style and text)
296
  gr.Examples(
297
  examples=EXAMPLES,
298
  inputs=[
299
  description,
300
+ text
 
 
 
301
  ],
302
  outputs=[output_audio, status_text],
303
+ fn=generate_with_default_params,
304
  cache_examples=False
305
  )
306