Spaces:

ajd12342
/

paraspeechcaps

Running

App Files Files Community

ajd12342 commited on Mar 4

Commit

b3cf9d6

verified ·

1 Parent(s): bd84ccf

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -46

app.py CHANGED Viewed

@@ -5,83 +5,67 @@ from transformers import AutoTokenizer, pipeline, WhisperForConditionalGeneratio
 import numpy as np
 import evaluate
-# Example prompts from the paper
 EXAMPLES = [
-    # Each list is [description, text, guidance_scale, num_retries, wer_threshold]
     [
         "A man speaks with a booming, medium-pitched voice in a clear environment, delivering his words at a measured speed.",
-        "That's my brother. I do agree, though, it wasn't very well-groomed.",
-        1.5, 3, 20.0
     ],
     [
         "A male speaker's speech is distinguished by a slurred articulation, delivered at a measured pace in a clear environment.",
-        "reveal my true intentions in different ways. That's why the Street King Project and SMS",
-        1.5, 3, 20.0
     ],
     [
         "In a clear environment, a male speaker delivers his words hesitantly with a measured pace.",
-        "the Grand Slam tennis game has sort of taken over our set that's sort of all the way",
-        1.5, 3, 20.0
     ],
     [
         "A low-pitched, guttural male voice speaks slowly in a clear environment.",
-        "you know you want to see how far you can push everything and as an artist",
-        1.5, 3, 20.0
     ],
     [
         "A man speaks with a measured pace in a clear environment, displaying a distinct British accent.",
-        "most important but the reaction is very similar throughout the world it's really very very similar",
-        1.5, 3, 20.0
     ],
     [
         "A male speaker's voice is clear and delivered at a measured pace in a quiet environment. His speech carries a distinct Jamaican accent.",
-        "about God and the people him come from is more Christian, you know. We always",
-        1.5, 3, 20.0
     ],
     [
         "In a clear environment, a male voice speaks with a sad tone.",
-        "Was that your landlord?",
-        1.5, 3, 20.0
     ],
     [
         "A man speaks with a measured pace in a clear environment, his voice carrying a sleepy tone.",
-        "I mean, to be fair, I did see a UFO, so, you know.",
-        1.5, 3, 20.0
     ],
     [
         "A frightened woman speaks with a clear and distinct voice.",
-        "Yes, that's what they said. I don't know what you're getting done. What are you getting done? Oh, okay. Yeah.",
-        1.5, 3, 20.0
     ],
     [
         "A woman speaks slowly in a clear environment, her voice filled with awe.",
-        "Oh wow, this music is fantastic. You play so well. I could just sit here.",
-        1.5, 3, 20.0
     ],
     [
         "A woman speaks with a high-pitched voice in a clear environment, conveying a sense of anxiety.",
-        "this is just way too overwhelming. I literally don't know how I'm going to get any of this done on time. I feel so overwhelmed right now. No one is helping me. Everyone's ignoring my calls and my emails. I don't know what I'm supposed to do right now.",
-        1.5, 3, 20.0
     ],
     [
         "A female speaker's high-pitched voice is clear and carries over a laughing, unobstructed environment.",
-        "What is wrong with him, Chad?",
-        1.5, 3, 20.0
     ],
     [
         "In a clear environment, a man speaks in a whispered tone.",
-        "The fruit piece, the still lifes, you mean.",
-        1.5, 3, 20.0
     ],
     [
         "A male speaker with a husky, low-pitched voice delivers clear speech in a quiet environment.",
-        "Ari had to somehow be subservient to Lloyd that would be unbelievable like if Lloyd was the guy who was like running Time Warner you know what I mean like",
-        1.5, 3, 20.0
     ],
     [
         "A female speaker's voice is clear and expressed at a measured pace, but carries a high-pitched, nasal tone, recorded in a quiet environment.",
-        "You know, Joe Bow, hockey mom from Wasilla, if I have an idea that would perhaps make",
-        1.5, 3, 20.0
     ]
 ]
@@ -148,9 +132,9 @@ class ParlerTTSInference:
             self.description_tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.transcription_tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
             self.asr_pipeline = pipeline(model=asr_model, device=self.device, chunk_length_s=25.0)
-            return True, "Models loaded successfully! You can now generate audio."
         except Exception as e:
-            return False, f"Error loading models: {str(e)}"
     def generate_audio(self, description, text, guidance_scale, num_retries, wer_threshold):
         """Generate audio from text with style description"""
@@ -194,11 +178,13 @@ def create_demo():
     # Initialize the inference class
     inference = ParlerTTSInference()
-    # Create the interface
-    with gr.Blocks(title="ParaSpeechCaps Demo", theme=gr.themes.Soft()) as demo:
         gr.Markdown(
             """
-            # 🎙️ ParaSpeechCaps Demo
             Generate expressive speech with rich style control using our Parler-TTS model finetuned on ParaSpeechCaps. Control various aspects of speech including:
             - Speaker characteristics (pitch, clarity, etc.)
@@ -268,12 +254,12 @@ def create_demo():
                         choices=["distil-whisper/distil-large-v2"],
                         value="distil-whisper/distil-large-v2",
                         label="ASR Model",
-                        info="ASR model used for resampling"
                     )
                 with gr.Row():
                     load_button = gr.Button("📥 Load Models", variant="primary")
-                    generate_button = gr.Button("🎵 Generate", variant="secondary", interactive=False)
             with gr.Column(scale=1):
                 output_audio = gr.Audio(label="Generated Speech", type="numpy")
@@ -283,9 +269,17 @@ def create_demo():
         load_button.click(
             fn=inference.load_models,
             inputs=[model_name, asr_model],
-            outputs=[status_text, generate_button]
         )
         generate_button.click(
             fn=inference.generate_audio,
             inputs=[
@@ -298,18 +292,15 @@ def create_demo():
             outputs=[output_audio, status_text]
         )
-        # Add examples
         gr.Examples(
             examples=EXAMPLES,
             inputs=[
                 description,
-                text,
-                guidance_scale,
-                num_retries,
-                wer_threshold
             ],
             outputs=[output_audio, status_text],
-            fn=inference.generate_audio,
             cache_examples=False
         )

 import numpy as np
 import evaluate
+# Example prompts from the paper (only style and text)
 EXAMPLES = [
     [
         "A man speaks with a booming, medium-pitched voice in a clear environment, delivering his words at a measured speed.",
+        "That's my brother. I do agree, though, it wasn't very well-groomed."
     ],
     [
         "A male speaker's speech is distinguished by a slurred articulation, delivered at a measured pace in a clear environment.",
+        "reveal my true intentions in different ways. That's why the Street King Project and SMS"
     ],
     [
         "In a clear environment, a male speaker delivers his words hesitantly with a measured pace.",
+        "the Grand Slam tennis game has sort of taken over our set that's sort of all the way"
     ],
     [
         "A low-pitched, guttural male voice speaks slowly in a clear environment.",
+        "you know you want to see how far you can push everything and as an artist"
     ],
     [
         "A man speaks with a measured pace in a clear environment, displaying a distinct British accent.",
+        "most important but the reaction is very similar throughout the world it's really very very similar"
     ],
     [
         "A male speaker's voice is clear and delivered at a measured pace in a quiet environment. His speech carries a distinct Jamaican accent.",
+        "about God and the people him come from is more Christian, you know. We always"
     ],
     [
         "In a clear environment, a male voice speaks with a sad tone.",
+        "Was that your landlord?"
     ],
     [
         "A man speaks with a measured pace in a clear environment, his voice carrying a sleepy tone.",
+        "I mean, to be fair, I did see a UFO, so, you know."
     ],
     [
         "A frightened woman speaks with a clear and distinct voice.",
+        "Yes, that's what they said. I don't know what you're getting done. What are you getting done? Oh, okay. Yeah."
     ],
     [
         "A woman speaks slowly in a clear environment, her voice filled with awe.",
+        "Oh wow, this music is fantastic. You play so well. I could just sit here."
     ],
     [
         "A woman speaks with a high-pitched voice in a clear environment, conveying a sense of anxiety.",
+        "this is just way too overwhelming. I literally don't know how I'm going to get any of this done on time. I feel so overwhelmed right now. No one is helping me. Everyone's ignoring my calls and my emails. I don't know what I'm supposed to do right now."
     ],
     [
         "A female speaker's high-pitched voice is clear and carries over a laughing, unobstructed environment.",
+        "What is wrong with him, Chad?"
     ],
     [
         "In a clear environment, a man speaks in a whispered tone.",
+        "The fruit piece, the still lifes, you mean."
     ],
     [
         "A male speaker with a husky, low-pitched voice delivers clear speech in a quiet environment.",
+        "Ari had to somehow be subservient to Lloyd that would be unbelievable like if Lloyd was the guy who was like running Time Warner you know what I mean like"
     ],
     [
         "A female speaker's voice is clear and expressed at a measured pace, but carries a high-pitched, nasal tone, recorded in a quiet environment.",
+        "You know, Joe Bow, hockey mom from Wasilla, if I have an idea that would perhaps make"
     ]
 ]
             self.description_tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.transcription_tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
             self.asr_pipeline = pipeline(model=asr_model, device=self.device, chunk_length_s=25.0)
+            return gr.Button(value="🎵 Generate", variant="primary", interactive=True), "Models loaded successfully! You can now generate audio."
         except Exception as e:
+            return gr.Button(value="🎵 Generate", variant="primary", interactive=False), f"Error loading models: {str(e)}"
     def generate_audio(self, description, text, guidance_scale, num_retries, wer_threshold):
         """Generate audio from text with style description"""
     # Initialize the inference class
     inference = ParlerTTSInference()
+    # Create the interface with a simple theme
+    theme = gr.themes.Default()
+    with gr.Blocks(title="ParaSpeechCaps Demo", theme=theme) as demo:
         gr.Markdown(
             """
+            # 🎙️ Parler-TTS Mini with ParaSpeechCaps
             Generate expressive speech with rich style control using our Parler-TTS model finetuned on ParaSpeechCaps. Control various aspects of speech including:
             - Speaker characteristics (pitch, clarity, etc.)
                         choices=["distil-whisper/distil-large-v2"],
                         value="distil-whisper/distil-large-v2",
                         label="ASR Model",
+                        info="ASR model used for quality assessment"
                     )
                 with gr.Row():
                     load_button = gr.Button("📥 Load Models", variant="primary")
+                    generate_button = gr.Button("🎵 Generate", variant="primary", interactive=False)
             with gr.Column(scale=1):
                 output_audio = gr.Audio(label="Generated Speech", type="numpy")
         load_button.click(
             fn=inference.load_models,
             inputs=[model_name, asr_model],
+            outputs=[generate_button, status_text]
         )
+        def generate_with_default_params(description, text):
+            return inference.generate_audio(
+                description, text,
+                guidance_scale=1.5,
+                num_retries=3,
+                wer_threshold=20.0
+            )
         generate_button.click(
             fn=inference.generate_audio,
             inputs=[
             outputs=[output_audio, status_text]
         )
+        # Add examples (only style and text)
         gr.Examples(
             examples=EXAMPLES,
             inputs=[
                 description,
+                text
             ],
             outputs=[output_audio, status_text],
+            fn=generate_with_default_params,
             cache_examples=False
         )