Spaces:

shukdevdatta123
/

Kokoro-TTS

Paused

App Files Files Community

shukdevdatta123 commited on Feb 1

Commit

bf626b9

verified ·

1 Parent(s): cde1167

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -61

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from kokoro import KPipeline
 import soundfile as sf
 import io
 import os
 # Install espeak-ng if not installed
 if not os.system("which espeak-ng"):
@@ -12,8 +13,9 @@ else:
     st.text("Installing espeak-ng...")
 # Streamlit App UI Setup
-st.title("Text-to-Speech with Kokoro")
-# Expander section to display information in multiple languages
 with st.expander("Sample Prompt!"):
     st.markdown("""
     - My name is Shukdev. (In English)
@@ -25,92 +27,84 @@ with st.expander("Sample Prompt!"):
     - 我叫苏赫德夫。(In Chinese)
     - 私の名前はスクデフです。(In Japanese)
     """)
 st.sidebar.header("Configuration & Instructions")
-# Sidebar Instructions
 st.sidebar.markdown("""
 ### How to Use the Text-to-Speech App:
-1. **Enter Text**: In the main text area, input any text that you want the model to convert to speech.
-2. **Select Language**:
-   - Choose the language of the text you are entering. Available options include:
-     - 🇺🇸 American English (`a`)
-     - 🇬🇧 British English (`b`)
-     - 🇪🇸 Spanish (`e`)
-     - 🇫🇷 French (`f`)
-     - 🇮🇳 Hindi (`h`)
-     - 🇮🇹 Italian (`i`)
-     - 🇧🇷 Brazilian Portuguese (`p`)
-     - 🇨🇳 Mandarin Chinese (`z`)
-     - 🇯🇵 Japanese (`j`)
-3. **Select Voice**:
-   - Choose the voice style for the speech. You can pick different voices based on tone and gender, such as `af_heart`, `af_joy`, etc.
-4. **Adjust Speed**:
-   - Use the speed slider to change how fast the speech is generated. You can set it between `0.5x` to `2.0x`, where `1.0x` is the normal speed.
-5. **Generate Speech**:
-   - After configuring the settings, click on the **"Generate Audio"** button. The app will process your text and produce speech audio accordingly.
-6. **Download**:
-   - Once the audio is generated, you can play it directly in the app or download it as a `.wav` file by clicking on the **"Download Audio"** button.
-Enjoy experimenting with the text-to-speech conversion, and feel free to try different voices, speeds, and languages!
 """)
-st.sidebar.markdown("""
-        ### Courtesy: [Kokoro](https://huggingface.co/hexgrad/Kokoro-82M?fbclid=IwY2xjawIKqzxleHRuA2FlbQIxMAABHaf9GldgYOzXktNuoRtNKqd-aL7r-S7zPGyC8ttYOiG2zYfQqLyV4Qm75A_aem_0wKLC2C87ZZ2F04WjPJbtA)
-    """)
 # User input for text, language, and voice settings
 input_text = st.text_area("Enter your text here", placeholder="The sky above the port was the color of television...")
 lang_code = st.selectbox("Select Language", ['a', 'b', 'e', 'f', 'h', 'i', 'p', 'z', 'j'])
 voice = st.selectbox("Select Voice", ['af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica', 'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky',
- 'am_adam', 'am_echo', 'am_eric', 'am_fenrir', 'am_liam', 'am_michael', 'am_onyx', 'am_puck', 'am_santa',
- 'bf_alice', 'bf_emma', 'bf_isabella', 'bf_lily',
- 'bm_daniel', 'bm_fable', 'bm_george', 'bm_lewis',
- 'ef_dora',
- 'em_alex', 'em_santa',
- 'ff_siwis',
- 'hf_alpha', 'hf_beta',
- 'hm_omega', 'hm_psi',
- 'if_sara',
- 'im_nicola',
- 'jf_alpha', 'jf_gongitsune', 'jf_nezumi', 'jf_tebukuro',
- 'jm_kumo',
- 'pf_dora',
- 'pm_alex', 'pm_santa',
- 'zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi',
- 'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang']
-)  # Change voice options as per model
-speed = st.slider("Speed", min_value=0.5, max_value=2.0, value=1.0, step=0.1)
 # Initialize the TTS pipeline with user-selected language
 pipeline = KPipeline(lang_code=lang_code)
-# Generate Audio function
-def generate_audio(text, lang_code, voice, speed):
     generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
     for i, (gs, ps, audio) in enumerate(generator):
         audio_data = audio
         # Save audio to in-memory buffer
         buffer = io.BytesIO()
-        # Explicitly specify format as WAV
-        sf.write(buffer, audio_data, 24000, format='WAV')  # Add 'format="WAV"'
         buffer.seek(0)
         return buffer
-# Generate and display the audio file
 if st.button('Generate Audio'):
-    st.write("Generating speech...")
-    audio_buffer = generate_audio(input_text, lang_code, voice, speed)
-    # Display Audio player in the app
     st.audio(audio_buffer, format='audio/wav')
-    # Optional: Save the generated audio file for download
     st.download_button(
         label="Download Audio",
         data=audio_buffer,
-        file_name="generated_speech.wav",
         mime="audio/wav"
-    )

 import soundfile as sf
 import io
 import os
+import time
 # Install espeak-ng if not installed
 if not os.system("which espeak-ng"):
     st.text("Installing espeak-ng...")
 # Streamlit App UI Setup
+st.title("Interactive Text-to-Speech with Kokoro")
+# Expander section for language samples
 with st.expander("Sample Prompt!"):
     st.markdown("""
     - My name is Shukdev. (In English)
     - 我叫苏赫德夫。(In Chinese)
     - 私の名前はスクデフです。(In Japanese)
     """)
+# Sidebar Instructions and Configuration
 st.sidebar.header("Configuration & Instructions")
 st.sidebar.markdown("""
 ### How to Use the Text-to-Speech App:
+1. **Enter Text**: Input any text that you want to convert to speech.
+2. **Select Language**: Choose the language of the text.
+3. **Select Voice**: Choose the voice style.
+4. **Select Speed**: Use the slider to change the speech speed.
+5. **Add Background Music**: Optional background music for the speech (choose genre).
+6. **Generate Speech**: Click on **Generate Audio** to create speech.
+7. **Download**: Download the generated audio file.
 """)
 # User input for text, language, and voice settings
 input_text = st.text_area("Enter your text here", placeholder="The sky above the port was the color of television...")
 lang_code = st.selectbox("Select Language", ['a', 'b', 'e', 'f', 'h', 'i', 'p', 'z', 'j'])
 voice = st.selectbox("Select Voice", ['af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica', 'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky',
+                                     'am_adam', 'am_echo', 'am_eric', 'am_fenrir', 'am_liam', 'am_michael', 'am_onyx', 'am_puck', 'am_santa',
+                                     'bf_alice', 'bf_emma', 'bf_isabella', 'bf_lily',
+                                     'bm_daniel', 'bm_fable', 'bm_george', 'bm_lewis',
+                                     'ef_dora',
+                                     'em_alex', 'em_santa',
+                                     'ff_siwis',
+                                     'hf_alpha', 'hf_beta',
+                                     'hm_omega', 'hm_psi',
+                                     'if_sara',
+                                     'im_nicola',
+                                     'jf_alpha', 'jf_gongitsune', 'jf_nezumi', 'jf_tebukuro',
+                                     'jm_kumo',
+                                     'pf_dora',
+                                     'pm_alex', 'pm_santa',
+                                     'zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi',
+                                     'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang'])
+speed = st.slider("Speech Speed", min_value=0.5, max_value=2.0, value=1.0, step=0.1)
+background_music = st.selectbox("Add Background Music", ['None', 'Calm', 'Energetic', 'Focus', 'Nature'])
 # Initialize the TTS pipeline with user-selected language
 pipeline = KPipeline(lang_code=lang_code)
+# Function to generate audio with background music
+def generate_audio_with_music(text, lang_code, voice, speed, music_type):
     generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
     for i, (gs, ps, audio) in enumerate(generator):
         audio_data = audio
+        # Simulate adding background music (using silence as placeholder)
+        if music_type != 'None':
+            st.write(f"Adding {music_type} background music...")
+            # Add some background music (could be a placeholder for real music loading)
+            music = io.BytesIO()
+            music.write(b'\0' * 500000)  # Placeholder: Replace with actual music blending logic
         # Save audio to in-memory buffer
         buffer = io.BytesIO()
+        sf.write(buffer, audio_data, 24000, format='WAV')
         buffer.seek(0)
         return buffer
+# Generate Audio Button
 if st.button('Generate Audio'):
+    st.write("Generating speech with background music...")
+    audio_buffer = generate_audio_with_music(input_text, lang_code, voice, speed, background_music)
+    # Display audio player in the app
     st.audio(audio_buffer, format='audio/wav')
+    # Option to download the audio file
     st.download_button(
         label="Download Audio",
         data=audio_buffer,
+        file_name="generated_speech_with_music.wav",
         mime="audio/wav"
+    )
+    # Speech Progress Feedback: Add voice feedback or something fun
+    st.write("Would you like to hear the audio again?")
+    if st.button('Replay Audio'):
+        st.audio(audio_buffer, format='audio/wav')