Spaces:

shukdevdatta123
/

Kokoro-TTS

Running

File size: 5,711 Bytes

620ebff
 
 
 
666cd48
63aca15
666cd48
 
 
 
 
 
 
620ebff
 
63aca15
bf626b9
63aca15
6d9e4e4
2eb0f72
 
 
 
 
 
 
 
 
6d9e4e4
63aca15
620ebff
 
63aca15
620ebff
 
63aca15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620ebff
 
63aca15
 
 
 
 
 
 
 
 
 
 
 
 
620ebff
2e0b1fa
63aca15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de8d4a5
63aca15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620ebff
 
 
 
63aca15
 
620ebff
 
 
 
 
63aca15
 
620ebff
 
 
63aca15
620ebff
63aca15
 
620ebff
63aca15
620ebff
 
63aca15
620ebff
 
 
63aca15
620ebff
bf626b9
 
63aca15
 
 
 
bf626b9

import streamlit as st
from kokoro import KPipeline
import soundfile as sf
import io
import os
from langdetect import detect  # Language detection library

# Install espeak-ng if not installed
if not os.system("which espeak-ng"):
    st.text("espeak-ng already installed.")
else:
    os.system("apt-get -qq -y install espeak-ng")
    st.text("Installing espeak-ng...")

# Streamlit App UI Setup
st.title("Text-to-Speech with Kokoro")

# Expander section to display information in multiple languages
with st.expander("Sample Prompt!"):
    st.markdown("""
    - My name is Shukdev. (In English)
    - Mi nombre es Shukdev. (In Spanish)
    - Je m'appelle Choukdev. (In French)
    - मेरा नाम शुकदेव है. (In Hindi)
    - Il mio nome è Shukdev. (In Italy)
    - Meu nome é Sukhdev. (In Portuguese, Brazil)
    - 我叫苏赫德夫。(In Chinese)
    - 私の名前はスクデフです。(In Japanese)
    """)

st.sidebar.header("Configuration & Instructions")

# Sidebar Instructions
st.sidebar.markdown("""
### How to Use the Text-to-Speech App:
1. **Enter Text**: In the main text area, input any text that you want the model to convert to speech.
   
2. **Select Language**: 
   - Choose the language of the text you are entering. Available options include:
     - 🇺🇸 American English (`a`)
     - 🇬🇧 British English (`b`)
     - 🇪🇸 Spanish (`e`)
     - 🇫🇷 French (`f`)
     - 🇮🇳 Hindi (`h`)
     - 🇮🇹 Italian (`i`)
     - 🇧🇷 Brazilian Portuguese (`p`)
     - 🇨🇳 Mandarin Chinese (`z`)
     - 🇯🇵 Japanese (`j`)
   
3. **Select Voice**:
   - Choose the voice style for the speech. You can pick different voices based on tone and gender, such as `af_heart`, `af_joy`, etc.
   
4. **Adjust Speed**:
   - Use the speed slider to change how fast the speech is generated. You can set it between `0.5x` to `2.0x`, where `1.0x` is the normal speed.
5. **Generate Speech**:
   - After configuring the settings, click on the **"Generate Audio"** button. The app will process your text and produce speech audio accordingly.
   
6. **Download**:
   - Once the audio is generated, you can play it directly in the app or download it as a `.wav` file by clicking on the **"Download Audio"** button.
Enjoy experimenting with the text-to-speech conversion, and feel free to try different voices, speeds, and languages!
""")

st.sidebar.markdown("""
        ### Courtesy: [Kokoro](https://huggingface.co/hexgrad/Kokoro-82M?fbclid=IwY2xjawIKqzxleHRuA2FlbQIxMAABHaf9GldgYOzXktNuoRtNKqd-aL7r-S7zPGyC8ttYOiG2zYfQqLyV4Qm75A_aem_0wKLC2C87ZZ2F04WjPJbtA)
    """)

# Language Detection Function
def detect_language(text):
    try:
        lang = detect(text)
        return lang
    except Exception as e:
        st.error("Error detecting language: " + str(e))
        return None

# User input for text, language, and voice settings
input_text = st.text_area("Enter your text here", placeholder="The sky above the port was the color of television...")
auto_detect_lang = detect_language(input_text)

# Set detected language to the selectbox (if detected)
if auto_detect_lang:
    lang_map = {
        'en': 'a',  # American English
        'es': 'e',  # Spanish
        'fr': 'f',  # French
        'hi': 'h',  # Hindi
        'it': 'i',  # Italian
        'pt': 'p',  # Portuguese
        'zh': 'z',  # Chinese
        'ja': 'j'   # Japanese
    }
    lang_code = lang_map.get(auto_detect_lang, 'a')  # Default to English if not in map
else:
    lang_code = st.selectbox("Select Language", ['a', 'b', 'e', 'f', 'h', 'i', 'p', 'z', 'j'])

voice = st.selectbox("Select Voice", ['af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica', 'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky', 
 'am_adam', 'am_echo', 'am_eric', 'am_fenrir', 'am_liam', 'am_michael', 'am_onyx', 'am_puck', 'am_santa', 
 'bf_alice', 'bf_emma', 'bf_isabella', 'bf_lily', 
 'bm_daniel', 'bm_fable', 'bm_george', 'bm_lewis', 
 'ef_dora', 
 'em_alex', 'em_santa', 
 'ff_siwis', 
 'hf_alpha', 'hf_beta', 
 'hm_omega', 'hm_psi', 
 'if_sara', 
 'im_nicola', 
 'jf_alpha', 'jf_gongitsune', 'jf_nezumi', 'jf_tebukuro', 
 'jm_kumo', 
 'pf_dora', 
 'pm_alex', 'pm_santa', 
 'zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi', 
 'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang'])

speed = st.slider("Speed", min_value=0.5, max_value=2.0, value=1.0, step=0.1)

# Initialize the TTS pipeline with user-selected language
pipeline = KPipeline(lang_code=lang_code)

# Generate Audio function
def generate_audio(text, lang_code, voice, speed):
    generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
    for i, (gs, ps, audio) in enumerate(generator):
        audio_data = audio
        # Save audio to in-memory buffer
        buffer = io.BytesIO()
        # Explicitly specify format as WAV
        sf.write(buffer, audio_data, 24000, format='WAV')  # Add 'format="WAV"'
        buffer.seek(0)
        return buffer

# Generate and display the audio file
if st.button('Generate Audio'):
    st.write("Generating speech...")
    audio_buffer = generate_audio(input_text, lang_code, voice, speed)
    
    # Display Audio player in the app
    st.audio(audio_buffer, format='audio/wav')

    # Optional: Save the generated audio file for download
    st.download_button(
        label="Download Audio",
        data=audio_buffer,
        file_name="generated_speech.wav",
        mime="audio/wav"
    )

    # Interactive Voice Feedback
    feedback = st.radio("Do you want to hear it again?", ('No', 'Yes'))
    if feedback == 'Yes':
        st.write("Replaying the generated speech...")
        st.audio(audio_buffer, format='audio/wav')