Spaces:

shukdevdatta123
/

Kokoro-TTS

Running

File size: 5,250 Bytes

import streamlit as st
from kokoro import KPipeline
import soundfile as sf
import io
import os
import speech_recognition as sr
import romkan  # For Japanese Romanization

# Install espeak-ng if not installed
if not os.system("which espeak-ng"):
    st.text("espeak-ng already installed.")
else:
    os.system("apt-get -qq -y install espeak-ng")
    st.text("Installing espeak-ng...")

# Streamlit App UI Setup
st.title("Text-to-Speech with Kokoro")
st.sidebar.header("Configuration & Instructions")

# Sidebar Instructions
st.sidebar.markdown("""
### How to Use the Text-to-Speech App:
1. **Enter Text**: In the main text area, input any text that you want the model to convert to speech.
   
2. **Select Language**: 
   - Choose the language of the text you are entering. Available options include:
     - 🇺🇸 American English (`a`)
     - 🇬🇧 British English (`b`)
     - 🇪🇸 Spanish (`e`)
     - 🇫🇷 French (`f`)
     - 🇮🇳 Hindi (`h`)
     - 🇮🇹 Italian (`i`)
     - 🇧🇷 Brazilian Portuguese (`p`)
     - 🇨🇳 Mandarin Chinese (`z`)
     - 🇯🇵 Japanese (`j`)
   
3. **Select Voice**:
   - Choose the voice style for the speech. You can pick different voices based on tone and gender, such as `af_heart`, `af_joy`, etc.
   
4. **Adjust Speed**:
   - Use the speed slider to change how fast the speech is generated. You can set it between `0.5x` to `2.0x`, where `1.0x` is the normal speed.
5. **Generate Speech**:
   - After configuring the settings, click on the **"Generate Audio"** button. The app will process your text and produce speech audio accordingly.
   
6. **Download**:
   - Once the audio is generated, you can play it directly in the app or download it as a `.wav` file by clicking on the **"Download Audio"** button.
Enjoy experimenting with the text-to-speech conversion, and feel free to try different voices, speeds, and languages!
""")

# User input for text, language, and voice settings
input_text = st.text_area("Enter your text here", "The sky above the port was the color of television...")
lang_code = st.selectbox("Select Language", ['a', 'b', 'e', 'f', 'h', 'i', 'p', 'z', 'j'])
voice = st.selectbox("Select Voice", ['af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica', 'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky', 
 'am_adam', 'am_echo', 'am_eric', 'am_fenrir', 'am_liam', 'am_michael', 'am_onyx', 'am_puck', 'am_santa', 
 'bf_alice', 'bf_emma', 'bf_isabella', 'bf_lily', 
 'bm_daniel', 'bm_fable', 'bm_george', 'bm_lewis', 
 'ef_dora', 
 'em_alex', 'em_santa', 
 'ff_siwis', 
 'hf_alpha', 'hf_beta', 
 'hm_omega', 'hm_psi', 
 'if_sara', 
 'im_nicola', 
 'jf_alpha', 'jf_gongitsune', 'jf_nezumi', 'jf_tebukuro', 
 'jm_kumo', 
 'pf_dora', 
 'pm_alex', 'pm_santa', 
 'zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi', 
 'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang']
)  # Change voice options as per model
speed = st.slider("Speed", min_value=0.5, max_value=2.0, value=1.0, step=0.1)

# Initialize the TTS pipeline with user-selected language
pipeline = KPipeline(lang_code=lang_code)

# Generate Audio function
def generate_audio(text, lang_code, voice, speed):
    generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
    for i, (gs, ps, audio) in enumerate(generator):
        audio_data = audio
        # Save audio to in-memory buffer
        buffer = io.BytesIO()
        # Explicitly specify format as WAV
        sf.write(buffer, audio_data, 24000, format='WAV')  # Add 'format="WAV"'
        buffer.seek(0)
        return buffer

# Transcribe the generated audio using speech recognition
def transcribe_audio(audio_buffer):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_buffer) as source:
        audio = recognizer.record(source)
    try:
        # Transcribe using Google Web Speech API (requires internet)
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Sorry, could not understand the audio"
    except sr.RequestError as e:
        return f"Request error from Google Speech Recognition service; {e}"

# Romanize (convert to Romanji) if the language is Japanese
def romanize_text(text, lang_code):
    if lang_code == 'j':  # Japanese language code
        return romkan.to_romaji(text)
    return text  # No need for Romanization for other languages

# Generate and display the audio file
if st.button('Generate Audio'):
    st.write("Generating speech...")
    audio_buffer = generate_audio(input_text, lang_code, voice, speed)
    
    # Display Audio player in the app
    st.audio(audio_buffer, format='audio/wav')

    # Transcribe the generated speech to text
    transcription = transcribe_audio(audio_buffer)
    
    # Romanize the transcription if it's Japanese
    romanized_text = romanize_text(transcription, lang_code)
    
    # Display the transcribed and Romanized text
    st.write("Transcribed Text: ", transcription)
    st.write("Romanized Pronunciation: ", romanized_text)

    # Optional: Save the generated audio file for download
    st.download_button(
        label="Download Audio",
        data=audio_buffer,
        file_name="generated_speech.wav",
        mime="audio/wav"
    )