File size: 5,250 Bytes
620ebff
 
 
 
666cd48
ee1b822
 
666cd48
 
 
 
 
 
 
620ebff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e6c87a
620ebff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e6c87a
de8d4a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620ebff
 
 
 
 
 
 
 
 
 
 
 
a861aa1
 
620ebff
 
 
ee1b822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620ebff
 
 
 
 
 
 
 
ee1b822
 
 
 
 
 
 
 
 
 
620ebff
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import streamlit as st
from kokoro import KPipeline
import soundfile as sf
import io
import os
import speech_recognition as sr
import romkan  # For Japanese Romanization

# Install espeak-ng if not installed
if not os.system("which espeak-ng"):
    st.text("espeak-ng already installed.")
else:
    os.system("apt-get -qq -y install espeak-ng")
    st.text("Installing espeak-ng...")

# Streamlit App UI Setup
st.title("Text-to-Speech with Kokoro")
st.sidebar.header("Configuration & Instructions")

# Sidebar Instructions
st.sidebar.markdown("""
### How to Use the Text-to-Speech App:
1. **Enter Text**: In the main text area, input any text that you want the model to convert to speech.
   
2. **Select Language**: 
   - Choose the language of the text you are entering. Available options include:
     - ๐Ÿ‡บ๐Ÿ‡ธ American English (`a`)
     - ๐Ÿ‡ฌ๐Ÿ‡ง British English (`b`)
     - ๐Ÿ‡ช๐Ÿ‡ธ Spanish (`e`)
     - ๐Ÿ‡ซ๐Ÿ‡ท French (`f`)
     - ๐Ÿ‡ฎ๐Ÿ‡ณ Hindi (`h`)
     - ๐Ÿ‡ฎ๐Ÿ‡น Italian (`i`)
     - ๐Ÿ‡ง๐Ÿ‡ท Brazilian Portuguese (`p`)
     - ๐Ÿ‡จ๐Ÿ‡ณ Mandarin Chinese (`z`)
     - ๐Ÿ‡ฏ๐Ÿ‡ต Japanese (`j`)
   
3. **Select Voice**:
   - Choose the voice style for the speech. You can pick different voices based on tone and gender, such as `af_heart`, `af_joy`, etc.
   
4. **Adjust Speed**:
   - Use the speed slider to change how fast the speech is generated. You can set it between `0.5x` to `2.0x`, where `1.0x` is the normal speed.
5. **Generate Speech**:
   - After configuring the settings, click on the **"Generate Audio"** button. The app will process your text and produce speech audio accordingly.
   
6. **Download**:
   - Once the audio is generated, you can play it directly in the app or download it as a `.wav` file by clicking on the **"Download Audio"** button.
Enjoy experimenting with the text-to-speech conversion, and feel free to try different voices, speeds, and languages!
""")

# User input for text, language, and voice settings
input_text = st.text_area("Enter your text here", "The sky above the port was the color of television...")
lang_code = st.selectbox("Select Language", ['a', 'b', 'e', 'f', 'h', 'i', 'p', 'z', 'j'])
voice = st.selectbox("Select Voice", ['af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica', 'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky', 
 'am_adam', 'am_echo', 'am_eric', 'am_fenrir', 'am_liam', 'am_michael', 'am_onyx', 'am_puck', 'am_santa', 
 'bf_alice', 'bf_emma', 'bf_isabella', 'bf_lily', 
 'bm_daniel', 'bm_fable', 'bm_george', 'bm_lewis', 
 'ef_dora', 
 'em_alex', 'em_santa', 
 'ff_siwis', 
 'hf_alpha', 'hf_beta', 
 'hm_omega', 'hm_psi', 
 'if_sara', 
 'im_nicola', 
 'jf_alpha', 'jf_gongitsune', 'jf_nezumi', 'jf_tebukuro', 
 'jm_kumo', 
 'pf_dora', 
 'pm_alex', 'pm_santa', 
 'zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi', 
 'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang']
)  # Change voice options as per model
speed = st.slider("Speed", min_value=0.5, max_value=2.0, value=1.0, step=0.1)

# Initialize the TTS pipeline with user-selected language
pipeline = KPipeline(lang_code=lang_code)

# Generate Audio function
def generate_audio(text, lang_code, voice, speed):
    generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
    for i, (gs, ps, audio) in enumerate(generator):
        audio_data = audio
        # Save audio to in-memory buffer
        buffer = io.BytesIO()
        # Explicitly specify format as WAV
        sf.write(buffer, audio_data, 24000, format='WAV')  # Add 'format="WAV"'
        buffer.seek(0)
        return buffer

# Transcribe the generated audio using speech recognition
def transcribe_audio(audio_buffer):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_buffer) as source:
        audio = recognizer.record(source)
    try:
        # Transcribe using Google Web Speech API (requires internet)
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Sorry, could not understand the audio"
    except sr.RequestError as e:
        return f"Request error from Google Speech Recognition service; {e}"

# Romanize (convert to Romanji) if the language is Japanese
def romanize_text(text, lang_code):
    if lang_code == 'j':  # Japanese language code
        return romkan.to_romaji(text)
    return text  # No need for Romanization for other languages

# Generate and display the audio file
if st.button('Generate Audio'):
    st.write("Generating speech...")
    audio_buffer = generate_audio(input_text, lang_code, voice, speed)
    
    # Display Audio player in the app
    st.audio(audio_buffer, format='audio/wav')

    # Transcribe the generated speech to text
    transcription = transcribe_audio(audio_buffer)
    
    # Romanize the transcription if it's Japanese
    romanized_text = romanize_text(transcription, lang_code)
    
    # Display the transcribed and Romanized text
    st.write("Transcribed Text: ", transcription)
    st.write("Romanized Pronunciation: ", romanized_text)

    # Optional: Save the generated audio file for download
    st.download_button(
        label="Download Audio",
        data=audio_buffer,
        file_name="generated_speech.wav",
        mime="audio/wav"
    )