File size: 8,642 Bytes
620ebff
9934dbc
620ebff
 
 
7f7879a
620ebff
 
f15a385
9934dbc
63aca15
6d9e4e4
dded7af
 
 
 
 
 
 
 
 
6d9e4e4
474d5c4
6d9115a
 
 
 
620ebff
 
 
 
f15a385
 
63aca15
f15a385
 
1497d56
 
 
 
 
 
 
 
 
63aca15
642f415
63aca15
f15a385
642f415
63aca15
f15a385
 
642f415
f15a385
 
79351fa
 
6d9115a
620ebff
 
9e2ae1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620ebff
2e0b1fa
6ad599b
de8d4a5
63aca15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ad599b
 
63aca15
620ebff
 
 
 
dded7af
 
62bfbfb
9934dbc
 
 
 
 
 
02b5b4a
9934dbc
a8004f6
dded7af
 
9934dbc
dded7af
16b0159
9934dbc
16b0159
 
 
 
 
9934dbc
 
 
 
62bfbfb
63aca15
 
620ebff
 
 
9e2ae1b
 
 
 
 
 
620ebff
63aca15
dded7af
7f7879a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dded7af
7f7879a
 
 
dded7af
bc2b0a4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import streamlit as st
import openai
from kokoro import KPipeline
import soundfile as sf
import io
import time  # To simulate some delay for spinner visibility

# Streamlit App UI Setup
st.title("Text-to-Speech Translator with Kokoro")

# Expander section to display information in multiple languages
with st.expander("Sample Prompt!"):
    st.markdown(""" 
    - My name is Shukdev. (In English) 
    - Mi nombre es Shukdev. (In Spanish) 
    - Je m'appelle Choukdev. (In French) 
    - मेरा नाम शुकदेव है. (In Hindi) 
    - Il mio nome è Shukdev. (In Italy) 
    - Meu nome é Sukhdev. (In Portuguese, Brazil) 
    - 我叫苏赫德夫。(In Chinese) 
    - 私の名前はスクデフです。(In Japanese) 
    """)

st.sidebar.markdown("""
        ### Courtesy: [Kokoro](https://huggingface.co/hexgrad/Kokoro-82M?fbclid=IwY2xjawIKqzxleHRuA2FlbQIxMAABHaf9GldgYOzXktNuoRtNKqd-aL7r-S7zPGyC8ttYOiG2zYfQqLyV4Qm75A_aem_0wKLC2C87ZZ2F04WjPJbtA)
    """)

st.sidebar.header("Configuration & Instructions")

st.sidebar.markdown("""
### How to Use the Text-to-Speech App:
1. **Enter Text**:
   - Type or paste the text you want to convert to speech in the main text area.
   
2. **Select Language**:
   - Choose the language of the input text. The available language options include:
     - 🇺🇸 American English (`a`)
     - 🇬🇧 British English (`b`)
     - 🇪🇸 Spanish (`e`)
     - 🇫🇷 French (`f`)
     - 🇮🇳 Hindi (`h`)
     - 🇮🇹 Italian (`i`)
     - 🇧🇷 Brazilian Portuguese (`p`)
     - 🇨🇳 Mandarin Chinese (`z`)
     - 🇯🇵 Japanese (`j`)
3. **Select Voice**:
   - Choose the voice you want for the speech. There are multiple voice styles based on tone and gender (e.g., af_heart, af_joy, etc.).
   
4. **Adjust Speech Speed**:
   - Use the slider to adjust how fast the speech will be generated. The speed can be set from 0.5x to 2.0x, with 1.0x being the default normal speed.
5. **Generate Speech**:
   - Once you've selected the text, language, voice, and speed, click the **"Generate Audio"** button. The app will process the text and generate the speech.
6. **Download Audio**:
   - After the audio is generated, you can play it directly within the app or download it as a .wav file by clicking the **"Download Audio"** button.
### Additional Features:
- **Text Translation**:
   - If you enter text in another language and want to hear it in English, provide your OpenAI API key (optional).
   - The app will automatically translate the text to English and generate the speech in English with the voice you selected. 
- Enjoy exploring different languages, voices, and speeds with the text-to-speech conversion!
""")

# Custom HTML and CSS for Spinner
st.markdown("""
<style>
.lds-ellipsis,
.lds-ellipsis div {
  box-sizing: border-box;
}
.lds-ellipsis {
  display: inline-block;
  position: relative;
  width: 80px;
  height: 80px;
}
.lds-ellipsis div {
  position: absolute;
  top: 33.33333px;
  width: 13.33333px;
  height: 13.33333px;
  border-radius: 50%;
  background: currentColor;
  animation-timing-function: cubic-bezier(0, 1, 1, 0);
}
.lds-ellipsis div:nth-child(1) {
  left: 8px;
  animation: lds-ellipsis1 0.6s infinite;
}
.lds-ellipsis div:nth-child(2) {
  left: 8px;
  animation: lds-ellipsis2 0.6s infinite;
}
.lds-ellipsis div:nth-child(3) {
  left: 32px;
  animation: lds-ellipsis2 0.6s infinite;
}
.lds-ellipsis div:nth-child(4) {
  left: 56px;
  animation: lds-ellipsis3 0.6s infinite;
}
@keyframes lds-ellipsis1 {
  0% {
    transform: scale(0);
  }
  100% {
    transform: scale(1);
  }
}
@keyframes lds-ellipsis3 {
  0% {
    transform: scale(1);
  }
  100% {
    transform: scale(0);
  }
}
@keyframes lds-ellipsis2 {
  0% {
    transform: translate(0, 0);
  }
  100% {
    transform: translate(24px, 0);
  }
}
</style>
""", unsafe_allow_html=True)

# User input for text, language, and voice settings
input_text = st.text_area("Enter your text here", placeholder="The sky above the port was the color of television...")
lang_code = st.selectbox("Select Language", ['a', 'b', 'e', 'f', 'h', 'i', 'p', 'z', 'j'])
voice = st.selectbox("Select Voice", ['af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica', 'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky', 
 'am_adam', 'am_echo', 'am_eric', 'am_fenrir', 'am_liam', 'am_michael', 'am_onyx', 'am_puck', 'am_santa', 
 'bf_alice', 'bf_emma', 'bf_isabella', 'bf_lily', 
 'bm_daniel', 'bm_fable', 'bm_george', 'bm_lewis', 
 'ef_dora', 
 'em_alex', 'em_santa', 
 'ff_siwis', 
 'hf_alpha', 'hf_beta', 
 'hm_omega', 'hm_psi', 
 'if_sara', 
 'im_nicola', 
 'jf_alpha', 'jf_gongitsune', 'jf_nezumi', 'jf_tebukuro', 
 'jm_kumo', 
 'pf_dora', 
 'pm_alex', 'pm_santa', 
 'zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi', 
 'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang']
)  # Change voice options as per model
speed = st.slider("Speed", min_value=0.5, max_value=2.0, value=1.0, step=0.1)

# Initialize the TTS pipeline with user-selected language
pipeline = KPipeline(lang_code=lang_code)

# Function to get the OpenAI API key from the user (optional for translation)
openai_api_key = st.text_input("Enter your OpenAI API Key (Optional for Translation)", type="password")

# Function to translate text to English using OpenAI's Chat API
def translate_to_english(api_key, text, lang_code):
    openai.api_key = api_key
    try:
        # Construct the prompt for translation
        prompt = f"Translate the following text from {lang_code} to English: \n\n{text}"

        response = openai.ChatCompletion.create(
            model="gpt-4",  # Using ChatGPT model for translation
            messages=[{"role": "system", "content": "You are a helpful assistant that translates text."},
                      {"role": "user", "content": prompt}]
        )

        # Extract translated text from response, removing any additional context or prefixes
        translated_text = response['choices'][0]['message']['content'].strip()

        # Clean up any unwanted prefixes or context
        if translated_text.lower().startswith("the translated text"):
            translated_text = translated_text.split(":", 1)[1].strip()

        return translated_text
    except Exception as e:
        st.error(f"Error occurred during translation: {e}")
        return text  # Fallback to original text in case of an error

# Generate Audio function
def generate_audio(text, lang_code, voice, speed):
    generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
    for i, (gs, ps, audio) in enumerate(generator):
        audio_data = audio
        # Save audio to in-memory buffer
        buffer = io.BytesIO()
        # Explicitly specify format as WAV
        sf.write(buffer, audio_data, 24000, format='WAV')  # Add 'format="WAV"'
        buffer.seek(0)
        return buffer

# Generate and display the audio file
if st.button('Generate Audio'):
    # Display custom spinner while generating audio
    spinner_placeholder = st.empty()  # Create a placeholder for the spinner
    spinner_placeholder.markdown('<div class="lds-ellipsis"><div></div><div></div><div></div><div></div></div>', unsafe_allow_html=True)

    # Generate audio for the original text
    audio_buffer = generate_audio(input_text, lang_code, voice, speed)
    
    # Remove the spinner once audio generation is complete
    spinner_placeholder.empty()

    # Display Audio player for the original language
    st.audio(audio_buffer, format='audio/wav')

    # Optional: Save the generated audio file for download (Original Text)
    st.download_button(
        label="Download Audio (Original Text)",
        data=audio_buffer,
        file_name="generated_speech_original.wav",
        mime="audio/wav"
    )

    # Check if OpenAI API Key is provided for translation and English audio generation
    if openai_api_key:
        # Translate the input text to English using OpenAI
        translated_text = translate_to_english(openai_api_key, input_text, lang_code)

        # Generate audio for the translated English text
        translated_audio_buffer = generate_audio(translated_text, 'a', voice, speed)

        # Display Audio for the translated text
        st.write(f"Translated Text: {translated_text}")
        st.audio(translated_audio_buffer, format='audio/wav')

        # Optional: Save the generated audio file for download (Translated Text)
        st.download_button(
            label="Download Audio (Translated to English)",
            data=translated_audio_buffer,
            file_name="generated_speech_translated.wav",
            mime="audio/wav"
        )