Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,7 +3,7 @@ from kokoro import KPipeline
|
|
3 |
import soundfile as sf
|
4 |
import io
|
5 |
import os
|
6 |
-
import
|
7 |
|
8 |
# Install espeak-ng if not installed
|
9 |
if not os.system("which espeak-ng"):
|
@@ -13,9 +13,9 @@ else:
|
|
13 |
st.text("Installing espeak-ng...")
|
14 |
|
15 |
# Streamlit App UI Setup
|
16 |
-
st.title("
|
17 |
|
18 |
-
# Expander section
|
19 |
with st.expander("Sample Prompt!"):
|
20 |
st.markdown("""
|
21 |
- My name is Shukdev. (In English)
|
@@ -27,84 +27,125 @@ with st.expander("Sample Prompt!"):
|
|
27 |
- 我叫苏赫德夫。(In Chinese)
|
28 |
- 私の名前はスクデフです。(In Japanese)
|
29 |
""")
|
30 |
-
|
31 |
-
# Sidebar Instructions and Configuration
|
32 |
st.sidebar.header("Configuration & Instructions")
|
33 |
|
|
|
34 |
st.sidebar.markdown("""
|
35 |
### How to Use the Text-to-Speech App:
|
36 |
-
1. **Enter Text**:
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
""")
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
# User input for text, language, and voice settings
|
46 |
input_text = st.text_area("Enter your text here", placeholder="The sky above the port was the color of television...")
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
voice = st.selectbox("Select Voice", ['af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica', 'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky',
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
background_music = st.selectbox("Add Background Music", ['None', 'Calm', 'Energetic', 'Focus', 'Nature'])
|
68 |
|
69 |
# Initialize the TTS pipeline with user-selected language
|
70 |
pipeline = KPipeline(lang_code=lang_code)
|
71 |
|
72 |
-
#
|
73 |
-
def
|
74 |
generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
|
75 |
for i, (gs, ps, audio) in enumerate(generator):
|
76 |
audio_data = audio
|
77 |
-
|
78 |
-
# Simulate adding background music (using silence as placeholder)
|
79 |
-
if music_type != 'None':
|
80 |
-
st.write(f"Adding {music_type} background music...")
|
81 |
-
# Add some background music (could be a placeholder for real music loading)
|
82 |
-
music = io.BytesIO()
|
83 |
-
music.write(b'\0' * 500000) # Placeholder: Replace with actual music blending logic
|
84 |
-
|
85 |
# Save audio to in-memory buffer
|
86 |
buffer = io.BytesIO()
|
87 |
-
|
|
|
88 |
buffer.seek(0)
|
89 |
return buffer
|
90 |
|
91 |
-
# Generate
|
92 |
if st.button('Generate Audio'):
|
93 |
-
st.write("Generating speech
|
94 |
-
audio_buffer =
|
95 |
|
96 |
-
# Display
|
97 |
st.audio(audio_buffer, format='audio/wav')
|
98 |
|
99 |
-
#
|
100 |
st.download_button(
|
101 |
label="Download Audio",
|
102 |
data=audio_buffer,
|
103 |
-
file_name="
|
104 |
mime="audio/wav"
|
105 |
)
|
106 |
|
107 |
-
#
|
108 |
-
st.
|
109 |
-
if
|
|
|
110 |
st.audio(audio_buffer, format='audio/wav')
|
|
|
3 |
import soundfile as sf
|
4 |
import io
|
5 |
import os
|
6 |
+
from langdetect import detect # Language detection library
|
7 |
|
8 |
# Install espeak-ng if not installed
|
9 |
if not os.system("which espeak-ng"):
|
|
|
13 |
st.text("Installing espeak-ng...")
|
14 |
|
15 |
# Streamlit App UI Setup
|
16 |
+
st.title("Text-to-Speech with Kokoro")
|
17 |
|
18 |
+
# Expander section to display information in multiple languages
|
19 |
with st.expander("Sample Prompt!"):
|
20 |
st.markdown("""
|
21 |
- My name is Shukdev. (In English)
|
|
|
27 |
- 我叫苏赫德夫。(In Chinese)
|
28 |
- 私の名前はスクデフです。(In Japanese)
|
29 |
""")
|
30 |
+
|
|
|
31 |
st.sidebar.header("Configuration & Instructions")
|
32 |
|
33 |
+
# Sidebar Instructions
|
34 |
st.sidebar.markdown("""
|
35 |
### How to Use the Text-to-Speech App:
|
36 |
+
1. **Enter Text**: In the main text area, input any text that you want the model to convert to speech.
|
37 |
+
|
38 |
+
2. **Select Language**:
|
39 |
+
- Choose the language of the text you are entering. Available options include:
|
40 |
+
- 🇺🇸 American English (`a`)
|
41 |
+
- 🇬🇧 British English (`b`)
|
42 |
+
- 🇪🇸 Spanish (`e`)
|
43 |
+
- 🇫🇷 French (`f`)
|
44 |
+
- 🇮🇳 Hindi (`h`)
|
45 |
+
- 🇮🇹 Italian (`i`)
|
46 |
+
- 🇧🇷 Brazilian Portuguese (`p`)
|
47 |
+
- 🇨🇳 Mandarin Chinese (`z`)
|
48 |
+
- 🇯🇵 Japanese (`j`)
|
49 |
+
|
50 |
+
3. **Select Voice**:
|
51 |
+
- Choose the voice style for the speech. You can pick different voices based on tone and gender, such as `af_heart`, `af_joy`, etc.
|
52 |
+
|
53 |
+
4. **Adjust Speed**:
|
54 |
+
- Use the speed slider to change how fast the speech is generated. You can set it between `0.5x` to `2.0x`, where `1.0x` is the normal speed.
|
55 |
+
5. **Generate Speech**:
|
56 |
+
- After configuring the settings, click on the **"Generate Audio"** button. The app will process your text and produce speech audio accordingly.
|
57 |
+
|
58 |
+
6. **Download**:
|
59 |
+
- Once the audio is generated, you can play it directly in the app or download it as a `.wav` file by clicking on the **"Download Audio"** button.
|
60 |
+
Enjoy experimenting with the text-to-speech conversion, and feel free to try different voices, speeds, and languages!
|
61 |
""")
|
62 |
|
63 |
+
st.sidebar.markdown("""
|
64 |
+
### Courtesy: [Kokoro](https://huggingface.co/hexgrad/Kokoro-82M?fbclid=IwY2xjawIKqzxleHRuA2FlbQIxMAABHaf9GldgYOzXktNuoRtNKqd-aL7r-S7zPGyC8ttYOiG2zYfQqLyV4Qm75A_aem_0wKLC2C87ZZ2F04WjPJbtA)
|
65 |
+
""")
|
66 |
+
|
67 |
+
# Language Detection Function
|
68 |
+
def detect_language(text):
|
69 |
+
try:
|
70 |
+
lang = detect(text)
|
71 |
+
return lang
|
72 |
+
except Exception as e:
|
73 |
+
st.error("Error detecting language: " + str(e))
|
74 |
+
return None
|
75 |
+
|
76 |
# User input for text, language, and voice settings
|
77 |
input_text = st.text_area("Enter your text here", placeholder="The sky above the port was the color of television...")
|
78 |
+
auto_detect_lang = detect_language(input_text)
|
79 |
+
|
80 |
+
# Set detected language to the selectbox (if detected)
|
81 |
+
if auto_detect_lang:
|
82 |
+
lang_map = {
|
83 |
+
'en': 'a', # American English
|
84 |
+
'es': 'e', # Spanish
|
85 |
+
'fr': 'f', # French
|
86 |
+
'hi': 'h', # Hindi
|
87 |
+
'it': 'i', # Italian
|
88 |
+
'pt': 'p', # Portuguese
|
89 |
+
'zh': 'z', # Chinese
|
90 |
+
'ja': 'j' # Japanese
|
91 |
+
}
|
92 |
+
lang_code = lang_map.get(auto_detect_lang, 'a') # Default to English if not in map
|
93 |
+
else:
|
94 |
+
lang_code = st.selectbox("Select Language", ['a', 'b', 'e', 'f', 'h', 'i', 'p', 'z', 'j'])
|
95 |
+
|
96 |
voice = st.selectbox("Select Voice", ['af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica', 'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky',
|
97 |
+
'am_adam', 'am_echo', 'am_eric', 'am_fenrir', 'am_liam', 'am_michael', 'am_onyx', 'am_puck', 'am_santa',
|
98 |
+
'bf_alice', 'bf_emma', 'bf_isabella', 'bf_lily',
|
99 |
+
'bm_daniel', 'bm_fable', 'bm_george', 'bm_lewis',
|
100 |
+
'ef_dora',
|
101 |
+
'em_alex', 'em_santa',
|
102 |
+
'ff_siwis',
|
103 |
+
'hf_alpha', 'hf_beta',
|
104 |
+
'hm_omega', 'hm_psi',
|
105 |
+
'if_sara',
|
106 |
+
'im_nicola',
|
107 |
+
'jf_alpha', 'jf_gongitsune', 'jf_nezumi', 'jf_tebukuro',
|
108 |
+
'jm_kumo',
|
109 |
+
'pf_dora',
|
110 |
+
'pm_alex', 'pm_santa',
|
111 |
+
'zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi',
|
112 |
+
'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang'])
|
113 |
+
|
114 |
+
speed = st.slider("Speed", min_value=0.5, max_value=2.0, value=1.0, step=0.1)
|
|
|
115 |
|
116 |
# Initialize the TTS pipeline with user-selected language
|
117 |
pipeline = KPipeline(lang_code=lang_code)
|
118 |
|
119 |
+
# Generate Audio function
|
120 |
+
def generate_audio(text, lang_code, voice, speed):
|
121 |
generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
|
122 |
for i, (gs, ps, audio) in enumerate(generator):
|
123 |
audio_data = audio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
# Save audio to in-memory buffer
|
125 |
buffer = io.BytesIO()
|
126 |
+
# Explicitly specify format as WAV
|
127 |
+
sf.write(buffer, audio_data, 24000, format='WAV') # Add 'format="WAV"'
|
128 |
buffer.seek(0)
|
129 |
return buffer
|
130 |
|
131 |
+
# Generate and display the audio file
|
132 |
if st.button('Generate Audio'):
|
133 |
+
st.write("Generating speech...")
|
134 |
+
audio_buffer = generate_audio(input_text, lang_code, voice, speed)
|
135 |
|
136 |
+
# Display Audio player in the app
|
137 |
st.audio(audio_buffer, format='audio/wav')
|
138 |
|
139 |
+
# Optional: Save the generated audio file for download
|
140 |
st.download_button(
|
141 |
label="Download Audio",
|
142 |
data=audio_buffer,
|
143 |
+
file_name="generated_speech.wav",
|
144 |
mime="audio/wav"
|
145 |
)
|
146 |
|
147 |
+
# Interactive Voice Feedback
|
148 |
+
feedback = st.radio("Do you want to hear it again?", ('No', 'Yes'))
|
149 |
+
if feedback == 'Yes':
|
150 |
+
st.write("Replaying the generated speech...")
|
151 |
st.audio(audio_buffer, format='audio/wav')
|