Spaces:
Running
Running
File size: 5,250 Bytes
620ebff 666cd48 ee1b822 666cd48 620ebff 9e6c87a 620ebff 9e6c87a de8d4a5 620ebff a861aa1 620ebff ee1b822 620ebff ee1b822 620ebff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import streamlit as st
from kokoro import KPipeline
import soundfile as sf
import io
import os
import speech_recognition as sr
import romkan # For Japanese Romanization
# Install espeak-ng if not installed
if not os.system("which espeak-ng"):
st.text("espeak-ng already installed.")
else:
os.system("apt-get -qq -y install espeak-ng")
st.text("Installing espeak-ng...")
# Streamlit App UI Setup
st.title("Text-to-Speech with Kokoro")
st.sidebar.header("Configuration & Instructions")
# Sidebar Instructions
st.sidebar.markdown("""
### How to Use the Text-to-Speech App:
1. **Enter Text**: In the main text area, input any text that you want the model to convert to speech.
2. **Select Language**:
- Choose the language of the text you are entering. Available options include:
- ๐บ๐ธ American English (`a`)
- ๐ฌ๐ง British English (`b`)
- ๐ช๐ธ Spanish (`e`)
- ๐ซ๐ท French (`f`)
- ๐ฎ๐ณ Hindi (`h`)
- ๐ฎ๐น Italian (`i`)
- ๐ง๐ท Brazilian Portuguese (`p`)
- ๐จ๐ณ Mandarin Chinese (`z`)
- ๐ฏ๐ต Japanese (`j`)
3. **Select Voice**:
- Choose the voice style for the speech. You can pick different voices based on tone and gender, such as `af_heart`, `af_joy`, etc.
4. **Adjust Speed**:
- Use the speed slider to change how fast the speech is generated. You can set it between `0.5x` to `2.0x`, where `1.0x` is the normal speed.
5. **Generate Speech**:
- After configuring the settings, click on the **"Generate Audio"** button. The app will process your text and produce speech audio accordingly.
6. **Download**:
- Once the audio is generated, you can play it directly in the app or download it as a `.wav` file by clicking on the **"Download Audio"** button.
Enjoy experimenting with the text-to-speech conversion, and feel free to try different voices, speeds, and languages!
""")
# User input for text, language, and voice settings
input_text = st.text_area("Enter your text here", "The sky above the port was the color of television...")
lang_code = st.selectbox("Select Language", ['a', 'b', 'e', 'f', 'h', 'i', 'p', 'z', 'j'])
voice = st.selectbox("Select Voice", ['af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica', 'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky',
'am_adam', 'am_echo', 'am_eric', 'am_fenrir', 'am_liam', 'am_michael', 'am_onyx', 'am_puck', 'am_santa',
'bf_alice', 'bf_emma', 'bf_isabella', 'bf_lily',
'bm_daniel', 'bm_fable', 'bm_george', 'bm_lewis',
'ef_dora',
'em_alex', 'em_santa',
'ff_siwis',
'hf_alpha', 'hf_beta',
'hm_omega', 'hm_psi',
'if_sara',
'im_nicola',
'jf_alpha', 'jf_gongitsune', 'jf_nezumi', 'jf_tebukuro',
'jm_kumo',
'pf_dora',
'pm_alex', 'pm_santa',
'zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi',
'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang']
) # Change voice options as per model
speed = st.slider("Speed", min_value=0.5, max_value=2.0, value=1.0, step=0.1)
# Initialize the TTS pipeline with user-selected language
pipeline = KPipeline(lang_code=lang_code)
# Generate Audio function
def generate_audio(text, lang_code, voice, speed):
generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
for i, (gs, ps, audio) in enumerate(generator):
audio_data = audio
# Save audio to in-memory buffer
buffer = io.BytesIO()
# Explicitly specify format as WAV
sf.write(buffer, audio_data, 24000, format='WAV') # Add 'format="WAV"'
buffer.seek(0)
return buffer
# Transcribe the generated audio using speech recognition
def transcribe_audio(audio_buffer):
recognizer = sr.Recognizer()
with sr.AudioFile(audio_buffer) as source:
audio = recognizer.record(source)
try:
# Transcribe using Google Web Speech API (requires internet)
text = recognizer.recognize_google(audio)
return text
except sr.UnknownValueError:
return "Sorry, could not understand the audio"
except sr.RequestError as e:
return f"Request error from Google Speech Recognition service; {e}"
# Romanize (convert to Romanji) if the language is Japanese
def romanize_text(text, lang_code):
if lang_code == 'j': # Japanese language code
return romkan.to_romaji(text)
return text # No need for Romanization for other languages
# Generate and display the audio file
if st.button('Generate Audio'):
st.write("Generating speech...")
audio_buffer = generate_audio(input_text, lang_code, voice, speed)
# Display Audio player in the app
st.audio(audio_buffer, format='audio/wav')
# Transcribe the generated speech to text
transcription = transcribe_audio(audio_buffer)
# Romanize the transcription if it's Japanese
romanized_text = romanize_text(transcription, lang_code)
# Display the transcribed and Romanized text
st.write("Transcribed Text: ", transcription)
st.write("Romanized Pronunciation: ", romanized_text)
# Optional: Save the generated audio file for download
st.download_button(
label="Download Audio",
data=audio_buffer,
file_name="generated_speech.wav",
mime="audio/wav"
)
|