Spaces:
Running
Running
File size: 11,855 Bytes
41cb666 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 |
# Initalize a pipeline
from kokoro import KPipeline
# from IPython.display import display, Audio
# import soundfile as sf
import os
from huggingface_hub import list_repo_files
import uuid
import re
import gradio as gr
# Language mapping dictionary
language_map = {
"American English": "a",
"British English": "b",
"Hindi": "h",
"Spanish": "e",
"French": "f",
"Italian": "i",
"Brazilian Portuguese": "p",
"Japanese": "j",
"Mandarin Chinese": "z"
}
# Print installation instructions if necessary
install_messages = {
"Japanese": "pip install misaki[ja]",
"Mandarin Chinese": "pip install misaki[zh]"
}
def update_pipeline(Language):
""" Updates the pipeline only if the language has changed. """
global pipeline, last_used_language
# Print installation instructions if necessary
if Language in install_messages:
# raise gr.Error(f"To Use {Language} Install: {install_messages[Language]}",duration=10)
gr.Warning(f"To Use {Language} Install: {install_messages[Language]}",duration=10)
# gr.Warning("Reverting to default English pipeline...", duration=5)
# print(f"To use {Language}, install: {install_messages[Language]}")
# print("Reverting to default English pipeline...")
# Revert to default English and return immediately
pipeline = KPipeline(lang_code="a")
last_used_language = "a"
return
# Get language code, default to 'a' if not found
new_lang = language_map.get(Language, "a")
# Only update if the language is different
if new_lang != last_used_language:
try:
pipeline = KPipeline(lang_code=new_lang)
last_used_language = new_lang # Update last used language
print(f"Pipeline updated to {Language} ({new_lang})")
except Exception as e:
print(f"Error initializing KPipeline: {e}\nRetrying with default language...")
pipeline = KPipeline(lang_code="a") # Fallback to English
last_used_language = "a"
def get_voice_names(repo_id):
"""Fetches and returns a list of voice names (without extensions) from the given Hugging Face repository."""
return [os.path.splitext(file.replace("voices/", ""))[0] for file in list_repo_files(repo_id) if file.startswith("voices/")]
def create_audio_dir():
"""Creates the 'kokoro_audio' directory in the root folder if it doesn't exist."""
root_dir = os.getcwd() # Use current working directory instead of __file__
audio_dir = os.path.join(root_dir, "kokoro_audio")
if not os.path.exists(audio_dir):
os.makedirs(audio_dir)
print(f"Created directory: {audio_dir}")
else:
print(f"Directory already exists: {audio_dir}")
return audio_dir
import re
def clean_text(text):
# Define replacement rules
replacements = {
"–": " ", # Replace en-dash with space
"-": " ", # Replace hyphen with space
"**": " ", # Replace double asterisks with space
"*": " ", # Replace single asterisk with space
"#": " ", # Replace hash with space
}
# Apply replacements
for old, new in replacements.items():
text = text.replace(old, new)
# Remove emojis using regex (covering wide range of Unicode characters)
emoji_pattern = re.compile(
r'[\U0001F600-\U0001F64F]|' # Emoticons
r'[\U0001F300-\U0001F5FF]|' # Miscellaneous symbols and pictographs
r'[\U0001F680-\U0001F6FF]|' # Transport and map symbols
r'[\U0001F700-\U0001F77F]|' # Alchemical symbols
r'[\U0001F780-\U0001F7FF]|' # Geometric shapes extended
r'[\U0001F800-\U0001F8FF]|' # Supplemental arrows-C
r'[\U0001F900-\U0001F9FF]|' # Supplemental symbols and pictographs
r'[\U0001FA00-\U0001FA6F]|' # Chess symbols
r'[\U0001FA70-\U0001FAFF]|' # Symbols and pictographs extended-A
r'[\U00002702-\U000027B0]|' # Dingbats
r'[\U0001F1E0-\U0001F1FF]' # Flags (iOS)
r'', flags=re.UNICODE)
text = emoji_pattern.sub(r'', text)
# Remove multiple spaces and extra line breaks
text = re.sub(r'\s+', ' ', text).strip()
return text
def tts_file_name(text):
global temp_folder
# Remove all non-alphabetic characters and convert to lowercase
text = re.sub(r'[^a-zA-Z\s]', '', text) # Retain only alphabets and spaces
text = text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
text = text.replace(" ", "_") # Replace spaces with underscores
# Truncate or handle empty text
truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else "empty"
# Generate a random string for uniqueness
random_string = uuid.uuid4().hex[:8].upper()
# Construct the file name
file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
return file_name
# import soundfile as sf
import numpy as np
import wave
from pydub import AudioSegment
from pydub.silence import split_on_silence
def remove_silence_function(file_path,minimum_silence=50):
# Extract file name and format from the provided path
output_path = file_path.replace(".wav", "_no_silence.wav")
audio_format = "wav"
# Reading and splitting the audio file into chunks
sound = AudioSegment.from_file(file_path, format=audio_format)
audio_chunks = split_on_silence(sound,
min_silence_len=100,
silence_thresh=-45,
keep_silence=minimum_silence)
# Putting the file back together
combined = AudioSegment.empty()
for chunk in audio_chunks:
combined += chunk
combined.export(output_path, format=audio_format)
return output_path
def generate_and_save_audio(text, Language="American English",voice="af_bella", speed=1,remove_silence=False,keep_silence_up_to=0.05):
text=clean_text(text)
update_pipeline(Language)
generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
save_path=tts_file_name(text)
# Open the WAV file for writing
with wave.open(save_path, 'wb') as wav_file:
# Set the WAV file parameters
wav_file.setnchannels(1) # Mono audio
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit audio)
wav_file.setframerate(24000) # Sample rate
# Process each audio chunk
for i, (gs, ps, audio) in enumerate(generator):
# print(f"{i}. {gs}")
# print(f"Phonetic Transcription: {ps}")
# display(Audio(data=audio, rate=24000, autoplay=i==0))
print("\n")
# Convert the Tensor to a NumPy array
audio_np = audio.numpy() # Convert Tensor to NumPy array
audio_int16 = (audio_np * 32767).astype(np.int16) # Scale to 16-bit range
audio_bytes = audio_int16.tobytes() # Convert to bytes
# Write the audio chunk to the WAV file
wav_file.writeframes(audio_bytes)
if remove_silence:
keep_silence = int(keep_silence_up_to * 1000)
new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence)
return new_wave_file,new_wave_file
return save_path,save_path
def ui():
def toggle_autoplay(autoplay):
return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
# Define examples in the format you mentioned
dummy_examples = [
["Hey, y'all, let’s grab some coffee and catch up!", "American English", "af_bella"],
["I'd like a large coffee, please.", "British English", "bf_isabella"],
["नमस्ते, कैसे हो?", "Hindi", "hf_alpha"],
["Hola, ¿cómo estás?", "Spanish", "ef_dora"],
["Bonjour, comment ça va?", "French", "ff_siwis"],
["Ciao, come stai?", "Italian", "if_sara"],
["Olá, como você está?", "Brazilian Portuguese", "pf_dora"],
["こんにちは、お元気ですか?", "Japanese", "jf_nezumi"],
["你好,你怎么样?", "Mandarin Chinese", "zf_xiaoni"]
]
with gr.Blocks() as demo:
# gr.Markdown("<center><h1 style='font-size: 40px;'>KOKORO TTS</h1></center>") # Larger title with CSS
lang_list = ['American English', 'British English', 'Hindi', 'Spanish', 'French', 'Italian', 'Brazilian Portuguese', 'Japanese', 'Mandarin Chinese']
voice_names = get_voice_names("hexgrad/Kokoro-82M")
with gr.Row():
with gr.Column():
text = gr.Textbox(label='Enter Text', lines=3)
with gr.Row():
language_name = gr.Dropdown(lang_list, label="Select Language", value=lang_list[0])
with gr.Row():
voice_name = gr.Dropdown(voice_names, label="Choose VoicePack", value=voice_names[0])
with gr.Row():
generate_btn = gr.Button('Generate', variant='primary')
with gr.Accordion('Audio Settings', open=False):
speed = gr.Slider(minimum=0.25, maximum=2, value=1, step=0.1, label='⚡️Speed', info='Adjust the speaking speed')
remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
with gr.Column():
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
audio_file = gr.File(label='Download Audio')
with gr.Accordion('Enable Autoplay', open=False):
autoplay = gr.Checkbox(value=True, label='Autoplay')
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
text.submit(generate_and_save_audio, inputs=[text, language_name, voice_name, speed, remove_silence], outputs=[audio, audio_file])
generate_btn.click(generate_and_save_audio, inputs=[text, language_name, voice_name, speed, remove_silence], outputs=[audio, audio_file])
# Add examples to the interface
gr.Examples(examples=dummy_examples, inputs=[text, language_name, voice_name])
return demo
def tutorial():
# Markdown explanation for language code
explanation = """
## Language Code Explanation:
Example: `'af_bella'`
- **'a'** stands for **American English**.
- **'f_'** stands for **Female** (If it were 'm_', it would mean Male).
- **'bella'** refers to the specific voice.
The first character in the voice code stands for the language:
- **"a"**: American English
- **"b"**: British English
- **"h"**: Hindi
- **"e"**: Spanish
- **"f"**: French
- **"i"**: Italian
- **"p"**: Brazilian Portuguese
- **"j"**: Japanese
- **"z"**: Mandarin Chinese
The second character stands for gender:
- **"f_"**: Female
- **"m_"**: Male
"""
with gr.Blocks() as demo2:
gr.Markdown(explanation) # Display the explanation
return demo2
import click
@click.command()
@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
def main(debug, share):
demo1 = ui()
demo2 = tutorial()
demo = gr.TabbedInterface([demo1, demo2],["Multilingual TTS","VoicePack Explanation"],title="Kokoro TTS",theme='JohnSmith9982/small_and_pretty')
demo.queue().launch(debug=debug, share=share)
#Run on local network
# laptop_ip="192.168.0.30"
# port=8080
# demo.queue().launch(debug=debug, share=share,server_name=laptop_ip,server_port=port)
# Initialize default pipeline
last_used_language = "a"
pipeline = KPipeline(lang_code=last_used_language)
temp_folder = create_audio_dir()
if __name__ == "__main__":
main() |