Spaces:
Runtime error
Runtime error
import gradio as gr | |
import openai | |
from kokoro import KPipeline, KModel | |
import random | |
import os | |
import torch | |
import time | |
# Set up the OpenAI API key (optional) | |
openai.api_key = None # Will be set by the user through the UI | |
# Check if GPU is available | |
CUDA_AVAILABLE = torch.cuda.is_available() | |
# Initialize the models and pipelines (for TTS) | |
# Initialize the models and pipelines (for TTS) | |
models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])} | |
# Fixed the iteration and dictionary comprehension for pipelines | |
pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in ['a', 'b', 'e', 'f', 'h', 'i', 'j', 'p', 'z']} | |
# Load lexicon for specific languages | |
pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO' | |
pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ' | |
# Initialize random texts for generating sample text | |
random_texts = {} | |
for lang in ['en']: | |
with open(f'{lang}.txt', 'r') as r: | |
random_texts[lang] = [line.strip() for line in r] | |
def get_random_text(voice): | |
lang = dict(a='en', b='en')[voice[0]] | |
return random.choice(random_texts[lang]) | |
# Generate function to create speech from text | |
def generate_first(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE): | |
pipeline = pipelines[voice[0]] | |
pack = pipeline.load_voice(voice) | |
use_gpu = use_gpu and CUDA_AVAILABLE | |
for _, ps, _ in pipeline(text, voice, speed): | |
ref_s = pack[len(ps)-1] | |
try: | |
if use_gpu: | |
audio = forward_gpu(ps, ref_s, speed) | |
else: | |
audio = models[False](ps, ref_s, speed) | |
except gr.exceptions.Error as e: | |
if use_gpu: | |
gr.Warning(str(e)) | |
gr.Info('Retrying with CPU. To avoid this error, change Hardware to CPU.') | |
audio = models[False](ps, ref_s, speed) | |
else: | |
raise gr.Error(e) | |
return (24000, audio.numpy()), ps | |
return None, '' | |
# Translator function using OpenAI API | |
def translate_to_english(api_key, text, lang_code): | |
openai.api_key = api_key | |
try: | |
prompt = f"Translate the following text from {lang_code} to English: \n\n{text}" | |
response = openai.ChatCompletion.create( | |
model="gpt-4", | |
messages=[{"role": "system", "content": "You are a helpful assistant that translates text."}, | |
{"role": "user", "content": prompt}] | |
) | |
translated_text = response['choices'][0]['message']['content'].strip() | |
return translated_text | |
except Exception as e: | |
return f"Error: {str(e)}" | |
def generate_audio_from_text(text, lang_code, voice, speed, use_gpu=True): | |
pipeline = pipelines[lang_code] | |
pack = pipeline.load_voice(voice) | |
use_gpu = use_gpu and CUDA_AVAILABLE | |
for _, ps, _ in pipeline(text, voice, speed): | |
ref_s = pack[len(ps)-1] | |
try: | |
if use_gpu: | |
audio = forward_gpu(ps, ref_s, speed) | |
else: | |
audio = models[False](ps, ref_s, speed) | |
except gr.exceptions.Error as e: | |
if use_gpu: | |
gr.Warning(str(e)) | |
gr.Info('Switching to CPU') | |
audio = models[False](ps, ref_s, speed) | |
else: | |
raise gr.Error(e) | |
return (24000, audio.numpy()) | |
# Define your available voices here in the CHOICES dictionary | |
CHOICES = { | |
'af_heart': '🇺🇸 🚺 Heart ❤️', | |
'af_bella': '🇺🇸 🚺 Bella 🔥', | |
'af_nicole': '🇺🇸 🚺 Nicole 🎧', | |
'af_aoede': '🇺🇸 🚺 Aoede', | |
'af_kore': '🇺🇸 🚺 Kore', | |
'af_sarah': '🇺🇸 🚺 Sarah', | |
'af_nova': '🇺🇸 🚺 Nova', | |
'af_sky': '🇺🇸 🚺 Sky', | |
'af_alloy': '🇺🇸 🚺 Alloy', | |
'af_jessica': '🇺🇸 🚺 Jessica', | |
'af_river': '🇺🇸 🚺 River', | |
'am_michael': '🇺🇸 🚹 Michael', | |
'am_fenrir': '🇺🇸 🚹 Fenrir', | |
'am_puck': '🇺🇸 🚹 Puck', | |
'am_echo': '🇺🇸 🚹 Echo', | |
'am_eric': '🇺🇸 🚹 Eric', | |
'am_liam': '🇺🇸 🚹 Liam', | |
'am_onyx': '🇺🇸 🚹 Onyx', | |
'am_santa': '🇺🇸 🚹 Santa', | |
'am_adam': '🇺🇸 🚹 Adam', | |
'bf_emma': '🇬🇧 🚺 Emma', | |
'bf_isabella': '🇬🇧 🚺 Isabella', | |
'bf_alice': '🇬🇧 🚺 Alice', | |
'bf_lily': '🇬🇧 🚺 Lily', | |
'bm_george': '🇬🇧 🚹 George', | |
'bm_fable': '🇬🇧 🚹 Fable', | |
'bm_lewis': '🇬🇧 🚹 Lewis', | |
'bm_daniel': '🇬🇧 🚹 Daniel', | |
'ef_dora': '🇪🇸 🚺 Dora', | |
'em_alex': '🇪🇸 🚹 Alex', | |
'em_santa': '🇪🇸 🚹 Santa', | |
'ff_siwis': '🇫🇷 🚺 Siwis', | |
'hf_alpha': '🇮🇳 🚹 Alpha', | |
'hf_beta': '🇮🇳 🚹 Beta', | |
'hm_omega': '🇮🇳 🚹 Omega', | |
'hm_psi': '🇮🇳 🚹 Psi', | |
'if_sara': '🇮🇹 🚺 Sara', | |
'im_nicola': '🇮🇹 🚺 Nicola', | |
'jf_alpha': '🇯🇵 🚹 Alpha', | |
'jf_gongitsune': '🇯🇵 🚹 Gongitsune', | |
'jf_nezumi': '🇯🇵 🚹 Nezumi', | |
'jf_tebukuro': '🇯🇵 🚹 Tebukuro', | |
'jm_kumo': '🇯🇵 🚹 Kumo', | |
'pf_dora': '🇧🇷 🚺 Dora', | |
'pm_alex': '🇧🇷 🚹 Alex', | |
'pm_santa': '🇧🇷 🚹 Santa', | |
'zf_xiaobei': '🇨🇳 🚺 Xiaobei', | |
'zf_xiaoni': '🇨🇳 🚺 Xiaoni', | |
'zf_xiaoxiao': '🇨🇳 🚺 Xiaoxiao', | |
'zf_xiaoyi': '🇨🇳 🚺 Xiaoyi', | |
'zm_yunjian': '🇨🇳 🚹 Yunjian', | |
'zm_yunxi': '🇨🇳 🚹 Yunxi', | |
'zm_yunxia': '🇨🇳 🚹 Yunxia', | |
'zm_yunyang': '🇨🇳 🚹 Yunyang' | |
} | |
# Gradio interface setup | |
with gr.Blocks() as app: | |
gr.Markdown("### Kokoro Text-to-Speech with Translation") | |
with gr.Row(): | |
with gr.Column(): | |
# Input for text and language settings | |
input_text = gr.Textbox(label="Enter Text", placeholder="Type your text here...") | |
voice = gr.Dropdown(list(CHOICES.items()), value='af_heart', label='Voice') | |
use_gpu = gr.Checkbox(label="Use GPU", value=CUDA_AVAILABLE) | |
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed") | |
openai_api_key = gr.Textbox(label="Enter OpenAI API Key (for translation)", type="password") | |
random_btn = gr.Button("Random Text") | |
with gr.Column(): | |
out_audio = gr.Audio(label="Generated Audio", interactive=False, autoplay=True) | |
out_text = gr.Textbox(label="Generated Audio Tokens", interactive=False) | |
generate_btn = gr.Button("Generate Audio") | |
translate_btn = gr.Button("Translate and Generate Audio") | |
random_btn.click(fn=get_random_text, inputs=[voice], outputs=[input_text]) | |
def handle_translation(text, api_key, lang_code, voice, speed, use_gpu): | |
translated_text = translate_to_english(api_key, text, lang_code) | |
translated_audio = generate_audio_from_text(translated_text, 'a', voice, speed, use_gpu) | |
return translated_audio, translated_text | |
translate_btn.click(fn=handle_translation, inputs=[input_text, openai_api_key, voice, speed, use_gpu], outputs=[out_audio, out_text]) | |
def generate_and_play(text, voice, speed, use_gpu): | |
audio, tokens = generate_first(text, voice, speed, use_gpu) | |
return audio, tokens | |
generate_btn.click(fn=generate_and_play, inputs=[input_text, voice, speed, use_gpu], outputs=[out_audio, out_text]) | |
app.launch() |