Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import openai | |
from kokoro import KPipeline | |
import random | |
import os | |
import torch | |
import time | |
# Set up the OpenAI API key (optional) | |
openai.api_key = None # Will be set by the user through the UI | |
# Check if GPU is available | |
CUDA_AVAILABLE = torch.cuda.is_available() | |
# Initialize the models and pipelines (for TTS) | |
models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])} | |
pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'abefhijpz'} | |
# Load lexicon for specific languages | |
pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO' | |
pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ' | |
# Initialize random texts for generating sample text | |
random_texts = {} | |
for lang in ['en']: | |
with open(f'{lang}.txt', 'r') as r: | |
random_texts[lang] = [line.strip() for line in r] | |
def get_random_text(voice): | |
lang = dict(a='en', b='en')[voice[0]] | |
return random.choice(random_texts[lang]) | |
# Generate function to create speech from text | |
def generate_first(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE): | |
pipeline = pipelines[voice[0]] | |
pack = pipeline.load_voice(voice) | |
use_gpu = use_gpu and CUDA_AVAILABLE | |
for _, ps, _ in pipeline(text, voice, speed): | |
ref_s = pack[len(ps)-1] | |
try: | |
if use_gpu: | |
audio = forward_gpu(ps, ref_s, speed) | |
else: | |
audio = models[False](ps, ref_s, speed) | |
except gr.exceptions.Error as e: | |
if use_gpu: | |
gr.Warning(str(e)) | |
gr.Info('Retrying with CPU. To avoid this error, change Hardware to CPU.') | |
audio = models[False](ps, ref_s, speed) | |
else: | |
raise gr.Error(e) | |
return (24000, audio.numpy()), ps | |
return None, '' | |
# Translator function using OpenAI API | |
def translate_to_english(api_key, text, lang_code): | |
openai.api_key = api_key | |
try: | |
prompt = f"Translate the following text from {lang_code} to English: \n\n{text}" | |
response = openai.ChatCompletion.create( | |
model="gpt-4o", | |
messages=[{"role": "system", "content": "You are a helpful assistant that translates text."}, | |
{"role": "user", "content": prompt}] | |
) | |
translated_text = response['choices'][0]['message']['content'].strip() | |
return translated_text | |
except Exception as e: | |
return f"Error: {str(e)}" | |
def generate_audio_from_text(text, lang_code, voice, speed, use_gpu=True): | |
pipeline = pipelines[lang_code] | |
pack = pipeline.load_voice(voice) | |
use_gpu = use_gpu and CUDA_AVAILABLE | |
for _, ps, _ in pipeline(text, voice, speed): | |
ref_s = pack[len(ps)-1] | |
try: | |
if use_gpu: | |
audio = forward_gpu(ps, ref_s, speed) | |
else: | |
audio = models[False](ps, ref_s, speed) | |
except gr.exceptions.Error as e: | |
if use_gpu: | |
gr.Warning(str(e)) | |
gr.Info('Switching to CPU') | |
audio = models[False](ps, ref_s, speed) | |
else: | |
raise gr.Error(e) | |
return (24000, audio.numpy()) | |
# Gradio interface setup | |
with gr.Blocks() as app: | |
gr.Markdown("### Kokoro Text-to-Speech with Translation") | |
with gr.Row(): | |
with gr.Column(): | |
# Input for text and language settings | |
input_text = gr.Textbox(label="Enter Text", placeholder="Type your text here...") | |
voice = gr.Dropdown(list(CHOICES.items()), value='af_heart', label='Voice') | |
use_gpu = gr.Checkbox(label="Use GPU", value=CUDA_AVAILABLE) | |
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed") | |
openai_api_key = gr.Textbox(label="Enter OpenAI API Key (for translation)", type="password") | |
random_btn = gr.Button("Random Text") | |
with gr.Column(): | |
out_audio = gr.Audio(label="Generated Audio", interactive=False, autoplay=True) | |
out_text = gr.Textbox(label="Generated Audio Tokens", interactive=False) | |
generate_btn = gr.Button("Generate Audio") | |
translate_btn = gr.Button("Translate and Generate Audio") | |
random_btn.click(fn=get_random_text, inputs=[voice], outputs=[input_text]) | |
def handle_translation(text, api_key, lang_code, voice, speed, use_gpu): | |
translated_text = translate_to_english(api_key, text, lang_code) | |
translated_audio = generate_audio_from_text(translated_text, 'a', voice, speed, use_gpu) | |
return translated_audio, translated_text | |
translate_btn.click(fn=handle_translation, inputs=[input_text, openai_api_key, voice, speed, use_gpu], outputs=[out_audio, out_text]) | |
def generate_and_play(text, voice, speed, use_gpu): | |
audio, tokens = generate_first(text, voice, speed, use_gpu) | |
return audio, tokens | |
generate_btn.click(fn=generate_and_play, inputs=[input_text, voice, speed, use_gpu], outputs=[out_audio, out_text]) | |
app.launch() | |