Edge-TTS-Text-to-Speech

Running

File size: 4,396 Bytes

63f1d6d
 
 
 
 
299c2df
 
 
63f1d6d
299c2df
63f1d6d
 
 
 
299c2df
63f1d6d
 
8529fe9
63f1d6d
8529fe9
63f1d6d
 
 
 
 
 
 
 
 
 
299c2df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8529fe9
299c2df
 
 
5f7fd2a
 
 
 
 
 
 
299c2df
5f7fd2a
 
d81bde6
63f1d6d
 
 
d81bde6
ff4d2d2
d81bde6
 
 
63f1d6d
 
 
 
 
 
299c2df
5f7fd2a
63f1d6d
 
 
299c2df
63f1d6d
 
 
d81bde6
 
63f1d6d
4f5115c
 
63f1d6d
 
 
8529fe9
 
 
 
 
63f1d6d
299c2df

import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from pydub import AudioSegment
from pydub.playback import play
import math

# Función para obtener las voces disponibles
async def get_voices():
    voices = await edge_tts.list_voices()
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

# Función principal de conversión de texto a voz
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, "Please enter text to convert."
    if not voice:
        return None, "Please select a voice."
    
    voice_short_name = voice.split(" - ")[0]
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path, None

# Función para agregar el fondo musical al speech
def add_background_music(speech_file, background_music_file, output_file):
    # Cargar los archivos de audio
    speech = AudioSegment.from_mp3(speech_file)
    background_music = AudioSegment.from_mp3(background_music_file)

    # Ajustar el volumen del fondo musical al 15%
    background_music = background_music - 16  # Reducción aproximada para 15%

    # Repetir el fondo musical si es más corto que el speech
    if len(background_music) < len(speech):
        repetitions = math.ceil(len(speech) / len(background_music))
        background_music = background_music * repetitions

    # Cortar el fondo musical para que coincida con la duración del speech
    background_music = background_music[:len(speech)]

    # Superponer el speech y el fondo musical
    final_audio = speech.overlay(background_music)

    # Exportar el audio resultante
    final_audio.export(output_file, format="mp3")
    print(f"Archivo generado exitosamente: {output_file}")

# Interfaz Gradio
async def tts_interface(text, voice, rate, pitch, background_music):
    # Generar el speech
    speech_file, warning = await text_to_speech(text, voice, rate, pitch)
    if warning:
        return None, None, gr.Warning(warning)
    
    # Verificar si se proporcionó un archivo de fondo musical
    if background_music and background_music != "":
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
            output_file = tmp_file.name
            add_background_music(speech_file, background_music, output_file)
        # Eliminar el archivo temporal del speech original
        os.remove(speech_file)
        return output_file, None, None
    
    # Si no hay fondo musical, devolver el speech original
    return speech_file, None, None

async def create_demo():
    voices = await get_voices()
    
    description = """
    Convert text to speech with audio background to 15% volumen, perfect for audiobooks or youtube videos ! using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
    
    """
    
    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text", lines=5),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1),
            gr.Audio(label="Background Music", type="filepath")  # Sin el argumento 'optional'
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.Image(label="Visualization", visible=False),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="Edge TTS Text-to-Speech",
        description=description,
        article="Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!",
        analytics_enabled=False,
        allow_flagging="manual",
        api_name=None
    )
    return demo

async def main():
    demo = await create_demo()
    demo.queue(default_concurrency_limit=5)
    demo.launch(show_api=False)

if __name__ == "__main__":
    asyncio.run(main())