import gradio as gr import edge_tts import asyncio import tempfile import os import math from pydub import AudioSegment import subprocess # Función para obtener voces disponibles async def get_voices(): voices = await edge_tts.list_voices() return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices} # Conversión de texto a voz async def text_to_speech(text, voice, rate, pitch): if not text.strip() or not voice: return (None, "Please enter text and select a voice") if not text else (None, "Please select a voice") try: communicate = edge_tts.Communicate( text, voice.split(" - ")[0], rate=f"{rate:+d}%", pitch=f"{pitch:+d}Hz" ) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: await communicate.save(tmp_file.name) return tmp_file.name, None except Exception as e: return None, f"Speech generation failed: {str(e)}" # Agregar música de fondo (ahora elimina el audio original) def add_background_music(speech_path, bg_music_path): speech = AudioSegment.from_file(speech_path) background = AudioSegment.from_file(bg_music_path) - 16 # 15% volume if len(background) < len(speech) + 3000: background = background * math.ceil((len(speech)+3000)/len(background)) combined = speech.overlay(background[:len(speech)]) fade_out = background[len(speech):len(speech)+3000].fade_out(3000) final_audio = combined + fade_out with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: final_audio.export(tmp_file.name, format="mp3") # Eliminar audio original if os.path.exists(speech_path): os.remove(speech_path) return tmp_file.name # Procesar múltiples videos (ahora elimina archivos temporales) def process_videos(audio_path, video_files): temp_files = [] try: audio_duration = AudioSegment.from_file(audio_path).duration_seconds # Concatenar videos with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as concat_video: temp_files.append(concat_video.name) with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as list_file: temp_files.append(list_file.name) list_file.write("\n".join([f"file '{v.name}'" for v in video_files])) list_file.close() subprocess.run([ "ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file.name, "-c", "copy", concat_video.name ], check=True) # Crear video final with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as final_video: subprocess.run([ "ffmpeg", "-y", "-stream_loop", "-1", "-i", concat_video.name, "-i", audio_path, "-t", str(audio_duration + 3), "-c:v", "libx264", "-c:a", "aac", "-vf", "fade=t=out:st={}:d=3".format(audio_duration), "-af", "afade=t=out:st={}:d=3".format(audio_duration), "-shortest", final_video.name ], check=True) return final_video.name finally: # Eliminar archivos temporales for f in temp_files: if os.path.exists(f): os.remove(f) # Función principal (ahora elimina videos originales) async def tts_interface(text, voice, rate, pitch, bg_music, video_files): temp_audio = None try: # Generar audio principal temp_audio, warning = await text_to_speech(text, voice, rate, pitch) if warning: return None, None, gr.Warning(warning) # Agregar música de fondo if bg_music: temp_audio = add_background_music(temp_audio, bg_music) # Procesar videos video_path = None if video_files: video_path = process_videos(temp_audio, video_files) # Eliminar videos originales subidos for video in video_files: if hasattr(video, 'name') and os.path.exists(video.name): os.remove(video.name) return temp_audio, video_path, None except Exception as e: return None, None, gr.Warning(f"Processing error: {str(e)}") finally: # Eliminar audio temporal si existe y no es la salida final if temp_audio and os.path.exists(temp_audio): try: if video_path and temp_audio != video_path: os.remove(temp_audio) except: # Evitar errores si el archivo ya fue eliminado pass # Crear interfaz (sin cambios) async def create_demo(): voices = await get_voices() demo = gr.Interface( fn=tts_interface, inputs=[ gr.Textbox(label="Input Text", lines=5, placeholder="Enter your text here..."), gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice"), gr.Slider(-50, 50, 0, label="Speech Rate (%)"), gr.Slider(-20, 20, 0, label="Pitch (Hz)"), gr.Audio(label="Background Music", type="filepath"), gr.File(label="Upload Videos", file_types=[".mp4", ".mov"], file_count="multiple") ], outputs=[ gr.Audio(label="Generated Audio", type="filepath"), gr.Video(label="Final Video"), gr.Markdown(visible=False) ], title="Multi-Video TTS con Bucle", description=""" Este script permite crear videos personalizados combinando texto, audio y múltiples clips de video. Convierte texto en voz usando tecnología avanzada de síntesis de voz (Text-to-Speech), opcionalmente añade música de fondo para enriquecer el audio generado y procesa varios videos subidos por el usuario para reproducirlos en secuencia y en bucle infinito. El resultado final es un video que sincroniza el audio con la concatenación de los clips, asegurando una transición suave entre ellos y un fade-out al final de cada ciclo. Además, el script está diseñado para limpiar automáticamente los archivos temporales y los videos originales subidos, evitando acumulación innecesaria en el servidor. Es ideal para generar contenido dinámico como videos motivacionales, presentaciones automáticas o material promocional. """, css="#component-0 {max-width: 800px}" ) return demo async def main(): demo = await create_demo() demo.queue() demo.launch() if __name__ == "__main__": asyncio.run(main())