File size: 5,444 Bytes
800530a 63f1d6d 800530a 299c2df 2e5ba54 4aea5de 800530a 2e5ba54 800530a 2e5ba54 63f1d6d 011525c 55cc4ac 8676795 011525c 8676795 011525c 8676795 011525c 299c2df 011525c 2e5ba54 011525c 2e5ba54 011525c 2e5ba54 011525c 2e5ba54 011525c 2e5ba54 011525c 2e5ba54 011525c 2e5ba54 011525c 2e5ba54 011525c 8529fe9 2e5ba54 011525c 2e5ba54 011525c 2e5ba54 011525c 2e5ba54 011525c d81bde6 2e5ba54 63f1d6d 55cc4ac 63f1d6d 011525c 800530a 209bf9f 63f1d6d 800530a 011525c 63f1d6d 2e5ba54 011525c d068ede 011525c d068ede |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import math
from pydub import AudioSegment
import subprocess
# Funci贸n para obtener voces disponibles
async def get_voices():
voices = await edge_tts.list_voices()
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
# Conversi贸n de texto a voz
async def text_to_speech(text, voice, rate, pitch):
if not text.strip() or not voice:
return (None, "Please enter text and select a voice") if not text else (None, "Please select a voice")
try:
communicate = edge_tts.Communicate(
text,
voice.split(" - ")[0],
rate=f"{rate:+d}%",
pitch=f"{pitch:+d}Hz"
)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
await communicate.save(tmp_file.name)
return tmp_file.name, None
except Exception as e:
return None, f"Speech generation failed: {str(e)}"
# Agregar m煤sica de fondo
def add_background_music(speech_path, bg_music_path):
speech = AudioSegment.from_file(speech_path)
background = AudioSegment.from_file(bg_music_path) - 16 # 15% volume
# Asegurar que la m煤sica de fondo dure al menos como el speech + 3s fadeout
if len(background) < len(speech) + 3000:
background = background * math.ceil((len(speech)+3000)/len(background))
# Combinar audio con fadeout
combined = speech.overlay(background[:len(speech)])
fade_out = background[len(speech):len(speech)+3000].fade_out(3000)
final_audio = combined + fade_out
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
final_audio.export(tmp_file.name, format="mp3")
return tmp_file.name
# Procesar m煤ltiples videos
def process_videos(audio_path, video_files):
audio_duration = AudioSegment.from_file(audio_path).duration_seconds
# Crear video concatenado
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as concat_video:
# Crear lista de videos para concatenar
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as list_file:
list_file.write("\n".join([f"file '{v.name}'" for v in video_files]))
list_file.close()
subprocess.run([
"ffmpeg", "-y",
"-f", "concat",
"-safe", "0",
"-i", list_file.name,
"-c", "copy",
concat_video.name
], check=True)
# Crear video final con loop
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as final_video:
subprocess.run([
"ffmpeg", "-y",
"-stream_loop", "-1",
"-i", concat_video.name,
"-i", audio_path,
"-t", str(audio_duration + 3),
"-c:v", "libx264",
"-c:a", "aac",
"-vf", "fade=t=out:st={}:d=3".format(audio_duration),
"-af", "afade=t=out:st={}:d=3".format(audio_duration),
"-shortest",
final_video.name
], check=True)
return final_video.name
# Funci贸n principal
async def tts_interface(text, voice, rate, pitch, bg_music, video_files):
# Generar audio principal
audio_path, warning = await text_to_speech(text, voice, rate, pitch)
if warning:
return None, None, gr.Warning(warning)
try:
# Agregar m煤sica de fondo
if bg_music:
audio_path = add_background_music(audio_path, bg_music)
# Procesar videos
if video_files:
video_path = process_videos(audio_path, video_files)
else:
video_path = None
return audio_path, video_path, None
except Exception as e:
return None, None, gr.Warning(f"Processing error: {str(e)}")
finally:
if 'audio_path' in locals() and os.path.exists(audio_path):
os.remove(audio_path)
# Crear interfaz
async def create_demo():
voices = await get_voices()
demo = gr.Interface(
fn=tts_interface,
inputs=[
gr.Textbox(label="Input Text", lines=5, placeholder="Enter your text here..."),
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice"),
gr.Slider(-50, 50, 0, label="Speech Rate (%)"),
gr.Slider(-20, 20, 0, label="Pitch (Hz)"),
gr.Audio(label="Background Music", type="filepath"),
gr.File(label="Upload Videos", file_types=[".mp4", ".mov"], file_count="multiple")
],
outputs=[
gr.Audio(label="Generated Audio", type="filepath"),
gr.Video(label="Final Video"),
gr.Markdown(visible=False)
],
title="Multi-Video TTS with Loop",
description="Create videos with: 1. Text-to-speech 2. Background music 3. Multiple video loop",
examples=[
["Hello world! This is a test with multiple videos.",
"en-US-JennyNeural - en-US (Female)",
0, 0, None, None]
],
css="#component-0 {max-width: 800px}"
)
return demo
async def main():
demo = await create_demo()
demo.queue()
demo.launch()
if __name__ == "__main__":
asyncio.run(main()) |