import io import os import tempfile from typing import List import TTS.api import TTS.utils.manage as manage import torch from pydub import AudioSegment import gradio as gr # Gradio库 import config device = "cuda" if torch.cuda.is_available() else "cpu" # 定义一个函数来自动接受许可条款 def ask_tos_patch(self, output_path): print("Automatically accepting the terms of service.") return True # 使用我们定义的函数替换原有的 ask_tos 方法 manage.ModelManager.ask_tos = ask_tos_patch tts = TTS.api.TTS() models = {} for id, model in config.models.items(): tts.download_model_by_name(model) models[id] = TTS.api.TTS(model).to(device) def synthesize_tts( text: str = 'Hello, World!', speaker_wavs: List[gr.File] = None, speaker_idx: str = 'Ana Florence', language: str = 'ja', temperature: float = 0.65, length_penalty: float = 1.0, repetition_penalty: int = 2.0, top_k: int = 50, top_p: float = 0.8, speed: float = 1.0, enable_text_splitting: bool = True, ): temp_files = [] try: if speaker_wavs: # Process each uploaded file for speaker_wav in speaker_wavs: with open(speaker_wav.name, "rb") as f: speaker_wav_bytes = f.read() # Convert the uploaded audio file to a WAV format using pydub try: audio = AudioSegment.from_file(io.BytesIO(speaker_wav_bytes)) wav_buffer = io.BytesIO() audio.export(wav_buffer, format="wav") wav_buffer.seek(0) # Reset buffer position to the beginning except Exception as e: return f"Error processing audio file: {e}" temp_wav_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) temp_wav_file.write(wav_buffer.read()) temp_wav_file.close() temp_files.append(temp_wav_file.name) output_buffer = io.BytesIO() if temp_files: models['multi'].tts_to_file( text=text, speaker_wav=temp_files, language=language, file_path=output_buffer, temperature=temperature, length_penalty=length_penalty, repetition_penalty=repetition_penalty, top_k=top_k, top_p=top_p, speed=speed, enable_text_splitting=enable_text_splitting ) else: models['multi'].tts_to_file( text=text, speaker=speaker_idx, language=language, file_path=output_buffer, temperature=temperature, length_penalty=length_penalty, repetition_penalty=repetition_penalty, top_k=top_k, top_p=top_p, speed=speed, enable_text_splitting=enable_text_splitting ) output_buffer.seek(0) return output_buffer.read() finally: for temp_file in temp_files: if isinstance(temp_file, str) and os.path.exists(temp_file): os.remove(temp_file) # 创建Gradio界面 inputs = [ gr.Textbox(value="Hello, World!", label="Text to Synthesize"), gr.File(file_types=["audio"], label="Speaker WAV files (optional)", file_count="multiple"), gr.Dropdown( choices=[ "Claribel Dervla", "Daisy Studious", "Gracie Wise", "Tammie Ema", "Alison Dietlinde", "Ana Florence", "Annmarie Nele", "Asya Anara", "Brenda Stern", "Gitta Nikolina", "Henriette Usha", "Sofia Hellen", "Tammy Grit", "Tanja Adelina", "Vjollca Johnnie", "Andrew Chipper", "Badr Odhiambo", "Dionisio Schuyler", "Royston Min", "Viktor Eka", "Abrahan Mack", "Adde Michal", "Baldur Sanjin", "Craig Gutsy", "Damien Black", "Gilberto Mathias", "Ilkin Urbano", "Kazuhiko Atallah", "Ludvig Milivoj", "Suad Qasim", "Torcull Diarmuid", "Viktor Menelaos", "Zacharie Aimilios", "Nova Hogarth", "Maja Ruoho", "Uta Obando", "Lidiya Szekeres", "Chandra MacFarland", "Szofi Granger", "Camilla Holmström", "Lilya Stainthorpe", "Zofija Kendrick", "Narelle Moon", "Barbora MacLean", "Alexandra Hisakawa", "Alma María", "Rosemary Okafor", "Ige Behringer", "Filip Traverse", "Damjan Chapman", "Wulf Carlevaro", "Aaron Dreschner", "Kumar Dahl", "Eugenio Mataracı", "Ferran Simen", "Xavier Hayasaka", "Luis Moray", "Marcos Rudaski" ], value="Ana Florence", label="Speaker Index" ), gr.Textbox(value="en", label="Language"), gr.Slider(0, 1, value=0.65, step=0.01, label="Temperature"), gr.Slider(0.5, 2, value=1.0, step=0.1, label="Length Penalty"), gr.Slider(1.0, 10.0, value=2.0, step=0.1, label="Repetition Penalty"), gr.Slider(1, 100, value=50, step=1, label="Top-K"), gr.Slider(0, 1, value=0.8, step=0.01, label="Top-P"), gr.Slider(0.5, 2, value=1.0, step=0.01, label="Speed"), gr.Checkbox(value=True, label="Enable Text Splitting") ] outputs = gr.Audio(label="Generated Speech") gr.Interface( fn=synthesize_tts, inputs=inputs, outputs=outputs, title="Text-to-Speech Synthesis with Gradio" ).launch()