Spaces:

daswer123
/

googletts

Runtime error

App Files Files Community

daswer123 commited on May 21

Commit

56064c3

verified ·

1 Parent(s): cb4661b

Upload 3 files

Browse files

Files changed (3) hide show

app.py +149 -0
requirements.txt +4 -0
wrapper.py +199 -0

app.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import os
+import gradio as gr
+import json
+from wrapper import GeminiTTSWrapper
+from dotenv import load_dotenv
+load_dotenv()
+# Константы
+# CONFIG_DIR = "config"
+# CONFIG_FILE = f"{CONFIG_DIR}/config.json"
+DEFAULT_OUTPUT_FILE = None  # Позволяет обертке генерировать имя файла с временной меткой
+# os.makedirs(CONFIG_DIR, exist_ok=True)
+# Загрузка сохраненного API ключа, если существует
+def load_config():
+    api_key = os.getenv("GOOGLE_API_KEY", "")
+    return {"api_key": api_key}
+# Сохранение API ключа
+# def save_config(api_key):
+#     with open(CONFIG_FILE, 'w') as f:
+#         json.dump({"api_key": api_key}, f)
+# Инициализация с сохраненной конфигурацией
+config = load_config()
+tts_wrapper = GeminiTTSWrapper(config.get("api_key", ""))
+def update_api_key(api_key):
+    """Обновление API ключа и сохранение его в файл конфигурации."""
+    # save_config(api_key)
+    tts_wrapper.set_api_key(api_key)
+    return "API ключ обновлен и сохранен"
+def generate_speech(api_key, model, voice, instructions, text, mp3_format):
+    """Генерация речи с помощью обертки."""
+    if not api_key:
+        return None, "Требуется API ключ"
+    if not text:
+        return None, "Требуется текст"
+    try:
+        # Обновление API ключа, если он изменился
+        if api_key != tts_wrapper.api_key:
+            tts_wrapper.set_api_key(api_key)
+            # save_config(api_key)
+        # Генерация речи
+        output_path = tts_wrapper.generate_speech(
+            text=text,
+            model=model,
+            voice=voice,
+            instructions=instructions,
+            output_file=DEFAULT_OUTPUT_FILE,
+            convert_to_mp3=mp3_format
+        )
+        return output_path, f"Сгенерированная речь сохранена в {output_path}"
+    except Exception as e:
+        return None, f"Ошибка: {str(e)}"
+# Получение списка доступных голосов
+available_voices = tts_wrapper.list_available_voices()
+# Создание интерфейса Gradio
+with gr.Blocks(title="Google Gemini TTS") as app:
+    # Ввод API ключа
+    print(os.getenv("GOOGLE_API_KEY", ""))
+    api_key_input = gr.Textbox(
+                label="API ключ Gemini",
+                value=os.getenv("GOOGLE_API_KEY", ""),
+                type="password"
+            )
+    gr.Markdown("# Преобразование текста в речь Google Gemini")
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Ввод инструкций
+            instructions_input = gr.TextArea(
+                label="Инструкции для голоса (необязательно)",
+                placeholder="например, 'Говорить спокойно и медленно', 'Говорить взволнованным тоном'"
+            )
+            # Флажок MP3 формата
+            mp3_checkbox = gr.Checkbox(
+                label="Конвертировать в MP3 формат",
+                value=True,
+                visible=False
+            )
+            text_input = gr.Textbox(
+                label="Текст для преобразования в речь",
+                placeholder="Введите ваш текст здесь...",
+                lines=15
+            )
+            # Выбор голоса
+            voice_dropdown = gr.Dropdown(
+                label="Голос",
+                choices=available_voices,
+                value="Laomedeia"
+            )
+            # Выбор модели
+            model_dropdown = gr.Dropdown(
+                label="Модель",
+                choices=[
+                    "gemini-2.5-pro-preview-tts",
+                    "gemini-2.5-flash-preview-tts"
+                ],
+                value="gemini-2.5-flash-preview-tts"
+            )
+        with gr.Column(scale=1):
+            # Вывод сгенерированного аудио
+            audio_output = gr.Audio(label="Сгенерированная речь")
+            generation_status = gr.Textbox(label="Статус генерации", interactive=False)
+            # Кнопка генерации
+            generate_btn = gr.Button("Сгенерировать речь", variant="primary")
+    # Настройка обработчиков событий
+    api_key_input.change(
+        update_api_key,
+        inputs=[api_key_input],
+        outputs=[generation_status]
+    )
+    generate_btn.click(
+        generate_speech,
+        inputs=[
+            api_key_input,
+            model_dropdown,
+            voice_dropdown,
+            instructions_input,
+            text_input,
+            mp3_checkbox
+        ],
+        outputs=[
+            audio_output,
+            generation_status
+        ]
+    )
+if __name__ == "__main__":
+    app.queue()
+    app.launch(server_name="0.0.0.0", server_port=7567)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+google-genai
+gradio
+ffmpy
+python-dotenv

wrapper.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import base64
+import mimetypes
+import os
+import struct
+from google import genai
+from google.genai import types
+import ffmpy
+import datetime
+class GeminiTTSWrapper:
+    def __init__(self, api_key=None):
+        """Initialize the Gemini TTS wrapper with an API key."""
+        self.api_key = api_key
+        self.client = None
+        # Create output directory if it doesn't exist
+        os.makedirs("output", exist_ok=True)
+        if api_key:
+            self.set_api_key(api_key)
+    def set_api_key(self, api_key):
+        """Set or update the API key and initialize the client."""
+        self.api_key = api_key
+        self.client = genai.Client(api_key=api_key)
+        return self
+    def generate_speech(self, text, model="gemini-2.5-pro-preview-tts", voice="Laomedeia",
+                         instructions="", temperature=1.0, output_file=None,
+                         convert_to_mp3=True):
+        """
+        Generate speech from text using Gemini TTS models.
+        Args:
+            text (str): The text to convert to speech
+            model (str): Model to use (gemini-2.5-pro-preview-tts or gemini-2.5-flash-preview-tts)
+            voice (str): Prebuilt voice name to use
+            instructions (str): Optional instructions for controlling style, tone, accent, etc.
+            temperature (float): Sampling temperature (0.0 to 1.0)
+            output_file (str): Output filename (without extension)
+            convert_to_mp3 (bool): Whether to convert the output to MP3 format
+        Returns:
+            str: Path to the saved audio file
+        """
+        if not self.client:
+            raise ValueError("API key not set. Call set_api_key() first.")
+        # Generate timestamp for filename
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        # If no output file specified, create one with timestamp
+        if output_file is None:
+            output_file = f"output/gemini_tts_{timestamp}"
+        elif not output_file.startswith("output/"):
+            output_file = f"output/{output_file}_{timestamp}"
+        # Prepare the content with instructions if provided
+        if instructions:
+            content_text = f"{instructions}:\n{text}"
+        else:
+            content_text = text
+        contents = [
+            types.Content(
+                role="user",
+                parts=[types.Part.from_text(text=content_text)],
+            ),
+        ]
+        generate_content_config = types.GenerateContentConfig(
+            temperature=temperature,
+            response_modalities=["audio"],
+            speech_config=types.SpeechConfig(
+                voice_config=types.VoiceConfig(
+                    prebuilt_voice_config=types.PrebuiltVoiceConfig(
+                        voice_name=voice
+                    )
+                )
+            ),
+        )
+        file_path = None
+        for chunk in self.client.models.generate_content_stream(
+            model=model,
+            contents=contents,
+            config=generate_content_config,
+        ):
+            if (
+                chunk.candidates is None
+                or chunk.candidates[0].content is None
+                or chunk.candidates[0].content.parts is None
+            ):
+                continue
+            if chunk.candidates[0].content.parts[0].inline_data:
+                inline_data = chunk.candidates[0].content.parts[0].inline_data
+                data_buffer = inline_data.data
+                file_extension = mimetypes.guess_extension(inline_data.mime_type)
+                if file_extension is None:
+                    file_extension = ".wav"
+                    data_buffer = self._convert_to_wav(inline_data.data, inline_data.mime_type)
+                wav_file_path = f"{output_file}{file_extension}"
+                self._save_binary_file(wav_file_path, data_buffer)
+                file_path = wav_file_path
+                # Convert to MP3 if requested
+                if convert_to_mp3:
+                    mp3_file_path = f"{output_file}.mp3"
+                    self._convert_to_mp3(wav_file_path, mp3_file_path)
+                    file_path = mp3_file_path
+            else:
+                print(chunk.text)
+        return file_path
+    def _save_binary_file(self, file_name, data):
+        """Save binary data to a file."""
+        with open(file_name, "wb") as f:
+            f.write(data)
+        return file_name
+    def _convert_to_wav(self, audio_data, mime_type):
+        """Convert audio data to WAV format."""
+        parameters = self._parse_audio_mime_type(mime_type)
+        bits_per_sample = parameters["bits_per_sample"]
+        sample_rate = parameters["rate"]
+        num_channels = 1
+        data_size = len(audio_data)
+        bytes_per_sample = bits_per_sample // 8
+        block_align = num_channels * bytes_per_sample
+        byte_rate = sample_rate * block_align
+        chunk_size = 36 + data_size  # 36 bytes for header fields before data chunk size
+        # http://soundfile.sapp.org/doc/WaveFormat/
+        header = struct.pack(
+            "<4sI4s4sIHHIIHH4sI",
+            b"RIFF",          # ChunkID
+            chunk_size,       # ChunkSize (total file size - 8 bytes)
+            b"WAVE",          # Format
+            b"fmt ",          # Subchunk1ID
+            16,               # Subchunk1Size (16 for PCM)
+            1,                # AudioFormat (1 for PCM)
+            num_channels,     # NumChannels
+            sample_rate,      # SampleRate
+            byte_rate,        # ByteRate
+            block_align,      # BlockAlign
+            bits_per_sample,  # BitsPerSample
+            b"data",          # Subchunk2ID
+            data_size         # Subchunk2Size (size of audio data)
+        )
+        return header + audio_data
+    def _convert_to_mp3(self, input_file, output_file):
+        """Convert audio file to MP3 format using ffmpeg."""
+        try:
+            converter = ffmpy.FFmpeg(
+                inputs={input_file: None},
+                outputs={output_file: None}
+            )
+            converter.run()
+            return output_file
+        except Exception as e:
+            print(f"Error converting to MP3: {str(e)}")
+            return input_file
+    def _parse_audio_mime_type(self, mime_type):
+        """Parse audio parameters from MIME type."""
+        bits_per_sample = 16
+        rate = 24000
+        # Extract rate from parameters
+        parts = mime_type.split(";")
+        for param in parts:
+            param = param.strip()
+            if param.lower().startswith("rate="):
+                try:
+                    rate_str = param.split("=", 1)[1]
+                    rate = int(rate_str)
+                except (ValueError, IndexError):
+                    pass  # Keep rate as default
+            elif param.startswith("audio/L"):
+                try:
+                    bits_per_sample = int(param.split("L", 1)[1])
+                except (ValueError, IndexError):
+                    pass  # Keep bits_per_sample as default if conversion fails
+        return {"bits_per_sample": bits_per_sample, "rate": rate}
+    def list_available_voices(self):
+        """Return a list of available voice options."""
+        return [
+            "Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
+            "Callirhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
+            "Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
+            "Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
+            "Vindemiatrix", "Sadachbia", "Sadalthager", "Sulafat"
+        ]