daswer123 commited on
Commit
56064c3
·
verified ·
1 Parent(s): cb4661b

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +149 -0
  2. requirements.txt +4 -0
  3. wrapper.py +199 -0
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import json
4
+ from wrapper import GeminiTTSWrapper
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+ # Константы
9
+ # CONFIG_DIR = "config"
10
+ # CONFIG_FILE = f"{CONFIG_DIR}/config.json"
11
+ DEFAULT_OUTPUT_FILE = None # Позволяет обертке генерировать имя файла с временной меткой
12
+
13
+ # os.makedirs(CONFIG_DIR, exist_ok=True)
14
+
15
+ # Загрузка сохраненного API ключа, если существует
16
+ def load_config():
17
+ api_key = os.getenv("GOOGLE_API_KEY", "")
18
+ return {"api_key": api_key}
19
+
20
+ # Сохранение API ключа
21
+ # def save_config(api_key):
22
+ # with open(CONFIG_FILE, 'w') as f:
23
+ # json.dump({"api_key": api_key}, f)
24
+
25
+ # Инициализация с сохраненной конфигурацией
26
+ config = load_config()
27
+ tts_wrapper = GeminiTTSWrapper(config.get("api_key", ""))
28
+
29
+ def update_api_key(api_key):
30
+ """Обновление API ключа и сохранение его в файл конфигурации."""
31
+ # save_config(api_key)
32
+ tts_wrapper.set_api_key(api_key)
33
+ return "API ключ обновлен и сохранен"
34
+
35
+ def generate_speech(api_key, model, voice, instructions, text, mp3_format):
36
+ """Генерация речи с помощью обертки."""
37
+ if not api_key:
38
+ return None, "Требуется API ключ"
39
+
40
+ if not text:
41
+ return None, "Требуется текст"
42
+
43
+ try:
44
+ # Обновление API ключа, если он изменился
45
+ if api_key != tts_wrapper.api_key:
46
+ tts_wrapper.set_api_key(api_key)
47
+ # save_config(api_key)
48
+
49
+ # Генерация речи
50
+ output_path = tts_wrapper.generate_speech(
51
+ text=text,
52
+ model=model,
53
+ voice=voice,
54
+ instructions=instructions,
55
+ output_file=DEFAULT_OUTPUT_FILE,
56
+ convert_to_mp3=mp3_format
57
+ )
58
+
59
+ return output_path, f"Сгенерированная речь сохранена в {output_path}"
60
+ except Exception as e:
61
+ return None, f"Ошибка: {str(e)}"
62
+
63
+ # Получение списка доступных голосов
64
+ available_voices = tts_wrapper.list_available_voices()
65
+
66
+ # Создание интерфейса Gradio
67
+ with gr.Blocks(title="Google Gemini TTS") as app:
68
+ # Ввод API ключа
69
+ print(os.getenv("GOOGLE_API_KEY", ""))
70
+ api_key_input = gr.Textbox(
71
+ label="API ключ Gemini",
72
+ value=os.getenv("GOOGLE_API_KEY", ""),
73
+ type="password"
74
+ )
75
+ gr.Markdown("# Преобразование текста в речь Google Gemini")
76
+ with gr.Row():
77
+ with gr.Column(scale=1):
78
+
79
+ # Ввод инструкций
80
+ instructions_input = gr.TextArea(
81
+ label="Инструкции для голоса (необязательно)",
82
+ placeholder="например, 'Говорить спокойно и медленно', 'Говорить взволнованным тоном'"
83
+ )
84
+
85
+ # Флажок MP3 формата
86
+ mp3_checkbox = gr.Checkbox(
87
+ label="Конвертировать в MP3 формат",
88
+ value=True,
89
+ visible=False
90
+ )
91
+
92
+ text_input = gr.Textbox(
93
+ label="Текст для преобразования в речь",
94
+ placeholder="Введите ваш текст здесь...",
95
+ lines=15
96
+ )
97
+
98
+
99
+ # Выбор голоса
100
+ voice_dropdown = gr.Dropdown(
101
+ label="Голос",
102
+ choices=available_voices,
103
+ value="Laomedeia"
104
+ )
105
+
106
+ # Выбор модели
107
+ model_dropdown = gr.Dropdown(
108
+ label="Модель",
109
+ choices=[
110
+ "gemini-2.5-pro-preview-tts",
111
+ "gemini-2.5-flash-preview-tts"
112
+ ],
113
+ value="gemini-2.5-flash-preview-tts"
114
+ )
115
+
116
+ with gr.Column(scale=1):
117
+ # Вывод сгенерированного аудио
118
+ audio_output = gr.Audio(label="Сгенерированная речь")
119
+ generation_status = gr.Textbox(label="Статус генерации", interactive=False)
120
+
121
+ # Кнопка генерации
122
+ generate_btn = gr.Button("Сгенерировать речь", variant="primary")
123
+
124
+ # Настройка обработчиков событий
125
+ api_key_input.change(
126
+ update_api_key,
127
+ inputs=[api_key_input],
128
+ outputs=[generation_status]
129
+ )
130
+
131
+ generate_btn.click(
132
+ generate_speech,
133
+ inputs=[
134
+ api_key_input,
135
+ model_dropdown,
136
+ voice_dropdown,
137
+ instructions_input,
138
+ text_input,
139
+ mp3_checkbox
140
+ ],
141
+ outputs=[
142
+ audio_output,
143
+ generation_status
144
+ ]
145
+ )
146
+
147
+ if __name__ == "__main__":
148
+ app.queue()
149
+ app.launch(server_name="0.0.0.0", server_port=7567)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ google-genai
2
+ gradio
3
+ ffmpy
4
+ python-dotenv
wrapper.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import mimetypes
3
+ import os
4
+ import struct
5
+ from google import genai
6
+ from google.genai import types
7
+ import ffmpy
8
+ import datetime
9
+
10
+
11
+ class GeminiTTSWrapper:
12
+ def __init__(self, api_key=None):
13
+ """Initialize the Gemini TTS wrapper with an API key."""
14
+ self.api_key = api_key
15
+ self.client = None
16
+ # Create output directory if it doesn't exist
17
+ os.makedirs("output", exist_ok=True)
18
+ if api_key:
19
+ self.set_api_key(api_key)
20
+
21
+ def set_api_key(self, api_key):
22
+ """Set or update the API key and initialize the client."""
23
+ self.api_key = api_key
24
+ self.client = genai.Client(api_key=api_key)
25
+ return self
26
+
27
+ def generate_speech(self, text, model="gemini-2.5-pro-preview-tts", voice="Laomedeia",
28
+ instructions="", temperature=1.0, output_file=None,
29
+ convert_to_mp3=True):
30
+ """
31
+ Generate speech from text using Gemini TTS models.
32
+
33
+ Args:
34
+ text (str): The text to convert to speech
35
+ model (str): Model to use (gemini-2.5-pro-preview-tts or gemini-2.5-flash-preview-tts)
36
+ voice (str): Prebuilt voice name to use
37
+ instructions (str): Optional instructions for controlling style, tone, accent, etc.
38
+ temperature (float): Sampling temperature (0.0 to 1.0)
39
+ output_file (str): Output filename (without extension)
40
+ convert_to_mp3 (bool): Whether to convert the output to MP3 format
41
+
42
+ Returns:
43
+ str: Path to the saved audio file
44
+ """
45
+ if not self.client:
46
+ raise ValueError("API key not set. Call set_api_key() first.")
47
+
48
+ # Generate timestamp for filename
49
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
50
+
51
+ # If no output file specified, create one with timestamp
52
+ if output_file is None:
53
+ output_file = f"output/gemini_tts_{timestamp}"
54
+ elif not output_file.startswith("output/"):
55
+ output_file = f"output/{output_file}_{timestamp}"
56
+
57
+ # Prepare the content with instructions if provided
58
+ if instructions:
59
+ content_text = f"{instructions}:\n{text}"
60
+ else:
61
+ content_text = text
62
+
63
+ contents = [
64
+ types.Content(
65
+ role="user",
66
+ parts=[types.Part.from_text(text=content_text)],
67
+ ),
68
+ ]
69
+
70
+ generate_content_config = types.GenerateContentConfig(
71
+ temperature=temperature,
72
+ response_modalities=["audio"],
73
+ speech_config=types.SpeechConfig(
74
+ voice_config=types.VoiceConfig(
75
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
76
+ voice_name=voice
77
+ )
78
+ )
79
+ ),
80
+ )
81
+
82
+ file_path = None
83
+ for chunk in self.client.models.generate_content_stream(
84
+ model=model,
85
+ contents=contents,
86
+ config=generate_content_config,
87
+ ):
88
+ if (
89
+ chunk.candidates is None
90
+ or chunk.candidates[0].content is None
91
+ or chunk.candidates[0].content.parts is None
92
+ ):
93
+ continue
94
+
95
+ if chunk.candidates[0].content.parts[0].inline_data:
96
+ inline_data = chunk.candidates[0].content.parts[0].inline_data
97
+ data_buffer = inline_data.data
98
+ file_extension = mimetypes.guess_extension(inline_data.mime_type)
99
+
100
+ if file_extension is None:
101
+ file_extension = ".wav"
102
+ data_buffer = self._convert_to_wav(inline_data.data, inline_data.mime_type)
103
+
104
+ wav_file_path = f"{output_file}{file_extension}"
105
+ self._save_binary_file(wav_file_path, data_buffer)
106
+ file_path = wav_file_path
107
+
108
+ # Convert to MP3 if requested
109
+ if convert_to_mp3:
110
+ mp3_file_path = f"{output_file}.mp3"
111
+ self._convert_to_mp3(wav_file_path, mp3_file_path)
112
+ file_path = mp3_file_path
113
+ else:
114
+ print(chunk.text)
115
+
116
+ return file_path
117
+
118
+ def _save_binary_file(self, file_name, data):
119
+ """Save binary data to a file."""
120
+ with open(file_name, "wb") as f:
121
+ f.write(data)
122
+ return file_name
123
+
124
+ def _convert_to_wav(self, audio_data, mime_type):
125
+ """Convert audio data to WAV format."""
126
+ parameters = self._parse_audio_mime_type(mime_type)
127
+ bits_per_sample = parameters["bits_per_sample"]
128
+ sample_rate = parameters["rate"]
129
+ num_channels = 1
130
+ data_size = len(audio_data)
131
+ bytes_per_sample = bits_per_sample // 8
132
+ block_align = num_channels * bytes_per_sample
133
+ byte_rate = sample_rate * block_align
134
+ chunk_size = 36 + data_size # 36 bytes for header fields before data chunk size
135
+
136
+ # http://soundfile.sapp.org/doc/WaveFormat/
137
+ header = struct.pack(
138
+ "<4sI4s4sIHHIIHH4sI",
139
+ b"RIFF", # ChunkID
140
+ chunk_size, # ChunkSize (total file size - 8 bytes)
141
+ b"WAVE", # Format
142
+ b"fmt ", # Subchunk1ID
143
+ 16, # Subchunk1Size (16 for PCM)
144
+ 1, # AudioFormat (1 for PCM)
145
+ num_channels, # NumChannels
146
+ sample_rate, # SampleRate
147
+ byte_rate, # ByteRate
148
+ block_align, # BlockAlign
149
+ bits_per_sample, # BitsPerSample
150
+ b"data", # Subchunk2ID
151
+ data_size # Subchunk2Size (size of audio data)
152
+ )
153
+ return header + audio_data
154
+
155
+ def _convert_to_mp3(self, input_file, output_file):
156
+ """Convert audio file to MP3 format using ffmpeg."""
157
+ try:
158
+ converter = ffmpy.FFmpeg(
159
+ inputs={input_file: None},
160
+ outputs={output_file: None}
161
+ )
162
+ converter.run()
163
+ return output_file
164
+ except Exception as e:
165
+ print(f"Error converting to MP3: {str(e)}")
166
+ return input_file
167
+
168
+ def _parse_audio_mime_type(self, mime_type):
169
+ """Parse audio parameters from MIME type."""
170
+ bits_per_sample = 16
171
+ rate = 24000
172
+
173
+ # Extract rate from parameters
174
+ parts = mime_type.split(";")
175
+ for param in parts:
176
+ param = param.strip()
177
+ if param.lower().startswith("rate="):
178
+ try:
179
+ rate_str = param.split("=", 1)[1]
180
+ rate = int(rate_str)
181
+ except (ValueError, IndexError):
182
+ pass # Keep rate as default
183
+ elif param.startswith("audio/L"):
184
+ try:
185
+ bits_per_sample = int(param.split("L", 1)[1])
186
+ except (ValueError, IndexError):
187
+ pass # Keep bits_per_sample as default if conversion fails
188
+
189
+ return {"bits_per_sample": bits_per_sample, "rate": rate}
190
+
191
+ def list_available_voices(self):
192
+ """Return a list of available voice options."""
193
+ return [
194
+ "Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
195
+ "Callirhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
196
+ "Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
197
+ "Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
198
+ "Vindemiatrix", "Sadachbia", "Sadalthager", "Sulafat"
199
+ ]