Upload 3 files
Browse files- app.py +149 -0
- requirements.txt +4 -0
- wrapper.py +199 -0
app.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import json
|
4 |
+
from wrapper import GeminiTTSWrapper
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
|
7 |
+
load_dotenv()
|
8 |
+
# Константы
|
9 |
+
# CONFIG_DIR = "config"
|
10 |
+
# CONFIG_FILE = f"{CONFIG_DIR}/config.json"
|
11 |
+
DEFAULT_OUTPUT_FILE = None # Позволяет обертке генерировать имя файла с временной меткой
|
12 |
+
|
13 |
+
# os.makedirs(CONFIG_DIR, exist_ok=True)
|
14 |
+
|
15 |
+
# Загрузка сохраненного API ключа, если существует
|
16 |
+
def load_config():
|
17 |
+
api_key = os.getenv("GOOGLE_API_KEY", "")
|
18 |
+
return {"api_key": api_key}
|
19 |
+
|
20 |
+
# Сохранение API ключа
|
21 |
+
# def save_config(api_key):
|
22 |
+
# with open(CONFIG_FILE, 'w') as f:
|
23 |
+
# json.dump({"api_key": api_key}, f)
|
24 |
+
|
25 |
+
# Инициализация с сохраненной конфигурацией
|
26 |
+
config = load_config()
|
27 |
+
tts_wrapper = GeminiTTSWrapper(config.get("api_key", ""))
|
28 |
+
|
29 |
+
def update_api_key(api_key):
|
30 |
+
"""Обновление API ключа и сохранение его в файл конфигурации."""
|
31 |
+
# save_config(api_key)
|
32 |
+
tts_wrapper.set_api_key(api_key)
|
33 |
+
return "API ключ обновлен и сохранен"
|
34 |
+
|
35 |
+
def generate_speech(api_key, model, voice, instructions, text, mp3_format):
|
36 |
+
"""Генерация речи с помощью обертки."""
|
37 |
+
if not api_key:
|
38 |
+
return None, "Требуется API ключ"
|
39 |
+
|
40 |
+
if not text:
|
41 |
+
return None, "Требуется текст"
|
42 |
+
|
43 |
+
try:
|
44 |
+
# Обновление API ключа, если он изменился
|
45 |
+
if api_key != tts_wrapper.api_key:
|
46 |
+
tts_wrapper.set_api_key(api_key)
|
47 |
+
# save_config(api_key)
|
48 |
+
|
49 |
+
# Генерация речи
|
50 |
+
output_path = tts_wrapper.generate_speech(
|
51 |
+
text=text,
|
52 |
+
model=model,
|
53 |
+
voice=voice,
|
54 |
+
instructions=instructions,
|
55 |
+
output_file=DEFAULT_OUTPUT_FILE,
|
56 |
+
convert_to_mp3=mp3_format
|
57 |
+
)
|
58 |
+
|
59 |
+
return output_path, f"Сгенерированная речь сохранена в {output_path}"
|
60 |
+
except Exception as e:
|
61 |
+
return None, f"Ошибка: {str(e)}"
|
62 |
+
|
63 |
+
# Получение списка доступных голосов
|
64 |
+
available_voices = tts_wrapper.list_available_voices()
|
65 |
+
|
66 |
+
# Создание интерфейса Gradio
|
67 |
+
with gr.Blocks(title="Google Gemini TTS") as app:
|
68 |
+
# Ввод API ключа
|
69 |
+
print(os.getenv("GOOGLE_API_KEY", ""))
|
70 |
+
api_key_input = gr.Textbox(
|
71 |
+
label="API ключ Gemini",
|
72 |
+
value=os.getenv("GOOGLE_API_KEY", ""),
|
73 |
+
type="password"
|
74 |
+
)
|
75 |
+
gr.Markdown("# Преобразование текста в речь Google Gemini")
|
76 |
+
with gr.Row():
|
77 |
+
with gr.Column(scale=1):
|
78 |
+
|
79 |
+
# Ввод инструкций
|
80 |
+
instructions_input = gr.TextArea(
|
81 |
+
label="Инструкции для голоса (необязательно)",
|
82 |
+
placeholder="например, 'Говорить спокойно и медленно', 'Говорить взволнованным тоном'"
|
83 |
+
)
|
84 |
+
|
85 |
+
# Флажок MP3 формата
|
86 |
+
mp3_checkbox = gr.Checkbox(
|
87 |
+
label="Конвертировать в MP3 формат",
|
88 |
+
value=True,
|
89 |
+
visible=False
|
90 |
+
)
|
91 |
+
|
92 |
+
text_input = gr.Textbox(
|
93 |
+
label="Текст для преобразования в речь",
|
94 |
+
placeholder="Введите ваш текст здесь...",
|
95 |
+
lines=15
|
96 |
+
)
|
97 |
+
|
98 |
+
|
99 |
+
# Выбор голоса
|
100 |
+
voice_dropdown = gr.Dropdown(
|
101 |
+
label="Голос",
|
102 |
+
choices=available_voices,
|
103 |
+
value="Laomedeia"
|
104 |
+
)
|
105 |
+
|
106 |
+
# Выбор модели
|
107 |
+
model_dropdown = gr.Dropdown(
|
108 |
+
label="Модель",
|
109 |
+
choices=[
|
110 |
+
"gemini-2.5-pro-preview-tts",
|
111 |
+
"gemini-2.5-flash-preview-tts"
|
112 |
+
],
|
113 |
+
value="gemini-2.5-flash-preview-tts"
|
114 |
+
)
|
115 |
+
|
116 |
+
with gr.Column(scale=1):
|
117 |
+
# Вывод сгенерированного аудио
|
118 |
+
audio_output = gr.Audio(label="Сгенерированная речь")
|
119 |
+
generation_status = gr.Textbox(label="Статус генерации", interactive=False)
|
120 |
+
|
121 |
+
# Кнопка генерации
|
122 |
+
generate_btn = gr.Button("Сгенерировать речь", variant="primary")
|
123 |
+
|
124 |
+
# Настройка обработчиков событий
|
125 |
+
api_key_input.change(
|
126 |
+
update_api_key,
|
127 |
+
inputs=[api_key_input],
|
128 |
+
outputs=[generation_status]
|
129 |
+
)
|
130 |
+
|
131 |
+
generate_btn.click(
|
132 |
+
generate_speech,
|
133 |
+
inputs=[
|
134 |
+
api_key_input,
|
135 |
+
model_dropdown,
|
136 |
+
voice_dropdown,
|
137 |
+
instructions_input,
|
138 |
+
text_input,
|
139 |
+
mp3_checkbox
|
140 |
+
],
|
141 |
+
outputs=[
|
142 |
+
audio_output,
|
143 |
+
generation_status
|
144 |
+
]
|
145 |
+
)
|
146 |
+
|
147 |
+
if __name__ == "__main__":
|
148 |
+
app.queue()
|
149 |
+
app.launch(server_name="0.0.0.0", server_port=7567)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
google-genai
|
2 |
+
gradio
|
3 |
+
ffmpy
|
4 |
+
python-dotenv
|
wrapper.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import mimetypes
|
3 |
+
import os
|
4 |
+
import struct
|
5 |
+
from google import genai
|
6 |
+
from google.genai import types
|
7 |
+
import ffmpy
|
8 |
+
import datetime
|
9 |
+
|
10 |
+
|
11 |
+
class GeminiTTSWrapper:
|
12 |
+
def __init__(self, api_key=None):
|
13 |
+
"""Initialize the Gemini TTS wrapper with an API key."""
|
14 |
+
self.api_key = api_key
|
15 |
+
self.client = None
|
16 |
+
# Create output directory if it doesn't exist
|
17 |
+
os.makedirs("output", exist_ok=True)
|
18 |
+
if api_key:
|
19 |
+
self.set_api_key(api_key)
|
20 |
+
|
21 |
+
def set_api_key(self, api_key):
|
22 |
+
"""Set or update the API key and initialize the client."""
|
23 |
+
self.api_key = api_key
|
24 |
+
self.client = genai.Client(api_key=api_key)
|
25 |
+
return self
|
26 |
+
|
27 |
+
def generate_speech(self, text, model="gemini-2.5-pro-preview-tts", voice="Laomedeia",
|
28 |
+
instructions="", temperature=1.0, output_file=None,
|
29 |
+
convert_to_mp3=True):
|
30 |
+
"""
|
31 |
+
Generate speech from text using Gemini TTS models.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
text (str): The text to convert to speech
|
35 |
+
model (str): Model to use (gemini-2.5-pro-preview-tts or gemini-2.5-flash-preview-tts)
|
36 |
+
voice (str): Prebuilt voice name to use
|
37 |
+
instructions (str): Optional instructions for controlling style, tone, accent, etc.
|
38 |
+
temperature (float): Sampling temperature (0.0 to 1.0)
|
39 |
+
output_file (str): Output filename (without extension)
|
40 |
+
convert_to_mp3 (bool): Whether to convert the output to MP3 format
|
41 |
+
|
42 |
+
Returns:
|
43 |
+
str: Path to the saved audio file
|
44 |
+
"""
|
45 |
+
if not self.client:
|
46 |
+
raise ValueError("API key not set. Call set_api_key() first.")
|
47 |
+
|
48 |
+
# Generate timestamp for filename
|
49 |
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
50 |
+
|
51 |
+
# If no output file specified, create one with timestamp
|
52 |
+
if output_file is None:
|
53 |
+
output_file = f"output/gemini_tts_{timestamp}"
|
54 |
+
elif not output_file.startswith("output/"):
|
55 |
+
output_file = f"output/{output_file}_{timestamp}"
|
56 |
+
|
57 |
+
# Prepare the content with instructions if provided
|
58 |
+
if instructions:
|
59 |
+
content_text = f"{instructions}:\n{text}"
|
60 |
+
else:
|
61 |
+
content_text = text
|
62 |
+
|
63 |
+
contents = [
|
64 |
+
types.Content(
|
65 |
+
role="user",
|
66 |
+
parts=[types.Part.from_text(text=content_text)],
|
67 |
+
),
|
68 |
+
]
|
69 |
+
|
70 |
+
generate_content_config = types.GenerateContentConfig(
|
71 |
+
temperature=temperature,
|
72 |
+
response_modalities=["audio"],
|
73 |
+
speech_config=types.SpeechConfig(
|
74 |
+
voice_config=types.VoiceConfig(
|
75 |
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
76 |
+
voice_name=voice
|
77 |
+
)
|
78 |
+
)
|
79 |
+
),
|
80 |
+
)
|
81 |
+
|
82 |
+
file_path = None
|
83 |
+
for chunk in self.client.models.generate_content_stream(
|
84 |
+
model=model,
|
85 |
+
contents=contents,
|
86 |
+
config=generate_content_config,
|
87 |
+
):
|
88 |
+
if (
|
89 |
+
chunk.candidates is None
|
90 |
+
or chunk.candidates[0].content is None
|
91 |
+
or chunk.candidates[0].content.parts is None
|
92 |
+
):
|
93 |
+
continue
|
94 |
+
|
95 |
+
if chunk.candidates[0].content.parts[0].inline_data:
|
96 |
+
inline_data = chunk.candidates[0].content.parts[0].inline_data
|
97 |
+
data_buffer = inline_data.data
|
98 |
+
file_extension = mimetypes.guess_extension(inline_data.mime_type)
|
99 |
+
|
100 |
+
if file_extension is None:
|
101 |
+
file_extension = ".wav"
|
102 |
+
data_buffer = self._convert_to_wav(inline_data.data, inline_data.mime_type)
|
103 |
+
|
104 |
+
wav_file_path = f"{output_file}{file_extension}"
|
105 |
+
self._save_binary_file(wav_file_path, data_buffer)
|
106 |
+
file_path = wav_file_path
|
107 |
+
|
108 |
+
# Convert to MP3 if requested
|
109 |
+
if convert_to_mp3:
|
110 |
+
mp3_file_path = f"{output_file}.mp3"
|
111 |
+
self._convert_to_mp3(wav_file_path, mp3_file_path)
|
112 |
+
file_path = mp3_file_path
|
113 |
+
else:
|
114 |
+
print(chunk.text)
|
115 |
+
|
116 |
+
return file_path
|
117 |
+
|
118 |
+
def _save_binary_file(self, file_name, data):
|
119 |
+
"""Save binary data to a file."""
|
120 |
+
with open(file_name, "wb") as f:
|
121 |
+
f.write(data)
|
122 |
+
return file_name
|
123 |
+
|
124 |
+
def _convert_to_wav(self, audio_data, mime_type):
|
125 |
+
"""Convert audio data to WAV format."""
|
126 |
+
parameters = self._parse_audio_mime_type(mime_type)
|
127 |
+
bits_per_sample = parameters["bits_per_sample"]
|
128 |
+
sample_rate = parameters["rate"]
|
129 |
+
num_channels = 1
|
130 |
+
data_size = len(audio_data)
|
131 |
+
bytes_per_sample = bits_per_sample // 8
|
132 |
+
block_align = num_channels * bytes_per_sample
|
133 |
+
byte_rate = sample_rate * block_align
|
134 |
+
chunk_size = 36 + data_size # 36 bytes for header fields before data chunk size
|
135 |
+
|
136 |
+
# http://soundfile.sapp.org/doc/WaveFormat/
|
137 |
+
header = struct.pack(
|
138 |
+
"<4sI4s4sIHHIIHH4sI",
|
139 |
+
b"RIFF", # ChunkID
|
140 |
+
chunk_size, # ChunkSize (total file size - 8 bytes)
|
141 |
+
b"WAVE", # Format
|
142 |
+
b"fmt ", # Subchunk1ID
|
143 |
+
16, # Subchunk1Size (16 for PCM)
|
144 |
+
1, # AudioFormat (1 for PCM)
|
145 |
+
num_channels, # NumChannels
|
146 |
+
sample_rate, # SampleRate
|
147 |
+
byte_rate, # ByteRate
|
148 |
+
block_align, # BlockAlign
|
149 |
+
bits_per_sample, # BitsPerSample
|
150 |
+
b"data", # Subchunk2ID
|
151 |
+
data_size # Subchunk2Size (size of audio data)
|
152 |
+
)
|
153 |
+
return header + audio_data
|
154 |
+
|
155 |
+
def _convert_to_mp3(self, input_file, output_file):
|
156 |
+
"""Convert audio file to MP3 format using ffmpeg."""
|
157 |
+
try:
|
158 |
+
converter = ffmpy.FFmpeg(
|
159 |
+
inputs={input_file: None},
|
160 |
+
outputs={output_file: None}
|
161 |
+
)
|
162 |
+
converter.run()
|
163 |
+
return output_file
|
164 |
+
except Exception as e:
|
165 |
+
print(f"Error converting to MP3: {str(e)}")
|
166 |
+
return input_file
|
167 |
+
|
168 |
+
def _parse_audio_mime_type(self, mime_type):
|
169 |
+
"""Parse audio parameters from MIME type."""
|
170 |
+
bits_per_sample = 16
|
171 |
+
rate = 24000
|
172 |
+
|
173 |
+
# Extract rate from parameters
|
174 |
+
parts = mime_type.split(";")
|
175 |
+
for param in parts:
|
176 |
+
param = param.strip()
|
177 |
+
if param.lower().startswith("rate="):
|
178 |
+
try:
|
179 |
+
rate_str = param.split("=", 1)[1]
|
180 |
+
rate = int(rate_str)
|
181 |
+
except (ValueError, IndexError):
|
182 |
+
pass # Keep rate as default
|
183 |
+
elif param.startswith("audio/L"):
|
184 |
+
try:
|
185 |
+
bits_per_sample = int(param.split("L", 1)[1])
|
186 |
+
except (ValueError, IndexError):
|
187 |
+
pass # Keep bits_per_sample as default if conversion fails
|
188 |
+
|
189 |
+
return {"bits_per_sample": bits_per_sample, "rate": rate}
|
190 |
+
|
191 |
+
def list_available_voices(self):
|
192 |
+
"""Return a list of available voice options."""
|
193 |
+
return [
|
194 |
+
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
|
195 |
+
"Callirhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
|
196 |
+
"Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
|
197 |
+
"Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
|
198 |
+
"Vindemiatrix", "Sadachbia", "Sadalthager", "Sulafat"
|
199 |
+
]
|