drewThomasson commited on
Commit
cd02b8a
·
verified ·
1 Parent(s): 100604e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +336 -0
app.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import base64
3
+ import time
4
+ import uuid
5
+ import shutil
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from pathlib import Path
8
+ from typing import List, Optional
9
+ import subprocess
10
+
11
+ import ebooklib
12
+ import gradio as gr
13
+ import torch
14
+ import torchaudio
15
+ from ebooklib import epub
16
+ from bs4 import BeautifulSoup
17
+
18
+ from auralis import TTS, TTSRequest, TTSOutput, AudioPreprocessingConfig, setup_logger
19
+
20
+ logger = setup_logger(__file__)
21
+
22
+ tts = TTS()
23
+ model_path = "AstraMindAI/xttsv2" # change this if you have a different model
24
+ gpt_model = "AstraMindAI/xtts2-gpt"
25
+ try:
26
+ tts = tts.from_pretrained(model_path, gpt_model=gpt_model)
27
+ logger.info(f"Successfully loaded model {model_path}")
28
+ except Exception as e:
29
+ logger.error(f"Failed to load model: {e}. Ensure that the model exists at {model_path}")
30
+
31
+ # Create a temporary directory to store short-named files
32
+ temp_dir = Path("/tmp/auralis")
33
+ temp_dir.mkdir(exist_ok=True)
34
+
35
+ def convert_ebook_to_txt(input_path: str) -> str:
36
+ """
37
+ Convert any ebook format to txt using calibre's ebook-convert
38
+ Returns the path to the converted txt file
39
+ """
40
+ output_path = str(temp_dir / f"{uuid.uuid4().hex[:8]}.txt")
41
+ try:
42
+ subprocess.run(['ebook-convert', input_path, output_path],
43
+ check=True, capture_output=True, text=True)
44
+ return output_path
45
+ except subprocess.CalledProcessError as e:
46
+ logger.error(f"Conversion failed: {e.stderr}")
47
+ raise RuntimeError(f"Failed to convert ebook: {e.stderr}")
48
+
49
+ def shorten_filename(original_path: str) -> str:
50
+ """Copies the given file to a temporary directory with a shorter, random filename."""
51
+ ext = Path(original_path).suffix
52
+ short_name = "file_" + uuid.uuid4().hex[:8] + ext
53
+ short_path = temp_dir / short_name
54
+ shutil.copyfile(original_path, short_path)
55
+ return str(short_path)
56
+
57
+ def text_from_file(file_path: str) -> str:
58
+ """Read text from a file, converting if necessary."""
59
+ file_ext = Path(file_path).suffix.lower()
60
+
61
+ if file_ext in ['.txt']:
62
+ with open(file_path, 'r', encoding='utf-8') as f:
63
+ return f.read()
64
+ else:
65
+ # Convert other formats to txt first
66
+ txt_path = convert_ebook_to_txt(file_path)
67
+ with open(txt_path, 'r', encoding='utf-8') as f:
68
+ return f.read()
69
+
70
+ def clone_voice(audio_path: str):
71
+ """Clone a voice from an audio path."""
72
+ audio_short_path = shorten_filename(audio_path)
73
+ with open(audio_short_path, "rb") as f:
74
+ audio_data = base64.b64encode(f.read()).decode('utf-8')
75
+ return audio_data
76
+
77
+ def process_text_and_generate(input_text, ref_audio_files, speed, enhance_speech, temperature, top_p, top_k, repetition_penalty, language, *args):
78
+ """Process text and generate audio."""
79
+ log_messages = ""
80
+ if not ref_audio_files:
81
+ log_messages += "Please provide at least one reference audio!\n"
82
+ return None, log_messages
83
+
84
+ # clone voices from all file paths (shorten them)
85
+ base64_voices = ref_audio_files[:5]
86
+
87
+ request = TTSRequest(
88
+ text=input_text,
89
+ speaker_files=base64_voices,
90
+ stream=False,
91
+ enhance_speech=enhance_speech,
92
+ temperature=temperature,
93
+ top_p=top_p,
94
+ top_k=top_k,
95
+ repetition_penalty=repetition_penalty,
96
+ language=language,
97
+ )
98
+
99
+ try:
100
+ with torch.no_grad():
101
+ output = tts.generate_speech(request)
102
+ if output:
103
+ if speed != 1:
104
+ output.change_speed(speed)
105
+ log_messages += f"✅ Successfully Generated audio\n"
106
+ return (output.sample_rate, output.array), log_messages
107
+ else:
108
+ log_messages += "❌ No output was generated. Check that the model was correctly loaded\n"
109
+ return None, log_messages
110
+ except Exception as e:
111
+ logger.error(f"Error: {e}")
112
+ log_messages += f"❌ An Error occured: {e}\n"
113
+ return None, log_messages
114
+
115
+ def build_gradio_ui():
116
+ """Builds and launches the Gradio UI for Auralis."""
117
+ with gr.Blocks(title="Auralis TTS Demo", theme="soft") as ui:
118
+ gr.Markdown(
119
+ """
120
+ # Auralis Text-to-Speech Demo 🌌
121
+ Convert text or ebooks to speech with advanced voice cloning and enhancement.
122
+ """
123
+ )
124
+
125
+ with gr.Tab("File to Speech"):
126
+ with gr.Row():
127
+ with gr.Column():
128
+ file_input = gr.File(
129
+ label="Upload Book/Text File",
130
+ file_types=[
131
+ ".txt", ".epub", ".mobi", ".azw3", ".fb2",
132
+ ".htmlz", ".lit", ".pdb", ".pdf", ".rtf"
133
+ ]
134
+ )
135
+ ref_audio_files = gr.Files(
136
+ label="Reference Audio Files",
137
+ file_types=["audio"]
138
+ )
139
+ with gr.Accordion("Advanced settings", open=False):
140
+ speed = gr.Slider(
141
+ label="Playback speed",
142
+ minimum=0.5,
143
+ maximum=2.0,
144
+ value=1.0,
145
+ step=0.1
146
+ )
147
+ enhance_speech = gr.Checkbox(
148
+ label="Enhance Reference Speech",
149
+ value=False
150
+ )
151
+ temperature = gr.Slider(
152
+ label="Temperature",
153
+ minimum=0.5,
154
+ maximum=1.0,
155
+ value=0.75,
156
+ step=0.05
157
+ )
158
+ top_p = gr.Slider(
159
+ label="Top P",
160
+ minimum=0.5,
161
+ maximum=1.0,
162
+ value=0.85,
163
+ step=0.05
164
+ )
165
+ top_k = gr.Slider(
166
+ label="Top K",
167
+ minimum=0,
168
+ maximum=100,
169
+ value=50,
170
+ step=10
171
+ )
172
+ repetition_penalty = gr.Slider(
173
+ label="Repetition penalty",
174
+ minimum=1.0,
175
+ maximum=10.0,
176
+ value=5.0,
177
+ step=0.5
178
+ )
179
+ language = gr.Dropdown(
180
+ label="Target Language",
181
+ choices=[
182
+ "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru",
183
+ "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi", "auto",
184
+ ],
185
+ value="auto"
186
+ )
187
+ generate_button = gr.Button("Generate Speech")
188
+ with gr.Column():
189
+ audio_output = gr.Audio(label="Generated Audio")
190
+ log_output = gr.Text(label="Log Output")
191
+
192
+ def process_file_and_generate(
193
+ file_input, ref_audio_files, speed, enhance_speech,
194
+ temperature, top_p, top_k, repetition_penalty, language
195
+ ):
196
+ if not file_input:
197
+ return None, "Please provide an input file!"
198
+
199
+ try:
200
+ # Convert input file to text
201
+ input_text = text_from_file(file_input.name)
202
+
203
+ return process_text_and_generate(
204
+ input_text, ref_audio_files, speed, enhance_speech,
205
+ temperature, top_p, top_k, repetition_penalty, language
206
+ )
207
+ except Exception as e:
208
+ logger.error(f"Error processing file: {e}")
209
+ return None, f"Error processing file: {str(e)}"
210
+
211
+ generate_button.click(
212
+ process_file_and_generate,
213
+ inputs=[
214
+ file_input, ref_audio_files, speed, enhance_speech,
215
+ temperature, top_p, top_k, repetition_penalty, language
216
+ ],
217
+ outputs=[audio_output, log_output],
218
+ )
219
+
220
+ with gr.Tab("Clone With Microphone"):
221
+ with gr.Row():
222
+ with gr.Column():
223
+ file_input_mic = gr.File(
224
+ label="Upload Book/Text File",
225
+ file_types=[
226
+ ".txt", ".epub", ".mobi", ".azw3", ".fb2",
227
+ ".htmlz", ".lit", ".pdb", ".pdf", ".rtf"
228
+ ]
229
+ )
230
+ mic_ref_audio = gr.Audio(
231
+ label="Record Reference Audio",
232
+ sources=["microphone"]
233
+ )
234
+
235
+ with gr.Accordion("Advanced settings", open=False):
236
+ speed_mic = gr.Slider(
237
+ label="Playback speed",
238
+ minimum=0.5,
239
+ maximum=2.0,
240
+ value=1.0,
241
+ step=0.1
242
+ )
243
+ enhance_speech_mic = gr.Checkbox(
244
+ label="Enhance Reference Speech",
245
+ value=True
246
+ )
247
+ temperature_mic = gr.Slider(
248
+ label="Temperature",
249
+ minimum=0.5,
250
+ maximum=1.0,
251
+ value=0.75,
252
+ step=0.05
253
+ )
254
+ top_p_mic = gr.Slider(
255
+ label="Top P",
256
+ minimum=0.5,
257
+ maximum=1.0,
258
+ value=0.85,
259
+ step=0.05
260
+ )
261
+ top_k_mic = gr.Slider(
262
+ label="Top K",
263
+ minimum=0,
264
+ maximum=100,
265
+ value=50,
266
+ step=10
267
+ )
268
+ repetition_penalty_mic = gr.Slider(
269
+ label="Repetition penalty",
270
+ minimum=1.0,
271
+ maximum=10.0,
272
+ value=5.0,
273
+ step=0.5
274
+ )
275
+ language_mic = gr.Dropdown(
276
+ label="Target Language",
277
+ choices=[
278
+ "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru",
279
+ "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi", "auto",
280
+ ],
281
+ value="auto"
282
+ )
283
+ generate_button_mic = gr.Button("Generate Speech")
284
+ with gr.Column():
285
+ audio_output_mic = gr.Audio(label="Generated Audio")
286
+ log_output_mic = gr.Text(label="Log Output")
287
+
288
+ def process_mic_and_generate(
289
+ file_input, mic_ref_audio, speed_mic, enhance_speech_mic,
290
+ temperature_mic, top_p_mic, top_k_mic, repetition_penalty_mic, language_mic
291
+ ):
292
+ if not mic_ref_audio:
293
+ return None, "Please record an audio!"
294
+ if not file_input:
295
+ return None, "Please provide an input file!"
296
+
297
+ try:
298
+ # Convert input file to text
299
+ input_text = text_from_file(file_input.name)
300
+
301
+ # Save microphone audio
302
+ data = str(time.time()).encode("utf-8")
303
+ hash = hashlib.sha1(data).hexdigest()[:10]
304
+ output_path = temp_dir / (f"mic_{hash}.wav")
305
+
306
+ torch_audio = torch.from_numpy(mic_ref_audio[1].astype(float))
307
+ torchaudio.save(
308
+ str(output_path),
309
+ torch_audio.unsqueeze(0),
310
+ mic_ref_audio[0]
311
+ )
312
+
313
+ return process_text_and_generate(
314
+ input_text, [Path(output_path)], speed_mic,
315
+ enhance_speech_mic, temperature_mic, top_p_mic,
316
+ top_k_mic, repetition_penalty_mic, language_mic
317
+ )
318
+ except Exception as e:
319
+ logger.error(f"Error processing input: {e}")
320
+ return None, f"Error processing input: {str(e)}"
321
+
322
+ generate_button_mic.click(
323
+ process_mic_and_generate,
324
+ inputs=[
325
+ file_input_mic, mic_ref_audio, speed_mic,
326
+ enhance_speech_mic, temperature_mic, top_p_mic,
327
+ top_k_mic, repetition_penalty_mic, language_mic
328
+ ],
329
+ outputs=[audio_output_mic, log_output_mic],
330
+ )
331
+
332
+ return ui
333
+
334
+ if __name__ == "__main__":
335
+ ui = build_gradio_ui()
336
+ ui.launch(debug=True)