hoonsubin commited on
Commit
597e812
·
1 Parent(s): 96cdf65

add base proj

Browse files
app.py CHANGED
@@ -1,7 +1,13 @@
1
- import gradio as gr
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
1
+ from tts_ui.tts.auralis_tts_engine import AuralisTTSEngine
2
+ from tts_ui.ui import build_gradio_ui
3
 
 
 
4
 
5
+ def main():
6
+ tts_engine = AuralisTTSEngine()
7
+ ui = build_gradio_ui(tts_engine)
8
+ ui.launch(debug=True)
9
+
10
+
11
+ if __name__ == "__main__":
12
+ # asyncio.run(main())
13
+ main()
pyproject.toml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "auralis-tts"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "auralis>=0.2.8.post2",
9
+ "bs4>=0.0.2",
10
+ "bunkai>=1.5.7",
11
+ "gradio>=5.17.1",
12
+ "jaconv>=0.4.0",
13
+ "langchain-text-splitters>=0.3.6",
14
+ "markdown>=3.7",
15
+ "nest-asyncio>=1.6.0",
16
+ "pdfplumber>=0.11.5",
17
+ "sudachidict-core>=20250129",
18
+ "sudachipy>=0.6.10",
19
+ "torch>=2.5.1",
20
+ "torchaudio>=2.5.1",
21
+ "torchvision>=0.20.1",
22
+ "unidic>=1.1.0",
23
+ "yakinori>=0.1.2",
24
+ ]
requirements.txt ADDED
@@ -0,0 +1,746 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv pip compile --output-file requirements.txt pyproject.toml
3
+ aiofiles==23.2.1
4
+ # via
5
+ # auralis
6
+ # gradio
7
+ aiohappyeyeballs==2.4.6
8
+ # via aiohttp
9
+ aiohttp==3.11.12
10
+ # via
11
+ # datasets
12
+ # fsspec
13
+ # vllm
14
+ aiosignal==1.3.2
15
+ # via
16
+ # aiohttp
17
+ # ray
18
+ annotated-types==0.7.0
19
+ # via pydantic
20
+ anyio==4.8.0
21
+ # via
22
+ # gradio
23
+ # httpx
24
+ # openai
25
+ # starlette
26
+ # watchfiles
27
+ asttokens==3.0.0
28
+ # via stack-data
29
+ async-timeout==5.0.1
30
+ # via aiohttp
31
+ attrs==25.1.0
32
+ # via
33
+ # aiohttp
34
+ # jsonschema
35
+ # referencing
36
+ audioread==3.0.1
37
+ # via librosa
38
+ auralis==0.2.8.post2
39
+ # via auralis-tts (pyproject.toml)
40
+ beautifulsoup4==4.13.3
41
+ # via
42
+ # auralis
43
+ # bs4
44
+ blis==0.7.11
45
+ # via thinc
46
+ bs4==0.0.2
47
+ # via auralis-tts (pyproject.toml)
48
+ bunkai==1.5.7
49
+ # via auralis-tts (pyproject.toml)
50
+ cachetools==5.5.2
51
+ # via auralis
52
+ catalogue==2.0.10
53
+ # via
54
+ # spacy
55
+ # srsly
56
+ # thinc
57
+ certifi==2025.1.31
58
+ # via
59
+ # httpcore
60
+ # httpx
61
+ # requests
62
+ cffi==1.17.1
63
+ # via
64
+ # cryptography
65
+ # sounddevice
66
+ # soundfile
67
+ charset-normalizer==3.4.1
68
+ # via
69
+ # pdfminer-six
70
+ # requests
71
+ click==8.1.8
72
+ # via
73
+ # ray
74
+ # typer
75
+ # uvicorn
76
+ cloudpathlib==0.20.0
77
+ # via weasel
78
+ cloudpickle==3.1.1
79
+ # via outlines
80
+ colorama==0.4.6
81
+ # via auralis
82
+ compressed-tensors==0.8.0
83
+ # via vllm
84
+ confection==0.1.5
85
+ # via
86
+ # thinc
87
+ # weasel
88
+ cryptography==44.0.1
89
+ # via pdfminer-six
90
+ cutlet==0.5.0
91
+ # via auralis
92
+ cymem==2.0.11
93
+ # via
94
+ # preshed
95
+ # spacy
96
+ # thinc
97
+ dataclasses-json==0.6.7
98
+ # via bunkai
99
+ datasets==2.14.4
100
+ # via outlines
101
+ decorator==5.2.1
102
+ # via
103
+ # ipython
104
+ # librosa
105
+ dill==0.3.7
106
+ # via
107
+ # datasets
108
+ # multiprocess
109
+ diskcache==5.6.3
110
+ # via outlines
111
+ distro==1.9.0
112
+ # via openai
113
+ docopt==0.6.2
114
+ # via num2words
115
+ ebooklib==0.18
116
+ # via auralis
117
+ einops==0.8.1
118
+ # via
119
+ # auralis
120
+ # vllm
121
+ emoji==2.14.1
122
+ # via bunkai
123
+ emojis==0.7.0
124
+ # via bunkai
125
+ exceptiongroup==1.2.2
126
+ # via
127
+ # anyio
128
+ # ipython
129
+ # pytest
130
+ executing==2.2.0
131
+ # via stack-data
132
+ fastapi==0.115.8
133
+ # via
134
+ # gradio
135
+ # vllm
136
+ ffmpeg==1.4
137
+ # via auralis
138
+ ffmpy==0.5.0
139
+ # via gradio
140
+ filelock==3.17.0
141
+ # via
142
+ # huggingface-hub
143
+ # ray
144
+ # torch
145
+ # transformers
146
+ # vllm
147
+ frozenlist==1.5.0
148
+ # via
149
+ # aiohttp
150
+ # aiosignal
151
+ # ray
152
+ fsspec==2025.2.0
153
+ # via
154
+ # auralis
155
+ # datasets
156
+ # gradio-client
157
+ # huggingface-hub
158
+ # torch
159
+ fugashi==1.4.0
160
+ # via cutlet
161
+ future==1.0.0
162
+ # via pyloudnorm
163
+ gguf==0.10.0
164
+ # via vllm
165
+ gradio==5.17.1
166
+ # via auralis-tts (pyproject.toml)
167
+ gradio-client==1.7.1
168
+ # via gradio
169
+ h11==0.14.0
170
+ # via
171
+ # httpcore
172
+ # uvicorn
173
+ hangul-romanize==0.1.0
174
+ # via auralis
175
+ httpcore==1.0.7
176
+ # via httpx
177
+ httptools==0.6.4
178
+ # via uvicorn
179
+ httpx==0.28.1
180
+ # via
181
+ # gradio
182
+ # gradio-client
183
+ # langsmith
184
+ # openai
185
+ # safehttpx
186
+ huggingface-hub==0.29.1
187
+ # via
188
+ # auralis
189
+ # datasets
190
+ # gradio
191
+ # gradio-client
192
+ # tokenizers
193
+ # transformers
194
+ idna==3.10
195
+ # via
196
+ # anyio
197
+ # httpx
198
+ # requests
199
+ # yarl
200
+ importlib-metadata==8.6.1
201
+ # via vllm
202
+ iniconfig==2.0.0
203
+ # via pytest
204
+ interegular==0.3.3
205
+ # via
206
+ # lm-format-enforcer
207
+ # outlines
208
+ ipython==8.32.0
209
+ # via auralis
210
+ jaconv==0.4.0
211
+ # via
212
+ # auralis-tts (pyproject.toml)
213
+ # cutlet
214
+ # yakinori
215
+ janome==0.5.0
216
+ # via bunkai
217
+ jedi==0.19.2
218
+ # via ipython
219
+ jinja2==3.1.5
220
+ # via
221
+ # gradio
222
+ # outlines
223
+ # spacy
224
+ # torch
225
+ jiter==0.8.2
226
+ # via openai
227
+ joblib==1.4.2
228
+ # via
229
+ # librosa
230
+ # scikit-learn
231
+ jsonpatch==1.33
232
+ # via langchain-core
233
+ jsonpointer==3.0.0
234
+ # via jsonpatch
235
+ jsonschema==4.23.0
236
+ # via
237
+ # mistral-common
238
+ # outlines
239
+ # ray
240
+ jsonschema-specifications==2024.10.1
241
+ # via jsonschema
242
+ langchain-core==0.3.37
243
+ # via langchain-text-splitters
244
+ langchain-text-splitters==0.3.6
245
+ # via auralis-tts (pyproject.toml)
246
+ langcodes==3.5.0
247
+ # via spacy
248
+ langid==1.1.6
249
+ # via auralis
250
+ langsmith==0.3.10
251
+ # via langchain-core
252
+ language-data==1.3.0
253
+ # via langcodes
254
+ lark==1.2.2
255
+ # via outlines
256
+ lazy-loader==0.4
257
+ # via librosa
258
+ librosa==0.10.2.post1
259
+ # via auralis
260
+ llvmlite==0.44.0
261
+ # via numba
262
+ lm-format-enforcer==0.10.10
263
+ # via vllm
264
+ lxml==5.3.1
265
+ # via ebooklib
266
+ marisa-trie==1.2.1
267
+ # via language-data
268
+ markdown==3.7
269
+ # via auralis-tts (pyproject.toml)
270
+ markdown-it-py==3.0.0
271
+ # via rich
272
+ markupsafe==2.1.5
273
+ # via
274
+ # gradio
275
+ # jinja2
276
+ marshmallow==3.26.1
277
+ # via dataclasses-json
278
+ matplotlib-inline==0.1.7
279
+ # via ipython
280
+ mdurl==0.1.2
281
+ # via markdown-it-py
282
+ mecab-python3==1.0.10
283
+ # via yakinori
284
+ mistral-common==1.5.3
285
+ # via vllm
286
+ mojimoji==0.0.13
287
+ # via cutlet
288
+ more-itertools==10.6.0
289
+ # via bunkai
290
+ mpmath==1.3.0
291
+ # via sympy
292
+ msgpack==1.1.0
293
+ # via
294
+ # librosa
295
+ # ray
296
+ msgspec==0.19.0
297
+ # via vllm
298
+ multidict==6.1.0
299
+ # via
300
+ # aiohttp
301
+ # yarl
302
+ multiprocess==0.70.15
303
+ # via datasets
304
+ murmurhash==1.0.12
305
+ # via
306
+ # preshed
307
+ # spacy
308
+ # thinc
309
+ mypy-extensions==1.0.0
310
+ # via typing-inspect
311
+ nest-asyncio==1.6.0
312
+ # via
313
+ # auralis-tts (pyproject.toml)
314
+ # outlines
315
+ networkx==3.4.2
316
+ # via
317
+ # auralis
318
+ # torch
319
+ num2words==0.5.14
320
+ # via auralis
321
+ numba==0.61.0
322
+ # via
323
+ # librosa
324
+ # outlines
325
+ numpy==1.26.4
326
+ # via
327
+ # auralis
328
+ # blis
329
+ # datasets
330
+ # gguf
331
+ # gradio
332
+ # langid
333
+ # librosa
334
+ # mistral-common
335
+ # numba
336
+ # opencv-python-headless
337
+ # outlines
338
+ # pandas
339
+ # pyloudnorm
340
+ # scikit-learn
341
+ # scipy
342
+ # soundfile
343
+ # soxr
344
+ # spacy
345
+ # thinc
346
+ # torchvision
347
+ # transformers
348
+ # vllm
349
+ nvidia-ml-py==12.570.86
350
+ # via
351
+ # auralis
352
+ # vllm
353
+ openai==1.64.0
354
+ # via vllm
355
+ opencc==1.1.9
356
+ # via auralis
357
+ opencv-python-headless==4.11.0.86
358
+ # via mistral-common
359
+ orjson==3.10.15
360
+ # via
361
+ # gradio
362
+ # langsmith
363
+ outlines==0.0.46
364
+ # via vllm
365
+ packaging==24.2
366
+ # via
367
+ # auralis
368
+ # datasets
369
+ # gradio
370
+ # gradio-client
371
+ # huggingface-hub
372
+ # langchain-core
373
+ # lazy-loader
374
+ # lm-format-enforcer
375
+ # marshmallow
376
+ # pooch
377
+ # pytest
378
+ # ray
379
+ # spacy
380
+ # thinc
381
+ # transformers
382
+ # weasel
383
+ pandas==2.2.3
384
+ # via
385
+ # datasets
386
+ # gradio
387
+ parso==0.8.4
388
+ # via jedi
389
+ partial-json-parser==0.2.1.1.post5
390
+ # via vllm
391
+ pdfminer-six==20231228
392
+ # via pdfplumber
393
+ pdfplumber==0.11.5
394
+ # via auralis-tts (pyproject.toml)
395
+ pexpect==4.9.0
396
+ # via ipython
397
+ pillow==11.1.0
398
+ # via
399
+ # gradio
400
+ # mistral-common
401
+ # pdfplumber
402
+ # torchvision
403
+ # vllm
404
+ plac==1.4.3
405
+ # via unidic
406
+ platformdirs==4.3.6
407
+ # via pooch
408
+ pluggy==1.5.0
409
+ # via pytest
410
+ pooch==1.8.2
411
+ # via librosa
412
+ preshed==3.0.9
413
+ # via
414
+ # spacy
415
+ # thinc
416
+ prometheus-client==0.21.1
417
+ # via
418
+ # prometheus-fastapi-instrumentator
419
+ # vllm
420
+ prometheus-fastapi-instrumentator==7.0.2
421
+ # via vllm
422
+ prompt-toolkit==3.0.50
423
+ # via ipython
424
+ propcache==0.3.0
425
+ # via
426
+ # aiohttp
427
+ # yarl
428
+ protobuf==5.29.3
429
+ # via
430
+ # ray
431
+ # vllm
432
+ psutil==7.0.0
433
+ # via vllm
434
+ ptyprocess==0.7.0
435
+ # via pexpect
436
+ pure-eval==0.2.3
437
+ # via stack-data
438
+ py-cpuinfo==9.0.0
439
+ # via vllm
440
+ pyairports==2.1.1
441
+ # via outlines
442
+ pyarrow==19.0.1
443
+ # via datasets
444
+ pycountry==24.6.1
445
+ # via outlines
446
+ pycparser==2.22
447
+ # via cffi
448
+ pydantic==2.10.6
449
+ # via
450
+ # compressed-tensors
451
+ # confection
452
+ # fastapi
453
+ # gradio
454
+ # langchain-core
455
+ # langsmith
456
+ # lm-format-enforcer
457
+ # mistral-common
458
+ # openai
459
+ # outlines
460
+ # spacy
461
+ # thinc
462
+ # vllm
463
+ # weasel
464
+ pydantic-core==2.27.2
465
+ # via pydantic
466
+ pydub==0.25.1
467
+ # via gradio
468
+ pygments==2.19.1
469
+ # via
470
+ # ipython
471
+ # rich
472
+ pyloudnorm==0.1.1
473
+ # via auralis
474
+ pypdfium2==4.30.1
475
+ # via pdfplumber
476
+ pypinyin==0.53.0
477
+ # via auralis
478
+ pytest==8.3.4
479
+ # via auralis
480
+ python-dateutil==2.9.0.post0
481
+ # via pandas
482
+ python-dotenv==1.0.1
483
+ # via uvicorn
484
+ python-multipart==0.0.20
485
+ # via gradio
486
+ pytz==2025.1
487
+ # via pandas
488
+ pyyaml==6.0.2
489
+ # via
490
+ # datasets
491
+ # gguf
492
+ # gradio
493
+ # huggingface-hub
494
+ # langchain-core
495
+ # lm-format-enforcer
496
+ # ray
497
+ # transformers
498
+ # uvicorn
499
+ # vllm
500
+ pyzmq==26.2.1
501
+ # via vllm
502
+ ray==2.42.1
503
+ # via vllm
504
+ referencing==0.36.2
505
+ # via
506
+ # jsonschema
507
+ # jsonschema-specifications
508
+ # outlines
509
+ regex==2024.11.6
510
+ # via
511
+ # bunkai
512
+ # tiktoken
513
+ # transformers
514
+ requests==2.32.3
515
+ # via
516
+ # datasets
517
+ # huggingface-hub
518
+ # langsmith
519
+ # mistral-common
520
+ # outlines
521
+ # pooch
522
+ # ray
523
+ # requests-toolbelt
524
+ # spacy
525
+ # tiktoken
526
+ # transformers
527
+ # unidic
528
+ # vllm
529
+ # weasel
530
+ requests-toolbelt==1.0.0
531
+ # via langsmith
532
+ rich==13.9.4
533
+ # via typer
534
+ rpds-py==0.23.1
535
+ # via
536
+ # jsonschema
537
+ # referencing
538
+ ruff==0.9.7
539
+ # via gradio
540
+ safehttpx==0.1.6
541
+ # via gradio
542
+ safetensors==0.5.2
543
+ # via
544
+ # auralis
545
+ # transformers
546
+ scikit-learn==1.6.1
547
+ # via librosa
548
+ scipy==1.15.2
549
+ # via
550
+ # librosa
551
+ # pyloudnorm
552
+ # scikit-learn
553
+ semantic-version==2.10.0
554
+ # via gradio
555
+ sentencepiece==0.2.0
556
+ # via
557
+ # mistral-common
558
+ # vllm
559
+ setuptools==75.8.0
560
+ # via
561
+ # auralis
562
+ # marisa-trie
563
+ # spacy
564
+ # thinc
565
+ shellingham==1.5.4
566
+ # via typer
567
+ six==1.17.0
568
+ # via
569
+ # ebooklib
570
+ # python-dateutil
571
+ smart-open==7.1.0
572
+ # via weasel
573
+ sniffio==1.3.1
574
+ # via
575
+ # anyio
576
+ # openai
577
+ sounddevice==0.5.1
578
+ # via auralis
579
+ soundfile==0.13.1
580
+ # via
581
+ # auralis
582
+ # librosa
583
+ soupsieve==2.6
584
+ # via beautifulsoup4
585
+ soxr==0.5.0.post1
586
+ # via librosa
587
+ spacy==3.7.5
588
+ # via auralis
589
+ spacy-legacy==3.0.12
590
+ # via spacy
591
+ spacy-loggers==1.0.5
592
+ # via spacy
593
+ spans==1.1.1
594
+ # via bunkai
595
+ srsly==2.5.1
596
+ # via
597
+ # confection
598
+ # spacy
599
+ # thinc
600
+ # weasel
601
+ stack-data==0.6.3
602
+ # via ipython
603
+ starlette==0.45.3
604
+ # via
605
+ # fastapi
606
+ # gradio
607
+ # prometheus-fastapi-instrumentator
608
+ sudachidict-core==20250129
609
+ # via auralis-tts (pyproject.toml)
610
+ sudachipy==0.6.10
611
+ # via
612
+ # auralis-tts (pyproject.toml)
613
+ # sudachidict-core
614
+ sympy==1.13.1
615
+ # via torch
616
+ tenacity==9.0.0
617
+ # via langchain-core
618
+ thinc==8.2.5
619
+ # via spacy
620
+ threadpoolctl==3.5.0
621
+ # via scikit-learn
622
+ tiktoken==0.9.0
623
+ # via
624
+ # mistral-common
625
+ # vllm
626
+ tokenizers==0.21.0
627
+ # via
628
+ # auralis
629
+ # transformers
630
+ # vllm
631
+ toml==0.10.2
632
+ # via bunkai
633
+ tomli==2.2.1
634
+ # via pytest
635
+ tomlkit==0.13.2
636
+ # via gradio
637
+ torch==2.5.1
638
+ # via
639
+ # auralis-tts (pyproject.toml)
640
+ # compressed-tensors
641
+ # torchaudio
642
+ # torchvision
643
+ # vllm
644
+ torchaudio==2.5.1
645
+ # via
646
+ # auralis-tts (pyproject.toml)
647
+ # auralis
648
+ torchvision==0.20.1
649
+ # via
650
+ # auralis-tts (pyproject.toml)
651
+ # vllm
652
+ tqdm==4.67.1
653
+ # via
654
+ # bunkai
655
+ # datasets
656
+ # gguf
657
+ # huggingface-hub
658
+ # openai
659
+ # outlines
660
+ # spacy
661
+ # transformers
662
+ # unidic
663
+ # vllm
664
+ traitlets==5.14.3
665
+ # via
666
+ # ipython
667
+ # matplotlib-inline
668
+ transformers==4.49.0
669
+ # via
670
+ # auralis
671
+ # compressed-tensors
672
+ # vllm
673
+ typer==0.15.1
674
+ # via
675
+ # gradio
676
+ # spacy
677
+ # weasel
678
+ typing-extensions==4.12.2
679
+ # via
680
+ # anyio
681
+ # beautifulsoup4
682
+ # cloudpathlib
683
+ # fastapi
684
+ # gradio
685
+ # gradio-client
686
+ # huggingface-hub
687
+ # ipython
688
+ # langchain-core
689
+ # librosa
690
+ # mistral-common
691
+ # multidict
692
+ # openai
693
+ # outlines
694
+ # pydantic
695
+ # pydantic-core
696
+ # referencing
697
+ # rich
698
+ # torch
699
+ # typer
700
+ # typing-inspect
701
+ # uvicorn
702
+ # vllm
703
+ typing-inspect==0.9.0
704
+ # via dataclasses-json
705
+ tzdata==2025.1
706
+ # via pandas
707
+ unidic==1.1.0
708
+ # via auralis-tts (pyproject.toml)
709
+ urllib3==2.3.0
710
+ # via requests
711
+ uvicorn==0.34.0
712
+ # via
713
+ # gradio
714
+ # vllm
715
+ uvloop==0.21.0
716
+ # via uvicorn
717
+ vllm==0.6.4.post1
718
+ # via auralis
719
+ wasabi==0.10.1
720
+ # via
721
+ # spacy
722
+ # thinc
723
+ # unidic
724
+ # weasel
725
+ watchfiles==1.0.4
726
+ # via uvicorn
727
+ wcwidth==0.2.13
728
+ # via prompt-toolkit
729
+ weasel==0.4.1
730
+ # via spacy
731
+ websockets==14.2
732
+ # via
733
+ # gradio-client
734
+ # uvicorn
735
+ wrapt==1.17.2
736
+ # via smart-open
737
+ xxhash==3.5.0
738
+ # via datasets
739
+ yakinori==0.1.2
740
+ # via auralis-tts (pyproject.toml)
741
+ yarl==1.18.3
742
+ # via aiohttp
743
+ zipp==3.21.0
744
+ # via importlib-metadata
745
+ zstandard==0.23.0
746
+ # via langsmith
tts_ui/__init__.py ADDED
File without changes
tts_ui/tts/__init__.py ADDED
File without changes
tts_ui/tts/auralis_tts_engine.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from auralis import TTS, TTSRequest, TTSOutput, setup_logger
2
+ from gradio import File, Files, Slider
3
+ import torch
4
+ from tts_ui.utils import (
5
+ calculate_byte_size,
6
+ split_text_into_chunks,
7
+ tmp_dir,
8
+ extract_text_from_epub,
9
+ text_from_file,
10
+ convert_audio,
11
+ )
12
+ from tts_ui.utils.doc_processor import DocumentProcessor
13
+ import hashlib
14
+ import torchaudio
15
+ import time
16
+ from pathlib import Path
17
+
18
+ # Loading the TTS engine first and assign it to the class.
19
+ # This looks ugly, but it works
20
+ logger = setup_logger(__file__)
21
+
22
+ tts = TTS()
23
+ model_path = "AstraMindAI/xttsv2" # change this if you have a different model
24
+ gpt_model = "AstraMindAI/xtts2-gpt"
25
+
26
+ try:
27
+ tts: TTS = tts.from_pretrained(
28
+ model_name_or_path=model_path,
29
+ gpt_model=gpt_model,
30
+ enforce_eager=False,
31
+ max_seq_len_to_capture=4096, # Match WSL2 page size
32
+ scheduler_max_concurrency=4,
33
+ )
34
+ logger.info(f"Successfully loaded model {model_path}")
35
+ except Exception as e:
36
+ error_msg = f"Failed to load model: {e}."
37
+ logger.error(error_msg)
38
+ raise Exception(error_msg)
39
+
40
+
41
+ class AuralisTTSEngine:
42
+ def __init__(self):
43
+ self.logger = logger
44
+ self.tts: TTS = tts
45
+ self.model_path: str = model_path
46
+ self.gpt_model: str = gpt_model
47
+ self.tmp_dir: Path = tmp_dir
48
+ self.doc_processor = DocumentProcessor
49
+
50
+ def process_text_and_generate(
51
+ self,
52
+ input_text: str,
53
+ ref_audio_files: str | list[str] | bytes | list[bytes],
54
+ speed: float,
55
+ enhance_speech: bool,
56
+ temperature: float,
57
+ top_p: float,
58
+ top_k: float,
59
+ repetition_penalty: float,
60
+ language: str = "auto",
61
+ *args,
62
+ ):
63
+ """Process text and generate audio."""
64
+ log_messages: str = ""
65
+ if not ref_audio_files:
66
+ log_messages += "Please provide at least one reference audio!\n"
67
+ return None, log_messages
68
+
69
+ input_size = calculate_byte_size(input_text)
70
+
71
+ # use the chunking process if the text is too large
72
+ if input_size > 45000:
73
+ self.logger.info(
74
+ f"Found {input_size} bytes of text. Switching to chunk mode."
75
+ )
76
+ # todo: this function has a couple of overlapping functions as normal processing. I need to optimize the code
77
+ return self._process_large_text(
78
+ input_text,
79
+ ref_audio_files,
80
+ speed,
81
+ enhance_speech,
82
+ temperature,
83
+ top_p,
84
+ top_k,
85
+ repetition_penalty,
86
+ language,
87
+ )
88
+ else:
89
+ try:
90
+ with torch.no_grad():
91
+ # clone voices from all file paths (shorten them)
92
+ base64_voices: str | list[str] | bytes | list[bytes] = (
93
+ ref_audio_files[:5]
94
+ )
95
+
96
+ request = TTSRequest(
97
+ text=input_text,
98
+ speaker_files=base64_voices,
99
+ stream=False,
100
+ enhance_speech=enhance_speech,
101
+ temperature=temperature,
102
+ top_p=top_p,
103
+ top_k=top_k,
104
+ repetition_penalty=repetition_penalty,
105
+ language=language,
106
+ )
107
+
108
+ output: TTSOutput = self.tts.generate_speech(request)
109
+
110
+ if output:
111
+ if speed != 1:
112
+ output.change_speed(speed)
113
+ log_messages += f"✅ Successfully Generated audio\n"
114
+ self.logger.info(log_messages)
115
+ # return the sample rate and the audio file as a byte array
116
+ return (
117
+ output.sample_rate,
118
+ convert_audio(output.array),
119
+ ), log_messages
120
+
121
+ else:
122
+ log_messages += "❌ No output was generated. Check that the model was correctly loaded\n"
123
+ return None, log_messages
124
+ except Exception as e:
125
+ self.logger.error(f"Error: {e}")
126
+ log_messages += f"❌ An Error occured: {e}\n"
127
+ return None, log_messages
128
+
129
+ def _process_large_text(
130
+ self,
131
+ input_full_text: str,
132
+ ref_audio_files: str | list[str] | bytes | list[bytes],
133
+ speed: float,
134
+ enhance_speech: bool,
135
+ temperature: float,
136
+ top_p: float,
137
+ top_k: float,
138
+ repetition_penalty: float,
139
+ language: str = "auto",
140
+ ):
141
+ """Process text in chunks and combine results"""
142
+ log_messages: str = ""
143
+
144
+ if not ref_audio_files:
145
+ log_messages += "Please provide at least one reference audio!\n"
146
+ return None, log_messages
147
+
148
+ base64_voices: str | list[str] | bytes | list[bytes] = ref_audio_files[:5]
149
+
150
+ chunks: list[str] = split_text_into_chunks(input_full_text)
151
+ print(f"Created {len(chunks)} chunks")
152
+
153
+ audio_segments: list[TTSOutput] = []
154
+ for idx, chunk in enumerate(chunks):
155
+ request = TTSRequest(
156
+ text=chunk,
157
+ speaker_files=base64_voices,
158
+ stream=False,
159
+ enhance_speech=enhance_speech,
160
+ temperature=temperature,
161
+ top_p=top_p,
162
+ top_k=top_k,
163
+ repetition_penalty=repetition_penalty,
164
+ language=language,
165
+ )
166
+
167
+ try:
168
+ with torch.no_grad():
169
+ audio = self.tts.generate_speech(request)
170
+ audio_segments.append(audio)
171
+ self.logger.info(f"Processed {idx + 1} chunks out of {len(chunks)}")
172
+
173
+ except Exception as e:
174
+ log_messages += f"❌ Chunk processing failed: {e}\n"
175
+ return None, log_messages
176
+
177
+ if len(audio_segments) <= 0:
178
+ log_messages += f"❌ Chunk processing failed. Chunk size: {len(chunks)}\n"
179
+ return None, log_messages
180
+
181
+ combined_output: TTSOutput = TTSOutput.combine_outputs(audio_segments)
182
+
183
+ if speed != 1:
184
+ combined_output.change_speed(speed)
185
+
186
+ log_messages += f"✅ Successfully Generated audio\n"
187
+ # return combined_output
188
+ return (
189
+ combined_output.sample_rate,
190
+ convert_audio(combined_output.array),
191
+ ), log_messages
192
+
193
+ def process_file_and_generate(
194
+ self,
195
+ file_input: File,
196
+ ref_audio_files_file: Files,
197
+ speed_file: Slider,
198
+ enhance_speech_file,
199
+ temperature_file,
200
+ top_p_file,
201
+ top_k_file,
202
+ repetition_penalty_file,
203
+ language_file,
204
+ ):
205
+ # todo: refactor this to use the document processor object
206
+ if file_input:
207
+ file_extension: str = Path(file_input.name).suffix
208
+
209
+ match file_extension:
210
+ case ".epub":
211
+ input_text: str = extract_text_from_epub(file_input.name)
212
+ case ".txt" | ".md":
213
+ input_text = text_from_file(file_input.name)
214
+ case _:
215
+ return (
216
+ None,
217
+ "Unsupported file format, it needs to be either .epub or .txt",
218
+ )
219
+
220
+ return self._process_large_text(
221
+ input_text,
222
+ ref_audio_files_file,
223
+ speed_file,
224
+ enhance_speech_file,
225
+ temperature_file,
226
+ top_p_file,
227
+ top_k_file,
228
+ repetition_penalty_file,
229
+ language_file,
230
+ )
231
+ else:
232
+ return None, "Please provide an .epub or .txt file!"
233
+
234
+ def process_mic_and_generate(
235
+ self,
236
+ input_text_mic,
237
+ mic_ref_audio,
238
+ speed_mic,
239
+ enhance_speech_mic,
240
+ temperature_mic,
241
+ top_p_mic,
242
+ top_k_mic,
243
+ repetition_penalty_mic,
244
+ language_mic,
245
+ ):
246
+ if mic_ref_audio:
247
+ data: bytes = str(time.time()).encode("utf-8")
248
+ hash: str = hashlib.sha1(data).hexdigest()[:10]
249
+ output_path = self.tmp_dir / (f"mic_{hash}.wav")
250
+
251
+ torch_audio: torch.Tensor = torch.from_numpy(mic_ref_audio[1].astype(float))
252
+ try:
253
+ torchaudio.save(
254
+ str(output_path), torch_audio.unsqueeze(0), mic_ref_audio[0]
255
+ )
256
+ return self.process_text_and_generate(
257
+ input_text_mic,
258
+ [Path(output_path)],
259
+ speed_mic,
260
+ enhance_speech_mic,
261
+ temperature_mic,
262
+ top_p_mic,
263
+ top_k_mic,
264
+ repetition_penalty_mic,
265
+ language_mic,
266
+ )
267
+ except Exception as e:
268
+ self.logger.error(f"Error saving audio file: {e}")
269
+ return None, f"Error saving audio file: {e}"
270
+ else:
271
+ return None, "Please record an audio!"
tts_ui/ui/__init__.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from tts_ui.utils import *
3
+ from tts_ui.tts.auralis_tts_engine import AuralisTTSEngine
4
+
5
+
6
+ supported_langs: list[str] = [
7
+ "en",
8
+ "es",
9
+ "fr",
10
+ "de",
11
+ "it",
12
+ "pt",
13
+ "pl",
14
+ "tr",
15
+ "ru",
16
+ "nl",
17
+ "cs",
18
+ "ar",
19
+ "zh-cn",
20
+ "hu",
21
+ "ko",
22
+ "ja",
23
+ "hi",
24
+ "auto",
25
+ ]
26
+
27
+
28
+ def build_gradio_ui(tts_engine: AuralisTTSEngine) -> gr.Blocks:
29
+ """Builds and launches the Gradio UI for Auralis."""
30
+ with gr.Blocks(title="Auralis TTS UI", theme="soft") as ui:
31
+
32
+ gr.Markdown(
33
+ """
34
+ # Text-to-Speech Interface
35
+
36
+ Convert text to speech with advanced voice cloning and enhancement.
37
+
38
+ Powered by Auralis 🌌 made by Hoon
39
+ """
40
+ )
41
+
42
+ with gr.Tab("Text to Speech"):
43
+ with gr.Row():
44
+ with gr.Column():
45
+ input_text = gr.Text(
46
+ label="Enter Text Here",
47
+ placeholder="Write the text you want to convert...",
48
+ )
49
+ ref_audio_files = gr.Files(
50
+ label="Reference Audio Files", file_types=["audio"]
51
+ )
52
+ with gr.Accordion("Advanced settings", open=False):
53
+ speed = gr.Slider(
54
+ label="Playback speed",
55
+ minimum=0.5,
56
+ maximum=2.0,
57
+ value=1.0,
58
+ step=0.1,
59
+ )
60
+ enhance_speech = gr.Checkbox(
61
+ label="Enhance Reference Speech", value=False
62
+ )
63
+ temperature = gr.Slider(
64
+ label="Temperature",
65
+ minimum=0.5,
66
+ maximum=1.0,
67
+ value=0.75,
68
+ step=0.05,
69
+ )
70
+ top_p = gr.Slider(
71
+ label="Top P",
72
+ minimum=0.5,
73
+ maximum=1.0,
74
+ value=0.85,
75
+ step=0.05,
76
+ )
77
+ top_k = gr.Slider(
78
+ label="Top K", minimum=0, maximum=100, value=50, step=10
79
+ )
80
+ repetition_penalty = gr.Slider(
81
+ label="Repetition penalty",
82
+ minimum=1.0,
83
+ maximum=10.0,
84
+ value=5.0,
85
+ step=0.5,
86
+ )
87
+ language = gr.Dropdown(
88
+ label="Target Language",
89
+ choices=supported_langs,
90
+ value="auto",
91
+ )
92
+ generate_button = gr.Button("Generate Speech")
93
+ with gr.Column():
94
+ audio_output = gr.Audio(label="Generated Audio")
95
+ log_output = gr.Text(label="Log Output")
96
+
97
+ generate_button.click(
98
+ fn=tts_engine.process_text_and_generate,
99
+ inputs=[
100
+ input_text,
101
+ ref_audio_files,
102
+ speed,
103
+ enhance_speech,
104
+ temperature,
105
+ top_p,
106
+ top_k,
107
+ repetition_penalty,
108
+ language,
109
+ ],
110
+ outputs=[audio_output, log_output],
111
+ )
112
+
113
+ with gr.Tab("File to Speech"):
114
+ with gr.Row():
115
+ with gr.Column():
116
+ file_input = gr.File(
117
+ label="Text / Ebook File", file_types=[".txt", ".md", ".epub"]
118
+ )
119
+ ref_audio_files_file = gr.Files(
120
+ label="Reference Audio Files", file_types=["audio"]
121
+ )
122
+ with gr.Accordion("Advanced settings", open=False):
123
+ speed_file = gr.Slider(
124
+ label="Playback speed",
125
+ minimum=0.5,
126
+ maximum=2.0,
127
+ value=1.0,
128
+ step=0.1,
129
+ )
130
+ enhance_speech_file = gr.Checkbox(
131
+ label="Enhance Reference Speech", value=False
132
+ )
133
+ temperature_file = gr.Slider(
134
+ label="Temperature",
135
+ minimum=0.5,
136
+ maximum=1.0,
137
+ value=0.75,
138
+ step=0.05,
139
+ )
140
+ top_p_file = gr.Slider(
141
+ label="Top P",
142
+ minimum=0.5,
143
+ maximum=1.0,
144
+ value=0.85,
145
+ step=0.05,
146
+ )
147
+ top_k_file = gr.Slider(
148
+ label="Top K", minimum=0, maximum=100, value=50, step=10
149
+ )
150
+ repetition_penalty_file = gr.Slider(
151
+ label="Repetition penalty",
152
+ minimum=1.0,
153
+ maximum=10.0,
154
+ value=5.0,
155
+ step=0.5,
156
+ )
157
+ language_file = gr.Dropdown(
158
+ label="Target Language",
159
+ choices=supported_langs,
160
+ value="auto",
161
+ )
162
+ generate_button_file = gr.Button("Generate Speech from File")
163
+ with gr.Column():
164
+ audio_output_file = gr.Audio(label="Generated Audio")
165
+ log_output_file = gr.Text(label="Log Output")
166
+
167
+ generate_button_file.click(
168
+ tts_engine.process_file_and_generate,
169
+ inputs=[
170
+ file_input,
171
+ ref_audio_files_file,
172
+ speed_file,
173
+ enhance_speech_file,
174
+ temperature_file,
175
+ top_p_file,
176
+ top_k_file,
177
+ repetition_penalty_file,
178
+ language_file,
179
+ ],
180
+ outputs=[audio_output_file, log_output_file],
181
+ )
182
+
183
+ with gr.Tab("Clone With Microphone"):
184
+ with gr.Row():
185
+ with gr.Column():
186
+ input_text_mic = gr.Text(
187
+ label="Enter Text Here",
188
+ placeholder="Write the text you want to convert...",
189
+ )
190
+ mic_ref_audio = gr.Audio(
191
+ label="Record Reference Audio", sources=["microphone"]
192
+ )
193
+
194
+ with gr.Accordion("Advanced settings", open=False):
195
+ speed_mic = gr.Slider(
196
+ label="Playback speed",
197
+ minimum=0.5,
198
+ maximum=2.0,
199
+ value=1.0,
200
+ step=0.1,
201
+ )
202
+ enhance_speech_mic = gr.Checkbox(
203
+ label="Enhance Reference Speech", value=True
204
+ )
205
+ temperature_mic = gr.Slider(
206
+ label="Temperature",
207
+ minimum=0.5,
208
+ maximum=1.0,
209
+ value=0.75,
210
+ step=0.05,
211
+ )
212
+ top_p_mic = gr.Slider(
213
+ label="Top P",
214
+ minimum=0.5,
215
+ maximum=1.0,
216
+ value=0.85,
217
+ step=0.05,
218
+ )
219
+ top_k_mic = gr.Slider(
220
+ label="Top K", minimum=0, maximum=100, value=50, step=10
221
+ )
222
+ repetition_penalty_mic = gr.Slider(
223
+ label="Repetition penalty",
224
+ minimum=1.0,
225
+ maximum=10.0,
226
+ value=5.0,
227
+ step=0.5,
228
+ )
229
+ language_mic = gr.Dropdown(
230
+ label="Target Language",
231
+ choices=supported_langs,
232
+ value="auto",
233
+ )
234
+ generate_button_mic = gr.Button("Generate Speech")
235
+ with gr.Column():
236
+ audio_output_mic = gr.Audio(label="Generated Audio")
237
+ log_output_mic = gr.Text(label="Log Output")
238
+
239
+ generate_button_mic.click(
240
+ fn=tts_engine.process_mic_and_generate,
241
+ inputs=[
242
+ input_text_mic,
243
+ mic_ref_audio,
244
+ speed_mic,
245
+ enhance_speech_mic,
246
+ temperature_mic,
247
+ top_p_mic,
248
+ top_k_mic,
249
+ repetition_penalty_mic,
250
+ language_mic,
251
+ ],
252
+ outputs=[audio_output_mic, log_output_mic],
253
+ )
254
+
255
+ return ui
tts_ui/utils/__init__.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import uuid
3
+ import shutil
4
+ from pathlib import Path
5
+ import ebooklib
6
+ from ebooklib import epub
7
+ from bs4 import BeautifulSoup
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+ from yakinori import Yakinori
10
+ import regex as re
11
+ import numpy as np
12
+ import jaconv
13
+ import bunkai
14
+
15
+ # Create a temporary directory to store short-named files
16
+ tmp_dir = Path("/tmp/auralis")
17
+ tmp_dir.mkdir(exist_ok=True)
18
+
19
+
20
+ def shorten_filename(original_path: str) -> str:
21
+ """Copies the given file to a temporary directory with a shorter, random filename."""
22
+ ext: str = Path(original_path).suffix
23
+ short_name: str = "file_" + uuid.uuid4().hex[:8] + ext
24
+ short_path: Path = tmp_dir / short_name
25
+ shutil.copyfile(original_path, short_path)
26
+ return str(short_path)
27
+
28
+
29
+ def extract_text_from_epub(epub_path: str, output_path=None) -> str:
30
+ """
31
+ Extracts text from an EPUB file and optionally saves it to a text file.
32
+
33
+ Args:
34
+ epub_path (str): Path to the EPUB file
35
+ output_path (str, optional): Path where to save the text file
36
+
37
+ Returns:
38
+ str: The extracted text
39
+ """
40
+ # Load the book
41
+ book: epub.EpubBook = epub.read_epub(epub_path)
42
+
43
+ # List to hold extracted text
44
+ chapters: list[str] = []
45
+
46
+ # Extract text from each chapter
47
+ for item in book.get_items():
48
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
49
+ # Get HTML content
50
+ html_content = item.get_content().decode("utf-8")
51
+
52
+ # Use BeautifulSoup to extract text
53
+ soup = BeautifulSoup(html_content, "html.parser")
54
+
55
+ # Remove scripts and styles
56
+ for script in soup(["script", "style"]):
57
+ script.decompose()
58
+
59
+ # Get text
60
+ text: str = soup.get_text()
61
+
62
+ # Clean text
63
+ lines = (line.strip() for line in text.splitlines())
64
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
65
+ text = "\n".join(chunk for chunk in chunks if chunk)
66
+
67
+ chapters.append(text)
68
+
69
+ # Join all chapters
70
+ full_text: str = "\n\n".join(chapters)
71
+
72
+ # Save text if output path is specified
73
+ if output_path:
74
+ with open(output_path, "w", encoding="utf-8") as f:
75
+ f.write(full_text)
76
+
77
+ return full_text.replace("»", '"').replace("«", '"')
78
+
79
+
80
+ def text_from_file(txt_file_path: str) -> str:
81
+ # Shorten filename before reading
82
+ txt_short_path: str = shorten_filename(txt_file_path)
83
+ with open(txt_short_path, "r") as f:
84
+ text: str = f.read()
85
+ return text
86
+
87
+
88
+ def clone_voice(audio_path: str) -> str:
89
+ """Clone a voice from an audio path."""
90
+ # Shorten filename before reading
91
+ audio_short_path: str = shorten_filename(audio_path)
92
+ with open(audio_short_path, "rb") as f:
93
+ audio_data: str = base64.b64encode(f.read()).decode("utf-8")
94
+ return audio_data
95
+
96
+
97
+ def calculate_byte_size(text: str) -> int:
98
+ """Calculate UTF-8 encoded byte size of text"""
99
+ return len(text.encode("utf-8"))
100
+
101
+
102
+ def is_japanese(text) -> bool:
103
+ # Regex patterns for Hiragana, Katakana, and common Kanji/CJK unified blocks
104
+ hiragana = r"[\p{Hiragana}]"
105
+ katakana = r"[\p{Katakana}]"
106
+
107
+ # Check for Hiragana or Katakana (unique to Japanese)
108
+ return bool(re.search(hiragana, text) or re.search(katakana, text))
109
+
110
+
111
+ def preprocess_japanese_text(text: str) -> str:
112
+ alpha2kana: str = jaconv.alphabet2kana(text)
113
+ normalized_jp: str = jaconv.normalize(alpha2kana)
114
+
115
+ yakinori = Yakinori()
116
+
117
+ splitter = bunkai.Bunkai()
118
+
119
+ sentences: np.Iterator[str] = splitter(normalized_jp)
120
+
121
+ final: str = ""
122
+
123
+ for sentence in sentences:
124
+ parsed_list: list[str] = yakinori.get_parsed_list(sentence)
125
+ final += yakinori.get_hiragana_sentence(parsed_list, is_hatsuon=True)
126
+
127
+ return final
128
+
129
+
130
+ def convert_audio(data: np.ndarray) -> np.ndarray:
131
+ """Convert any float format to proper 16-bit PCM"""
132
+ if data.dtype in [np.float16, np.float32, np.float64]:
133
+ # Normalize first to [-1, 1] range
134
+ data = data.astype(np.float32) / np.max(np.abs(data))
135
+ # Scale to 16-bit int range
136
+ data = (data * 32767).astype(np.int16)
137
+ return data
138
+
139
+
140
+ def split_text_into_chunks(
141
+ text: str, chunk_size: int = 2000, chunk_overlap: int = 100
142
+ ) -> list[str]:
143
+ """
144
+ Split text into chunks respecting byte limits and natural boundaries.
145
+ This function also automatically converts Japanese Kanji into Kana for better readability.
146
+ """
147
+
148
+ text_to_process = text
149
+
150
+ text_separators: list[str] = [
151
+ "\n\n",
152
+ "\n",
153
+ "。",
154
+ ".",
155
+ "?",
156
+ "!",
157
+ "?",
158
+ "!",
159
+ ",",
160
+ "、",
161
+ ",",
162
+ "」",
163
+ "』",
164
+ "\u3002",
165
+ "\uff0c",
166
+ "\u3001",
167
+ "\uff0e",
168
+ "",
169
+ ]
170
+
171
+ if is_japanese(text_to_process):
172
+ text_to_process = preprocess_japanese_text(text_to_process)
173
+
174
+ splitter = RecursiveCharacterTextSplitter(
175
+ separators=text_separators,
176
+ chunk_size=chunk_size, # Optimized for TTS context windows
177
+ chunk_overlap=chunk_overlap,
178
+ length_function=len,
179
+ is_separator_regex=False,
180
+ )
181
+
182
+ return splitter.split_text(text)
tts_ui/utils/doc_processor.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import markdown
2
+ import pdfplumber
3
+ from pathlib import Path
4
+ from tts_ui.utils import split_text_into_chunks, extract_text_from_epub, text_from_file
5
+
6
+
7
+ class DocumentProcessor:
8
+ def __init__(self, max_word_chunk_size=4000):
9
+ self.max_word_chunk_size: int = max_word_chunk_size # Characters per chunk
10
+
11
+ def process_doc(self, file_path: Path) -> list[str]:
12
+ # get the file extension from the path
13
+ ext: str = file_path.name.split(".")[-1].lower()
14
+
15
+ match ext:
16
+ case "pdf":
17
+ return self._process_pdf(file_path)
18
+ case "epub":
19
+ return self._process_epub(file_path)
20
+ case "md":
21
+ return self._process_markdown(file_path)
22
+ case "txt":
23
+ return self._process_text(file_path)
24
+ case _:
25
+ raise Exception(f"No file found in {file_path}")
26
+
27
+ def _process_pdf(self, file_path: str) -> list[str]:
28
+ text = ""
29
+ with pdfplumber.open(file_path) as pdf:
30
+ for page in pdf.pages:
31
+ text += page.extract_text() + "\n"
32
+ return self._chunk_text(text)
33
+
34
+ def _process_epub(self, file_path: str) -> list[str]:
35
+ text = extract_text_from_epub(file_path)
36
+ return self._chunk_text(text)
37
+
38
+ def _process_markdown(self, file_path: str) -> list[str]:
39
+ with open(file_path, "r") as f:
40
+ md_text: str = f.read()
41
+ return self._chunk_text(markdown.markdown(md_text))
42
+
43
+ def _process_text(self, file_path: str) -> list[str]:
44
+ text = text_from_file(file_path)
45
+ return self._chunk_text(text)
46
+
47
+ def _chunk_text(self, text: str) -> list[str]:
48
+ return split_text_into_chunks(text, self.max_word_chunk_size)
uv.lock ADDED
The diff for this file is too large to render. See raw diff