Spaces:

hoonsubin
/

gpt-tts-ui

Build error

App Files Files Community

hoonsubin commited on Feb 24

Commit

597e812

1 Parent(s): 96cdf65

add base proj

Browse files

Files changed (10) hide show

app.py +11 -5
pyproject.toml +24 -0
requirements.txt +746 -0
tts_ui/__init__.py +0 -0
tts_ui/tts/__init__.py +0 -0
tts_ui/tts/auralis_tts_engine.py +271 -0
tts_ui/ui/__init__.py +255 -0
tts_ui/utils/__init__.py +182 -0
tts_ui/utils/doc_processor.py +48 -0
uv.lock +0 -0

app.py CHANGED Viewed

@@ -1,7 +1,13 @@
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+from tts_ui.tts.auralis_tts_engine import AuralisTTSEngine
+from tts_ui.ui import build_gradio_ui
+def main():
+    tts_engine = AuralisTTSEngine()
+    ui = build_gradio_ui(tts_engine)
+    ui.launch(debug=True)
+if __name__ == "__main__":
+    # asyncio.run(main())
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,24 @@

+[project]
+name = "auralis-tts"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "auralis>=0.2.8.post2",
+    "bs4>=0.0.2",
+    "bunkai>=1.5.7",
+    "gradio>=5.17.1",
+    "jaconv>=0.4.0",
+    "langchain-text-splitters>=0.3.6",
+    "markdown>=3.7",
+    "nest-asyncio>=1.6.0",
+    "pdfplumber>=0.11.5",
+    "sudachidict-core>=20250129",
+    "sudachipy>=0.6.10",
+    "torch>=2.5.1",
+    "torchaudio>=2.5.1",
+    "torchvision>=0.20.1",
+    "unidic>=1.1.0",
+    "yakinori>=0.1.2",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,746 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile --output-file requirements.txt pyproject.toml
+aiofiles==23.2.1
+    # via
+    #   auralis
+    #   gradio
+aiohappyeyeballs==2.4.6
+    # via aiohttp
+aiohttp==3.11.12
+    # via
+    #   datasets
+    #   fsspec
+    #   vllm
+aiosignal==1.3.2
+    # via
+    #   aiohttp
+    #   ray
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.8.0
+    # via
+    #   gradio
+    #   httpx
+    #   openai
+    #   starlette
+    #   watchfiles
+asttokens==3.0.0
+    # via stack-data
+async-timeout==5.0.1
+    # via aiohttp
+attrs==25.1.0
+    # via
+    #   aiohttp
+    #   jsonschema
+    #   referencing
+audioread==3.0.1
+    # via librosa
+auralis==0.2.8.post2
+    # via auralis-tts (pyproject.toml)
+beautifulsoup4==4.13.3
+    # via
+    #   auralis
+    #   bs4
+blis==0.7.11
+    # via thinc
+bs4==0.0.2
+    # via auralis-tts (pyproject.toml)
+bunkai==1.5.7
+    # via auralis-tts (pyproject.toml)
+cachetools==5.5.2
+    # via auralis
+catalogue==2.0.10
+    # via
+    #   spacy
+    #   srsly
+    #   thinc
+certifi==2025.1.31
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+cffi==1.17.1
+    # via
+    #   cryptography
+    #   sounddevice
+    #   soundfile
+charset-normalizer==3.4.1
+    # via
+    #   pdfminer-six
+    #   requests
+click==8.1.8
+    # via
+    #   ray
+    #   typer
+    #   uvicorn
+cloudpathlib==0.20.0
+    # via weasel
+cloudpickle==3.1.1
+    # via outlines
+colorama==0.4.6
+    # via auralis
+compressed-tensors==0.8.0
+    # via vllm
+confection==0.1.5
+    # via
+    #   thinc
+    #   weasel
+cryptography==44.0.1
+    # via pdfminer-six
+cutlet==0.5.0
+    # via auralis
+cymem==2.0.11
+    # via
+    #   preshed
+    #   spacy
+    #   thinc
+dataclasses-json==0.6.7
+    # via bunkai
+datasets==2.14.4
+    # via outlines
+decorator==5.2.1
+    # via
+    #   ipython
+    #   librosa
+dill==0.3.7
+    # via
+    #   datasets
+    #   multiprocess
+diskcache==5.6.3
+    # via outlines
+distro==1.9.0
+    # via openai
+docopt==0.6.2
+    # via num2words
+ebooklib==0.18
+    # via auralis
+einops==0.8.1
+    # via
+    #   auralis
+    #   vllm
+emoji==2.14.1
+    # via bunkai
+emojis==0.7.0
+    # via bunkai
+exceptiongroup==1.2.2
+    # via
+    #   anyio
+    #   ipython
+    #   pytest
+executing==2.2.0
+    # via stack-data
+fastapi==0.115.8
+    # via
+    #   gradio
+    #   vllm
+ffmpeg==1.4
+    # via auralis
+ffmpy==0.5.0
+    # via gradio
+filelock==3.17.0
+    # via
+    #   huggingface-hub
+    #   ray
+    #   torch
+    #   transformers
+    #   vllm
+frozenlist==1.5.0
+    # via
+    #   aiohttp
+    #   aiosignal
+    #   ray
+fsspec==2025.2.0
+    # via
+    #   auralis
+    #   datasets
+    #   gradio-client
+    #   huggingface-hub
+    #   torch
+fugashi==1.4.0
+    # via cutlet
+future==1.0.0
+    # via pyloudnorm
+gguf==0.10.0
+    # via vllm
+gradio==5.17.1
+    # via auralis-tts (pyproject.toml)
+gradio-client==1.7.1
+    # via gradio
+h11==0.14.0
+    # via
+    #   httpcore
+    #   uvicorn
+hangul-romanize==0.1.0
+    # via auralis
+httpcore==1.0.7
+    # via httpx
+httptools==0.6.4
+    # via uvicorn
+httpx==0.28.1
+    # via
+    #   gradio
+    #   gradio-client
+    #   langsmith
+    #   openai
+    #   safehttpx
+huggingface-hub==0.29.1
+    # via
+    #   auralis
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+importlib-metadata==8.6.1
+    # via vllm
+iniconfig==2.0.0
+    # via pytest
+interegular==0.3.3
+    # via
+    #   lm-format-enforcer
+    #   outlines
+ipython==8.32.0
+    # via auralis
+jaconv==0.4.0
+    # via
+    #   auralis-tts (pyproject.toml)
+    #   cutlet
+    #   yakinori
+janome==0.5.0
+    # via bunkai
+jedi==0.19.2
+    # via ipython
+jinja2==3.1.5
+    # via
+    #   gradio
+    #   outlines
+    #   spacy
+    #   torch
+jiter==0.8.2
+    # via openai
+joblib==1.4.2
+    # via
+    #   librosa
+    #   scikit-learn
+jsonpatch==1.33
+    # via langchain-core
+jsonpointer==3.0.0
+    # via jsonpatch
+jsonschema==4.23.0
+    # via
+    #   mistral-common
+    #   outlines
+    #   ray
+jsonschema-specifications==2024.10.1
+    # via jsonschema
+langchain-core==0.3.37
+    # via langchain-text-splitters
+langchain-text-splitters==0.3.6
+    # via auralis-tts (pyproject.toml)
+langcodes==3.5.0
+    # via spacy
+langid==1.1.6
+    # via auralis
+langsmith==0.3.10
+    # via langchain-core
+language-data==1.3.0
+    # via langcodes
+lark==1.2.2
+    # via outlines
+lazy-loader==0.4
+    # via librosa
+librosa==0.10.2.post1
+    # via auralis
+llvmlite==0.44.0
+    # via numba
+lm-format-enforcer==0.10.10
+    # via vllm
+lxml==5.3.1
+    # via ebooklib
+marisa-trie==1.2.1
+    # via language-data
+markdown==3.7
+    # via auralis-tts (pyproject.toml)
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==2.1.5
+    # via
+    #   gradio
+    #   jinja2
+marshmallow==3.26.1
+    # via dataclasses-json
+matplotlib-inline==0.1.7
+    # via ipython
+mdurl==0.1.2
+    # via markdown-it-py
+mecab-python3==1.0.10
+    # via yakinori
+mistral-common==1.5.3
+    # via vllm
+mojimoji==0.0.13
+    # via cutlet
+more-itertools==10.6.0
+    # via bunkai
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.0
+    # via
+    #   librosa
+    #   ray
+msgspec==0.19.0
+    # via vllm
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.15
+    # via datasets
+murmurhash==1.0.12
+    # via
+    #   preshed
+    #   spacy
+    #   thinc
+mypy-extensions==1.0.0
+    # via typing-inspect
+nest-asyncio==1.6.0
+    # via
+    #   auralis-tts (pyproject.toml)
+    #   outlines
+networkx==3.4.2
+    # via
+    #   auralis
+    #   torch
+num2words==0.5.14
+    # via auralis
+numba==0.61.0
+    # via
+    #   librosa
+    #   outlines
+numpy==1.26.4
+    # via
+    #   auralis
+    #   blis
+    #   datasets
+    #   gguf
+    #   gradio
+    #   langid
+    #   librosa
+    #   mistral-common
+    #   numba
+    #   opencv-python-headless
+    #   outlines
+    #   pandas
+    #   pyloudnorm
+    #   scikit-learn
+    #   scipy
+    #   soundfile
+    #   soxr
+    #   spacy
+    #   thinc
+    #   torchvision
+    #   transformers
+    #   vllm
+nvidia-ml-py==12.570.86
+    # via
+    #   auralis
+    #   vllm
+openai==1.64.0
+    # via vllm
+opencc==1.1.9
+    # via auralis
+opencv-python-headless==4.11.0.86
+    # via mistral-common
+orjson==3.10.15
+    # via
+    #   gradio
+    #   langsmith
+outlines==0.0.46
+    # via vllm
+packaging==24.2
+    # via
+    #   auralis
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   langchain-core
+    #   lazy-loader
+    #   lm-format-enforcer
+    #   marshmallow
+    #   pooch
+    #   pytest
+    #   ray
+    #   spacy
+    #   thinc
+    #   transformers
+    #   weasel
+pandas==2.2.3
+    # via
+    #   datasets
+    #   gradio
+parso==0.8.4
+    # via jedi
+partial-json-parser==0.2.1.1.post5
+    # via vllm
+pdfminer-six==20231228
+    # via pdfplumber
+pdfplumber==0.11.5
+    # via auralis-tts (pyproject.toml)
+pexpect==4.9.0
+    # via ipython
+pillow==11.1.0
+    # via
+    #   gradio
+    #   mistral-common
+    #   pdfplumber
+    #   torchvision
+    #   vllm
+plac==1.4.3
+    # via unidic
+platformdirs==4.3.6
+    # via pooch
+pluggy==1.5.0
+    # via pytest
+pooch==1.8.2
+    # via librosa
+preshed==3.0.9
+    # via
+    #   spacy
+    #   thinc
+prometheus-client==0.21.1
+    # via
+    #   prometheus-fastapi-instrumentator
+    #   vllm
+prometheus-fastapi-instrumentator==7.0.2
+    # via vllm
+prompt-toolkit==3.0.50
+    # via ipython
+propcache==0.3.0
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==5.29.3
+    # via
+    #   ray
+    #   vllm
+psutil==7.0.0
+    # via vllm
+ptyprocess==0.7.0
+    # via pexpect
+pure-eval==0.2.3
+    # via stack-data
+py-cpuinfo==9.0.0
+    # via vllm
+pyairports==2.1.1
+    # via outlines
+pyarrow==19.0.1
+    # via datasets
+pycountry==24.6.1
+    # via outlines
+pycparser==2.22
+    # via cffi
+pydantic==2.10.6
+    # via
+    #   compressed-tensors
+    #   confection
+    #   fastapi
+    #   gradio
+    #   langchain-core
+    #   langsmith
+    #   lm-format-enforcer
+    #   mistral-common
+    #   openai
+    #   outlines
+    #   spacy
+    #   thinc
+    #   vllm
+    #   weasel
+pydantic-core==2.27.2
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pygments==2.19.1
+    # via
+    #   ipython
+    #   rich
+pyloudnorm==0.1.1
+    # via auralis
+pypdfium2==4.30.1
+    # via pdfplumber
+pypinyin==0.53.0
+    # via auralis
+pytest==8.3.4
+    # via auralis
+python-dateutil==2.9.0.post0
+    # via pandas
+python-dotenv==1.0.1
+    # via uvicorn
+python-multipart==0.0.20
+    # via gradio
+pytz==2025.1
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   datasets
+    #   gguf
+    #   gradio
+    #   huggingface-hub
+    #   langchain-core
+    #   lm-format-enforcer
+    #   ray
+    #   transformers
+    #   uvicorn
+    #   vllm
+pyzmq==26.2.1
+    # via vllm
+ray==2.42.1
+    # via vllm
+referencing==0.36.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+    #   outlines
+regex==2024.11.6
+    # via
+    #   bunkai
+    #   tiktoken
+    #   transformers
+requests==2.32.3
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   langsmith
+    #   mistral-common
+    #   outlines
+    #   pooch
+    #   ray
+    #   requests-toolbelt
+    #   spacy
+    #   tiktoken
+    #   transformers
+    #   unidic
+    #   vllm
+    #   weasel
+requests-toolbelt==1.0.0
+    # via langsmith
+rich==13.9.4
+    # via typer
+rpds-py==0.23.1
+    # via
+    #   jsonschema
+    #   referencing
+ruff==0.9.7
+    # via gradio
+safehttpx==0.1.6
+    # via gradio
+safetensors==0.5.2
+    # via
+    #   auralis
+    #   transformers
+scikit-learn==1.6.1
+    # via librosa
+scipy==1.15.2
+    # via
+    #   librosa
+    #   pyloudnorm
+    #   scikit-learn
+semantic-version==2.10.0
+    # via gradio
+sentencepiece==0.2.0
+    # via
+    #   mistral-common
+    #   vllm
+setuptools==75.8.0
+    # via
+    #   auralis
+    #   marisa-trie
+    #   spacy
+    #   thinc
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via
+    #   ebooklib
+    #   python-dateutil
+smart-open==7.1.0
+    # via weasel
+sniffio==1.3.1
+    # via
+    #   anyio
+    #   openai
+sounddevice==0.5.1
+    # via auralis
+soundfile==0.13.1
+    # via
+    #   auralis
+    #   librosa
+soupsieve==2.6
+    # via beautifulsoup4
+soxr==0.5.0.post1
+    # via librosa
+spacy==3.7.5
+    # via auralis
+spacy-legacy==3.0.12
+    # via spacy
+spacy-loggers==1.0.5
+    # via spacy
+spans==1.1.1
+    # via bunkai
+srsly==2.5.1
+    # via
+    #   confection
+    #   spacy
+    #   thinc
+    #   weasel
+stack-data==0.6.3
+    # via ipython
+starlette==0.45.3
+    # via
+    #   fastapi
+    #   gradio
+    #   prometheus-fastapi-instrumentator
+sudachidict-core==20250129
+    # via auralis-tts (pyproject.toml)
+sudachipy==0.6.10
+    # via
+    #   auralis-tts (pyproject.toml)
+    #   sudachidict-core
+sympy==1.13.1
+    # via torch
+tenacity==9.0.0
+    # via langchain-core
+thinc==8.2.5
+    # via spacy
+threadpoolctl==3.5.0
+    # via scikit-learn
+tiktoken==0.9.0
+    # via
+    #   mistral-common
+    #   vllm
+tokenizers==0.21.0
+    # via
+    #   auralis
+    #   transformers
+    #   vllm
+toml==0.10.2
+    # via bunkai
+tomli==2.2.1
+    # via pytest
+tomlkit==0.13.2
+    # via gradio
+torch==2.5.1
+    # via
+    #   auralis-tts (pyproject.toml)
+    #   compressed-tensors
+    #   torchaudio
+    #   torchvision
+    #   vllm
+torchaudio==2.5.1
+    # via
+    #   auralis-tts (pyproject.toml)
+    #   auralis
+torchvision==0.20.1
+    # via
+    #   auralis-tts (pyproject.toml)
+    #   vllm
+tqdm==4.67.1
+    # via
+    #   bunkai
+    #   datasets
+    #   gguf
+    #   huggingface-hub
+    #   openai
+    #   outlines
+    #   spacy
+    #   transformers
+    #   unidic
+    #   vllm
+traitlets==5.14.3
+    # via
+    #   ipython
+    #   matplotlib-inline
+transformers==4.49.0
+    # via
+    #   auralis
+    #   compressed-tensors
+    #   vllm
+typer==0.15.1
+    # via
+    #   gradio
+    #   spacy
+    #   weasel
+typing-extensions==4.12.2
+    # via
+    #   anyio
+    #   beautifulsoup4
+    #   cloudpathlib
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   ipython
+    #   langchain-core
+    #   librosa
+    #   mistral-common
+    #   multidict
+    #   openai
+    #   outlines
+    #   pydantic
+    #   pydantic-core
+    #   referencing
+    #   rich
+    #   torch
+    #   typer
+    #   typing-inspect
+    #   uvicorn
+    #   vllm
+typing-inspect==0.9.0
+    # via dataclasses-json
+tzdata==2025.1
+    # via pandas
+unidic==1.1.0
+    # via auralis-tts (pyproject.toml)
+urllib3==2.3.0
+    # via requests
+uvicorn==0.34.0
+    # via
+    #   gradio
+    #   vllm
+uvloop==0.21.0
+    # via uvicorn
+vllm==0.6.4.post1
+    # via auralis
+wasabi==0.10.1
+    # via
+    #   spacy
+    #   thinc
+    #   unidic
+    #   weasel
+watchfiles==1.0.4
+    # via uvicorn
+wcwidth==0.2.13
+    # via prompt-toolkit
+weasel==0.4.1
+    # via spacy
+websockets==14.2
+    # via
+    #   gradio-client
+    #   uvicorn
+wrapt==1.17.2
+    # via smart-open
+xxhash==3.5.0
+    # via datasets
+yakinori==0.1.2
+    # via auralis-tts (pyproject.toml)
+yarl==1.18.3
+    # via aiohttp
+zipp==3.21.0
+    # via importlib-metadata
+zstandard==0.23.0
+    # via langsmith

tts_ui/__init__.py ADDED Viewed

File without changes

tts_ui/tts/__init__.py ADDED Viewed

File without changes

tts_ui/tts/auralis_tts_engine.py ADDED Viewed

	@@ -0,0 +1,271 @@

+from auralis import TTS, TTSRequest, TTSOutput, setup_logger
+from gradio import File, Files, Slider
+import torch
+from tts_ui.utils import (
+    calculate_byte_size,
+    split_text_into_chunks,
+    tmp_dir,
+    extract_text_from_epub,
+    text_from_file,
+    convert_audio,
+)
+from tts_ui.utils.doc_processor import DocumentProcessor
+import hashlib
+import torchaudio
+import time
+from pathlib import Path
+# Loading the TTS engine first and assign it to the class.
+# This looks ugly, but it works
+logger = setup_logger(__file__)
+tts = TTS()
+model_path = "AstraMindAI/xttsv2"  # change this if you have a different model
+gpt_model = "AstraMindAI/xtts2-gpt"
+try:
+    tts: TTS = tts.from_pretrained(
+        model_name_or_path=model_path,
+        gpt_model=gpt_model,
+        enforce_eager=False,
+        max_seq_len_to_capture=4096,  # Match WSL2 page size
+        scheduler_max_concurrency=4,
+    )
+    logger.info(f"Successfully loaded model {model_path}")
+except Exception as e:
+    error_msg = f"Failed to load model: {e}."
+    logger.error(error_msg)
+    raise Exception(error_msg)
+class AuralisTTSEngine:
+    def __init__(self):
+        self.logger = logger
+        self.tts: TTS = tts
+        self.model_path: str = model_path
+        self.gpt_model: str = gpt_model
+        self.tmp_dir: Path = tmp_dir
+        self.doc_processor = DocumentProcessor
+    def process_text_and_generate(
+        self,
+        input_text: str,
+        ref_audio_files: str | list[str] | bytes | list[bytes],
+        speed: float,
+        enhance_speech: bool,
+        temperature: float,
+        top_p: float,
+        top_k: float,
+        repetition_penalty: float,
+        language: str = "auto",
+        *args,
+    ):
+        """Process text and generate audio."""
+        log_messages: str = ""
+        if not ref_audio_files:
+            log_messages += "Please provide at least one reference audio!\n"
+            return None, log_messages
+        input_size = calculate_byte_size(input_text)
+        # use the chunking process if the text is too large
+        if input_size > 45000:
+            self.logger.info(
+                f"Found {input_size} bytes of text. Switching to chunk mode."
+            )
+            # todo: this function has a couple of overlapping functions as normal processing. I need to optimize the code
+            return self._process_large_text(
+                input_text,
+                ref_audio_files,
+                speed,
+                enhance_speech,
+                temperature,
+                top_p,
+                top_k,
+                repetition_penalty,
+                language,
+            )
+        else:
+            try:
+                with torch.no_grad():
+                    # clone voices from all file paths (shorten them)
+                    base64_voices: str | list[str] | bytes | list[bytes] = (
+                        ref_audio_files[:5]
+                    )
+                    request = TTSRequest(
+                        text=input_text,
+                        speaker_files=base64_voices,
+                        stream=False,
+                        enhance_speech=enhance_speech,
+                        temperature=temperature,
+                        top_p=top_p,
+                        top_k=top_k,
+                        repetition_penalty=repetition_penalty,
+                        language=language,
+                    )
+                    output: TTSOutput = self.tts.generate_speech(request)
+                    if output:
+                        if speed != 1:
+                            output.change_speed(speed)
+                        log_messages += f"✅ Successfully Generated audio\n"
+                        self.logger.info(log_messages)
+                        # return the sample rate and the audio file as a byte array
+                        return (
+                            output.sample_rate,
+                            convert_audio(output.array),
+                        ), log_messages
+                    else:
+                        log_messages += "❌ No output was generated. Check that the model was correctly loaded\n"
+                        return None, log_messages
+            except Exception as e:
+                self.logger.error(f"Error: {e}")
+                log_messages += f"❌ An Error occured: {e}\n"
+                return None, log_messages
+    def _process_large_text(
+        self,
+        input_full_text: str,
+        ref_audio_files: str | list[str] | bytes | list[bytes],
+        speed: float,
+        enhance_speech: bool,
+        temperature: float,
+        top_p: float,
+        top_k: float,
+        repetition_penalty: float,
+        language: str = "auto",
+    ):
+        """Process text in chunks and combine results"""
+        log_messages: str = ""
+        if not ref_audio_files:
+            log_messages += "Please provide at least one reference audio!\n"
+            return None, log_messages
+        base64_voices: str | list[str] | bytes | list[bytes] = ref_audio_files[:5]
+        chunks: list[str] = split_text_into_chunks(input_full_text)
+        print(f"Created {len(chunks)} chunks")
+        audio_segments: list[TTSOutput] = []
+        for idx, chunk in enumerate(chunks):
+            request = TTSRequest(
+                text=chunk,
+                speaker_files=base64_voices,
+                stream=False,
+                enhance_speech=enhance_speech,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                language=language,
+            )
+            try:
+                with torch.no_grad():
+                    audio = self.tts.generate_speech(request)
+                    audio_segments.append(audio)
+                    self.logger.info(f"Processed {idx + 1} chunks out of {len(chunks)}")
+            except Exception as e:
+                log_messages += f"❌ Chunk processing failed: {e}\n"
+                return None, log_messages
+        if len(audio_segments) <= 0:
+            log_messages += f"❌ Chunk processing failed. Chunk size: {len(chunks)}\n"
+            return None, log_messages
+        combined_output: TTSOutput = TTSOutput.combine_outputs(audio_segments)
+        if speed != 1:
+            combined_output.change_speed(speed)
+        log_messages += f"✅ Successfully Generated audio\n"
+        # return combined_output
+        return (
+            combined_output.sample_rate,
+            convert_audio(combined_output.array),
+        ), log_messages
+    def process_file_and_generate(
+        self,
+        file_input: File,
+        ref_audio_files_file: Files,
+        speed_file: Slider,
+        enhance_speech_file,
+        temperature_file,
+        top_p_file,
+        top_k_file,
+        repetition_penalty_file,
+        language_file,
+    ):
+        # todo: refactor this to use the document processor object
+        if file_input:
+            file_extension: str = Path(file_input.name).suffix
+            match file_extension:
+                case ".epub":
+                    input_text: str = extract_text_from_epub(file_input.name)
+                case ".txt" | ".md":
+                    input_text = text_from_file(file_input.name)
+                case _:
+                    return (
+                        None,
+                        "Unsupported file format, it needs to be either .epub or .txt",
+                    )
+            return self._process_large_text(
+                input_text,
+                ref_audio_files_file,
+                speed_file,
+                enhance_speech_file,
+                temperature_file,
+                top_p_file,
+                top_k_file,
+                repetition_penalty_file,
+                language_file,
+            )
+        else:
+            return None, "Please provide an .epub or .txt file!"
+    def process_mic_and_generate(
+        self,
+        input_text_mic,
+        mic_ref_audio,
+        speed_mic,
+        enhance_speech_mic,
+        temperature_mic,
+        top_p_mic,
+        top_k_mic,
+        repetition_penalty_mic,
+        language_mic,
+    ):
+        if mic_ref_audio:
+            data: bytes = str(time.time()).encode("utf-8")
+            hash: str = hashlib.sha1(data).hexdigest()[:10]
+            output_path = self.tmp_dir / (f"mic_{hash}.wav")
+            torch_audio: torch.Tensor = torch.from_numpy(mic_ref_audio[1].astype(float))
+            try:
+                torchaudio.save(
+                    str(output_path), torch_audio.unsqueeze(0), mic_ref_audio[0]
+                )
+                return self.process_text_and_generate(
+                    input_text_mic,
+                    [Path(output_path)],
+                    speed_mic,
+                    enhance_speech_mic,
+                    temperature_mic,
+                    top_p_mic,
+                    top_k_mic,
+                    repetition_penalty_mic,
+                    language_mic,
+                )
+            except Exception as e:
+                self.logger.error(f"Error saving audio file: {e}")
+                return None, f"Error saving audio file: {e}"
+        else:
+            return None, "Please record an audio!"

tts_ui/ui/__init__.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import gradio as gr
+from tts_ui.utils import *
+from tts_ui.tts.auralis_tts_engine import AuralisTTSEngine
+supported_langs: list[str] = [
+    "en",
+    "es",
+    "fr",
+    "de",
+    "it",
+    "pt",
+    "pl",
+    "tr",
+    "ru",
+    "nl",
+    "cs",
+    "ar",
+    "zh-cn",
+    "hu",
+    "ko",
+    "ja",
+    "hi",
+    "auto",
+]
+def build_gradio_ui(tts_engine: AuralisTTSEngine) -> gr.Blocks:
+    """Builds and launches the Gradio UI for Auralis."""
+    with gr.Blocks(title="Auralis TTS UI", theme="soft") as ui:
+        gr.Markdown(
+            """
+          # Text-to-Speech Interface
+          Convert text to speech with advanced voice cloning and enhancement.
+          Powered by Auralis 🌌 made by Hoon
+          """
+        )
+        with gr.Tab("Text to Speech"):
+            with gr.Row():
+                with gr.Column():
+                    input_text = gr.Text(
+                        label="Enter Text Here",
+                        placeholder="Write the text you want to convert...",
+                    )
+                    ref_audio_files = gr.Files(
+                        label="Reference Audio Files", file_types=["audio"]
+                    )
+                    with gr.Accordion("Advanced settings", open=False):
+                        speed = gr.Slider(
+                            label="Playback speed",
+                            minimum=0.5,
+                            maximum=2.0,
+                            value=1.0,
+                            step=0.1,
+                        )
+                        enhance_speech = gr.Checkbox(
+                            label="Enhance Reference Speech", value=False
+                        )
+                        temperature = gr.Slider(
+                            label="Temperature",
+                            minimum=0.5,
+                            maximum=1.0,
+                            value=0.75,
+                            step=0.05,
+                        )
+                        top_p = gr.Slider(
+                            label="Top P",
+                            minimum=0.5,
+                            maximum=1.0,
+                            value=0.85,
+                            step=0.05,
+                        )
+                        top_k = gr.Slider(
+                            label="Top K", minimum=0, maximum=100, value=50, step=10
+                        )
+                        repetition_penalty = gr.Slider(
+                            label="Repetition penalty",
+                            minimum=1.0,
+                            maximum=10.0,
+                            value=5.0,
+                            step=0.5,
+                        )
+                        language = gr.Dropdown(
+                            label="Target Language",
+                            choices=supported_langs,
+                            value="auto",
+                        )
+                    generate_button = gr.Button("Generate Speech")
+                with gr.Column():
+                    audio_output = gr.Audio(label="Generated Audio")
+                    log_output = gr.Text(label="Log Output")
+            generate_button.click(
+                fn=tts_engine.process_text_and_generate,
+                inputs=[
+                    input_text,
+                    ref_audio_files,
+                    speed,
+                    enhance_speech,
+                    temperature,
+                    top_p,
+                    top_k,
+                    repetition_penalty,
+                    language,
+                ],
+                outputs=[audio_output, log_output],
+            )
+        with gr.Tab("File to Speech"):
+            with gr.Row():
+                with gr.Column():
+                    file_input = gr.File(
+                        label="Text / Ebook File", file_types=[".txt", ".md", ".epub"]
+                    )
+                    ref_audio_files_file = gr.Files(
+                        label="Reference Audio Files", file_types=["audio"]
+                    )
+                    with gr.Accordion("Advanced settings", open=False):
+                        speed_file = gr.Slider(
+                            label="Playback speed",
+                            minimum=0.5,
+                            maximum=2.0,
+                            value=1.0,
+                            step=0.1,
+                        )
+                        enhance_speech_file = gr.Checkbox(
+                            label="Enhance Reference Speech", value=False
+                        )
+                        temperature_file = gr.Slider(
+                            label="Temperature",
+                            minimum=0.5,
+                            maximum=1.0,
+                            value=0.75,
+                            step=0.05,
+                        )
+                        top_p_file = gr.Slider(
+                            label="Top P",
+                            minimum=0.5,
+                            maximum=1.0,
+                            value=0.85,
+                            step=0.05,
+                        )
+                        top_k_file = gr.Slider(
+                            label="Top K", minimum=0, maximum=100, value=50, step=10
+                        )
+                        repetition_penalty_file = gr.Slider(
+                            label="Repetition penalty",
+                            minimum=1.0,
+                            maximum=10.0,
+                            value=5.0,
+                            step=0.5,
+                        )
+                        language_file = gr.Dropdown(
+                            label="Target Language",
+                            choices=supported_langs,
+                            value="auto",
+                        )
+                    generate_button_file = gr.Button("Generate Speech from File")
+                with gr.Column():
+                    audio_output_file = gr.Audio(label="Generated Audio")
+                    log_output_file = gr.Text(label="Log Output")
+            generate_button_file.click(
+                tts_engine.process_file_and_generate,
+                inputs=[
+                    file_input,
+                    ref_audio_files_file,
+                    speed_file,
+                    enhance_speech_file,
+                    temperature_file,
+                    top_p_file,
+                    top_k_file,
+                    repetition_penalty_file,
+                    language_file,
+                ],
+                outputs=[audio_output_file, log_output_file],
+            )
+        with gr.Tab("Clone With Microphone"):
+            with gr.Row():
+                with gr.Column():
+                    input_text_mic = gr.Text(
+                        label="Enter Text Here",
+                        placeholder="Write the text you want to convert...",
+                    )
+                    mic_ref_audio = gr.Audio(
+                        label="Record Reference Audio", sources=["microphone"]
+                    )
+                    with gr.Accordion("Advanced settings", open=False):
+                        speed_mic = gr.Slider(
+                            label="Playback speed",
+                            minimum=0.5,
+                            maximum=2.0,
+                            value=1.0,
+                            step=0.1,
+                        )
+                        enhance_speech_mic = gr.Checkbox(
+                            label="Enhance Reference Speech", value=True
+                        )
+                        temperature_mic = gr.Slider(
+                            label="Temperature",
+                            minimum=0.5,
+                            maximum=1.0,
+                            value=0.75,
+                            step=0.05,
+                        )
+                        top_p_mic = gr.Slider(
+                            label="Top P",
+                            minimum=0.5,
+                            maximum=1.0,
+                            value=0.85,
+                            step=0.05,
+                        )
+                        top_k_mic = gr.Slider(
+                            label="Top K", minimum=0, maximum=100, value=50, step=10
+                        )
+                        repetition_penalty_mic = gr.Slider(
+                            label="Repetition penalty",
+                            minimum=1.0,
+                            maximum=10.0,
+                            value=5.0,
+                            step=0.5,
+                        )
+                        language_mic = gr.Dropdown(
+                            label="Target Language",
+                            choices=supported_langs,
+                            value="auto",
+                        )
+                    generate_button_mic = gr.Button("Generate Speech")
+                with gr.Column():
+                    audio_output_mic = gr.Audio(label="Generated Audio")
+                    log_output_mic = gr.Text(label="Log Output")
+            generate_button_mic.click(
+                fn=tts_engine.process_mic_and_generate,
+                inputs=[
+                    input_text_mic,
+                    mic_ref_audio,
+                    speed_mic,
+                    enhance_speech_mic,
+                    temperature_mic,
+                    top_p_mic,
+                    top_k_mic,
+                    repetition_penalty_mic,
+                    language_mic,
+                ],
+                outputs=[audio_output_mic, log_output_mic],
+            )
+    return ui

tts_ui/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import base64
+import uuid
+import shutil
+from pathlib import Path
+import ebooklib
+from ebooklib import epub
+from bs4 import BeautifulSoup
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from yakinori import Yakinori
+import regex as re
+import numpy as np
+import jaconv
+import bunkai
+# Create a temporary directory to store short-named files
+tmp_dir = Path("/tmp/auralis")
+tmp_dir.mkdir(exist_ok=True)
+def shorten_filename(original_path: str) -> str:
+    """Copies the given file to a temporary directory with a shorter, random filename."""
+    ext: str = Path(original_path).suffix
+    short_name: str = "file_" + uuid.uuid4().hex[:8] + ext
+    short_path: Path = tmp_dir / short_name
+    shutil.copyfile(original_path, short_path)
+    return str(short_path)
+def extract_text_from_epub(epub_path: str, output_path=None) -> str:
+    """
+    Extracts text from an EPUB file and optionally saves it to a text file.
+    Args:
+        epub_path (str): Path to the EPUB file
+        output_path (str, optional): Path where to save the text file
+    Returns:
+        str: The extracted text
+    """
+    # Load the book
+    book: epub.EpubBook = epub.read_epub(epub_path)
+    # List to hold extracted text
+    chapters: list[str] = []
+    # Extract text from each chapter
+    for item in book.get_items():
+        if item.get_type() == ebooklib.ITEM_DOCUMENT:
+            # Get HTML content
+            html_content = item.get_content().decode("utf-8")
+            # Use BeautifulSoup to extract text
+            soup = BeautifulSoup(html_content, "html.parser")
+            # Remove scripts and styles
+            for script in soup(["script", "style"]):
+                script.decompose()
+            # Get text
+            text: str = soup.get_text()
+            # Clean text
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            text = "\n".join(chunk for chunk in chunks if chunk)
+            chapters.append(text)
+    # Join all chapters
+    full_text: str = "\n\n".join(chapters)
+    # Save text if output path is specified
+    if output_path:
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(full_text)
+    return full_text.replace("»", '"').replace("«", '"')
+def text_from_file(txt_file_path: str) -> str:
+    # Shorten filename before reading
+    txt_short_path: str = shorten_filename(txt_file_path)
+    with open(txt_short_path, "r") as f:
+        text: str = f.read()
+    return text
+def clone_voice(audio_path: str) -> str:
+    """Clone a voice from an audio path."""
+    # Shorten filename before reading
+    audio_short_path: str = shorten_filename(audio_path)
+    with open(audio_short_path, "rb") as f:
+        audio_data: str = base64.b64encode(f.read()).decode("utf-8")
+    return audio_data
+def calculate_byte_size(text: str) -> int:
+    """Calculate UTF-8 encoded byte size of text"""
+    return len(text.encode("utf-8"))
+def is_japanese(text) -> bool:
+    # Regex patterns for Hiragana, Katakana, and common Kanji/CJK unified blocks
+    hiragana = r"[\p{Hiragana}]"
+    katakana = r"[\p{Katakana}]"
+    # Check for Hiragana or Katakana (unique to Japanese)
+    return bool(re.search(hiragana, text) or re.search(katakana, text))
+def preprocess_japanese_text(text: str) -> str:
+    alpha2kana: str = jaconv.alphabet2kana(text)
+    normalized_jp: str = jaconv.normalize(alpha2kana)
+    yakinori = Yakinori()
+    splitter = bunkai.Bunkai()
+    sentences: np.Iterator[str] = splitter(normalized_jp)
+    final: str = ""
+    for sentence in sentences:
+        parsed_list: list[str] = yakinori.get_parsed_list(sentence)
+        final += yakinori.get_hiragana_sentence(parsed_list, is_hatsuon=True)
+    return final
+def convert_audio(data: np.ndarray) -> np.ndarray:
+    """Convert any float format to proper 16-bit PCM"""
+    if data.dtype in [np.float16, np.float32, np.float64]:
+        # Normalize first to [-1, 1] range
+        data = data.astype(np.float32) / np.max(np.abs(data))
+        # Scale to 16-bit int range
+        data = (data * 32767).astype(np.int16)
+    return data
+def split_text_into_chunks(
+    text: str, chunk_size: int = 2000, chunk_overlap: int = 100
+) -> list[str]:
+    """
+    Split text into chunks respecting byte limits and natural boundaries.
+    This function also automatically converts Japanese Kanji into Kana for better readability.
+    """
+    text_to_process = text
+    text_separators: list[str] = [
+        "\n\n",
+        "\n",
+        "。",
+        "．",
+        "？",
+        "！",
+        "?",
+        "!",
+        ",",
+        "、",
+        "，",
+        "」",
+        "』",
+        "\u3002",
+        "\uff0c",
+        "\u3001",
+        "\uff0e",
+        "",
+    ]
+    if is_japanese(text_to_process):
+        text_to_process = preprocess_japanese_text(text_to_process)
+    splitter = RecursiveCharacterTextSplitter(
+        separators=text_separators,
+        chunk_size=chunk_size,  # Optimized for TTS context windows
+        chunk_overlap=chunk_overlap,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    return splitter.split_text(text)

tts_ui/utils/doc_processor.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import markdown
+import pdfplumber
+from pathlib import Path
+from tts_ui.utils import split_text_into_chunks, extract_text_from_epub, text_from_file
+class DocumentProcessor:
+    def __init__(self, max_word_chunk_size=4000):
+        self.max_word_chunk_size: int = max_word_chunk_size  # Characters per chunk
+    def process_doc(self, file_path: Path) -> list[str]:
+        # get the file extension from the path
+        ext: str = file_path.name.split(".")[-1].lower()
+        match ext:
+            case "pdf":
+                return self._process_pdf(file_path)
+            case "epub":
+                return self._process_epub(file_path)
+            case "md":
+                return self._process_markdown(file_path)
+            case "txt":
+                return self._process_text(file_path)
+            case _:
+                raise Exception(f"No file found in {file_path}")
+    def _process_pdf(self, file_path: str) -> list[str]:
+        text = ""
+        with pdfplumber.open(file_path) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text() + "\n"
+        return self._chunk_text(text)
+    def _process_epub(self, file_path: str) -> list[str]:
+        text = extract_text_from_epub(file_path)
+        return self._chunk_text(text)
+    def _process_markdown(self, file_path: str) -> list[str]:
+        with open(file_path, "r") as f:
+            md_text: str = f.read()
+        return self._chunk_text(markdown.markdown(md_text))
+    def _process_text(self, file_path: str) -> list[str]:
+        text = text_from_file(file_path)
+        return self._chunk_text(text)
+    def _chunk_text(self, text: str) -> list[str]:
+        return split_text_into_chunks(text, self.max_word_chunk_size)

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff