gpt-tts-ui / tts_ui /utils /__init__.py
hoonsubin's picture
add base proj
597e812
raw
history blame
5.23 kB
import base64
import uuid
import shutil
from pathlib import Path
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter
from yakinori import Yakinori
import regex as re
import numpy as np
import jaconv
import bunkai
# Create a temporary directory to store short-named files
tmp_dir = Path("/tmp/auralis")
tmp_dir.mkdir(exist_ok=True)
def shorten_filename(original_path: str) -> str:
"""Copies the given file to a temporary directory with a shorter, random filename."""
ext: str = Path(original_path).suffix
short_name: str = "file_" + uuid.uuid4().hex[:8] + ext
short_path: Path = tmp_dir / short_name
shutil.copyfile(original_path, short_path)
return str(short_path)
def extract_text_from_epub(epub_path: str, output_path=None) -> str:
"""
Extracts text from an EPUB file and optionally saves it to a text file.
Args:
epub_path (str): Path to the EPUB file
output_path (str, optional): Path where to save the text file
Returns:
str: The extracted text
"""
# Load the book
book: epub.EpubBook = epub.read_epub(epub_path)
# List to hold extracted text
chapters: list[str] = []
# Extract text from each chapter
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
# Get HTML content
html_content = item.get_content().decode("utf-8")
# Use BeautifulSoup to extract text
soup = BeautifulSoup(html_content, "html.parser")
# Remove scripts and styles
for script in soup(["script", "style"]):
script.decompose()
# Get text
text: str = soup.get_text()
# Clean text
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk)
chapters.append(text)
# Join all chapters
full_text: str = "\n\n".join(chapters)
# Save text if output path is specified
if output_path:
with open(output_path, "w", encoding="utf-8") as f:
f.write(full_text)
return full_text.replace("»", '"').replace("«", '"')
def text_from_file(txt_file_path: str) -> str:
# Shorten filename before reading
txt_short_path: str = shorten_filename(txt_file_path)
with open(txt_short_path, "r") as f:
text: str = f.read()
return text
def clone_voice(audio_path: str) -> str:
"""Clone a voice from an audio path."""
# Shorten filename before reading
audio_short_path: str = shorten_filename(audio_path)
with open(audio_short_path, "rb") as f:
audio_data: str = base64.b64encode(f.read()).decode("utf-8")
return audio_data
def calculate_byte_size(text: str) -> int:
"""Calculate UTF-8 encoded byte size of text"""
return len(text.encode("utf-8"))
def is_japanese(text) -> bool:
# Regex patterns for Hiragana, Katakana, and common Kanji/CJK unified blocks
hiragana = r"[\p{Hiragana}]"
katakana = r"[\p{Katakana}]"
# Check for Hiragana or Katakana (unique to Japanese)
return bool(re.search(hiragana, text) or re.search(katakana, text))
def preprocess_japanese_text(text: str) -> str:
alpha2kana: str = jaconv.alphabet2kana(text)
normalized_jp: str = jaconv.normalize(alpha2kana)
yakinori = Yakinori()
splitter = bunkai.Bunkai()
sentences: np.Iterator[str] = splitter(normalized_jp)
final: str = ""
for sentence in sentences:
parsed_list: list[str] = yakinori.get_parsed_list(sentence)
final += yakinori.get_hiragana_sentence(parsed_list, is_hatsuon=True)
return final
def convert_audio(data: np.ndarray) -> np.ndarray:
"""Convert any float format to proper 16-bit PCM"""
if data.dtype in [np.float16, np.float32, np.float64]:
# Normalize first to [-1, 1] range
data = data.astype(np.float32) / np.max(np.abs(data))
# Scale to 16-bit int range
data = (data * 32767).astype(np.int16)
return data
def split_text_into_chunks(
text: str, chunk_size: int = 2000, chunk_overlap: int = 100
) -> list[str]:
"""
Split text into chunks respecting byte limits and natural boundaries.
This function also automatically converts Japanese Kanji into Kana for better readability.
"""
text_to_process = text
text_separators: list[str] = [
"\n\n",
"\n",
"。",
".",
"?",
"!",
"?",
"!",
",",
"、",
",",
"」",
"』",
"\u3002",
"\uff0c",
"\u3001",
"\uff0e",
"",
]
if is_japanese(text_to_process):
text_to_process = preprocess_japanese_text(text_to_process)
splitter = RecursiveCharacterTextSplitter(
separators=text_separators,
chunk_size=chunk_size, # Optimized for TTS context windows
chunk_overlap=chunk_overlap,
length_function=len,
is_separator_regex=False,
)
return splitter.split_text(text)