File size: 5,231 Bytes
597e812
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import base64
import uuid
import shutil
from pathlib import Path
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter
from yakinori import Yakinori
import regex as re
import numpy as np
import jaconv
import bunkai

# Create a temporary directory to store short-named files
tmp_dir = Path("/tmp/auralis")
tmp_dir.mkdir(exist_ok=True)


def shorten_filename(original_path: str) -> str:
    """Copies the given file to a temporary directory with a shorter, random filename."""
    ext: str = Path(original_path).suffix
    short_name: str = "file_" + uuid.uuid4().hex[:8] + ext
    short_path: Path = tmp_dir / short_name
    shutil.copyfile(original_path, short_path)
    return str(short_path)


def extract_text_from_epub(epub_path: str, output_path=None) -> str:
    """
    Extracts text from an EPUB file and optionally saves it to a text file.

    Args:
        epub_path (str): Path to the EPUB file
        output_path (str, optional): Path where to save the text file

    Returns:
        str: The extracted text
    """
    # Load the book
    book: epub.EpubBook = epub.read_epub(epub_path)

    # List to hold extracted text
    chapters: list[str] = []

    # Extract text from each chapter
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            # Get HTML content
            html_content = item.get_content().decode("utf-8")

            # Use BeautifulSoup to extract text
            soup = BeautifulSoup(html_content, "html.parser")

            # Remove scripts and styles
            for script in soup(["script", "style"]):
                script.decompose()

            # Get text
            text: str = soup.get_text()

            # Clean text
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = "\n".join(chunk for chunk in chunks if chunk)

            chapters.append(text)

    # Join all chapters
    full_text: str = "\n\n".join(chapters)

    # Save text if output path is specified
    if output_path:
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(full_text)

    return full_text.replace("»", '"').replace("«", '"')


def text_from_file(txt_file_path: str) -> str:
    # Shorten filename before reading
    txt_short_path: str = shorten_filename(txt_file_path)
    with open(txt_short_path, "r") as f:
        text: str = f.read()
    return text


def clone_voice(audio_path: str) -> str:
    """Clone a voice from an audio path."""
    # Shorten filename before reading
    audio_short_path: str = shorten_filename(audio_path)
    with open(audio_short_path, "rb") as f:
        audio_data: str = base64.b64encode(f.read()).decode("utf-8")
    return audio_data


def calculate_byte_size(text: str) -> int:
    """Calculate UTF-8 encoded byte size of text"""
    return len(text.encode("utf-8"))


def is_japanese(text) -> bool:
    # Regex patterns for Hiragana, Katakana, and common Kanji/CJK unified blocks
    hiragana = r"[\p{Hiragana}]"
    katakana = r"[\p{Katakana}]"

    # Check for Hiragana or Katakana (unique to Japanese)
    return bool(re.search(hiragana, text) or re.search(katakana, text))


def preprocess_japanese_text(text: str) -> str:
    alpha2kana: str = jaconv.alphabet2kana(text)
    normalized_jp: str = jaconv.normalize(alpha2kana)

    yakinori = Yakinori()

    splitter = bunkai.Bunkai()

    sentences: np.Iterator[str] = splitter(normalized_jp)

    final: str = ""

    for sentence in sentences:
        parsed_list: list[str] = yakinori.get_parsed_list(sentence)
        final += yakinori.get_hiragana_sentence(parsed_list, is_hatsuon=True)

    return final


def convert_audio(data: np.ndarray) -> np.ndarray:
    """Convert any float format to proper 16-bit PCM"""
    if data.dtype in [np.float16, np.float32, np.float64]:
        # Normalize first to [-1, 1] range
        data = data.astype(np.float32) / np.max(np.abs(data))
        # Scale to 16-bit int range
        data = (data * 32767).astype(np.int16)
    return data


def split_text_into_chunks(
    text: str, chunk_size: int = 2000, chunk_overlap: int = 100
) -> list[str]:
    """
    Split text into chunks respecting byte limits and natural boundaries.
    This function also automatically converts Japanese Kanji into Kana for better readability.
    """

    text_to_process = text

    text_separators: list[str] = [
        "\n\n",
        "\n",
        "。",
        ".",
        "?",
        "!",
        "?",
        "!",
        ",",
        "、",
        ",",
        "」",
        "』",
        "\u3002",
        "\uff0c",
        "\u3001",
        "\uff0e",
        "",
    ]

    if is_japanese(text_to_process):
        text_to_process = preprocess_japanese_text(text_to_process)

    splitter = RecursiveCharacterTextSplitter(
        separators=text_separators,
        chunk_size=chunk_size,  # Optimized for TTS context windows
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )

    return splitter.split_text(text)