Spaces:
Build error
Build error
import base64 | |
import uuid | |
import shutil | |
from pathlib import Path | |
import ebooklib | |
from ebooklib import epub | |
from bs4 import BeautifulSoup | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from yakinori import Yakinori | |
import regex as re | |
import numpy as np | |
import jaconv | |
import bunkai | |
# Create a temporary directory to store short-named files | |
tmp_dir = Path("/tmp/auralis") | |
tmp_dir.mkdir(exist_ok=True) | |
def shorten_filename(original_path: str) -> str: | |
"""Copies the given file to a temporary directory with a shorter, random filename.""" | |
ext: str = Path(original_path).suffix | |
short_name: str = "file_" + uuid.uuid4().hex[:8] + ext | |
short_path: Path = tmp_dir / short_name | |
shutil.copyfile(original_path, short_path) | |
return str(short_path) | |
def extract_text_from_epub(epub_path: str, output_path=None) -> str: | |
""" | |
Extracts text from an EPUB file and optionally saves it to a text file. | |
Args: | |
epub_path (str): Path to the EPUB file | |
output_path (str, optional): Path where to save the text file | |
Returns: | |
str: The extracted text | |
""" | |
# Load the book | |
book: epub.EpubBook = epub.read_epub(epub_path) | |
# List to hold extracted text | |
chapters: list[str] = [] | |
# Extract text from each chapter | |
for item in book.get_items(): | |
if item.get_type() == ebooklib.ITEM_DOCUMENT: | |
# Get HTML content | |
html_content = item.get_content().decode("utf-8") | |
# Use BeautifulSoup to extract text | |
soup = BeautifulSoup(html_content, "html.parser") | |
# Remove scripts and styles | |
for script in soup(["script", "style"]): | |
script.decompose() | |
# Get text | |
text: str = soup.get_text() | |
# Clean text | |
lines = (line.strip() for line in text.splitlines()) | |
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
text = "\n".join(chunk for chunk in chunks if chunk) | |
chapters.append(text) | |
# Join all chapters | |
full_text: str = "\n\n".join(chapters) | |
# Save text if output path is specified | |
if output_path: | |
with open(output_path, "w", encoding="utf-8") as f: | |
f.write(full_text) | |
return full_text.replace("»", '"').replace("«", '"') | |
def text_from_file(txt_file_path: str) -> str: | |
# Shorten filename before reading | |
txt_short_path: str = shorten_filename(txt_file_path) | |
with open(txt_short_path, "r") as f: | |
text: str = f.read() | |
return text | |
def clone_voice(audio_path: str) -> str: | |
"""Clone a voice from an audio path.""" | |
# Shorten filename before reading | |
audio_short_path: str = shorten_filename(audio_path) | |
with open(audio_short_path, "rb") as f: | |
audio_data: str = base64.b64encode(f.read()).decode("utf-8") | |
return audio_data | |
def calculate_byte_size(text: str) -> int: | |
"""Calculate UTF-8 encoded byte size of text""" | |
return len(text.encode("utf-8")) | |
def is_japanese(text) -> bool: | |
# Regex patterns for Hiragana, Katakana, and common Kanji/CJK unified blocks | |
hiragana = r"[\p{Hiragana}]" | |
katakana = r"[\p{Katakana}]" | |
# Check for Hiragana or Katakana (unique to Japanese) | |
return bool(re.search(hiragana, text) or re.search(katakana, text)) | |
def preprocess_japanese_text(text: str) -> str: | |
alpha2kana: str = jaconv.alphabet2kana(text) | |
normalized_jp: str = jaconv.normalize(alpha2kana) | |
yakinori = Yakinori() | |
splitter = bunkai.Bunkai() | |
sentences: np.Iterator[str] = splitter(normalized_jp) | |
final: str = "" | |
for sentence in sentences: | |
parsed_list: list[str] = yakinori.get_parsed_list(sentence) | |
final += yakinori.get_hiragana_sentence(parsed_list, is_hatsuon=True) | |
return final | |
def convert_audio(data: np.ndarray) -> np.ndarray: | |
"""Convert any float format to proper 16-bit PCM""" | |
if data.dtype in [np.float16, np.float32, np.float64]: | |
# Normalize first to [-1, 1] range | |
data = data.astype(np.float32) / np.max(np.abs(data)) | |
# Scale to 16-bit int range | |
data = (data * 32767).astype(np.int16) | |
return data | |
def split_text_into_chunks( | |
text: str, chunk_size: int = 2000, chunk_overlap: int = 100 | |
) -> list[str]: | |
""" | |
Split text into chunks respecting byte limits and natural boundaries. | |
This function also automatically converts Japanese Kanji into Kana for better readability. | |
""" | |
text_to_process = text | |
text_separators: list[str] = [ | |
"\n\n", | |
"\n", | |
"。", | |
".", | |
"?", | |
"!", | |
"?", | |
"!", | |
",", | |
"、", | |
",", | |
"」", | |
"』", | |
"\u3002", | |
"\uff0c", | |
"\u3001", | |
"\uff0e", | |
"", | |
] | |
if is_japanese(text_to_process): | |
text_to_process = preprocess_japanese_text(text_to_process) | |
splitter = RecursiveCharacterTextSplitter( | |
separators=text_separators, | |
chunk_size=chunk_size, # Optimized for TTS context windows | |
chunk_overlap=chunk_overlap, | |
length_function=len, | |
is_separator_regex=False, | |
) | |
return splitter.split_text(text) | |