Spaces:

hoonsubin
/

gpt-tts-ui

Build error

Hoon Kim

add base proj

597e812 7 months ago

5.23 kB

	import base64
	import uuid
	import shutil
	from pathlib import Path
	import ebooklib
	from ebooklib import epub
	from bs4 import BeautifulSoup
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from yakinori import Yakinori
	import regex as re
	import numpy as np
	import jaconv
	import bunkai

	# Create a temporary directory to store short-named files
	tmp_dir = Path("/tmp/auralis")
	tmp_dir.mkdir(exist_ok=True)


	def shorten_filename(original_path: str) -> str:
	"""Copies the given file to a temporary directory with a shorter, random filename."""
	ext: str = Path(original_path).suffix
	short_name: str = "file_" + uuid.uuid4().hex[:8] + ext
	short_path: Path = tmp_dir / short_name
	shutil.copyfile(original_path, short_path)
	return str(short_path)


	def extract_text_from_epub(epub_path: str, output_path=None) -> str:
	"""
	Extracts text from an EPUB file and optionally saves it to a text file.

	Args:
	epub_path (str): Path to the EPUB file
	output_path (str, optional): Path where to save the text file

	Returns:
	str: The extracted text
	"""
	# Load the book
	book: epub.EpubBook = epub.read_epub(epub_path)

	# List to hold extracted text
	chapters: list[str] = []

	# Extract text from each chapter
	for item in book.get_items():
	if item.get_type() == ebooklib.ITEM_DOCUMENT:
	# Get HTML content
	html_content = item.get_content().decode("utf-8")

	# Use BeautifulSoup to extract text
	soup = BeautifulSoup(html_content, "html.parser")

	# Remove scripts and styles
	for script in soup(["script", "style"]):
	script.decompose()

	# Get text
	text: str = soup.get_text()

	# Clean text
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = "\n".join(chunk for chunk in chunks if chunk)

	chapters.append(text)

	# Join all chapters
	full_text: str = "\n\n".join(chapters)

	# Save text if output path is specified
	if output_path:
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(full_text)

	return full_text.replace("»", '"').replace("«", '"')


	def text_from_file(txt_file_path: str) -> str:
	# Shorten filename before reading
	txt_short_path: str = shorten_filename(txt_file_path)
	with open(txt_short_path, "r") as f:
	text: str = f.read()
	return text


	def clone_voice(audio_path: str) -> str:
	"""Clone a voice from an audio path."""
	# Shorten filename before reading
	audio_short_path: str = shorten_filename(audio_path)
	with open(audio_short_path, "rb") as f:
	audio_data: str = base64.b64encode(f.read()).decode("utf-8")
	return audio_data


	def calculate_byte_size(text: str) -> int:
	"""Calculate UTF-8 encoded byte size of text"""
	return len(text.encode("utf-8"))


	def is_japanese(text) -> bool:
	# Regex patterns for Hiragana, Katakana, and common Kanji/CJK unified blocks
	hiragana = r"[\p{Hiragana}]"
	katakana = r"[\p{Katakana}]"

	# Check for Hiragana or Katakana (unique to Japanese)
	return bool(re.search(hiragana, text) or re.search(katakana, text))


	def preprocess_japanese_text(text: str) -> str:
	alpha2kana: str = jaconv.alphabet2kana(text)
	normalized_jp: str = jaconv.normalize(alpha2kana)

	yakinori = Yakinori()

	splitter = bunkai.Bunkai()

	sentences: np.Iterator[str] = splitter(normalized_jp)

	final: str = ""

	for sentence in sentences:
	parsed_list: list[str] = yakinori.get_parsed_list(sentence)
	final += yakinori.get_hiragana_sentence(parsed_list, is_hatsuon=True)

	return final


	def convert_audio(data: np.ndarray) -> np.ndarray:
	"""Convert any float format to proper 16-bit PCM"""
	if data.dtype in [np.float16, np.float32, np.float64]:
	# Normalize first to [-1, 1] range
	data = data.astype(np.float32) / np.max(np.abs(data))
	# Scale to 16-bit int range
	data = (data * 32767).astype(np.int16)
	return data


	def split_text_into_chunks(
	text: str, chunk_size: int = 2000, chunk_overlap: int = 100
	) -> list[str]:
	"""
	Split text into chunks respecting byte limits and natural boundaries.
	This function also automatically converts Japanese Kanji into Kana for better readability.
	"""

	text_to_process = text

	text_separators: list[str] = [
	"\n\n",
	"\n",
	"。",
	"．",
	"？",
	"！",
	"?",
	"!",
	",",
	"、",
	"，",
	"」",
	"』",
	"\u3002",
	"\uff0c",
	"\u3001",
	"\uff0e",
	"",
	]

	if is_japanese(text_to_process):
	text_to_process = preprocess_japanese_text(text_to_process)

	splitter = RecursiveCharacterTextSplitter(
	separators=text_separators,
	chunk_size=chunk_size, # Optimized for TTS context windows
	chunk_overlap=chunk_overlap,
	length_function=len,
	is_separator_regex=False,
	)

	return splitter.split_text(text)