File size: 14,464 Bytes
92d5846 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 |
import time
import requests
import pathlib
from io import BytesIO
from playsound import playsound
from webscout import exceptions
from webscout.AIbase import TTSProvider
from webscout.litagent import LitAgent
from concurrent.futures import ThreadPoolExecutor, as_completed
"""
Text processing utilities for TTS providers.
"""
from typing import List, Dict, Tuple, Set, Optional, Pattern
import re
class SentenceTokenizer:
"""Advanced sentence tokenizer with support for complex cases and proper formatting."""
def __init__(self) -> None:
# Common abbreviations by category
self.TITLES: Set[str] = {
'mr', 'mrs', 'ms', 'dr', 'prof', 'rev', 'sr', 'jr', 'esq',
'hon', 'pres', 'gov', 'atty', 'supt', 'det', 'rev', 'col','maj', 'gen', 'capt', 'cmdr',
'lt', 'sgt', 'cpl', 'pvt'
}
self.ACADEMIC: Set[str] = {
'ph.d', 'phd', 'm.d', 'md', 'b.a', 'ba', 'm.a', 'ma', 'd.d.s', 'dds',
'm.b.a', 'mba', 'b.sc', 'bsc', 'm.sc', 'msc', 'llb', 'll.b', 'bl'
}
self.ORGANIZATIONS: Set[str] = {
'inc', 'ltd', 'co', 'corp', 'llc', 'llp', 'assn', 'bros', 'plc', 'cos',
'intl', 'dept', 'est', 'dist', 'mfg', 'div'
}
self.MONTHS: Set[str] = {
'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'
}
self.UNITS: Set[str] = {
'oz', 'pt', 'qt', 'gal', 'ml', 'cc', 'km', 'cm', 'mm', 'ft', 'in',
'kg', 'lb', 'lbs', 'hz', 'khz', 'mhz', 'ghz', 'kb', 'mb', 'gb', 'tb'
}
self.TECHNOLOGY: Set[str] = {
'v', 'ver', 'app', 'sys', 'dir', 'exe', 'lib', 'api', 'sdk', 'url',
'cpu', 'gpu', 'ram', 'rom', 'hdd', 'ssd', 'lan', 'wan', 'sql', 'html'
}
self.MISC: Set[str] = {
'vs', 'etc', 'ie', 'eg', 'no', 'al', 'ca', 'cf', 'pp', 'est', 'st',
'approx', 'appt', 'apt', 'dept', 'depts', 'min', 'max', 'avg'
}
# Combine all abbreviations
self.all_abbreviations: Set[str] = (
self.TITLES | self.ACADEMIC | self.ORGANIZATIONS |
self.MONTHS | self.UNITS | self.TECHNOLOGY | self.MISC
)
# Special patterns
self.ELLIPSIS: str = r'\.{2,}|β¦'
self.URL_PATTERN: str = (
r'(?:https?:\/\/|www\.)[\w\-\.]+\.[a-zA-Z]{2,}(?:\/[^\s]*)?'
)
self.EMAIL_PATTERN: str = r'[\w\.-]+@[\w\.-]+\.\w+'
self.NUMBER_PATTERN: str = (
r'\d+(?:\.\d+)?(?:%|Β°|km|cm|mm|m|kg|g|lb|ft|in|mph|kmh|hz|mhz|ghz)?'
)
# Quote and bracket pairs
self.QUOTE_PAIRS: Dict[str, str] = {
'"': '"', "'": "'", '"': '"', "γ": "γ", "γ": "γ",
"Β«": "Β»", "βΉ": "βΊ", "'": "'", "β": "'"
}
self.BRACKETS: Dict[str, str] = {
'(': ')', '[': ']', '{': '}', 'β¨': 'β©', 'γ': 'γ',
'γ': 'γ', 'γ': 'γ', 'γ': 'γ', 'ο½’': 'ο½£'
}
# Compile regex patterns
self._compile_patterns()
def _compile_patterns(self) -> None:
"""Compile regex patterns for better performance."""
# Pattern for finding potential sentence boundaries
self.SENTENCE_END: Pattern = re.compile(
r'''
# Group for sentence endings
(?:
# Standard endings with optional quotes/brackets
(?<=[.!?])[\"\'\)\]\}Β»βΊγγ\s]*
# Ellipsis
|(?:\.{2,}|β¦)
# Asian-style endings
|(?<=[γοΌοΌγγγ\s])
)
# Must be followed by whitespace and capital letter or number
(?=\s+(?:[A-Z0-9]|["'({[\[γγγβΉγ][A-Z]))
''',
re.VERBOSE
)
# Pattern for abbreviations
abbrev_pattern = '|'.join(re.escape(abbr) for abbr in self.all_abbreviations)
self.ABBREV_PATTERN: Pattern = re.compile(
fr'\b(?:{abbrev_pattern})\.?',
re.IGNORECASE
)
def _protect_special_cases(self, text: str) -> Tuple[str, Dict[str, str]]:
"""Protect URLs, emails, and other special cases from being split."""
protected = text
placeholders: Dict[str, str] = {}
counter = 0
# Protect URLs and emails
for pattern in [self.URL_PATTERN, self.EMAIL_PATTERN]:
for match in re.finditer(pattern, protected):
placeholder = f'__PROTECTED_{counter}__'
placeholders[placeholder] = match.group()
protected = protected.replace(match.group(), placeholder)
counter += 1
# Protect quoted content
stack = []
protected_chars = list(protected)
i = 0
while i < len(protected_chars):
char = protected_chars[i]
if char in self.QUOTE_PAIRS:
stack.append((char, i))
elif stack and char == self.QUOTE_PAIRS[stack[-1][0]]:
start_quote, start_idx = stack.pop()
content = ''.join(protected_chars[start_idx:i + 1])
placeholder = f'__PROTECTED_{counter}__'
placeholders[placeholder] = content
protected_chars[start_idx:i + 1] = list(placeholder)
counter += 1
i += 1
return ''.join(protected_chars), placeholders
def _restore_special_cases(self, text: str, placeholders: Dict[str, str]) -> str:
"""Restore protected content."""
restored = text
for placeholder, original in placeholders.items():
restored = restored.replace(placeholder, original)
return restored
def _handle_abbreviations(self, text: str) -> str:
"""Handle abbreviations to prevent incorrect sentence splitting."""
def replace_abbrev(match: re.Match) -> str:
abbr = match.group().lower().rstrip('.')
if abbr in self.all_abbreviations:
return match.group().replace('.', '__DOT__')
return match.group()
return self.ABBREV_PATTERN.sub(replace_abbrev, text)
def _normalize_whitespace(self, text: str) -> str:
"""Normalize whitespace while preserving paragraph breaks."""
# Replace multiple newlines with special marker
text = re.sub(r'\n\s*\n', ' __PARA__ ', text)
# Normalize remaining whitespace
text = re.sub(r'\s+', ' ', text)
return text.strip()
def _restore_formatting(self, sentences: List[str]) -> List[str]:
"""Restore original formatting and clean up sentences."""
restored = []
for sentence in sentences:
# Restore dots in abbreviations
sentence = sentence.replace('__DOT__', '.')
# Restore paragraph breaks
sentence = sentence.replace('__PARA__', '\n\n')
# Clean up whitespace
sentence = re.sub(r'\s+', ' ', sentence).strip()
# Capitalize first letter if it's lowercase and not an abbreviation
words = sentence.split()
if words and words[0].lower() not in self.all_abbreviations:
sentence = sentence[0].upper() + sentence[1:]
if sentence:
restored.append(sentence)
return restored
def tokenize(self, text: str) -> List[str]:
"""
Split text into sentences while handling complex cases.
Args:
text (str): Input text to split into sentences.
Returns:
List[str]: List of properly formatted sentences.
"""
if not text or not text.strip():
return []
# Step 1: Protect special cases
protected_text, placeholders = self._protect_special_cases(text)
# Step 2: Normalize whitespace
protected_text = self._normalize_whitespace(protected_text)
# Step 3: Handle abbreviations
protected_text = self._handle_abbreviations(protected_text)
# Step 4: Split into potential sentences
potential_sentences = self.SENTENCE_END.split(protected_text)
# Step 5: Process and restore formatting
sentences = self._restore_formatting(potential_sentences)
# Step 6: Restore special cases
sentences = [self._restore_special_cases(s, placeholders) for s in sentences]
# Step 7: Post-process sentences
final_sentences = []
current_sentence = []
for sentence in sentences:
# Skip empty sentences
if not sentence.strip():
continue
# Check if sentence might be continuation of previous
if current_sentence and sentence[0].islower():
current_sentence.append(sentence)
else:
if current_sentence:
final_sentences.append(' '.join(current_sentence))
current_sentence = [sentence]
# Add last sentence if exists
if current_sentence:
final_sentences.append(' '.join(current_sentence))
return final_sentences
def split_sentences(text: str) -> List[str]:
"""
Convenience function to split text into sentences using SentenceTokenizer.
Args:
text (str): Input text to split into sentences.
Returns:
List[str]: List of properly formatted sentences.
"""
tokenizer = SentenceTokenizer()
return tokenizer.tokenize(text)
class ElevenlabsTTS(TTSProvider):
"""
Text-to-speech provider using the ElevenlabsTTS API.
"""
# Request headers
headers: dict[str, str] = {
"User-Agent": LitAgent().random()
}
cache_dir = pathlib.Path("./audio_cache")
all_voices: dict[str, str] = {"Brian": "nPczCjzI2devNBz1zQrb", "Alice":"Xb7hH8MSUJpSbSDYk0k2", "Bill":"pqHfZKP75CvOlQylNhV4", "Callum":"N2lVS1w4EtoT3dr4eOWO", "Charlie":"IKne3meq5aSn9XLyUdCD", "Charlotte":"XB0fDUnXU5powFXDhCwa", "Chris":"iP95p4xoKVk53GoZ742B", "Daniel":"onwK4e9ZLuTAKqWW03F9", "Eric":"cjVigY5qzO86Huf0OWal", "George":"JBFqnCBsd6RMkjVDRZzb", "Jessica":"cgSgspJ2msm6clMCkdW9", "Laura":"FGY2WhTYpPnrIDTdsKH5", "Liam":"TX3LPaxmHKxFdv7VOQHJ", "Lily":"pFZP5JQG7iQjIQuC4Bku", "Matilda":"XrExE9yKIg1WjnnlVkGX", "Sarah":"EXAVITQu4vr4xnSDxMaL", "Will":"bIHbv24MWmeRgasZH58o", "Neal":"Zp1aWhL05Pi5BkhizFC3"}
def __init__(self, timeout: int = 20, proxies: dict = None):
"""Initializes the ElevenlabsTTS TTS client."""
self.session = requests.Session()
self.session.headers.update(self.headers)
if proxies:
self.session.proxies.update(proxies)
self.timeout = timeout
self.params = {'allow_unauthenticated': '1'}
def tts(self, text: str, voice: str = "Brian", verbose:bool = True) -> str:
"""
Converts text to speech using the ElevenlabsTTS API and saves it to a file.
"""
assert (
voice in self.all_voices
), f"Voice '{voice}' not one of [{', '.join(self.all_voices.keys())}]"
filename = self.cache_dir / f"{int(time.time())}.mp3"
# Split text into sentences
sentences = split_sentences(text)
# Function to request audio for each chunk
def generate_audio_for_chunk(part_text: str, part_number: int):
while True:
try:
json_data = {'text': part_text, 'model_id': 'eleven_multilingual_v2'}
response = self.session.post(f'https://api.elevenlabs.io/v1/text-to-speech/{self.all_voices[voice]}',params=self.params, headers=self.headers, json=json_data, timeout=self.timeout)
response.raise_for_status()
# Create the audio_cache directory if it doesn't exist
self.cache_dir.mkdir(parents=True, exist_ok=True)
# Check if the request was successful
if response.ok and response.status_code == 200:
return part_number, response.content
else:
raise exceptions.FailedToGenerateResponseError(
f"Failed to generate audio for chunk {part_number}: {response.status_code}"
)
except requests.RequestException as e:
time.sleep(1)
continue
try:
# Using ThreadPoolExecutor to handle requests concurrently
with ThreadPoolExecutor() as executor:
futures = {executor.submit(generate_audio_for_chunk, sentence.strip(), chunk_num): chunk_num
for chunk_num, sentence in enumerate(sentences, start=1)}
# Dictionary to store results with order preserved
audio_chunks = {}
for future in as_completed(futures):
chunk_num = futures[future]
try:
part_number, audio_data = future.result()
audio_chunks[part_number] = audio_data
except Exception as e:
raise exceptions.FailedToGenerateResponseError(
f"Failed to generate audio for chunk {chunk_num}: {e}"
)
# Combine audio chunks in the correct sequence
combined_audio = BytesIO()
for part_number in sorted(audio_chunks.keys()):
combined_audio.write(audio_chunks[part_number])
# Save the combined audio data to a single file
with open(filename, 'wb') as f:
f.write(combined_audio.getvalue())
return filename.as_posix()
except requests.exceptions.RequestException as e:
raise exceptions.FailedToGenerateResponseError(
f"Failed to perform the operation: {e}"
)
# Example usage
if __name__ == "__main__":
elevenlabs = ElevenlabsTTS()
text = "This is a test of the ElevenlabsTTS text-to-speech API. It supports multiple sentences and advanced logging."
audio_file = elevenlabs.tts(text, voice="Brian")
|