Niansuh commited on
Commit
92d5846
·
verified ·
1 Parent(s): bd8ad86

Create tts_script.py

Browse files
Files changed (1) hide show
  1. tts_script.py +362 -0
tts_script.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import requests
3
+ import pathlib
4
+ from io import BytesIO
5
+ from playsound import playsound
6
+ from webscout import exceptions
7
+ from webscout.AIbase import TTSProvider
8
+ from webscout.litagent import LitAgent
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ """
11
+ Text processing utilities for TTS providers.
12
+ """
13
+ from typing import List, Dict, Tuple, Set, Optional, Pattern
14
+ import re
15
+
16
+
17
+ class SentenceTokenizer:
18
+ """Advanced sentence tokenizer with support for complex cases and proper formatting."""
19
+
20
+ def __init__(self) -> None:
21
+ # Common abbreviations by category
22
+ self.TITLES: Set[str] = {
23
+ 'mr', 'mrs', 'ms', 'dr', 'prof', 'rev', 'sr', 'jr', 'esq',
24
+ 'hon', 'pres', 'gov', 'atty', 'supt', 'det', 'rev', 'col','maj', 'gen', 'capt', 'cmdr',
25
+ 'lt', 'sgt', 'cpl', 'pvt'
26
+ }
27
+
28
+ self.ACADEMIC: Set[str] = {
29
+ 'ph.d', 'phd', 'm.d', 'md', 'b.a', 'ba', 'm.a', 'ma', 'd.d.s', 'dds',
30
+ 'm.b.a', 'mba', 'b.sc', 'bsc', 'm.sc', 'msc', 'llb', 'll.b', 'bl'
31
+ }
32
+
33
+ self.ORGANIZATIONS: Set[str] = {
34
+ 'inc', 'ltd', 'co', 'corp', 'llc', 'llp', 'assn', 'bros', 'plc', 'cos',
35
+ 'intl', 'dept', 'est', 'dist', 'mfg', 'div'
36
+ }
37
+
38
+ self.MONTHS: Set[str] = {
39
+ 'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'
40
+ }
41
+
42
+ self.UNITS: Set[str] = {
43
+ 'oz', 'pt', 'qt', 'gal', 'ml', 'cc', 'km', 'cm', 'mm', 'ft', 'in',
44
+ 'kg', 'lb', 'lbs', 'hz', 'khz', 'mhz', 'ghz', 'kb', 'mb', 'gb', 'tb'
45
+ }
46
+
47
+ self.TECHNOLOGY: Set[str] = {
48
+ 'v', 'ver', 'app', 'sys', 'dir', 'exe', 'lib', 'api', 'sdk', 'url',
49
+ 'cpu', 'gpu', 'ram', 'rom', 'hdd', 'ssd', 'lan', 'wan', 'sql', 'html'
50
+ }
51
+
52
+ self.MISC: Set[str] = {
53
+ 'vs', 'etc', 'ie', 'eg', 'no', 'al', 'ca', 'cf', 'pp', 'est', 'st',
54
+ 'approx', 'appt', 'apt', 'dept', 'depts', 'min', 'max', 'avg'
55
+ }
56
+
57
+ # Combine all abbreviations
58
+ self.all_abbreviations: Set[str] = (
59
+ self.TITLES | self.ACADEMIC | self.ORGANIZATIONS |
60
+ self.MONTHS | self.UNITS | self.TECHNOLOGY | self.MISC
61
+ )
62
+
63
+ # Special patterns
64
+ self.ELLIPSIS: str = r'\.{2,}|…'
65
+ self.URL_PATTERN: str = (
66
+ r'(?:https?:\/\/|www\.)[\w\-\.]+\.[a-zA-Z]{2,}(?:\/[^\s]*)?'
67
+ )
68
+ self.EMAIL_PATTERN: str = r'[\w\.-]+@[\w\.-]+\.\w+'
69
+ self.NUMBER_PATTERN: str = (
70
+ r'\d+(?:\.\d+)?(?:%|°|km|cm|mm|m|kg|g|lb|ft|in|mph|kmh|hz|mhz|ghz)?'
71
+ )
72
+
73
+ # Quote and bracket pairs
74
+ self.QUOTE_PAIRS: Dict[str, str] = {
75
+ '"': '"', "'": "'", '"': '"', "「": "」", "『": "』",
76
+ "«": "»", "‹": "›", "'": "'", "‚": "'"
77
+ }
78
+
79
+ self.BRACKETS: Dict[str, str] = {
80
+ '(': ')', '[': ']', '{': '}', '⟨': '⟩', '「': '」',
81
+ '『': '』', '【': '】', '〖': '〗', '「': '」'
82
+ }
83
+
84
+ # Compile regex patterns
85
+ self._compile_patterns()
86
+
87
+ def _compile_patterns(self) -> None:
88
+ """Compile regex patterns for better performance."""
89
+ # Pattern for finding potential sentence boundaries
90
+ self.SENTENCE_END: Pattern = re.compile(
91
+ r'''
92
+ # Group for sentence endings
93
+ (?:
94
+ # Standard endings with optional quotes/brackets
95
+ (?<=[.!?])[\"\'\)\]\}»›」』\s]*
96
+
97
+ # Ellipsis
98
+ |(?:\.{2,}|…)
99
+
100
+ # Asian-style endings
101
+ |(?<=[。!?」』】\s])
102
+ )
103
+
104
+ # Must be followed by whitespace and capital letter or number
105
+ (?=\s+(?:[A-Z0-9]|["'({[\[「『《‹〈][A-Z]))
106
+ ''',
107
+ re.VERBOSE
108
+ )
109
+
110
+ # Pattern for abbreviations
111
+ abbrev_pattern = '|'.join(re.escape(abbr) for abbr in self.all_abbreviations)
112
+ self.ABBREV_PATTERN: Pattern = re.compile(
113
+ fr'\b(?:{abbrev_pattern})\.?',
114
+ re.IGNORECASE
115
+ )
116
+
117
+ def _protect_special_cases(self, text: str) -> Tuple[str, Dict[str, str]]:
118
+ """Protect URLs, emails, and other special cases from being split."""
119
+ protected = text
120
+ placeholders: Dict[str, str] = {}
121
+ counter = 0
122
+
123
+ # Protect URLs and emails
124
+ for pattern in [self.URL_PATTERN, self.EMAIL_PATTERN]:
125
+ for match in re.finditer(pattern, protected):
126
+ placeholder = f'__PROTECTED_{counter}__'
127
+ placeholders[placeholder] = match.group()
128
+ protected = protected.replace(match.group(), placeholder)
129
+ counter += 1
130
+
131
+ # Protect quoted content
132
+ stack = []
133
+ protected_chars = list(protected)
134
+ i = 0
135
+ while i < len(protected_chars):
136
+ char = protected_chars[i]
137
+ if char in self.QUOTE_PAIRS:
138
+ stack.append((char, i))
139
+ elif stack and char == self.QUOTE_PAIRS[stack[-1][0]]:
140
+ start_quote, start_idx = stack.pop()
141
+ content = ''.join(protected_chars[start_idx:i + 1])
142
+ placeholder = f'__PROTECTED_{counter}__'
143
+ placeholders[placeholder] = content
144
+ protected_chars[start_idx:i + 1] = list(placeholder)
145
+ counter += 1
146
+ i += 1
147
+
148
+ return ''.join(protected_chars), placeholders
149
+
150
+ def _restore_special_cases(self, text: str, placeholders: Dict[str, str]) -> str:
151
+ """Restore protected content."""
152
+ restored = text
153
+ for placeholder, original in placeholders.items():
154
+ restored = restored.replace(placeholder, original)
155
+ return restored
156
+
157
+ def _handle_abbreviations(self, text: str) -> str:
158
+ """Handle abbreviations to prevent incorrect sentence splitting."""
159
+ def replace_abbrev(match: re.Match) -> str:
160
+ abbr = match.group().lower().rstrip('.')
161
+ if abbr in self.all_abbreviations:
162
+ return match.group().replace('.', '__DOT__')
163
+ return match.group()
164
+
165
+ return self.ABBREV_PATTERN.sub(replace_abbrev, text)
166
+
167
+ def _normalize_whitespace(self, text: str) -> str:
168
+ """Normalize whitespace while preserving paragraph breaks."""
169
+ # Replace multiple newlines with special marker
170
+ text = re.sub(r'\n\s*\n', ' __PARA__ ', text)
171
+ # Normalize remaining whitespace
172
+ text = re.sub(r'\s+', ' ', text)
173
+ return text.strip()
174
+
175
+ def _restore_formatting(self, sentences: List[str]) -> List[str]:
176
+ """Restore original formatting and clean up sentences."""
177
+ restored = []
178
+ for sentence in sentences:
179
+ # Restore dots in abbreviations
180
+ sentence = sentence.replace('__DOT__', '.')
181
+
182
+ # Restore paragraph breaks
183
+ sentence = sentence.replace('__PARA__', '\n\n')
184
+
185
+ # Clean up whitespace
186
+ sentence = re.sub(r'\s+', ' ', sentence).strip()
187
+
188
+ # Capitalize first letter if it's lowercase and not an abbreviation
189
+ words = sentence.split()
190
+ if words and words[0].lower() not in self.all_abbreviations:
191
+ sentence = sentence[0].upper() + sentence[1:]
192
+
193
+ if sentence:
194
+ restored.append(sentence)
195
+
196
+ return restored
197
+
198
+ def tokenize(self, text: str) -> List[str]:
199
+ """
200
+ Split text into sentences while handling complex cases.
201
+
202
+ Args:
203
+ text (str): Input text to split into sentences.
204
+
205
+ Returns:
206
+ List[str]: List of properly formatted sentences.
207
+ """
208
+ if not text or not text.strip():
209
+ return []
210
+
211
+ # Step 1: Protect special cases
212
+ protected_text, placeholders = self._protect_special_cases(text)
213
+
214
+ # Step 2: Normalize whitespace
215
+ protected_text = self._normalize_whitespace(protected_text)
216
+
217
+ # Step 3: Handle abbreviations
218
+ protected_text = self._handle_abbreviations(protected_text)
219
+
220
+ # Step 4: Split into potential sentences
221
+ potential_sentences = self.SENTENCE_END.split(protected_text)
222
+
223
+ # Step 5: Process and restore formatting
224
+ sentences = self._restore_formatting(potential_sentences)
225
+
226
+ # Step 6: Restore special cases
227
+ sentences = [self._restore_special_cases(s, placeholders) for s in sentences]
228
+
229
+ # Step 7: Post-process sentences
230
+ final_sentences = []
231
+ current_sentence = []
232
+
233
+ for sentence in sentences:
234
+ # Skip empty sentences
235
+ if not sentence.strip():
236
+ continue
237
+
238
+ # Check if sentence might be continuation of previous
239
+ if current_sentence and sentence[0].islower():
240
+ current_sentence.append(sentence)
241
+ else:
242
+ if current_sentence:
243
+ final_sentences.append(' '.join(current_sentence))
244
+ current_sentence = [sentence]
245
+
246
+ # Add last sentence if exists
247
+ if current_sentence:
248
+ final_sentences.append(' '.join(current_sentence))
249
+
250
+ return final_sentences
251
+
252
+
253
+ def split_sentences(text: str) -> List[str]:
254
+ """
255
+ Convenience function to split text into sentences using SentenceTokenizer.
256
+
257
+ Args:
258
+ text (str): Input text to split into sentences.
259
+
260
+ Returns:
261
+ List[str]: List of properly formatted sentences.
262
+ """
263
+ tokenizer = SentenceTokenizer()
264
+ return tokenizer.tokenize(text)
265
+
266
+
267
+ class ElevenlabsTTS(TTSProvider):
268
+ """
269
+ Text-to-speech provider using the ElevenlabsTTS API.
270
+ """
271
+ # Request headers
272
+ headers: dict[str, str] = {
273
+ "User-Agent": LitAgent().random()
274
+ }
275
+ cache_dir = pathlib.Path("./audio_cache")
276
+ all_voices: dict[str, str] = {"Brian": "nPczCjzI2devNBz1zQrb", "Alice":"Xb7hH8MSUJpSbSDYk0k2", "Bill":"pqHfZKP75CvOlQylNhV4", "Callum":"N2lVS1w4EtoT3dr4eOWO", "Charlie":"IKne3meq5aSn9XLyUdCD", "Charlotte":"XB0fDUnXU5powFXDhCwa", "Chris":"iP95p4xoKVk53GoZ742B", "Daniel":"onwK4e9ZLuTAKqWW03F9", "Eric":"cjVigY5qzO86Huf0OWal", "George":"JBFqnCBsd6RMkjVDRZzb", "Jessica":"cgSgspJ2msm6clMCkdW9", "Laura":"FGY2WhTYpPnrIDTdsKH5", "Liam":"TX3LPaxmHKxFdv7VOQHJ", "Lily":"pFZP5JQG7iQjIQuC4Bku", "Matilda":"XrExE9yKIg1WjnnlVkGX", "Sarah":"EXAVITQu4vr4xnSDxMaL", "Will":"bIHbv24MWmeRgasZH58o", "Neal":"Zp1aWhL05Pi5BkhizFC3"}
277
+
278
+ def __init__(self, timeout: int = 20, proxies: dict = None):
279
+ """Initializes the ElevenlabsTTS TTS client."""
280
+ self.session = requests.Session()
281
+ self.session.headers.update(self.headers)
282
+ if proxies:
283
+ self.session.proxies.update(proxies)
284
+ self.timeout = timeout
285
+ self.params = {'allow_unauthenticated': '1'}
286
+
287
+ def tts(self, text: str, voice: str = "Brian", verbose:bool = True) -> str:
288
+ """
289
+ Converts text to speech using the ElevenlabsTTS API and saves it to a file.
290
+ """
291
+ assert (
292
+ voice in self.all_voices
293
+ ), f"Voice '{voice}' not one of [{', '.join(self.all_voices.keys())}]"
294
+
295
+ filename = self.cache_dir / f"{int(time.time())}.mp3"
296
+
297
+ # Split text into sentences
298
+ sentences = split_sentences(text)
299
+
300
+ # Function to request audio for each chunk
301
+ def generate_audio_for_chunk(part_text: str, part_number: int):
302
+ while True:
303
+ try:
304
+ json_data = {'text': part_text, 'model_id': 'eleven_multilingual_v2'}
305
+ response = self.session.post(f'https://api.elevenlabs.io/v1/text-to-speech/{self.all_voices[voice]}',params=self.params, headers=self.headers, json=json_data, timeout=self.timeout)
306
+ response.raise_for_status()
307
+
308
+ # Create the audio_cache directory if it doesn't exist
309
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
310
+
311
+ # Check if the request was successful
312
+ if response.ok and response.status_code == 200:
313
+ return part_number, response.content
314
+ else:
315
+ raise exceptions.FailedToGenerateResponseError(
316
+ f"Failed to generate audio for chunk {part_number}: {response.status_code}"
317
+ )
318
+ except requests.RequestException as e:
319
+ time.sleep(1)
320
+ continue
321
+
322
+ try:
323
+ # Using ThreadPoolExecutor to handle requests concurrently
324
+ with ThreadPoolExecutor() as executor:
325
+ futures = {executor.submit(generate_audio_for_chunk, sentence.strip(), chunk_num): chunk_num
326
+ for chunk_num, sentence in enumerate(sentences, start=1)}
327
+
328
+ # Dictionary to store results with order preserved
329
+ audio_chunks = {}
330
+
331
+ for future in as_completed(futures):
332
+ chunk_num = futures[future]
333
+ try:
334
+ part_number, audio_data = future.result()
335
+ audio_chunks[part_number] = audio_data
336
+ except Exception as e:
337
+ raise exceptions.FailedToGenerateResponseError(
338
+ f"Failed to generate audio for chunk {chunk_num}: {e}"
339
+ )
340
+
341
+ # Combine audio chunks in the correct sequence
342
+ combined_audio = BytesIO()
343
+ for part_number in sorted(audio_chunks.keys()):
344
+ combined_audio.write(audio_chunks[part_number])
345
+
346
+ # Save the combined audio data to a single file
347
+ with open(filename, 'wb') as f:
348
+ f.write(combined_audio.getvalue())
349
+ return filename.as_posix()
350
+
351
+ except requests.exceptions.RequestException as e:
352
+ raise exceptions.FailedToGenerateResponseError(
353
+ f"Failed to perform the operation: {e}"
354
+ )
355
+
356
+
357
+ # Example usage
358
+ if __name__ == "__main__":
359
+ elevenlabs = ElevenlabsTTS()
360
+ text = "This is a test of the ElevenlabsTTS text-to-speech API. It supports multiple sentences and advanced logging."
361
+
362
+ audio_file = elevenlabs.tts(text, voice="Brian")