suhaibrashid17 commited on
Commit
e8c66fa
·
verified ·
1 Parent(s): 151b142

Upload tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +907 -0
tokenizer.py ADDED
@@ -0,0 +1,907 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import re
4
+ import textwrap
5
+ from functools import cached_property
6
+
7
+ import torch
8
+ from num2words import num2words
9
+ from spacy.lang.ar import Arabic
10
+ from spacy.lang.ar import Arabic
11
+ from spacy.lang.en import English
12
+ from spacy.lang.es import Spanish
13
+ from spacy.lang.hi import Hindi
14
+ from spacy.lang.ja import Japanese
15
+ from spacy.lang.zh import Chinese
16
+ from tokenizers import Tokenizer
17
+
18
+ from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
19
+ from TTS.tts.utils.text.cleaners import collapse_whitespace, lowercase
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def get_spacy_lang(lang):
25
+ """Return Spacy language used for sentence splitting."""
26
+ if lang == "zh":
27
+ return Chinese()
28
+ elif lang == "ja":
29
+ return Japanese()
30
+ elif lang == "ar":
31
+ return Arabic()
32
+ elif lang == "ur":
33
+ return Urdu()
34
+ elif lang == "es":
35
+ return Spanish()
36
+ elif lang == "hi":
37
+ return Hindi()
38
+ else:
39
+ # For most languages, English does the job
40
+ return English()
41
+
42
+
43
+ def split_sentence(text, lang, text_split_length=250):
44
+ """Preprocess the input text"""
45
+ text_splits = []
46
+ if text_split_length is not None and len(text) >= text_split_length:
47
+ text_splits.append("")
48
+ nlp = get_spacy_lang(lang)
49
+ nlp.add_pipe("sentencizer")
50
+ doc = nlp(text)
51
+ for sentence in doc.sents:
52
+ if len(text_splits[-1]) + len(str(sentence)) <= text_split_length:
53
+ # if the last sentence + the current sentence is less than the text_split_length
54
+ # then add the current sentence to the last sentence
55
+ text_splits[-1] += " " + str(sentence)
56
+ text_splits[-1] = text_splits[-1].lstrip()
57
+ elif len(str(sentence)) > text_split_length:
58
+ # if the current sentence is greater than the text_split_length
59
+ for line in textwrap.wrap(
60
+ str(sentence),
61
+ width=text_split_length,
62
+ drop_whitespace=True,
63
+ break_on_hyphens=False,
64
+ tabsize=1,
65
+ ):
66
+ text_splits.append(str(line))
67
+ else:
68
+ text_splits.append(str(sentence))
69
+
70
+ if len(text_splits) > 1:
71
+ if text_splits[0] == "":
72
+ del text_splits[0]
73
+ else:
74
+ text_splits = [text.lstrip()]
75
+
76
+ return text_splits
77
+
78
+
79
+ # List of (regular expression, replacement) pairs for abbreviations:
80
+ _abbreviations = {
81
+ "en": [
82
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
83
+ for x in [
84
+ ("mrs", "misess"),
85
+ ("mr", "mister"),
86
+ ("dr", "doctor"),
87
+ ("st", "saint"),
88
+ ("co", "company"),
89
+ ("jr", "junior"),
90
+ ("maj", "major"),
91
+ ("gen", "general"),
92
+ ("drs", "doctors"),
93
+ ("rev", "reverend"),
94
+ ("lt", "lieutenant"),
95
+ ("hon", "honorable"),
96
+ ("sgt", "sergeant"),
97
+ ("capt", "captain"),
98
+ ("esq", "esquire"),
99
+ ("ltd", "limited"),
100
+ ("col", "colonel"),
101
+ ("ft", "fort"),
102
+ ]
103
+ ],
104
+ "es": [
105
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
106
+ for x in [
107
+ ("sra", "señora"),
108
+ ("sr", "señor"),
109
+ ("dr", "doctor"),
110
+ ("dra", "doctora"),
111
+ ("st", "santo"),
112
+ ("co", "compañía"),
113
+ ("jr", "junior"),
114
+ ("ltd", "limitada"),
115
+ ]
116
+ ],
117
+ "fr": [
118
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
119
+ for x in [
120
+ ("mme", "madame"),
121
+ ("mr", "monsieur"),
122
+ ("dr", "docteur"),
123
+ ("st", "saint"),
124
+ ("co", "compagnie"),
125
+ ("jr", "junior"),
126
+ ("ltd", "limitée"),
127
+ ]
128
+ ],
129
+ "de": [
130
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
131
+ for x in [
132
+ ("fr", "frau"),
133
+ ("dr", "doktor"),
134
+ ("st", "sankt"),
135
+ ("co", "firma"),
136
+ ("jr", "junior"),
137
+ ]
138
+ ],
139
+ "pt": [
140
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
141
+ for x in [
142
+ ("sra", "senhora"),
143
+ ("sr", "senhor"),
144
+ ("dr", "doutor"),
145
+ ("dra", "doutora"),
146
+ ("st", "santo"),
147
+ ("co", "companhia"),
148
+ ("jr", "júnior"),
149
+ ("ltd", "limitada"),
150
+ ]
151
+ ],
152
+ "it": [
153
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
154
+ for x in [
155
+ # ("sig.ra", "signora"),
156
+ ("sig", "signore"),
157
+ ("dr", "dottore"),
158
+ ("st", "santo"),
159
+ ("co", "compagnia"),
160
+ ("jr", "junior"),
161
+ ("ltd", "limitata"),
162
+ ]
163
+ ],
164
+ "pl": [
165
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
166
+ for x in [
167
+ ("p", "pani"),
168
+ ("m", "pan"),
169
+ ("dr", "doktor"),
170
+ ("sw", "święty"),
171
+ ("jr", "junior"),
172
+ ]
173
+ ],
174
+ "ar": [
175
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
176
+ for x in [
177
+ # There are not many common abbreviations in Arabic as in English.
178
+ ]
179
+ ],
180
+ "ur": [
181
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
182
+ for x in [
183
+ # There are not many common abbreviations in Arabic as in English.
184
+ ]
185
+ ],
186
+
187
+ "zh": [
188
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
189
+ for x in [
190
+ # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
191
+ ]
192
+ ],
193
+ "cs": [
194
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
195
+ for x in [
196
+ ("dr", "doktor"), # doctor
197
+ ("ing", "inženýr"), # engineer
198
+ ("p", "pan"), # Could also map to pani for woman but no easy way to do it
199
+ # Other abbreviations would be specialized and not as common.
200
+ ]
201
+ ],
202
+ "ru": [
203
+ (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1])
204
+ for x in [
205
+ ("г-жа", "госпожа"), # Mrs.
206
+ ("г-н", "господин"), # Mr.
207
+ ("д-р", "доктор"), # doctor
208
+ # Other abbreviations are less common or specialized.
209
+ ]
210
+ ],
211
+ "nl": [
212
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
213
+ for x in [
214
+ ("dhr", "de heer"), # Mr.
215
+ ("mevr", "mevrouw"), # Mrs.
216
+ ("dr", "dokter"), # doctor
217
+ ("jhr", "jonkheer"), # young lord or nobleman
218
+ # Dutch uses more abbreviations, but these are the most common ones.
219
+ ]
220
+ ],
221
+ "tr": [
222
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
223
+ for x in [
224
+ ("b", "bay"), # Mr.
225
+ ("byk", "büyük"), # büyük
226
+ ("dr", "doktor"), # doctor
227
+ # Add other Turkish abbreviations here if needed.
228
+ ]
229
+ ],
230
+ "hu": [
231
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
232
+ for x in [
233
+ ("dr", "doktor"), # doctor
234
+ ("b", "bácsi"), # Mr.
235
+ ("nőv", "nővér"), # nurse
236
+ # Add other Hungarian abbreviations here if needed.
237
+ ]
238
+ ],
239
+ "ko": [
240
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
241
+ for x in [
242
+ # Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
243
+ ]
244
+ ],
245
+ "hi": [
246
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
247
+ for x in [
248
+ # Hindi doesn't typically use abbreviations in the same way as Latin-based scripts.
249
+ ]
250
+ ],
251
+ }
252
+
253
+
254
+ def expand_abbreviations_multilingual(text, lang="en"):
255
+ for regex, replacement in _abbreviations[lang]:
256
+ text = re.sub(regex, replacement, text)
257
+ return text
258
+
259
+
260
+ _symbols_multilingual = {
261
+ "en": [
262
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
263
+ for x in [
264
+ ("&", " and "),
265
+ ("@", " at "),
266
+ ("%", " percent "),
267
+ ("#", " hash "),
268
+ ("$", " dollar "),
269
+ ("£", " pound "),
270
+ ("°", " degree "),
271
+ ]
272
+ ],
273
+ "es": [
274
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
275
+ for x in [
276
+ ("&", " y "),
277
+ ("@", " arroba "),
278
+ ("%", " por ciento "),
279
+ ("#", " numeral "),
280
+ ("$", " dolar "),
281
+ ("£", " libra "),
282
+ ("°", " grados "),
283
+ ]
284
+ ],
285
+ "fr": [
286
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
287
+ for x in [
288
+ ("&", " et "),
289
+ ("@", " arobase "),
290
+ ("%", " pour cent "),
291
+ ("#", " dièse "),
292
+ ("$", " dollar "),
293
+ ("£", " livre "),
294
+ ("°", " degrés "),
295
+ ]
296
+ ],
297
+ "de": [
298
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
299
+ for x in [
300
+ ("&", " und "),
301
+ ("@", " at "),
302
+ ("%", " prozent "),
303
+ ("#", " raute "),
304
+ ("$", " dollar "),
305
+ ("£", " pfund "),
306
+ ("°", " grad "),
307
+ ]
308
+ ],
309
+ "pt": [
310
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
311
+ for x in [
312
+ ("&", " e "),
313
+ ("@", " arroba "),
314
+ ("%", " por cento "),
315
+ ("#", " cardinal "),
316
+ ("$", " dólar "),
317
+ ("£", " libra "),
318
+ ("°", " graus "),
319
+ ]
320
+ ],
321
+ "it": [
322
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
323
+ for x in [
324
+ ("&", " e "),
325
+ ("@", " chiocciola "),
326
+ ("%", " per cento "),
327
+ ("#", " cancelletto "),
328
+ ("$", " dollaro "),
329
+ ("£", " sterlina "),
330
+ ("°", " gradi "),
331
+ ]
332
+ ],
333
+ "pl": [
334
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
335
+ for x in [
336
+ ("&", " i "),
337
+ ("@", " małpa "),
338
+ ("%", " procent "),
339
+ ("#", " krzyżyk "),
340
+ ("$", " dolar "),
341
+ ("£", " funt "),
342
+ ("°", " stopnie "),
343
+ ]
344
+ ],
345
+ "ar": [
346
+ # Arabic
347
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
348
+ for x in [
349
+ ("&", " و "),
350
+ ("@", " على "),
351
+ ("%", " في المئة "),
352
+ ("#", " رقم "),
353
+ ("$", " دولار "),
354
+ ("£", " جنيه "),
355
+ ("°", " درجة "),
356
+ ]
357
+ ],
358
+ "ur": [
359
+ # Urdu symbol expansions
360
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
361
+ for x in [
362
+ ("&", " اور "),
363
+ ("@", " پر "),
364
+ ("%", " فیصد "),
365
+ ("#", " ہیش "),
366
+ ("$", " ڈالر "),
367
+ ("£", " پاؤنڈ "),
368
+ ("°", " ڈگری "),
369
+ ("*", " ستارہ "),
370
+ ("@", " ای میل "),
371
+ ("!", " اچھا "),
372
+ ]
373
+ ],
374
+
375
+ "zh": [
376
+ # Chinese
377
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
378
+ for x in [
379
+ ("&", " 和 "),
380
+ ("@", " 在 "),
381
+ ("%", " 百分之 "),
382
+ ("#", " 号 "),
383
+ ("$", " 美元 "),
384
+ ("£", " 英镑 "),
385
+ ("°", " 度 "),
386
+ ]
387
+ ],
388
+ "cs": [
389
+ # Czech
390
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
391
+ for x in [
392
+ ("&", " a "),
393
+ ("@", " na "),
394
+ ("%", " procento "),
395
+ ("#", " křížek "),
396
+ ("$", " dolar "),
397
+ ("£", " libra "),
398
+ ("°", " stupně "),
399
+ ]
400
+ ],
401
+ "ru": [
402
+ # Russian
403
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
404
+ for x in [
405
+ ("&", " и "),
406
+ ("@", " собака "),
407
+ ("%", " процентов "),
408
+ ("#", " номер "),
409
+ ("$", " доллар "),
410
+ ("£", " фунт "),
411
+ ("°", " градус "),
412
+ ]
413
+ ],
414
+ "nl": [
415
+ # Dutch
416
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
417
+ for x in [
418
+ ("&", " en "),
419
+ ("@", " bij "),
420
+ ("%", " procent "),
421
+ ("#", " hekje "),
422
+ ("$", " dollar "),
423
+ ("£", " pond "),
424
+ ("°", " graden "),
425
+ ]
426
+ ],
427
+ "tr": [
428
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
429
+ for x in [
430
+ ("&", " ve "),
431
+ ("@", " at "),
432
+ ("%", " yüzde "),
433
+ ("#", " diyez "),
434
+ ("$", " dolar "),
435
+ ("£", " sterlin "),
436
+ ("°", " derece "),
437
+ ]
438
+ ],
439
+ "hu": [
440
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
441
+ for x in [
442
+ ("&", " és "),
443
+ ("@", " kukac "),
444
+ ("%", " százalék "),
445
+ ("#", " kettőskereszt "),
446
+ ("$", " dollár "),
447
+ ("£", " font "),
448
+ ("°", " fok "),
449
+ ]
450
+ ],
451
+ "ko": [
452
+ # Korean
453
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
454
+ for x in [
455
+ ("&", " 그리고 "),
456
+ ("@", " 에 "),
457
+ ("%", " 퍼센트 "),
458
+ ("#", " 번호 "),
459
+ ("$", " 달러 "),
460
+ ("£", " 파운드 "),
461
+ ("°", " 도 "),
462
+ ]
463
+ ],
464
+ "hi": [
465
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
466
+ for x in [
467
+ ("&", " और "),
468
+ ("@", " ऐट दी रेट "),
469
+ ("%", " प्रतिशत "),
470
+ ("#", " हैश "),
471
+ ("$", " डॉलर "),
472
+ ("£", " पाउंड "),
473
+ ("°", " डिग्री "),
474
+ ]
475
+ ],
476
+ }
477
+
478
+
479
+ def expand_symbols_multilingual(text, lang="en"):
480
+ for regex, replacement in _symbols_multilingual[lang]:
481
+ text = re.sub(regex, replacement, text)
482
+ text = text.replace(" ", " ") # Ensure there are no double spaces
483
+ return text.strip()
484
+
485
+
486
+ _ordinal_re = {
487
+ "en": re.compile(r"([0-9]+)(st|nd|rd|th)"),
488
+ "es": re.compile(r"([0-9]+)(º|ª|er|o|a|os|as)"),
489
+ "fr": re.compile(r"([0-9]+)(º|ª|er|re|e|ème)"),
490
+ "de": re.compile(r"([0-9]+)(st|nd|rd|th|º|ª|\.(?=\s|$))"),
491
+ "pt": re.compile(r"([0-9]+)(º|ª|o|a|os|as)"),
492
+ "it": re.compile(r"([0-9]+)(º|°|ª|o|a|i|e)"),
493
+ "pl": re.compile(r"([0-9]+)(º|ª|st|nd|rd|th)"),
494
+ "ar": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"),
495
+ "cs": re.compile(r"([0-9]+)\.(?=\s|$)"), # In Czech, a dot is often used after the number to indicate ordinals.
496
+ "ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"),
497
+ "nl": re.compile(r"([0-9]+)(de|ste|e)"),
498
+ "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
499
+ "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
500
+ "ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
501
+ "hi": re.compile(r"([0-9]+)(st|nd|rd|th)"), # To check
502
+ "ur": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"),
503
+
504
+ }
505
+ _number_re = re.compile(r"[0-9]+")
506
+ _currency_re = {
507
+ "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"),
508
+ "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"),
509
+ "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"),
510
+ }
511
+
512
+ _comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b")
513
+ _dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b")
514
+ _decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)")
515
+
516
+
517
+ def _remove_commas(m):
518
+ text = m.group(0)
519
+ if "," in text:
520
+ text = text.replace(",", "")
521
+ return text
522
+
523
+
524
+ def _remove_dots(m):
525
+ text = m.group(0)
526
+ if "." in text:
527
+ text = text.replace(".", "")
528
+ return text
529
+
530
+
531
+ def _expand_decimal_point(m, lang="en"):
532
+ amount = m.group(1).replace(",", ".")
533
+ return num2words(float(amount), lang=lang if lang != "cs" else "cz")
534
+
535
+
536
+ def _expand_currency(m, lang="en", currency="USD"):
537
+ amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
538
+ full_amount = num2words(amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz")
539
+
540
+ and_equivalents = {
541
+ "en": ", ",
542
+ "es": " con ",
543
+ "fr": " et ",
544
+ "de": " und ",
545
+ "pt": " e ",
546
+ "it": " e ",
547
+ "pl": ", ",
548
+ "cs": ", ",
549
+ "ru": ", ",
550
+ "nl": ", ",
551
+ "ar": ", ",
552
+ "ur": ", ",
553
+ "tr": ", ",
554
+ "hu": ", ",
555
+ "ko": ", ",
556
+ "hi": ", ",
557
+ }
558
+
559
+ if amount.is_integer():
560
+ last_and = full_amount.rfind(and_equivalents[lang])
561
+ if last_and != -1:
562
+ full_amount = full_amount[:last_and]
563
+
564
+ return full_amount
565
+
566
+
567
+ def _expand_ordinal(m, lang="en"):
568
+ if lang!="ur":
569
+ return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz")
570
+ else:
571
+ return m
572
+
573
+
574
+ def _expand_number(m, lang="en"):
575
+ if lang!="ur":
576
+ return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz")
577
+ else:
578
+ return m
579
+
580
+
581
+
582
+ def expand_numbers_multilingual(text, lang="en"):
583
+ if lang == "zh":
584
+ text = zh_num2words()(text)
585
+ if lang == "ur":
586
+ text = text
587
+ else:
588
+ if lang in ["en", "ru"]:
589
+ text = re.sub(_comma_number_re, _remove_commas, text)
590
+ else:
591
+ text = re.sub(_dot_number_re, _remove_dots, text)
592
+ try:
593
+ text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text)
594
+ text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text)
595
+ text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text)
596
+ except:
597
+ pass
598
+ if lang != "tr":
599
+ text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text)
600
+ text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
601
+ text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
602
+ return text
603
+
604
+ def basic_cleaners(text):
605
+ text = lowercase(text)
606
+ return text
607
+ def multilingual_cleaners(text, lang):
608
+ text = text.replace('"', "")
609
+ if lang == "tr":
610
+ text = text.replace("İ", "i")
611
+ text = text.replace("Ö", "ö")
612
+ text = text.replace("Ü", "ü")
613
+ text = lowercase(text)
614
+ text = expand_numbers_multilingual(text, lang)
615
+ text = expand_abbreviations_multilingual(text, lang)
616
+ text = expand_symbols_multilingual(text, lang=lang)
617
+ text = collapse_whitespace(text)
618
+ return text
619
+
620
+
621
+ def chinese_transliterate(text):
622
+ try:
623
+ import pypinyin
624
+ except ImportError as e:
625
+ raise ImportError("Chinese requires: pypinyin") from e
626
+ return "".join(
627
+ [p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)]
628
+ )
629
+
630
+
631
+ def japanese_cleaners(text, katsu):
632
+ text = katsu.romaji(text)
633
+ text = lowercase(text)
634
+ return text
635
+
636
+
637
+ def korean_transliterate(text):
638
+ try:
639
+ from hangul_romanize import Transliter
640
+ from hangul_romanize.rule import academic
641
+ except ImportError as e:
642
+ raise ImportError("Korean requires: hangul_romanize") from e
643
+ r = Transliter(academic)
644
+ return r.translit(text)
645
+
646
+
647
+ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tokenizer.json")
648
+
649
+
650
+ class VoiceBpeTokenizer:
651
+ def __init__(self, vocab_file=None):
652
+ self.tokenizer = None
653
+ if vocab_file is not None:
654
+ self.tokenizer = Tokenizer.from_file(vocab_file)
655
+ self.char_limits = {
656
+ "en": 250,
657
+ "de": 253,
658
+ "fr": 273,
659
+ "es": 239,
660
+ "it": 213,
661
+ "pt": 203,
662
+ "pl": 224,
663
+ "zh": 82,
664
+ "ar": 166,
665
+ "cs": 186,
666
+ "ru": 182,
667
+ "nl": 251,
668
+ "tr": 226,
669
+ "ja": 71,
670
+ "hu": 224,
671
+ "ko": 95,
672
+ "hi": 150,
673
+ "ur": 150,
674
+
675
+ }
676
+
677
+ @cached_property
678
+ def katsu(self):
679
+ import cutlet
680
+
681
+ return cutlet.Cutlet()
682
+
683
+ def check_input_length(self, txt, lang):
684
+ lang = lang.split("-")[0] # remove the region
685
+ limit = self.char_limits.get(lang, 250)
686
+ if len(txt) > limit:
687
+ logger.warning(
688
+ "The text length exceeds the character limit of %d for language '%s', this might cause truncated audio.",
689
+ limit,
690
+ lang,
691
+ )
692
+
693
+ def preprocess_text(self, txt, lang):
694
+ if lang in {"ar", "cs", "de", "en", "es", "fr", "hi", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko","ur"}:
695
+ txt = multilingual_cleaners(txt, lang)
696
+ if lang == "zh":
697
+ txt = chinese_transliterate(txt)
698
+ if lang == "ko":
699
+ txt = korean_transliterate(txt)
700
+ elif lang == "ja":
701
+ txt = japanese_cleaners(txt, self.katsu)
702
+ else:
703
+ raise NotImplementedError(f"Language '{lang}' is not supported.")
704
+ return txt
705
+
706
+ def encode(self, txt, lang):
707
+ lang = lang.split("-")[0] # remove the region
708
+ self.check_input_length(txt, lang)
709
+ txt = self.preprocess_text(txt, lang)
710
+ lang = "zh-cn" if lang == "zh" else lang
711
+ txt = f"[{lang}]{txt}"
712
+ txt = txt.replace(" ", "[SPACE]")
713
+ return self.tokenizer.encode(txt).ids
714
+
715
+ def decode(self, seq):
716
+ if isinstance(seq, torch.Tensor):
717
+ seq = seq.cpu().numpy()
718
+ txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "")
719
+ txt = txt.replace("[SPACE]", " ")
720
+ txt = txt.replace("[STOP]", "")
721
+ txt = txt.replace("[UNK]", "")
722
+ return txt
723
+
724
+ def __len__(self):
725
+ return self.tokenizer.get_vocab_size()
726
+
727
+ def get_number_tokens(self):
728
+ return max(self.tokenizer.get_vocab().values()) + 1
729
+
730
+
731
+ def test_expand_numbers_multilingual():
732
+ test_cases = [
733
+ # English
734
+ ("In 12.5 seconds.", "In twelve point five seconds.", "en"),
735
+ ("There were 50 soldiers.", "There were fifty soldiers.", "en"),
736
+ ("This is a 1st test", "This is a first test", "en"),
737
+ ("That will be $20 sir.", "That will be twenty dollars sir.", "en"),
738
+ ("That will be 20€ sir.", "That will be twenty euro sir.", "en"),
739
+ ("That will be 20.15€ sir.", "That will be twenty euro, fifteen cents sir.", "en"),
740
+ ("That's 100,000.5.", "That's one hundred thousand point five.", "en"),
741
+ # French
742
+ ("En 12,5 secondes.", "En douze virgule cinq secondes.", "fr"),
743
+ ("Il y avait 50 soldats.", "Il y avait cinquante soldats.", "fr"),
744
+ ("Ceci est un 1er test", "Ceci est un premier test", "fr"),
745
+ ("Cela vous fera $20 monsieur.", "Cela vous fera vingt dollars monsieur.", "fr"),
746
+ ("Cela vous fera 20€ monsieur.", "Cela vous fera vingt euros monsieur.", "fr"),
747
+ ("Cela vous fera 20,15€ monsieur.", "Cela vous fera vingt euros et quinze centimes monsieur.", "fr"),
748
+ ("Ce sera 100.000,5.", "Ce sera cent mille virgule cinq.", "fr"),
749
+ # German
750
+ ("In 12,5 Sekunden.", "In zwölf Komma fünf Sekunden.", "de"),
751
+ ("Es gab 50 Soldaten.", "Es gab fünfzig Soldaten.", "de"),
752
+ ("Dies ist ein 1. Test", "Dies ist ein erste Test", "de"), # Issue with gender
753
+ ("Das macht $20 Herr.", "Das macht zwanzig Dollar Herr.", "de"),
754
+ ("Das macht 20€ Herr.", "Das macht zwanzig Euro Herr.", "de"),
755
+ ("Das macht 20,15€ Herr.", "Das macht zwanzig Euro und fünfzehn Cent Herr.", "de"),
756
+ # Spanish
757
+ ("En 12,5 segundos.", "En doce punto cinco segundos.", "es"),
758
+ ("Había 50 soldados.", "Había cincuenta soldados.", "es"),
759
+ ("Este es un 1er test", "Este es un primero test", "es"),
760
+ ("Eso le costará $20 señor.", "Eso le costará veinte dólares señor.", "es"),
761
+ ("Eso le costará 20€ señor.", "Eso le costará veinte euros señor.", "es"),
762
+ ("Eso le costará 20,15€ señor.", "Eso le costará veinte euros con quince céntimos señor.", "es"),
763
+ # Italian
764
+ ("In 12,5 secondi.", "In dodici virgola cinque secondi.", "it"),
765
+ ("C'erano 50 soldati.", "C'erano cinquanta soldati.", "it"),
766
+ ("Questo è un 1° test", "Questo è un primo test", "it"),
767
+ ("Ti costerà $20 signore.", "Ti costerà venti dollari signore.", "it"),
768
+ ("Ti costerà 20€ signore.", "Ti costerà venti euro signore.", "it"),
769
+ ("Ti costerà 20,15€ signore.", "Ti costerà venti euro e quindici centesimi signore.", "it"),
770
+ # Portuguese
771
+ ("Em 12,5 segundos.", "Em doze vírgula cinco segundos.", "pt"),
772
+ ("Havia 50 soldados.", "Havia cinquenta soldados.", "pt"),
773
+ ("Este é um 1º teste", "Este é um primeiro teste", "pt"),
774
+ ("Isso custará $20 senhor.", "Isso custará vinte dólares senhor.", "pt"),
775
+ ("Isso custará 20€ senhor.", "Isso custará vinte euros senhor.", "pt"),
776
+ (
777
+ "Isso custará 20,15€ senhor.",
778
+ "Isso custará vinte euros e quinze cêntimos senhor.",
779
+ "pt",
780
+ ), # "cêntimos" should be "centavos" num2words issue
781
+ # Polish
782
+ ("W 12,5 sekundy.", "W dwanaście przecinek pięć sekundy.", "pl"),
783
+ ("Było 50 żołnierzy.", "Było pięćdziesiąt żołnierzy.", "pl"),
784
+ ("To będzie kosztować 20€ panie.", "To będzie kosztować dwadzieścia euro panie.", "pl"),
785
+ ("To będzie kosztować 20,15€ panie.", "To będzie kosztować dwadzieścia euro, piętnaście centów panie.", "pl"),
786
+ # Arabic
787
+ ("في الـ 12,5 ثانية.", "في الـ اثنا عشر , خمسون ثانية.", "ar"),
788
+ ("كان هناك 50 جنديًا.", "كان هناك خمسون جنديًا.", "ar"),
789
+ # ("ستكون النتيجة $20 يا سيد.", 'ستكون النتيجة عشرون دولار يا سيد.', 'ar'), # $ and € are mising from num2words
790
+ # ("ستكون النتيجة 20€ يا سيد.", 'ستكون النتيجة عشرون يورو يا سيد.', 'ar'),
791
+ # Czech
792
+ ("Za 12,5 vteřiny.", "Za dvanáct celá pět vteřiny.", "cs"),
793
+ ("Bylo tam 50 vojáků.", "Bylo tam padesát vojáků.", "cs"),
794
+ ("To bude stát 20€ pane.", "To bude stát dvacet euro pane.", "cs"),
795
+ ("To bude 20.15€ pane.", "To bude dvacet euro, patnáct centů pane.", "cs"),
796
+ # Russian
797
+ ("Через 12.5 секунды.", "Через двенадцать запятая пять секунды.", "ru"),
798
+ ("Там было 50 солдат.", "Там было пятьдесят солдат.", "ru"),
799
+ ("Это будет 20.15€ сэр.", "Это будет двадцать евро, пятнадцать центов сэр.", "ru"),
800
+ ("Это будет стоить 20€ господин.", "Это будет стоить двадцать евро господин.", "ru"),
801
+ # Dutch
802
+ ("In 12,5 seconden.", "In twaalf komma vijf seconden.", "nl"),
803
+ ("Er waren 50 soldaten.", "Er waren vijftig soldaten.", "nl"),
804
+ ("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"),
805
+ ("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"),
806
+ # Chinese (Simplified)
807
+ ("在12.5秒内", "在十二点五秒内", "zh"),
808
+ ("有50名士兵", "有五十名士兵", "zh"),
809
+ # ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work
810
+ # ("那将是20€先生", '那将是二十欧元先生', 'zh'),
811
+ # Turkish
812
+ # ("12,5 saniye içinde.", 'On iki virgül beş saniye içinde.', 'tr'), # decimal doesn't work for TR
813
+ ("50 asker vardı.", "elli asker vardı.", "tr"),
814
+ ("Bu 1. test", "Bu birinci test", "tr"),
815
+ # ("Bu 100.000,5.", 'Bu yüz bin virgül beş.', 'tr'),
816
+ # Hungarian
817
+ ("12,5 másodperc alatt.", "tizenkettő egész öt tized másodperc alatt.", "hu"),
818
+ ("50 katona volt.", "ötven katona volt.", "hu"),
819
+ ("Ez az 1. teszt", "Ez az első teszt", "hu"),
820
+ # Korean
821
+ ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"),
822
+ ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"),
823
+ ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"),
824
+ # Hindi
825
+ ("12.5 सेकंड में।", "साढ़े बारह सेकंड में।", "hi"),
826
+ ("50 सैनिक थे।", "पचास सैनिक थे।", "hi"),
827
+ ]
828
+ for a, b, lang in test_cases:
829
+ out = expand_numbers_multilingual(a, lang=lang)
830
+ assert out == b, f"'{out}' vs '{b}'"
831
+
832
+
833
+ def test_abbreviations_multilingual():
834
+ test_cases = [
835
+ # English
836
+ ("Hello Mr. Smith.", "Hello mister Smith.", "en"),
837
+ ("Dr. Jones is here.", "doctor Jones is here.", "en"),
838
+ # Spanish
839
+ ("Hola Sr. Garcia.", "Hola señor Garcia.", "es"),
840
+ ("La Dra. Martinez es muy buena.", "La doctora Martinez es muy buena.", "es"),
841
+ # French
842
+ ("Bonjour Mr. Dupond.", "Bonjour monsieur Dupond.", "fr"),
843
+ ("Mme. Moreau est absente aujourd'hui.", "madame Moreau est absente aujourd'hui.", "fr"),
844
+ # German
845
+ ("Frau Dr. Müller ist sehr klug.", "Frau doktor Müller ist sehr klug.", "de"),
846
+ # Portuguese
847
+ ("Olá Sr. Silva.", "Olá senhor Silva.", "pt"),
848
+ ("Dra. Costa, você está disponível?", "doutora Costa, você está disponível?", "pt"),
849
+ # Italian
850
+ ("Buongiorno, Sig. Rossi.", "Buongiorno, signore Rossi.", "it"),
851
+ # ("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern
852
+ # Polish
853
+ ("Dzień dobry, P. Kowalski.", "Dzień dobry, pani Kowalski.", "pl"),
854
+ ("M. Nowak, czy mogę zadać pytanie?", "pan Nowak, czy mogę zadać pytanie?", "pl"),
855
+ # Czech
856
+ ("P. Novák", "pan Novák", "cs"),
857
+ ("Dr. Vojtěch", "doktor Vojtěch", "cs"),
858
+ # Dutch
859
+ ("Dhr. Jansen", "de heer Jansen", "nl"),
860
+ ("Mevr. de Vries", "mevrouw de Vries", "nl"),
861
+ # Russian
862
+ ("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", "ru"),
863
+ ("Д-р Смирнов здесь, чтобы увидеть вас.", "доктор Смирнов здесь, чтобы увидеть вас.", "ru"),
864
+ # Turkish
865
+ ("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", "tr"),
866
+ ("Dr. Ayşe burada.", "doktor Ayşe burada.", "tr"),
867
+ # Hungarian
868
+ ("Dr. Szabó itt van.", "doktor Szabó itt van.", "hu"),
869
+ ]
870
+
871
+ for a, b, lang in test_cases:
872
+ out = expand_abbreviations_multilingual(a, lang=lang)
873
+ assert out == b, f"'{out}' vs '{b}'"
874
+
875
+
876
+ def test_symbols_multilingual():
877
+ test_cases = [
878
+ ("I have 14% battery", "I have 14 percent battery", "en"),
879
+ ("Te veo @ la fiesta", "Te veo arroba la fiesta", "es"),
880
+ ("J'ai 14° de fièvre", "J'ai 14 degrés de fièvre", "fr"),
881
+ ("Die Rechnung beträgt £ 20", "Die Rechnung beträgt pfund 20", "de"),
882
+ ("O meu email é ana&[email protected]", "O meu email é ana e joao arroba gmail.com", "pt"),
883
+ ("linguaggio di programmazione C#", "linguaggio di programmazione C cancelletto", "it"),
884
+ ("Moja temperatura to 36.6°", "Moja temperatura to 36.6 stopnie", "pl"),
885
+ ("Mám 14% baterie", "Mám 14 procento baterie", "cs"),
886
+ ("Těším se na tebe @ party", "Těším se na tebe na party", "cs"),
887
+ ("У меня 14% заряда", "У меня 14 процентов заряда", "ru"),
888
+ ("Я буду @ дома", "Я буду собака дома", "ru"),
889
+ ("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"),
890
+ ("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"),
891
+ ("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"),
892
+ ("我的电量为 14%", "我的电量为 14 百分之", "zh"),
893
+ ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
894
+ ("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"),
895
+ ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"),
896
+ ("मेरे पास 14% बैटरी है।", "मेरे पास चौदह प्रतिशत बैटरी है।", "hi"),
897
+ ]
898
+
899
+ for a, b, lang in test_cases:
900
+ out = expand_symbols_multilingual(a, lang=lang)
901
+ assert out == b, f"'{out}' vs '{b}'"
902
+
903
+
904
+ if __name__ == "__main__":
905
+ test_expand_numbers_multilingual()
906
+ test_abbreviations_multilingual()
907
+ test_symbols_multilingual()