Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
|
@@ -48,19 +48,29 @@ def parens_to_angles(s):
|
|
| 48 |
def split_num(num):
|
| 49 |
num = num.group()
|
| 50 |
if '.' in num:
|
|
|
|
| 51 |
a, b = num.split('.')
|
| 52 |
return ' point '.join([a, ' '.join(b)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
year = int(num[:4])
|
| 54 |
if year < 1100 or year % 1000 < 10:
|
| 55 |
return num
|
| 56 |
-
left, right = num[:2], num[2:4]
|
| 57 |
s = 's' if num.endswith('s') else ''
|
| 58 |
if 100 <= year % 1000 <= 999:
|
| 59 |
-
if right ==
|
| 60 |
return f'{left} hundred{s}'
|
| 61 |
-
elif
|
| 62 |
return f'{left} oh {right}{s}'
|
| 63 |
-
return f'{left} {right}{s}'
|
| 64 |
|
| 65 |
def normalize(text):
|
| 66 |
# TODO: Custom text normalization rules?
|
|
@@ -75,11 +85,9 @@ def normalize(text):
|
|
| 75 |
text = re.sub(r'[^\S \n]', ' ', text)
|
| 76 |
text = re.sub(r' +', ' ', text)
|
| 77 |
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
|
| 78 |
-
text = re.sub(r'\d*\.\d+|\b\d{4}s?\b', split_num, text)
|
| 79 |
text = re.sub(r'(?<=\d),(?=\d)', '', text)
|
| 80 |
text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text) # TODO: could be minus
|
| 81 |
-
text = re.sub(r'(?<!:)\b(?:[1-9]|1[0-2]):00\b(?!:)', lambda m: m.group()[:-3] + " o'clock", text)
|
| 82 |
-
text = re.sub(r'(?<=\d):(?=\d)', ' ', text)
|
| 83 |
text = re.sub(r'(?<=\d)S', ' S', text)
|
| 84 |
text = re.sub(r"(?<=[A-Z])'?s", lambda m: m.group().upper(), text)
|
| 85 |
text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
|
|
|
|
| 48 |
def split_num(num):
|
| 49 |
num = num.group()
|
| 50 |
if '.' in num:
|
| 51 |
+
# Decimal
|
| 52 |
a, b = num.split('.')
|
| 53 |
return ' point '.join([a, ' '.join(b)])
|
| 54 |
+
elif ':' in num:
|
| 55 |
+
# Time
|
| 56 |
+
h, m = [int(n) for n in num.split(':')]
|
| 57 |
+
if m == 0:
|
| 58 |
+
return f"{h} o'clock"
|
| 59 |
+
elif m < 10:
|
| 60 |
+
return f'{h} oh {m}'
|
| 61 |
+
return f'{h} {m}'
|
| 62 |
+
# Year
|
| 63 |
year = int(num[:4])
|
| 64 |
if year < 1100 or year % 1000 < 10:
|
| 65 |
return num
|
| 66 |
+
left, right = num[:2], int(num[2:4])
|
| 67 |
s = 's' if num.endswith('s') else ''
|
| 68 |
if 100 <= year % 1000 <= 999:
|
| 69 |
+
if right == 0:
|
| 70 |
return f'{left} hundred{s}'
|
| 71 |
+
elif right < 10:
|
| 72 |
return f'{left} oh {right}{s}'
|
| 73 |
+
return f'{left} {right:02}{s}'
|
| 74 |
|
| 75 |
def normalize(text):
|
| 76 |
# TODO: Custom text normalization rules?
|
|
|
|
| 85 |
text = re.sub(r'[^\S \n]', ' ', text)
|
| 86 |
text = re.sub(r' +', ' ', text)
|
| 87 |
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
|
| 88 |
+
text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
|
| 89 |
text = re.sub(r'(?<=\d),(?=\d)', '', text)
|
| 90 |
text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text) # TODO: could be minus
|
|
|
|
|
|
|
| 91 |
text = re.sub(r'(?<=\d)S', ' S', text)
|
| 92 |
text = re.sub(r"(?<=[A-Z])'?s", lambda m: m.group().upper(), text)
|
| 93 |
text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
|