Browse files
@@ -58,18 +58,14 @@ def parens_to_angles(s):
58 |
def split_num(num):
59 |
num =
60 |
if '.' in num:
61 |
62 |
a, b = num.split('.')
63 |
return ' point '.join([a, ' '.join(b)])
64 |
elif ':' in num:
65 |
# Time
66 |
h, m = [int(n) for n in num.split(':')]
67 |
if m == 0:
68 |
return f"{h} o'clock"
69 |
elif m < 10:
70 |
return f'{h} oh {m}'
71 |
return f'{h} {m}'
72 |
# Year
73 |
year = int(num[:4])
74 |
if year < 1100 or year % 1000 < 10:
75 |
return num
@@ -82,6 +78,24 @@ def split_num(num):
82 |
return f'{left} oh {right}{s}'
83 |
return f'{left} {right}{s}'
84 |
85 |
def normalize(text):
86 |
# TODO: Custom text normalization rules?
87 |
text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
@@ -97,6 +111,8 @@ def normalize(text):
97 |
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
98 |
text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
99 |
text = re.sub(r'(?<=\d),(?=\d)', '', text)
100 |
text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text) # TODO: could be minus
101 |
text = re.sub(r'(?<=\d)S', ' S', text)
102 |
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
58 |
def split_num(num):
59 |
num =
60 |
if '.' in num:
61 |
return num
62 |
elif ':' in num:
63 |
h, m = [int(n) for n in num.split(':')]
64 |
if m == 0:
65 |
return f"{h} o'clock"
66 |
elif m < 10:
67 |
return f'{h} oh {m}'
68 |
return f'{h} {m}'
69 |
year = int(num[:4])
70 |
if year < 1100 or year % 1000 < 10:
71 |
return num
78 |
return f'{left} oh {right}{s}'
79 |
return f'{left} {right}{s}'
80 |
81 |
def flip_money(m):
82 |
m =
83 |
bill = 'dollar' if m[0] == '$' else 'pound'
84 |
if m[-1].isalpha():
85 |
return f'{m[1:]} {bill}s'
86 |
elif '.' not in m:
87 |
s = '' if m[1:] == '1' else 's'
88 |
return f'{m[1:]} {bill}{s}'
89 |
b, c = m[1:].split('.')
90 |
s = '' if b == '1' else 's'
91 |
c = int(c.ljust(2, '0'))
92 |
coins = f"cent{'' if c == 1 else 's'}" if m[0] == '$' else ('penny' if c == 1 else 'pence')
93 |
return f'{b} {bill}{s} and {c} {coins}'
94 |
95 |
def point_num(num):
96 |
a, b ='.')
97 |
return ' point '.join([a, ' '.join(b)])
98 |
99 |
def normalize(text):
100 |
# TODO: Custom text normalization rules?
101 |
text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
111 |
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
112 |
text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
113 |
text = re.sub(r'(?<=\d),(?=\d)', '', text)
114 |
text = re.sub(r'[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b', flip_money, text)
115 |
text = re.sub(r'\d*\.\d+', point_num, text)
116 |
text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text) # TODO: could be minus
117 |
text = re.sub(r'(?<=\d)S', ' S', text)
118 |
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)