Spaces:
Build error
Build error
import argparse | |
import re | |
import inflect | |
from training import DEFAULT_ALPHABET | |
INFLECT_ENGINE = inflect.engine() | |
COMMA_NUMBER_RE = re.compile(r"([0-9][0-9\,]+[0-9])") | |
DECIMAL_NUMBER_RE = re.compile(r"([0-9]+\.[0-9]+)") | |
NUMBER_RE = re.compile(r"[0-9]+") | |
ORDINALS = re.compile(r"([0-9]+[st|nd|rd|th]+)") | |
CURRENCY = re.compile(r"([£|$|€]+[0-9]+)") | |
WHITESPACE_RE = re.compile(r"\s+") | |
ALLOWED_CHARACTERS_RE = re.compile("[^a-z ,.!?'-]+") | |
MONETARY_REPLACEMENT = {"$": " dollars", "£": " pounds", "€": " euros"} | |
ABBREVIATION_REPLACEMENT = { | |
"mr.": "mister", | |
"mrs.": "misess", | |
"dr.": "doctor", | |
"no.": "number", | |
"st.": "saint", | |
"co.": "company", | |
"jr.": "junior", | |
"maj.": "major", | |
"gen.": "general", | |
"drs.": "doctors", | |
"rev.": "reverend", | |
"lt.": "lieutenant", | |
"hon.": "honorable", | |
"sgt.": "sergeant", | |
"capt.": "captain", | |
"esq.": "esquire", | |
"ltd.": "limited", | |
"col.": "colonel", | |
"ft.": "fort", | |
} | |
def clean_text(text, symbols=DEFAULT_ALPHABET, remove_invalid_characters=True): | |
""" | |
Cleans text. This includes: | |
- Replacing monetary terms (i.e. $ -> dollars) | |
- Converting ordinals to full words (i.e. 1st -> first) | |
- Converting numbers to their full word format (i.e. 100 -> one hundred) | |
- Replacing abbreviations (i.e. dr. -> doctor) | |
- Removing invalid characters (non utf-8 or invalid punctuation) | |
Parameters | |
---------- | |
text : str | |
Text to clean | |
symbols : list (optional) | |
List of valid symbols in text (default is English alphabet & punctuation) | |
remove_invalid_characters : bool (optional) | |
Whether to remove characters not in symbols list (default is True) | |
Returns | |
------- | |
str | |
Cleaned text | |
""" | |
text = text.strip() | |
text = text.lower() | |
# Convert currency to words | |
money = re.findall(CURRENCY, text) | |
for amount in money: | |
for key, value in MONETARY_REPLACEMENT.items(): | |
if key in amount: | |
text = text.replace(amount, amount[1:] + value) | |
# Convert ordinals to words | |
ordinals = re.findall(ORDINALS, text) | |
for ordinal in ordinals: | |
text = text.replace(ordinal, INFLECT_ENGINE.number_to_words(ordinal)) | |
# Convert comma & decimal numbers to words | |
numbers = re.findall(COMMA_NUMBER_RE, text) + re.findall(DECIMAL_NUMBER_RE, text) | |
for number in numbers: | |
text = text.replace(number, INFLECT_ENGINE.number_to_words(number)) | |
# Convert standard numbers to words | |
numbers = re.findall(NUMBER_RE, text) | |
for number in numbers: | |
text = text.replace(number, INFLECT_ENGINE.number_to_words(number)) | |
# Replace abbreviations | |
for key, value in ABBREVIATION_REPLACEMENT.items(): | |
text = text.replace(" " + key + " ", " " + value + " ") | |
# Collapse whitespace | |
text = re.sub(WHITESPACE_RE, " ", text) | |
# Remove banned characters | |
if remove_invalid_characters: | |
text = "".join([c for c in text if c in symbols]) | |
return text | |
if __name__ == "__main__": | |
"""Script to clean text for training""" | |
parser = argparse.ArgumentParser(description="Clean & improve text for training") | |
parser.add_argument("-f", "--file", help="Text file path", type=str, required=True) | |
parser.add_argument("-o", "--output", help="Output text file path", type=str, required=True) | |
args = parser.parse_args() | |
with open(args.file) as f: | |
rows = f.readlines() | |
cleaned_text = [] | |
for row in rows: | |
filename, text = row.split("|") | |
text = clean_text(text) | |
cleaned_text.append(f"{filename}|{text}") | |
with open(args.output, "w") as f: | |
for line in cleaned_text: | |
f.write(line) | |
f.write("\n") | |