Spaces:

0xrushi
/

Priyanka-Chopra-TTS

Build error

App Files Files

xet

Community

Priyanka-Chopra-TTS / training /clean_text.py

0xrushi

first commit

bcfd9f0 over 3 years ago

raw

history blame

3.75 kB

	import argparse
	import re

	import inflect
	from training import DEFAULT_ALPHABET

	INFLECT_ENGINE = inflect.engine()
	COMMA_NUMBER_RE = re.compile(r"([0-9][0-9\,]+[0-9])")
	DECIMAL_NUMBER_RE = re.compile(r"([0-9]+\.[0-9]+)")
	NUMBER_RE = re.compile(r"[0-9]+")
	ORDINALS = re.compile(r"([0-9]+[st\|nd\|rd\|th]+)")
	CURRENCY = re.compile(r"([£\|$\|€]+[0-9]+)")
	WHITESPACE_RE = re.compile(r"\s+")
	ALLOWED_CHARACTERS_RE = re.compile("[^a-z ,.!?'-]+")
	MONETARY_REPLACEMENT = {"$": " dollars", "£": " pounds", "€": " euros"}
	ABBREVIATION_REPLACEMENT = {
	"mr.": "mister",
	"mrs.": "misess",
	"dr.": "doctor",
	"no.": "number",
	"st.": "saint",
	"co.": "company",
	"jr.": "junior",
	"maj.": "major",
	"gen.": "general",
	"drs.": "doctors",
	"rev.": "reverend",
	"lt.": "lieutenant",
	"hon.": "honorable",
	"sgt.": "sergeant",
	"capt.": "captain",
	"esq.": "esquire",
	"ltd.": "limited",
	"col.": "colonel",
	"ft.": "fort",
	}


	def clean_text(text, symbols=DEFAULT_ALPHABET, remove_invalid_characters=True):
	"""
	Cleans text. This includes:
	- Replacing monetary terms (i.e. $ -> dollars)
	- Converting ordinals to full words (i.e. 1st -> first)
	- Converting numbers to their full word format (i.e. 100 -> one hundred)
	- Replacing abbreviations (i.e. dr. -> doctor)
	- Removing invalid characters (non utf-8 or invalid punctuation)

	Parameters
	----------
	text : str
	Text to clean
	symbols : list (optional)
	List of valid symbols in text (default is English alphabet & punctuation)
	remove_invalid_characters : bool (optional)
	Whether to remove characters not in symbols list (default is True)

	Returns
	-------
	str
	Cleaned text
	"""
	text = text.strip()
	text = text.lower()
	# Convert currency to words
	money = re.findall(CURRENCY, text)
	for amount in money:
	for key, value in MONETARY_REPLACEMENT.items():
	if key in amount:
	text = text.replace(amount, amount[1:] + value)
	# Convert ordinals to words
	ordinals = re.findall(ORDINALS, text)
	for ordinal in ordinals:
	text = text.replace(ordinal, INFLECT_ENGINE.number_to_words(ordinal))
	# Convert comma & decimal numbers to words
	numbers = re.findall(COMMA_NUMBER_RE, text) + re.findall(DECIMAL_NUMBER_RE, text)
	for number in numbers:
	text = text.replace(number, INFLECT_ENGINE.number_to_words(number))
	# Convert standard numbers to words
	numbers = re.findall(NUMBER_RE, text)
	for number in numbers:
	text = text.replace(number, INFLECT_ENGINE.number_to_words(number))
	# Replace abbreviations
	for key, value in ABBREVIATION_REPLACEMENT.items():
	text = text.replace(" " + key + " ", " " + value + " ")
	# Collapse whitespace
	text = re.sub(WHITESPACE_RE, " ", text)
	# Remove banned characters
	if remove_invalid_characters:
	text = "".join([c for c in text if c in symbols])
	return text


	if __name__ == "__main__":
	"""Script to clean text for training"""
	parser = argparse.ArgumentParser(description="Clean & improve text for training")
	parser.add_argument("-f", "--file", help="Text file path", type=str, required=True)
	parser.add_argument("-o", "--output", help="Output text file path", type=str, required=True)
	args = parser.parse_args()

	with open(args.file) as f:
	rows = f.readlines()

	cleaned_text = []

	for row in rows:
	filename, text = row.split("\|")
	text = clean_text(text)
	cleaned_text.append(f"{filename}\|{text}")

	with open(args.output, "w") as f:
	for line in cleaned_text:
	f.write(line)
	f.write("\n")