|
import re |
|
|
|
SYMBOLS_MAPPING = { |
|
"β": "'", |
|
"β": "'", |
|
} |
|
|
|
REPLACE_SYMBOL_REGEX = re.compile( |
|
"|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys()) |
|
) |
|
|
|
|
|
EMOJI_REGEX = re.compile( |
|
"[" |
|
"\U0001F600-\U0001F64F" |
|
"\U0001F300-\U0001F5FF" |
|
"\U0001F680-\U0001F6FF" |
|
"\U0001F1E0-\U0001F1FF" |
|
"]+", |
|
flags=re.UNICODE, |
|
) |
|
|
|
|
|
def clean_text(text): |
|
|
|
text = text.strip() |
|
|
|
|
|
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text) |
|
|
|
|
|
text = EMOJI_REGEX.sub(r"", text) |
|
|
|
|
|
text = re.sub(r"[,]{2,}", lambda m: m.group()[0], text) |
|
|
|
return text |
|
|