|
import re |
|
|
|
emoji_pattern = re.compile( |
|
"[" |
|
u"\U0001F600-\U0001F64F" |
|
u"\U0001F300-\U0001F5FF" |
|
u"\U0001F680-\U0001F6FF" |
|
u"\U0001F1E0-\U0001F1FF" |
|
u"\U00002702-\U000027B0" |
|
u"\U000024C2-\U0001F251" |
|
"]+", |
|
flags=re.UNICODE, |
|
) |
|
|
|
|
|
def clean_text(x): |
|
x = x.lower() |
|
x = x.encode("ascii", "ignore").decode() |
|
x = re.sub(r"https*\S+", " ", x) |
|
x = re.sub(r"@\S+", " ", x) |
|
x = re.sub(r"#\S+", " ", x) |
|
x = x.replace("'", "") |
|
|
|
|
|
x = re.sub(r"\s{2,}", " ", x) |
|
x = emoji_pattern.sub(r"", x) |
|
x = re.sub("[^A-Za-z0-9]+", " ", x) |
|
|
|
return x |
|
|