Spaces:
Sleeping
Sleeping
import re | |
import nltk | |
nltk.download('stopwords') | |
from nltk.corpus import stopwords | |
nltk.download('punkt') | |
from nltk import sent_tokenize,word_tokenize | |
from nltk.stem.snowball import SnowballStemmer | |
def normalize(text): | |
return(text.lower()) | |
def remove_stopwords(text): | |
list_stopwords = stopwords.words("english") | |
finalText=' '.join(a for a in word_tokenize(text) if (a not in list_stopwords and a.isalnum())) | |
return finalText | |
def removenumbers(text): | |
re_num = "\d+" ###COMPLETE THE REGULAR EXPRESSION | |
text = re.sub(re_num, "", text) | |
return text | |
def stem_text(text): | |
stemmer = SnowballStemmer("english") | |
t=' '.join(stemmer.stem(a) for a in word_tokenize(text)) | |
return t | |
def preprocess(text): | |
text = normalize(text) | |
text = remove_stopwords(text) | |
text = removenumbers(text) | |
text = stem_text(text) | |
return(text) | |