Spaces:
Sleeping
Sleeping
File size: 851 Bytes
7901fc5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
from nltk import sent_tokenize,word_tokenize
from nltk.stem.snowball import SnowballStemmer
def normalize(text):
return(text.lower())
def remove_stopwords(text):
list_stopwords = stopwords.words("english")
finalText=' '.join(a for a in word_tokenize(text) if (a not in list_stopwords and a.isalnum()))
return finalText
def removenumbers(text):
re_num = "\d+" ###COMPLETE THE REGULAR EXPRESSION
text = re.sub(re_num, "", text)
return text
def stem_text(text):
stemmer = SnowballStemmer("english")
t=' '.join(stemmer.stem(a) for a in word_tokenize(text))
return t
def preprocess(text):
text = normalize(text)
text = remove_stopwords(text)
text = removenumbers(text)
text = stem_text(text)
return(text)
|