File size: 851 Bytes
7901fc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
from nltk import sent_tokenize,word_tokenize
from nltk.stem.snowball import SnowballStemmer


def normalize(text):
    return(text.lower())

def remove_stopwords(text):
  list_stopwords =  stopwords.words("english")
  finalText=' '.join(a for a in word_tokenize(text) if (a not in list_stopwords and a.isalnum()))
  return finalText

def removenumbers(text):
    re_num = "\d+" ###COMPLETE THE REGULAR EXPRESSION
    text = re.sub(re_num, "", text)
    return text

def stem_text(text):
  stemmer = SnowballStemmer("english")
  t=' '.join(stemmer.stem(a) for a in word_tokenize(text))
  return t

def preprocess(text):
  text = normalize(text)
  text = remove_stopwords(text)
  text = removenumbers(text)
  text = stem_text(text)
  return(text)