|
import nltk |
|
from nltk.corpus import stopwords |
|
import spacy |
|
|
|
nltk.download('stopwords') |
|
|
|
nltk_stop_words = stopwords.words('english') |
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
def process_text(text): |
|
""" |
|
Process text by: |
|
1. Lowercasing |
|
2. Removing punctuation and non-alphanumeric characters |
|
3. Removing stop words |
|
4. Lemmatization |
|
""" |
|
|
|
doc = nlp(text.lower()) |
|
|
|
|
|
processed_tokens = [ |
|
re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) |
|
for token in doc |
|
if token.text not in nltk_stop_words and token.text not in string.punctuation |
|
] |
|
|
|
|
|
processed_tokens = " ".join([word for word in processed_tokens if word]) |
|
|
|
return processed_tokens |