import nltk from nltk.corpus import stopwords import spacy import string import re nltk.download('stopwords') # Get the list of English stop words from NLTK nltk_stop_words = stopwords.words('english') # Load the spaCy model for English nlp = spacy.load("en_core_web_sm") def process_text(text): """ Process text by: 1. Lowercasing 2. Removing punctuation and non-alphanumeric characters 3. Removing stop words 4. Lemmatization """ # Step 1: Tokenization & Processing with spaCy doc = nlp(text.lower()) # Process text with spaCy # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization processed_tokens = [ re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters for token in doc if token.text not in nltk_stop_words and token.text not in string.punctuation ] # Optional: Filter out empty strings resulting from the regex replacement processed_tokens = " ".join([word for word in processed_tokens if word]) return processed_tokens