File size: 1,097 Bytes
be76a88
 
 
63cab8a
58fe3bc
63cab8a
be76a88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import nltk
from nltk.corpus import stopwords
import spacy
import string
import re


nltk.download('stopwords')
# Get the list of English stop words from NLTK
nltk_stop_words = stopwords.words('english')
# Load the spaCy model for English
nlp = spacy.load("en_core_web_sm")


def process_text(text):
    """
    Process text by:
    1. Lowercasing
    2. Removing punctuation and non-alphanumeric characters
    3. Removing stop words
    4. Lemmatization
    """
    # Step 1: Tokenization & Processing with spaCy
    doc = nlp(text.lower())  # Process text with spaCy

    # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
    processed_tokens = [
        re.sub(r'[^a-zA-Z0-9]', '', token.lemma_)  # Remove non-alphanumeric characters
        for token in doc 
        if token.text not in nltk_stop_words and token.text not in string.punctuation
    ]
    
    # Optional: Filter out empty strings resulting from the regex replacement
    processed_tokens = " ".join([word for word in processed_tokens if word])
    
    return processed_tokens