import nltk
from nltk.corpus import stopwords
import spacy
import string
import re


nltk.download('stopwords')
# Get the list of English stop words from NLTK
nltk_stop_words = stopwords.words('english')
# Load the spaCy model for English
nlp = spacy.load("en_core_web_sm")


def process_text(text):
    """
    Process text by:
    1. Lowercasing
    2. Removing punctuation and non-alphanumeric characters
    3. Removing stop words
    4. Lemmatization
    """
    # Step 1: Tokenization & Processing with spaCy
    doc = nlp(text.lower())  # Process text with spaCy

    # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
    processed_tokens = [
        re.sub(r'[^a-zA-Z0-9]', '', token.lemma_)  # Remove non-alphanumeric characters
        for token in doc 
        if token.text not in nltk_stop_words and token.text not in string.punctuation
    ]
    
    # Optional: Filter out empty strings resulting from the regex replacement
    processed_tokens = " ".join([word for word in processed_tokens if word])
    
    return processed_tokens