submission-template

Sleeping

sumesh4C commited on Jan 27

Commit

88e54d1

verified ·

1 Parent(s): 02700b3

Update tasks/text.py

Files changed (1) hide show

tasks/text.py CHANGED Viewed

@@ -14,6 +14,40 @@ import pickle
 import torch
 import os
 router = APIRouter()
 DESCRIPTION = "TF-IDF + RF"

 import torch
 import os
+import nltk
+from nltk.corpus import stopwords
+import spacy
+nltk.download('stopwords')
+# Get the list of English stop words from NLTK
+nltk_stop_words = stopwords.words('english')
+# Load the spaCy model for English
+nlp = spacy.load("en_core_web_sm")
+def process_text(text):
+    """
+    Process text by:
+    1. Lowercasing
+    2. Removing punctuation and non-alphanumeric characters
+    3. Removing stop words
+    4. Lemmatization
+    """
+    # Step 1: Tokenization & Processing with spaCy
+    doc = nlp(text.lower())  # Process text with spaCy
+    # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
+    processed_tokens = [
+        re.sub(r'[^a-zA-Z0-9]', '', token.lemma_)  # Remove non-alphanumeric characters
+        for token in doc
+        if token.text not in nltk_stop_words and token.text not in string.punctuation
+    ]
+    # Optional: Filter out empty strings resulting from the regex replacement
+    processed_tokens = " ".join([word for word in processed_tokens if word])
+    return processed_tokens
 router = APIRouter()
 DESCRIPTION = "TF-IDF + RF"