submission-template

Sleeping

sumesh4C commited on Jan 27

Commit

02700b3

verified ·

1 Parent(s): 91ad6bb

Update tasks/utils/preprocessing.py

Files changed (1) hide show

tasks/utils/preprocessing.py CHANGED Viewed

@@ -1,41 +1,7 @@
 import pickle
 import re
 import string
-from nltk.corpus import stopwords
-import nltk
-import spacy
 import pandas as pd
-nltk.download('stopwords')
-# Get the list of English stop words from NLTK
-nltk_stop_words = stopwords.words('english')
-# Load the spaCy model for English
-nlp = spacy.load("en_core_web_sm")
-def process_text(text):
-    """
-    Process text by:
-    1. Lowercasing
-    2. Removing punctuation and non-alphanumeric characters
-    3. Removing stop words
-    4. Lemmatization
-    """
-    # Step 1: Tokenization & Processing with spaCy
-    doc = nlp(text.lower())  # Process text with spaCy
-    # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
-    processed_tokens = [
-        re.sub(r'[^a-zA-Z0-9]', '', token.lemma_)  # Remove non-alphanumeric characters
-        for token in doc
-        if token.text not in nltk_stop_words and token.text not in string.punctuation
-    ]
-    # Optional: Filter out empty strings resulting from the regex replacement
-    processed_tokens = " ".join([word for word in processed_tokens if word])
-    return processed_tokens
 def predict(input_df: pd.DataFrame, tfidf_path: str, model_path: str):
     """

 import pickle
 import re
 import string
 import pandas as pd
 def predict(input_df: pd.DataFrame, tfidf_path: str, model_path: str):
     """