submission-template

Sleeping

sumesh4C commited on Jan 27

Commit

2fa8fbb

verified ·

1 Parent(s): ca68325

Update tasks/text.py

Files changed (1) hide show

tasks/text.py CHANGED Viewed

@@ -7,48 +7,15 @@ import random
 from .utils.evaluation import TextEvaluationRequest
 from .utils.emissions import tracker, clean_emissions_data, get_space_info
 from .utils.predict import predict
 #packages needed for inference
 import pickle
 import torch
 import os
-import nltk
-from nltk.corpus import stopwords
-import spacy
-nltk.download('stopwords')
-# Get the list of English stop words from NLTK
-nltk_stop_words = stopwords.words('english')
-# Load the spaCy model for English
-nlp = spacy.load("en_core_web_sm")
-def process_text(text):
-    """
-    Process text by:
-    1. Lowercasing
-    2. Removing punctuation and non-alphanumeric characters
-    3. Removing stop words
-    4. Lemmatization
-    """
-    # Step 1: Tokenization & Processing with spaCy
-    doc = nlp(text.lower())  # Process text with spaCy
-    # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
-    processed_tokens = [
-        re.sub(r'[^a-zA-Z0-9]', '', token.lemma_)  # Remove non-alphanumeric characters
-        for token in doc
-        if token.text not in nltk_stop_words and token.text not in string.punctuation
-    ]
-    # Optional: Filter out empty strings resulting from the regex replacement
-    processed_tokens = " ".join([word for word in processed_tokens if word])
-    return processed_tokens
 router = APIRouter()
 DESCRIPTION = "TF-IDF + RF"

 from .utils.evaluation import TextEvaluationRequest
 from .utils.emissions import tracker, clean_emissions_data, get_space_info
 from .utils.predict import predict
+from .utils.preprocessing import process_text
+print(process_text("I am better"))
 #packages needed for inference
 import pickle
 import torch
 import os
 router = APIRouter()
 DESCRIPTION = "TF-IDF + RF"