sumesh4C commited on
Commit
88e54d1
·
verified ·
1 Parent(s): 02700b3

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +34 -0
tasks/text.py CHANGED
@@ -14,6 +14,40 @@ import pickle
14
  import torch
15
  import os
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  router = APIRouter()
18
 
19
  DESCRIPTION = "TF-IDF + RF"
 
14
  import torch
15
  import os
16
 
17
+ import nltk
18
+ from nltk.corpus import stopwords
19
+ import spacy
20
+
21
+ nltk.download('stopwords')
22
+ # Get the list of English stop words from NLTK
23
+ nltk_stop_words = stopwords.words('english')
24
+ # Load the spaCy model for English
25
+ nlp = spacy.load("en_core_web_sm")
26
+
27
+
28
+ def process_text(text):
29
+ """
30
+ Process text by:
31
+ 1. Lowercasing
32
+ 2. Removing punctuation and non-alphanumeric characters
33
+ 3. Removing stop words
34
+ 4. Lemmatization
35
+ """
36
+ # Step 1: Tokenization & Processing with spaCy
37
+ doc = nlp(text.lower()) # Process text with spaCy
38
+
39
+ # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
40
+ processed_tokens = [
41
+ re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters
42
+ for token in doc
43
+ if token.text not in nltk_stop_words and token.text not in string.punctuation
44
+ ]
45
+
46
+ # Optional: Filter out empty strings resulting from the regex replacement
47
+ processed_tokens = " ".join([word for word in processed_tokens if word])
48
+
49
+ return processed_tokens
50
+
51
  router = APIRouter()
52
 
53
  DESCRIPTION = "TF-IDF + RF"