sumesh4C commited on
Commit
3c7c0c1
·
verified ·
1 Parent(s): 59d3c8a

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +0 -34
tasks/text.py CHANGED
@@ -14,40 +14,6 @@ import pickle
14
  import torch
15
  import os
16
 
17
- import nltk
18
- from nltk.corpus import stopwords
19
- import spacy
20
-
21
- nltk.download('stopwords')
22
- # Get the list of English stop words from NLTK
23
- nltk_stop_words = stopwords.words('english')
24
- # Load the spaCy model for English
25
- nlp = spacy.load("en_core_web_sm")
26
-
27
-
28
- def process_text(text):
29
- """
30
- Process text by:
31
- 1. Lowercasing
32
- 2. Removing punctuation and non-alphanumeric characters
33
- 3. Removing stop words
34
- 4. Lemmatization
35
- """
36
- # Step 1: Tokenization & Processing with spaCy
37
- doc = nlp(text.lower()) # Process text with spaCy
38
-
39
- # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
40
- processed_tokens = [
41
- re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters
42
- for token in doc
43
- if token.text not in nltk_stop_words and token.text not in string.punctuation
44
- ]
45
-
46
- # Optional: Filter out empty strings resulting from the regex replacement
47
- processed_tokens = " ".join([word for word in processed_tokens if word])
48
-
49
- return processed_tokens
50
-
51
  router = APIRouter()
52
 
53
  DESCRIPTION = "TF-IDF + RF"
 
14
  import torch
15
  import os
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  router = APIRouter()
18
 
19
  DESCRIPTION = "TF-IDF + RF"