sumesh4C commited on
Commit
2fa8fbb
·
verified ·
1 Parent(s): ca68325

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +2 -35
tasks/text.py CHANGED
@@ -7,48 +7,15 @@ import random
7
  from .utils.evaluation import TextEvaluationRequest
8
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
9
  from .utils.predict import predict
 
10
 
11
 
12
-
13
  #packages needed for inference
14
  import pickle
15
  import torch
16
  import os
17
 
18
- import nltk
19
- from nltk.corpus import stopwords
20
- import spacy
21
-
22
- nltk.download('stopwords')
23
- # Get the list of English stop words from NLTK
24
- nltk_stop_words = stopwords.words('english')
25
- # Load the spaCy model for English
26
- nlp = spacy.load("en_core_web_sm")
27
-
28
-
29
- def process_text(text):
30
- """
31
- Process text by:
32
- 1. Lowercasing
33
- 2. Removing punctuation and non-alphanumeric characters
34
- 3. Removing stop words
35
- 4. Lemmatization
36
- """
37
- # Step 1: Tokenization & Processing with spaCy
38
- doc = nlp(text.lower()) # Process text with spaCy
39
-
40
- # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
41
- processed_tokens = [
42
- re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters
43
- for token in doc
44
- if token.text not in nltk_stop_words and token.text not in string.punctuation
45
- ]
46
-
47
- # Optional: Filter out empty strings resulting from the regex replacement
48
- processed_tokens = " ".join([word for word in processed_tokens if word])
49
-
50
- return processed_tokens
51
-
52
  router = APIRouter()
53
 
54
  DESCRIPTION = "TF-IDF + RF"
 
7
  from .utils.evaluation import TextEvaluationRequest
8
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
9
  from .utils.predict import predict
10
+ from .utils.preprocessing import process_text
11
 
12
 
13
+ print(process_text("I am better"))
14
  #packages needed for inference
15
  import pickle
16
  import torch
17
  import os
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  router = APIRouter()
20
 
21
  DESCRIPTION = "TF-IDF + RF"