sumesh4C commited on
Commit
02700b3
·
verified ·
1 Parent(s): 91ad6bb

Update tasks/utils/preprocessing.py

Browse files
Files changed (1) hide show
  1. tasks/utils/preprocessing.py +0 -34
tasks/utils/preprocessing.py CHANGED
@@ -1,41 +1,7 @@
1
  import pickle
2
  import re
3
  import string
4
- from nltk.corpus import stopwords
5
- import nltk
6
- import spacy
7
  import pandas as pd
8
-
9
- nltk.download('stopwords')
10
- # Get the list of English stop words from NLTK
11
- nltk_stop_words = stopwords.words('english')
12
- # Load the spaCy model for English
13
- nlp = spacy.load("en_core_web_sm")
14
-
15
-
16
- def process_text(text):
17
- """
18
- Process text by:
19
- 1. Lowercasing
20
- 2. Removing punctuation and non-alphanumeric characters
21
- 3. Removing stop words
22
- 4. Lemmatization
23
- """
24
- # Step 1: Tokenization & Processing with spaCy
25
- doc = nlp(text.lower()) # Process text with spaCy
26
-
27
- # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
28
- processed_tokens = [
29
- re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters
30
- for token in doc
31
- if token.text not in nltk_stop_words and token.text not in string.punctuation
32
- ]
33
-
34
- # Optional: Filter out empty strings resulting from the regex replacement
35
- processed_tokens = " ".join([word for word in processed_tokens if word])
36
-
37
- return processed_tokens
38
-
39
 
40
  def predict(input_df: pd.DataFrame, tfidf_path: str, model_path: str):
41
  """
 
1
  import pickle
2
  import re
3
  import string
 
 
 
4
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def predict(input_df: pd.DataFrame, tfidf_path: str, model_path: str):
7
  """