sumesh4C commited on
Commit
be76a88
·
verified ·
1 Parent(s): 3a8787c

Create preprocessing.py

Browse files
Files changed (1) hide show
  1. tasks/utils/preprocessing.py +33 -0
tasks/utils/preprocessing.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk.corpus import stopwords
3
+ import spacy
4
+
5
+ nltk.download('stopwords')
6
+ # Get the list of English stop words from NLTK
7
+ nltk_stop_words = stopwords.words('english')
8
+ # Load the spaCy model for English
9
+ nlp = spacy.load("en_core_web_sm")
10
+
11
+
12
+ def process_text(text):
13
+ """
14
+ Process text by:
15
+ 1. Lowercasing
16
+ 2. Removing punctuation and non-alphanumeric characters
17
+ 3. Removing stop words
18
+ 4. Lemmatization
19
+ """
20
+ # Step 1: Tokenization & Processing with spaCy
21
+ doc = nlp(text.lower()) # Process text with spaCy
22
+
23
+ # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
24
+ processed_tokens = [
25
+ re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters
26
+ for token in doc
27
+ if token.text not in nltk_stop_words and token.text not in string.punctuation
28
+ ]
29
+
30
+ # Optional: Filter out empty strings resulting from the regex replacement
31
+ processed_tokens = " ".join([word for word in processed_tokens if word])
32
+
33
+ return processed_tokens