sumesh4C commited on
Commit
cf772c6
·
verified ·
1 Parent(s): 04554fe

Create preprocessing.py

Browse files
Files changed (1) hide show
  1. tasks/utils/preprocessing.py +67 -0
tasks/utils/preprocessing.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import re
3
+ import string
4
+ from nltk.corpus import stopwords
5
+ import nltk
6
+ import spacy
7
+
8
+ # Get the list of English stop words from NLTK
9
+ nltk_stop_words = stopwords.words('english')
10
+
11
+ # Load the spaCy model for English
12
+ nlp = spacy.load("en_core_web_sm")
13
+ def process_text(text):
14
+ """
15
+ Process text by:
16
+ 1. Lowercasing
17
+ 2. Removing punctuation and non-alphanumeric characters
18
+ 3. Removing stop words
19
+ 4. Lemmatization
20
+ """
21
+ # Step 1: Tokenization & Processing with spaCy
22
+ doc = nlp(text.lower()) # Process text with spaCy
23
+
24
+ # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
25
+ processed_tokens = [
26
+ re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters
27
+ for token in doc
28
+ if token.text not in nltk_stop_words and token.text not in string.punctuation
29
+ ]
30
+
31
+ # Optional: Filter out empty strings resulting from the regex replacement
32
+ processed_tokens = " ".join([word for word in processed_tokens if word])
33
+
34
+ return processed_tokens
35
+
36
+
37
+ def predict(input_df: pd.DataFrame, tfidf_path: str, model_path: str, text_column: str = "quote"):
38
+ """
39
+ Predict the output using a saved TF-IDF vectorizer and Random Forest model.
40
+
41
+ Parameters:
42
+ input_df (pd.DataFrame): Input dataframe containing the text data.
43
+ tfidf_path (str): Path to the saved TF-IDF vectorizer pickle file.
44
+ model_path (str): Path to the saved Random Forest model pickle file.
45
+ text_column (str): The name of the column in the dataframe containing the text data.
46
+
47
+ Returns:
48
+ pd.Series: Predictions for each row in the input dataframe.
49
+ """
50
+ # Load the TF-IDF vectorizer
51
+ with open(tfidf_path, "rb") as tfidf_file:
52
+ tfidf_vectorizer = pickle.load(tfidf_file)
53
+
54
+ # Load the Random Forest model
55
+ with open(model_path, "rb") as model_file:
56
+ model = pickle.load(model_file)
57
+
58
+ # Transform the input text using the TF-IDF vectorizer
59
+ text_data = input_df.to_pandas()["quote"]
60
+ text_features = tfidf_vectorizer.transform(text_data)
61
+
62
+ # Make predictions using the loaded model
63
+ predictions = model.predict(text_features)
64
+
65
+ return predictions
66
+
67
+