kantundpeterpan commited on
Commit
cb4572b
·
verified ·
1 Parent(s): 65adc06

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +16 -0
tasks/text.py CHANGED
@@ -12,6 +12,22 @@ import joblib
12
  REPO_ID = "kantundpeterpan/frugal-ai-toy"
13
  FILENAME = "tfidf_rf.pkl"
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def lemmatize_X(X):
16
  return X.quote.apply(tokenize_quote).apply(lemmatize_tokens).apply(lambda x: " ".join(x))
17
 
 
12
  REPO_ID = "kantundpeterpan/frugal-ai-toy"
13
  FILENAME = "tfidf_rf.pkl"
14
 
15
+ import nltk
16
+ from nltk.tokenize import WordPunctTokenizer
17
+ from nltk.stem import WordNetLemmatizer
18
+ from nltk.corpus import stopwords
19
+ import string
20
+
21
+ stop = set(stopwords.words('english') + list(string.punctuation))
22
+
23
+ def tokenize_quote(r):
24
+ tokens = nltk.word_tokenize(r.lower())
25
+ cleaned = [word for word in tokens if word not in stop]
26
+ return cleaned
27
+
28
+ def lemmatize_tokens(tokens: list):
29
+ return [lemmatizer.lemmatize(t) for t in tokens]
30
+
31
  def lemmatize_X(X):
32
  return X.quote.apply(tokenize_quote).apply(lemmatize_tokens).apply(lambda x: " ".join(x))
33