Update clean_data.py
Browse files- clean_data.py +2 -0
clean_data.py
CHANGED
|
@@ -37,6 +37,7 @@ def cleaned_complaints(text):
|
|
| 37 |
import nltk
|
| 38 |
from nltk.corpus import stopwords
|
| 39 |
from nltk.stem import WordNetLemmatizer
|
|
|
|
| 40 |
import warnings
|
| 41 |
import re
|
| 42 |
|
|
@@ -76,6 +77,7 @@ def cleaned_complaints(text):
|
|
| 76 |
letters_only = re.sub("[^a-zA-Z]", " ", newString) #Fetching out only letters
|
| 77 |
lower_case = letters_only.lower() #converting all words to lowercase
|
| 78 |
tokens = [w for w in lower_case.split() if not w in stop_words]#stopwords removal
|
|
|
|
| 79 |
# tokens= lower_case.split()
|
| 80 |
newString=''
|
| 81 |
for i in tokens:
|
|
|
|
| 37 |
import nltk
|
| 38 |
from nltk.corpus import stopwords
|
| 39 |
from nltk.stem import WordNetLemmatizer
|
| 40 |
+
from nltk.corpus import words
|
| 41 |
import warnings
|
| 42 |
import re
|
| 43 |
|
|
|
|
| 77 |
letters_only = re.sub("[^a-zA-Z]", " ", newString) #Fetching out only letters
|
| 78 |
lower_case = letters_only.lower() #converting all words to lowercase
|
| 79 |
tokens = [w for w in lower_case.split() if not w in stop_words]#stopwords removal
|
| 80 |
+
tokens = [x for x in tokens if x in words.words()]
|
| 81 |
# tokens= lower_case.split()
|
| 82 |
newString=''
|
| 83 |
for i in tokens:
|