Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
2 |
+
from TurkishStemmer import TurkishStemmer
|
3 |
+
import nltk
|
4 |
+
import string
|
5 |
+
# import for loading python objects (scikit-learn models)
|
6 |
+
import pickle
|
7 |
+
|
8 |
+
def custom_tokenizer_with_Turkish_stemmer(text):
|
9 |
+
# my text was unicode so I had to use the unicode-specific translate function. If your documents are strings, you will need to use a different `translate` function here. `Translated` here just does search-replace. See the trans_table: any matching character in the set is replaced with `None`
|
10 |
+
tokens = [word for word in nltk.word_tokenize(text.translate(trans_table))]
|
11 |
+
stems = [stemmerTR.stem(item.lower()) for item in tokens]
|
12 |
+
return stems
|
13 |
+
|
14 |
+
def predictSMSdata(test_text):
|
15 |
+
categories = ["legitimate", "spam"]
|
16 |
+
categories.sort()
|
17 |
+
|
18 |
+
# load model
|
19 |
+
filename1 = "LinearSVC_SMS_spam_TR.pickle"
|
20 |
+
file_handle1 = open(filename1, "rb")
|
21 |
+
classifier = pickle.load(file_handle1)
|
22 |
+
file_handle1.close()
|
23 |
+
|
24 |
+
# load tfidf_vectorizer for transforming test text data
|
25 |
+
filename2 = "tfidf_vectorizer_TR.pickle"
|
26 |
+
file_handle2 = open(filename2, "rb")
|
27 |
+
tfidf_vectorizer = pickle.load(file_handle2)
|
28 |
+
file_handle2.close()
|
29 |
+
|
30 |
+
test_list=[test_text]
|
31 |
+
tfidf_vectorizer_vectors_test = tfidf_vectorizer.transform(test_list)
|
32 |
+
predicted = classifier.predict(tfidf_vectorizer_vectors_test)
|
33 |
+
print(categories[predicted[0]])
|
34 |
+
|
35 |
+
trans_table = {ord(c): None for c in string.punctuation + string.digits}
|
36 |
+
stemmerTR = TurkishStemmer()
|
37 |
+
|
38 |
+
# Extra test data from the training set
|
39 |
+
# legitimate - l0430.txt
|
40 |
+
predictSMSdata("Ahmet de gelecek mi?")
|
41 |
+
|
42 |
+
# legitimate - l0429.txt
|
43 |
+
predictSMSdata("Vakifbank WebSubem girisi icin tek kullanimlik sifreniz: 160038 . Sifreniz 3 dk gecerlidir. Tarih: 14.02.2011 Saat: 13:53")
|
44 |
+
|
45 |
+
# spam - s0003.txt
|
46 |
+
predictSMSdata("Aveadan SUPER bir Muzik Paketi! MAXI yaz, 5555e gonder, Maxi Muzikindir Paketi ile 150 yerli 50 Yabanci sarkiyi ayda sadece 5,99 TLye cebine indir!")
|
47 |
+
|
48 |
+
# spam - s0359.txt
|
49 |
+
predictSMSdata("1-2 NISAN TARIHLERINDE;DERMALOGICA CILT BAKIMINA DAVETLISINIZ.RANDEVU ALINIZ TEL:2312840")
|