akuysal commited on
Commit
4b10007
·
1 Parent(s): 45d23ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -0
app.py CHANGED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ from TurkishStemmer import TurkishStemmer
3
+ import nltk
4
+ import string
5
+ # import for loading python objects (scikit-learn models)
6
+ import pickle
7
+
8
+ def custom_tokenizer_with_Turkish_stemmer(text):
9
+ # my text was unicode so I had to use the unicode-specific translate function. If your documents are strings, you will need to use a different `translate` function here. `Translated` here just does search-replace. See the trans_table: any matching character in the set is replaced with `None`
10
+ tokens = [word for word in nltk.word_tokenize(text.translate(trans_table))]
11
+ stems = [stemmerTR.stem(item.lower()) for item in tokens]
12
+ return stems
13
+
14
+ def predictSMSdata(test_text):
15
+ categories = ["legitimate", "spam"]
16
+ categories.sort()
17
+
18
+ # load model
19
+ filename1 = "LinearSVC_SMS_spam_TR.pickle"
20
+ file_handle1 = open(filename1, "rb")
21
+ classifier = pickle.load(file_handle1)
22
+ file_handle1.close()
23
+
24
+ # load tfidf_vectorizer for transforming test text data
25
+ filename2 = "tfidf_vectorizer_TR.pickle"
26
+ file_handle2 = open(filename2, "rb")
27
+ tfidf_vectorizer = pickle.load(file_handle2)
28
+ file_handle2.close()
29
+
30
+ test_list=[test_text]
31
+ tfidf_vectorizer_vectors_test = tfidf_vectorizer.transform(test_list)
32
+ predicted = classifier.predict(tfidf_vectorizer_vectors_test)
33
+ print(categories[predicted[0]])
34
+
35
+ trans_table = {ord(c): None for c in string.punctuation + string.digits}
36
+ stemmerTR = TurkishStemmer()
37
+
38
+ # Extra test data from the training set
39
+ # legitimate - l0430.txt
40
+ predictSMSdata("Ahmet de gelecek mi?")
41
+
42
+ # legitimate - l0429.txt
43
+ predictSMSdata("Vakifbank WebSubem girisi icin tek kullanimlik sifreniz: 160038 . Sifreniz 3 dk gecerlidir. Tarih: 14.02.2011 Saat: 13:53")
44
+
45
+ # spam - s0003.txt
46
+ predictSMSdata("Aveadan SUPER bir Muzik Paketi! MAXI yaz, 5555e gonder, Maxi Muzikindir Paketi ile 150 yerli 50 Yabanci sarkiyi ayda sadece 5,99 TLye cebine indir!")
47
+
48
+ # spam - s0359.txt
49
+ predictSMSdata("1-2 NISAN TARIHLERINDE;DERMALOGICA CILT BAKIMINA DAVETLISINIZ.RANDEVU ALINIZ TEL:2312840")