akuysal commited on
Commit
8678eea
·
1 Parent(s): 1247363

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -8
app.py CHANGED
@@ -1,22 +1,21 @@
1
  from sklearn.feature_extraction.text import TfidfVectorizer
2
- from TurkishStemmer import TurkishStemmer
3
  import string
4
  # import for loading python objects (scikit-learn models)
5
  import pickle
6
  import nltk
7
  from nltk.data import load
 
8
  import streamlit as st
9
  import sklearn
10
 
11
  nltk.download('punkt')
12
- trans_table = {ord(c): None for c in string.punctuation + string.digits}
13
 
14
  def custom_tokenizer_with_Turkish_stemmer(text):
15
  # tokenize text
16
  # tokens = text.split(" ")
17
- tokens = [word for word in nltk.word_tokenize(text.translate(trans_table))]
18
  print(tokens)
19
- stems = [stemmerTR.stem(item.lower()) for item in tokens]
20
  return stems
21
 
22
  def predictSMSdata(test_text):
@@ -24,13 +23,13 @@ def predictSMSdata(test_text):
24
  categories.sort()
25
 
26
  # load model
27
- filename1 = "LinearSVC_SMS_spam_TR.pickle"
28
  file_handle1 = open(filename1, "rb")
29
  classifier = pickle.load(file_handle1)
30
  file_handle1.close()
31
 
32
  # load tfidf_vectorizer for transforming test text data
33
- filename2 = "tfidf_vectorizer_TR.pickle"
34
  file_handle2 = open(filename2, "rb")
35
  tfidf_vectorizer = pickle.load(file_handle2)
36
  file_handle2.close()
@@ -41,10 +40,11 @@ def predictSMSdata(test_text):
41
  print(categories[predicted[0]])
42
  return categories[predicted[0]]
43
 
44
- stemmerTR = TurkishStemmer()
 
45
 
46
  # adding the text that will show in the text box
47
- default_value = "Aveadan SUPER bir Muzik Paketi! MAXI yaz, 5555e gonder"
48
  text = st.text_area("enter some text!", default_value)
49
  if text:
50
  out = predictSMSdata(text)
 
1
  from sklearn.feature_extraction.text import TfidfVectorizer
 
2
  import string
3
  # import for loading python objects (scikit-learn models)
4
  import pickle
5
  import nltk
6
  from nltk.data import load
7
+ from nltk.stem import PorterStemmer
8
  import streamlit as st
9
  import sklearn
10
 
11
  nltk.download('punkt')
 
12
 
13
  def custom_tokenizer_with_Turkish_stemmer(text):
14
  # tokenize text
15
  # tokens = text.split(" ")
16
+ tokens = [word for word in nltk.word_tokenize(text)]
17
  print(tokens)
18
+ stems = [stemmerEN.stem(item.lower()) for item in tokens]
19
  return stems
20
 
21
  def predictSMSdata(test_text):
 
23
  categories.sort()
24
 
25
  # load model
26
+ filename1 = "LinearSVC_SMS_spam_EN.pickle"
27
  file_handle1 = open(filename1, "rb")
28
  classifier = pickle.load(file_handle1)
29
  file_handle1.close()
30
 
31
  # load tfidf_vectorizer for transforming test text data
32
+ filename2 = "tfidf_vectorizer_EN.pickle"
33
  file_handle2 = open(filename2, "rb")
34
  tfidf_vectorizer = pickle.load(file_handle2)
35
  file_handle2.close()
 
40
  print(categories[predicted[0]])
41
  return categories[predicted[0]]
42
 
43
+ # Porter Stemmer for English
44
+ stemmerEN = PorterStemmer()
45
 
46
  # adding the text that will show in the text box
47
+ default_value = "ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE MINS. INDIA CUST SERVs SED YES. L8ER GOT MEGA BILL. 3 DONT GIV A SHIT. BAILIFF DUE IN DAYS. I O £250 3 WANT £800"
48
  text = st.text_area("enter some text!", default_value)
49
  if text:
50
  out = predictSMSdata(text)