add tokenize function
Browse files
    	
        app.py
    CHANGED
    
    | @@ -16,98 +16,56 @@ from underthesea import word_tokenize | |
| 16 |  | 
| 17 | 
             
            from phoBERT import BERT_predict
         | 
| 18 |  | 
| 19 | 
            -
            # Load tokenizer
         | 
| 20 | 
            -
            # fp = Path(__file__).with_name('tokenizer.pkl')
         | 
| 21 | 
            -
            # with open(fp,mode="rb") as f:
         | 
| 22 | 
            -
            #     tokenizer = pickle.load(f)
         | 
| 23 |  | 
| 24 | 
            -
            #Load LSTM
         | 
| 25 | 
            -
            #fp = Path(__file__).with_name('lstm_model.h5')
         | 
| 26 | 
             
            LSTM_model = tf.keras.models.load_model('lstm_model.tf')
         | 
| 27 |  | 
| 28 | 
            -
            #Load GRU
         | 
| 29 | 
            -
            #fp = Path(__file__).with_name('gru_model.h5')
         | 
| 30 | 
             
            GRU_model = tf.keras.models.load_model('gru_model.tf')
         | 
| 31 |  | 
| 32 | 
            -
             | 
| 33 | 
            -
            def tokenizer_pad(tokenizer,comment_text,max_length=200):
         | 
| 34 | 
            -
               
         | 
| 35 | 
            -
                comment_text = word_tokenize(comment_text, format="text")
         | 
| 36 | 
            -
                comment_text = [comment_text]
         | 
| 37 | 
            -
                tokenized_text = tokenizer.texts_to_sequences(comment_text)
         | 
| 38 | 
            -
             | 
| 39 | 
            -
                padded_sequences = pad_sequences(sequences=tokenized_text,maxlen=max_length,padding="post",truncating="post")
         | 
| 40 | 
            -
             | 
| 41 | 
            -
                return padded_sequences
         | 
| 42 | 
            -
             | 
| 43 | 
             
            def LSTM_predict(x):
         | 
| 44 | 
            -
                # x = tokenizer_pad(tokenizer=tokenizer,comment_text=x)
         | 
| 45 | 
            -
             | 
| 46 |  | 
| 47 | 
             
                pred_proba = LSTM_model.predict([x])[0]
         | 
| 48 |  | 
| 49 | 
             
                pred_proba = [round(i,2) for i in pred_proba]
         | 
| 50 |  | 
| 51 | 
            -
                #print(pred_proba)
         | 
| 52 | 
            -
             | 
| 53 | 
             
                return pred_proba
         | 
| 54 |  | 
| 55 | 
             
            def GRU_predict(x):
         | 
| 56 | 
            -
                # x = tokenizer_pad(tokenizer=tokenizer,comment_text=x)
         | 
| 57 | 
            -
             | 
| 58 |  | 
| 59 | 
             
                pred_proba = GRU_model.predict([x])[0]
         | 
| 60 |  | 
| 61 | 
             
                pred_proba = [round(i,2) for i in pred_proba]
         | 
| 62 |  | 
| 63 | 
            -
                #print(pred_proba)
         | 
| 64 | 
            -
             | 
| 65 | 
             
                return pred_proba
         | 
| 66 |  | 
| 67 | 
            -
            def  | 
| 68 | 
            -
               | 
| 69 | 
            -
               | 
| 70 | 
            -
               | 
| 71 | 
            -
              data['Điểm'] = result
         | 
| 72 | 
            -
             | 
| 73 | 
            -
              #print(data)
         | 
| 74 |  | 
| 75 | 
            -
              p = px.bar(data, x='Nhãn', y='Điểm', color='Nhãn', range_y=[0, 1] )
         | 
| 76 | 
            -
              return p
         | 
| 77 | 
            -
              pass
         | 
| 78 |  | 
| 79 | 
             
            def judge(x):
         | 
| 80 |  | 
| 81 | 
            -
              label = ['độc hại', 'cực kì độc hại', 'tục tĩu', 'đe dọa', 'xúc phạm', 'thù ghét cá nhân']
         | 
| 82 | 
             
              result = []
         | 
| 83 | 
            -
              judge_result = []
         | 
| 84 |  | 
| 85 | 
            -
              x =  | 
| 86 | 
            -
              x = word_tokenize(x, format="text")
         | 
| 87 |  | 
| 88 | 
             
              lstm_pred = LSTM_predict(x)
         | 
| 89 | 
             
              gru_pred = GRU_predict(x)
         | 
| 90 | 
            -
              #bert_pred = BERT_predict(x)
         | 
| 91 | 
            -
              #print(result)
         | 
| 92 |  | 
| 93 | 
            -
              return_result = 'Result'
         | 
| 94 | 
             
              result_lstm = np.round(lstm_pred, 2)
         | 
| 95 | 
             
              result_gru = np.round(gru_pred, 2)
         | 
| 96 | 
            -
              #result_bert = np.round(bert_pred, 2)
         | 
| 97 |  | 
| 98 | 
             
              for i in range(6):
         | 
| 99 | 
             
                result.append((result_lstm[i]+result_gru[i])/2)
         | 
| 100 |  | 
| 101 | 
             
              return (result)
         | 
| 102 |  | 
|  | |
| 103 | 
             
            def judgePlus(x):
         | 
| 104 |  | 
| 105 | 
            -
              label = ['độc hại', 'cực kì độc hại', 'tục tĩu', 'đe dọa', 'xúc phạm', 'thù ghét cá nhân']
         | 
| 106 | 
             
              result = []
         | 
| 107 | 
            -
              judge_result = []
         | 
| 108 |  | 
| 109 | 
            -
              x =  | 
| 110 | 
            -
              x = word_tokenize(x, format="text")
         | 
| 111 |  | 
| 112 | 
             
              lstm_pred = LSTM_predict(x)
         | 
| 113 | 
             
              gru_pred = GRU_predict(x)
         | 
| @@ -117,11 +75,10 @@ def judgePlus(x): | |
| 117 | 
             
                bert_pred = np.average([lstm_pred, gru_pred], axis=0)
         | 
| 118 |  | 
| 119 |  | 
| 120 | 
            -
              return_result = 'Result'
         | 
| 121 | 
             
              result_lstm = np.round(lstm_pred, 2)
         | 
| 122 | 
             
              result_gru = np.round(gru_pred, 2)
         | 
| 123 | 
             
              result_bert = np.round(bert_pred, 2)
         | 
| 124 | 
            -
             | 
| 125 | 
             
              if((result_lstm[0]+result_gru[0])<(result_bert[0]*2)):
         | 
| 126 | 
             
                for i in range(6):
         | 
| 127 | 
             
                  result.append((result_bert[i])/1)
         | 
| @@ -131,26 +88,19 @@ def judgePlus(x): | |
| 131 |  | 
| 132 | 
             
              return (result)
         | 
| 133 |  | 
|  | |
| 134 | 
             
            def judgeBert(x):
         | 
| 135 |  | 
| 136 | 
            -
              label = ['độc hại', 'cực kì độc hại', 'tục tĩu', 'đe dọa', 'xúc phạm', 'thù ghét cá nhân']
         | 
| 137 | 
             
              result = []
         | 
| 138 | 
            -
              judge_result = []
         | 
| 139 |  | 
| 140 | 
            -
              x =  | 
| 141 | 
            -
              x = word_tokenize(x, format="text")
         | 
| 142 |  | 
| 143 | 
            -
              
         | 
| 144 | 
             
              try:
         | 
| 145 | 
             
                bert_pred = BERT_predict(x)
         | 
| 146 | 
             
              except:
         | 
| 147 | 
             
                bert_pred = np.zeros(6, dtype=float)
         | 
| 148 |  | 
| 149 | 
            -
              
         | 
| 150 | 
            -
              return_result = 'Result'
         | 
| 151 | 
            -
              
         | 
| 152 | 
             
              result_bert = np.round(bert_pred, 2)
         | 
| 153 | 
            -
              #result_bert = np.round(bert_pred, 2)
         | 
| 154 |  | 
| 155 | 
             
              for i in range(6):
         | 
| 156 | 
             
                result.append((result_bert[i])/1)
         | 
|  | |
| 16 |  | 
| 17 | 
             
            from phoBERT import BERT_predict
         | 
| 18 |  | 
|  | |
|  | |
|  | |
|  | |
| 19 |  | 
|  | |
|  | |
| 20 | 
             
            LSTM_model = tf.keras.models.load_model('lstm_model.tf')
         | 
| 21 |  | 
|  | |
|  | |
| 22 | 
             
            GRU_model = tf.keras.models.load_model('gru_model.tf')
         | 
| 23 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 24 | 
             
            def LSTM_predict(x):
         | 
|  | |
|  | |
| 25 |  | 
| 26 | 
             
                pred_proba = LSTM_model.predict([x])[0]
         | 
| 27 |  | 
| 28 | 
             
                pred_proba = [round(i,2) for i in pred_proba]
         | 
| 29 |  | 
|  | |
|  | |
| 30 | 
             
                return pred_proba
         | 
| 31 |  | 
| 32 | 
             
            def GRU_predict(x):
         | 
|  | |
|  | |
| 33 |  | 
| 34 | 
             
                pred_proba = GRU_model.predict([x])[0]
         | 
| 35 |  | 
| 36 | 
             
                pred_proba = [round(i,2) for i in pred_proba]
         | 
| 37 |  | 
|  | |
|  | |
| 38 | 
             
                return pred_proba
         | 
| 39 |  | 
| 40 | 
            +
            def tokenize(x):
         | 
| 41 | 
            +
              x = ud.normalize('NFKC', x)
         | 
| 42 | 
            +
              x = word_tokenize(x, format="text")
         | 
| 43 | 
            +
              return x
         | 
|  | |
|  | |
|  | |
| 44 |  | 
|  | |
|  | |
|  | |
| 45 |  | 
| 46 | 
             
            def judge(x):
         | 
| 47 |  | 
|  | |
| 48 | 
             
              result = []
         | 
|  | |
| 49 |  | 
| 50 | 
            +
              x = tokenize(x)
         | 
|  | |
| 51 |  | 
| 52 | 
             
              lstm_pred = LSTM_predict(x)
         | 
| 53 | 
             
              gru_pred = GRU_predict(x)
         | 
|  | |
|  | |
| 54 |  | 
|  | |
| 55 | 
             
              result_lstm = np.round(lstm_pred, 2)
         | 
| 56 | 
             
              result_gru = np.round(gru_pred, 2)
         | 
|  | |
| 57 |  | 
| 58 | 
             
              for i in range(6):
         | 
| 59 | 
             
                result.append((result_lstm[i]+result_gru[i])/2)
         | 
| 60 |  | 
| 61 | 
             
              return (result)
         | 
| 62 |  | 
| 63 | 
            +
             | 
| 64 | 
             
            def judgePlus(x):
         | 
| 65 |  | 
|  | |
| 66 | 
             
              result = []
         | 
|  | |
| 67 |  | 
| 68 | 
            +
              x = tokenize(x)
         | 
|  | |
| 69 |  | 
| 70 | 
             
              lstm_pred = LSTM_predict(x)
         | 
| 71 | 
             
              gru_pred = GRU_predict(x)
         | 
|  | |
| 75 | 
             
                bert_pred = np.average([lstm_pred, gru_pred], axis=0)
         | 
| 76 |  | 
| 77 |  | 
|  | |
| 78 | 
             
              result_lstm = np.round(lstm_pred, 2)
         | 
| 79 | 
             
              result_gru = np.round(gru_pred, 2)
         | 
| 80 | 
             
              result_bert = np.round(bert_pred, 2)
         | 
| 81 | 
            +
             | 
| 82 | 
             
              if((result_lstm[0]+result_gru[0])<(result_bert[0]*2)):
         | 
| 83 | 
             
                for i in range(6):
         | 
| 84 | 
             
                  result.append((result_bert[i])/1)
         | 
|  | |
| 88 |  | 
| 89 | 
             
              return (result)
         | 
| 90 |  | 
| 91 | 
            +
             | 
| 92 | 
             
            def judgeBert(x):
         | 
| 93 |  | 
|  | |
| 94 | 
             
              result = []
         | 
|  | |
| 95 |  | 
| 96 | 
            +
              x = tokenize(x)
         | 
|  | |
| 97 |  | 
|  | |
| 98 | 
             
              try:
         | 
| 99 | 
             
                bert_pred = BERT_predict(x)
         | 
| 100 | 
             
              except:
         | 
| 101 | 
             
                bert_pred = np.zeros(6, dtype=float)
         | 
| 102 |  | 
|  | |
|  | |
|  | |
| 103 | 
             
              result_bert = np.round(bert_pred, 2)
         | 
|  | |
| 104 |  | 
| 105 | 
             
              for i in range(6):
         | 
| 106 | 
             
                result.append((result_bert[i])/1)
         |