|
import re |
|
import string as str |
|
import nltk |
|
import pandas as pd |
|
import keras |
|
from sklearn.model_selection import train_test_split |
|
from keras.preprocessing.text import Tokenizer |
|
from keras.preprocessing.sequence import pad_sequences |
|
from keras.layers import Dense, LSTM, Embedding |
|
from keras.models import Sequential |
|
from bs4 import BeautifulSoup |
|
from keras.models import save_model, load_model |
|
import tensorflow as tf |
|
import datetime |
|
|
|
ps = nltk.stem.PorterStemmer() |
|
stopwords = nltk.corpus.stopwords.words('english') |
|
|
|
new_words = ('I', 'fyi', 's', 'the') |
|
for i in new_words: |
|
stopwords.append(i) |
|
|
|
|
|
def beauty(raw_text): |
|
res_text = BeautifulSoup(raw_text, "html.parser").get_text() |
|
return res_text |
|
|
|
|
|
def remove_urls(vTEXT): |
|
vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT) |
|
alphabet_regular_expression = re.compile("[^a-zA-Z]") |
|
string_without_non_alphabet = re.sub(alphabet_regular_expression, " ", vTEXT) |
|
return (string_without_non_alphabet) |
|
|
|
|
|
def clean_text(text): |
|
|
|
clean_dts = ''.join([char for char in text if char not in str.punctuation]) |
|
token = re.split('\W+', clean_dts) |
|
stemtext = ' '.join([ps.stem(word) for word in token if word not in stopwords]) |
|
return stemtext |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
message = pd.read_csv('train_nlp.csv') |
|
message = message.drop(labels=["keyword", "location"], axis=1) |
|
message['textclean_uri'] = message["text"].apply(lambda x: remove_urls(x) if len(x) > 1 else 'NA') |
|
message['textclean'] = message["textclean_uri"].apply(lambda x: clean_text(x) if len(x) > 1 else 'NA') |
|
message.to_csv('dataclean_new_opt.csv', index=False) |
|
|
|
|
|
msg_value = pd.read_csv('test_nlp.csv') |
|
msg_value = msg_value.drop(labels=["keyword", "location"], axis=1) |
|
msg_value['textclean_uri'] = msg_value["text"].apply(lambda x: remove_urls(x) if len(x) > 1 else 'NA') |
|
msg_value['textclean'] = msg_value["textclean_uri"].apply(lambda x: clean_text(x).lower() if len(x) > 1 else 'NA') |
|
|
|
X_train, X_test, y_train, y_test = train_test_split(message['textclean'], message['target'], test_size=0.05) |
|
|
|
|
|
tokenizer = Tokenizer() |
|
tokenizer.fit_on_texts(X_train) |
|
x_train_seq = tokenizer.texts_to_sequences(X_train) |
|
x_text_seq = tokenizer.texts_to_sequences(X_test) |
|
x_final_seq = tokenizer.texts_to_sequences(msg_value['textclean']) |
|
|
|
|
|
x_train_seq_padded = pad_sequences(x_train_seq, 110) |
|
x_text_seq_padded = pad_sequences(x_text_seq, 110) |
|
x_final_seq_padded = pad_sequences(x_final_seq, 110) |
|
|
|
log_dir = "logs" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") |
|
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) |
|
|
|
model = Sequential() |
|
model.add(Embedding(len(tokenizer.index_word) + 1, 32)) |
|
model.add(LSTM(10, dropout=0, recurrent_dropout=0)) |
|
model.add(Dense(10, activation='relu')) |
|
model.add(Dense(1, activation='sigmoid')) |
|
model.summary() |
|
|
|
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) |
|
history = model.fit(x_train_seq_padded, y_train, batch_size=110, epochs=20, shuffle=True,callbacks=[tensorboard_callback]) |
|
|
|
model.save('my_model.h5') |
|
del model |
|
model = load_model('my_model.h5') |
|
|
|
test_error_rate = model.evaluate(x_text_seq_padded, y_test, verbose=0) |
|
|
|
|
|
prediction_value = model.predict(x_final_seq_padded) |
|
|
|
|
|
print(prediction_value[0][0]) |
|
|
|
|
|
submit = pd.DataFrame({"Id": msg_value.id, 'target': prediction_value[:, 0]}) |
|
submit.to_csv("hist_latest_rnn7.csv", index=False) |