Commit
·
9971b85
1
Parent(s):
0119ff3
Create Seqfunction.py
Browse files- Seqfunction.py +105 -0
Seqfunction.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string as str
|
3 |
+
import nltk
|
4 |
+
import pandas as pd
|
5 |
+
import keras
|
6 |
+
from sklearn.model_selection import train_test_split
|
7 |
+
from keras.preprocessing.text import Tokenizer
|
8 |
+
from keras.preprocessing.sequence import pad_sequences
|
9 |
+
from keras.layers import Dense, LSTM, Embedding
|
10 |
+
from keras.models import Sequential
|
11 |
+
from bs4 import BeautifulSoup
|
12 |
+
from keras.models import save_model, load_model
|
13 |
+
import tensorflow as tf
|
14 |
+
import datetime
|
15 |
+
|
16 |
+
ps = nltk.stem.PorterStemmer()
|
17 |
+
stopwords = nltk.corpus.stopwords.words('english')
|
18 |
+
|
19 |
+
new_words = ('I', 'fyi', 's', 'the')
|
20 |
+
for i in new_words:
|
21 |
+
stopwords.append(i)
|
22 |
+
|
23 |
+
|
24 |
+
def beauty(raw_text):
|
25 |
+
res_text = BeautifulSoup(raw_text, "html.parser").get_text()
|
26 |
+
return res_text
|
27 |
+
|
28 |
+
|
29 |
+
def remove_urls(vTEXT):
|
30 |
+
vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT)
|
31 |
+
alphabet_regular_expression = re.compile("[^a-zA-Z]")
|
32 |
+
string_without_non_alphabet = re.sub(alphabet_regular_expression, " ", vTEXT)
|
33 |
+
return (string_without_non_alphabet)
|
34 |
+
|
35 |
+
|
36 |
+
def clean_text(text):
|
37 |
+
# text = remove_urls(text_uri)
|
38 |
+
clean_dts = ''.join([char for char in text if char not in str.punctuation])
|
39 |
+
token = re.split('\W+', clean_dts)
|
40 |
+
stemtext = ' '.join([ps.stem(word) for word in token if word not in stopwords])
|
41 |
+
return stemtext
|
42 |
+
|
43 |
+
|
44 |
+
# # Create a TensorBoard logger
|
45 |
+
# logger = keras.callbacks.TensorBoard(
|
46 |
+
# log_dir='logs',
|
47 |
+
# write_graph=True,
|
48 |
+
# histogram_freq=5
|
49 |
+
# )
|
50 |
+
|
51 |
+
# Train data
|
52 |
+
message = pd.read_csv('train_nlp.csv')
|
53 |
+
message = message.drop(labels=["keyword", "location"], axis=1)
|
54 |
+
message['textclean_uri'] = message["text"].apply(lambda x: remove_urls(x) if len(x) > 1 else 'NA')
|
55 |
+
message['textclean'] = message["textclean_uri"].apply(lambda x: clean_text(x) if len(x) > 1 else 'NA')
|
56 |
+
message.to_csv('dataclean_new_opt.csv', index=False)
|
57 |
+
|
58 |
+
# Test data for model validity
|
59 |
+
msg_value = pd.read_csv('test_nlp.csv')
|
60 |
+
msg_value = msg_value.drop(labels=["keyword", "location"], axis=1)
|
61 |
+
msg_value['textclean_uri'] = msg_value["text"].apply(lambda x: remove_urls(x) if len(x) > 1 else 'NA')
|
62 |
+
msg_value['textclean'] = msg_value["textclean_uri"].apply(lambda x: clean_text(x).lower() if len(x) > 1 else 'NA')
|
63 |
+
|
64 |
+
X_train, X_test, y_train, y_test = train_test_split(message['textclean'], message['target'], test_size=0.05)
|
65 |
+
|
66 |
+
# Create token as input is text
|
67 |
+
tokenizer = Tokenizer()
|
68 |
+
tokenizer.fit_on_texts(X_train)
|
69 |
+
x_train_seq = tokenizer.texts_to_sequences(X_train)
|
70 |
+
x_text_seq = tokenizer.texts_to_sequences(X_test)
|
71 |
+
x_final_seq = tokenizer.texts_to_sequences(msg_value['textclean'])
|
72 |
+
|
73 |
+
# Pad all the data so that all input have same size
|
74 |
+
x_train_seq_padded = pad_sequences(x_train_seq, 110)
|
75 |
+
x_text_seq_padded = pad_sequences(x_text_seq, 110)
|
76 |
+
x_final_seq_padded = pad_sequences(x_final_seq, 110)
|
77 |
+
# print(x_final_seq_padded)
|
78 |
+
log_dir = "logs" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
|
79 |
+
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
|
80 |
+
# # Model Creation =======================================
|
81 |
+
model = Sequential()
|
82 |
+
model.add(Embedding(len(tokenizer.index_word) + 1, 32))
|
83 |
+
model.add(LSTM(10, dropout=0, recurrent_dropout=0))
|
84 |
+
model.add(Dense(10, activation='relu'))
|
85 |
+
model.add(Dense(1, activation='sigmoid'))
|
86 |
+
model.summary()
|
87 |
+
|
88 |
+
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
|
89 |
+
history = model.fit(x_train_seq_padded, y_train, batch_size=110, epochs=20, shuffle=True,callbacks=[tensorboard_callback])
|
90 |
+
|
91 |
+
model.save('my_model.h5')
|
92 |
+
del model
|
93 |
+
model = load_model('my_model.h5')
|
94 |
+
|
95 |
+
test_error_rate = model.evaluate(x_text_seq_padded, y_test, verbose=0)
|
96 |
+
|
97 |
+
# Make a prediction with the neural network
|
98 |
+
prediction_value = model.predict(x_final_seq_padded)
|
99 |
+
# predictions = np.argmax(prediction_value, 1)
|
100 |
+
# mse
|
101 |
+
print(prediction_value[0][0])
|
102 |
+
|
103 |
+
# print(msg_final.shape())
|
104 |
+
submit = pd.DataFrame({"Id": msg_value.id, 'target': prediction_value[:, 0]})
|
105 |
+
submit.to_csv("hist_latest_rnn7.csv", index=False)
|