vijaykumar0704 commited on
Commit
9971b85
·
1 Parent(s): 0119ff3

Create Seqfunction.py

Browse files
Files changed (1) hide show
  1. Seqfunction.py +105 -0
Seqfunction.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string as str
3
+ import nltk
4
+ import pandas as pd
5
+ import keras
6
+ from sklearn.model_selection import train_test_split
7
+ from keras.preprocessing.text import Tokenizer
8
+ from keras.preprocessing.sequence import pad_sequences
9
+ from keras.layers import Dense, LSTM, Embedding
10
+ from keras.models import Sequential
11
+ from bs4 import BeautifulSoup
12
+ from keras.models import save_model, load_model
13
+ import tensorflow as tf
14
+ import datetime
15
+
16
+ ps = nltk.stem.PorterStemmer()
17
+ stopwords = nltk.corpus.stopwords.words('english')
18
+
19
+ new_words = ('I', 'fyi', 's', 'the')
20
+ for i in new_words:
21
+ stopwords.append(i)
22
+
23
+
24
+ def beauty(raw_text):
25
+ res_text = BeautifulSoup(raw_text, "html.parser").get_text()
26
+ return res_text
27
+
28
+
29
+ def remove_urls(vTEXT):
30
+ vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT)
31
+ alphabet_regular_expression = re.compile("[^a-zA-Z]")
32
+ string_without_non_alphabet = re.sub(alphabet_regular_expression, " ", vTEXT)
33
+ return (string_without_non_alphabet)
34
+
35
+
36
+ def clean_text(text):
37
+ # text = remove_urls(text_uri)
38
+ clean_dts = ''.join([char for char in text if char not in str.punctuation])
39
+ token = re.split('\W+', clean_dts)
40
+ stemtext = ' '.join([ps.stem(word) for word in token if word not in stopwords])
41
+ return stemtext
42
+
43
+
44
+ # # Create a TensorBoard logger
45
+ # logger = keras.callbacks.TensorBoard(
46
+ # log_dir='logs',
47
+ # write_graph=True,
48
+ # histogram_freq=5
49
+ # )
50
+
51
+ # Train data
52
+ message = pd.read_csv('train_nlp.csv')
53
+ message = message.drop(labels=["keyword", "location"], axis=1)
54
+ message['textclean_uri'] = message["text"].apply(lambda x: remove_urls(x) if len(x) > 1 else 'NA')
55
+ message['textclean'] = message["textclean_uri"].apply(lambda x: clean_text(x) if len(x) > 1 else 'NA')
56
+ message.to_csv('dataclean_new_opt.csv', index=False)
57
+
58
+ # Test data for model validity
59
+ msg_value = pd.read_csv('test_nlp.csv')
60
+ msg_value = msg_value.drop(labels=["keyword", "location"], axis=1)
61
+ msg_value['textclean_uri'] = msg_value["text"].apply(lambda x: remove_urls(x) if len(x) > 1 else 'NA')
62
+ msg_value['textclean'] = msg_value["textclean_uri"].apply(lambda x: clean_text(x).lower() if len(x) > 1 else 'NA')
63
+
64
+ X_train, X_test, y_train, y_test = train_test_split(message['textclean'], message['target'], test_size=0.05)
65
+
66
+ # Create token as input is text
67
+ tokenizer = Tokenizer()
68
+ tokenizer.fit_on_texts(X_train)
69
+ x_train_seq = tokenizer.texts_to_sequences(X_train)
70
+ x_text_seq = tokenizer.texts_to_sequences(X_test)
71
+ x_final_seq = tokenizer.texts_to_sequences(msg_value['textclean'])
72
+
73
+ # Pad all the data so that all input have same size
74
+ x_train_seq_padded = pad_sequences(x_train_seq, 110)
75
+ x_text_seq_padded = pad_sequences(x_text_seq, 110)
76
+ x_final_seq_padded = pad_sequences(x_final_seq, 110)
77
+ # print(x_final_seq_padded)
78
+ log_dir = "logs" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
79
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
80
+ # # Model Creation =======================================
81
+ model = Sequential()
82
+ model.add(Embedding(len(tokenizer.index_word) + 1, 32))
83
+ model.add(LSTM(10, dropout=0, recurrent_dropout=0))
84
+ model.add(Dense(10, activation='relu'))
85
+ model.add(Dense(1, activation='sigmoid'))
86
+ model.summary()
87
+
88
+ model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
89
+ history = model.fit(x_train_seq_padded, y_train, batch_size=110, epochs=20, shuffle=True,callbacks=[tensorboard_callback])
90
+
91
+ model.save('my_model.h5')
92
+ del model
93
+ model = load_model('my_model.h5')
94
+
95
+ test_error_rate = model.evaluate(x_text_seq_padded, y_test, verbose=0)
96
+
97
+ # Make a prediction with the neural network
98
+ prediction_value = model.predict(x_final_seq_padded)
99
+ # predictions = np.argmax(prediction_value, 1)
100
+ # mse
101
+ print(prediction_value[0][0])
102
+
103
+ # print(msg_final.shape())
104
+ submit = pd.DataFrame({"Id": msg_value.id, 'target': prediction_value[:, 0]})
105
+ submit.to_csv("hist_latest_rnn7.csv", index=False)