vijaykumar0704
/

Titanic_prediction_learn

Model card Files Files and versions

xet

Community

vijaykumar0704 commited on Sep 17, 2023

Commit

9971b85

1 Parent(s): 0119ff3

Create Seqfunction.py

Browse files

Files changed (1) hide show

Seqfunction.py +105 -0

Seqfunction.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import re
+import string as str
+import nltk
+import pandas as pd
+import keras
+from sklearn.model_selection import train_test_split
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import pad_sequences
+from keras.layers import Dense, LSTM, Embedding
+from keras.models import Sequential
+from bs4 import BeautifulSoup
+from keras.models import save_model, load_model
+import tensorflow as tf
+import datetime
+ps = nltk.stem.PorterStemmer()
+stopwords = nltk.corpus.stopwords.words('english')
+new_words = ('I', 'fyi', 's', 'the')
+for i in new_words:
+    stopwords.append(i)
+def beauty(raw_text):
+    res_text = BeautifulSoup(raw_text, "html.parser").get_text()
+    return res_text
+def remove_urls(vTEXT):
+    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT)
+    alphabet_regular_expression = re.compile("[^a-zA-Z]")
+    string_without_non_alphabet = re.sub(alphabet_regular_expression, " ", vTEXT)
+    return (string_without_non_alphabet)
+def clean_text(text):
+    # text = remove_urls(text_uri)
+    clean_dts = ''.join([char for char in text if char not in str.punctuation])
+    token = re.split('\W+', clean_dts)
+    stemtext = ' '.join([ps.stem(word) for word in token if word not in stopwords])
+    return stemtext
+# # Create a TensorBoard logger
+# logger = keras.callbacks.TensorBoard(
+#     log_dir='logs',
+#     write_graph=True,
+#     histogram_freq=5
+# )
+# Train data
+message = pd.read_csv('train_nlp.csv')
+message = message.drop(labels=["keyword", "location"], axis=1)
+message['textclean_uri'] = message["text"].apply(lambda x: remove_urls(x) if len(x) > 1 else 'NA')
+message['textclean'] = message["textclean_uri"].apply(lambda x: clean_text(x) if len(x) > 1 else 'NA')
+message.to_csv('dataclean_new_opt.csv', index=False)
+# Test data for model validity
+msg_value = pd.read_csv('test_nlp.csv')
+msg_value = msg_value.drop(labels=["keyword", "location"], axis=1)
+msg_value['textclean_uri'] = msg_value["text"].apply(lambda x: remove_urls(x) if len(x) > 1 else 'NA')
+msg_value['textclean'] = msg_value["textclean_uri"].apply(lambda x: clean_text(x).lower() if len(x) > 1 else 'NA')
+X_train, X_test, y_train, y_test = train_test_split(message['textclean'], message['target'], test_size=0.05)
+# Create token as input is text
+tokenizer = Tokenizer()
+tokenizer.fit_on_texts(X_train)
+x_train_seq = tokenizer.texts_to_sequences(X_train)
+x_text_seq = tokenizer.texts_to_sequences(X_test)
+x_final_seq = tokenizer.texts_to_sequences(msg_value['textclean'])
+# Pad all the data so that all input have same size
+x_train_seq_padded = pad_sequences(x_train_seq, 110)
+x_text_seq_padded = pad_sequences(x_text_seq, 110)
+x_final_seq_padded = pad_sequences(x_final_seq, 110)
+# print(x_final_seq_padded)
+log_dir = "logs" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
+# # Model Creation =======================================
+model = Sequential()
+model.add(Embedding(len(tokenizer.index_word) + 1, 32))
+model.add(LSTM(10, dropout=0, recurrent_dropout=0))
+model.add(Dense(10, activation='relu'))
+model.add(Dense(1, activation='sigmoid'))
+model.summary()
+model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
+history = model.fit(x_train_seq_padded, y_train, batch_size=110, epochs=20, shuffle=True,callbacks=[tensorboard_callback])
+model.save('my_model.h5')
+del model
+model = load_model('my_model.h5')
+test_error_rate = model.evaluate(x_text_seq_padded, y_test, verbose=0)
+# Make a prediction with the neural network
+prediction_value = model.predict(x_final_seq_padded)
+# predictions = np.argmax(prediction_value, 1)
+# mse
+print(prediction_value[0][0])
+# print(msg_final.shape())
+submit = pd.DataFrame({"Id": msg_value.id, 'target': prediction_value[:, 0]})
+submit.to_csv("hist_latest_rnn7.csv", index=False)