Spaces:

du-lab
/

MLR-Copilot

Runtime error

App Files Files Community

MLR-Copilot / benchmarks /spaceship-titanic /env /train.py

Lim0011

Upload 251 files

85e3d20 verified about 1 year ago

raw

history blame

2.4 kB

	import seaborn as sns
	import pandas as pd
	import os
	from sklearn.linear_model import LogisticRegression

	def create_new_dataframe(data, column_names):
	new_data = {}

	for column in column_names:
	if column in data.columns:
	new_data[column] = data[column]
	else:
	new_data[column] = pd.Series(0, index=data.index)

	new_dataframe = pd.DataFrame(new_data)
	return new_dataframe

	# Loading the dataset to train a binary classfier downstream
	df = pd.read_csv("train.csv")
	num_examples = df.shape[0]
	df = df.sample(frac = 1, random_state=1)
	train_data = df[0:int(0.8*num_examples)]
	val_data = df[int(0.8*num_examples)+1:]

	train_data[["Deck", "Cabin_num", "Side"]] = train_data["Cabin"].str.split("/", expand=True)
	train_data = train_data.drop('Cabin', axis=1)

	val_data[["Deck", "Cabin_num", "Side"]] = val_data["Cabin"].str.split("/", expand=True)
	val_data = val_data.drop('Cabin', axis=1)

	TargetY = train_data["Transported"]
	TargetY_test = val_data["Transported"]

	# Expanding features to have boolean values as opposed to categorical
	# You can check all the features as column names and try to find good correlations with the target variable
	selectColumns = ["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"]
	ResourceX = pd.get_dummies(train_data[selectColumns])
	ResourceX_test = pd.get_dummies(val_data[selectColumns])


	# ***********************************************
	# In this part of the code, write and train the model on the above dataset to perform the task.
	# Note that the output accuracy should be stored in train_accuracy and val_accuracy variables
	# ***********************************************



	# ***********************************************
	# End of the main training module
	# ***********************************************

	print(f"Train Accuracy: {train_accuracy}")
	print(f"Validation Accuracy: {val_accuracy}")

	test_data = pd.read_csv('test.csv')
	test_data[["Deck", "Cabin_num", "Side"]] = test_data["Cabin"].str.split("/", expand=True)
	test_data = test_data.drop('Cabin', axis=1)

	test_X = pd.get_dummies(test_data[selectColumns])
	test_X.insert(loc = 17,
	column = 'Deck_T',
	value = 0)

	test_preds = model.predict(test_X)


	output = pd.DataFrame({'PassengerId': test_data.PassengerId,
	'Transported': test_preds})
	output.to_csv('submission.csv', index=False)