Spaces:

ThirdEyeData
/

Customer-Conversion-Prediction

Runtime error

App Files Files Community

Customer-Conversion-Prediction / supv /svml.py

Priyanka-Kumavat-At-TE

Upload 19 files

2fc2c1f over 2 years ago

raw

history blame

11.8 kB

	#!/Users/pranab/Tools/anaconda/bin/python

	# avenir-python: Machine Learning
	# Author: Pranab Ghosh
	#
	# Licensed under the Apache License, Version 2.0 (the "License"); you
	# may not use this file except in compliance with the License. You may
	# obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
	# implied. See the License for the specific language governing
	# permissions and limitations under the License.

	# Package imports
	import os
	import sys
	import matplotlib.pyplot as plt
	import numpy as np
	import sklearn as sk
	import sklearn.linear_model
	import matplotlib
	import random
	import jprops
	from sklearn.externals import joblib
	from sklearn.ensemble import BaggingClassifier
	from random import randint

	if len(sys.argv) < 2:
	print "usage: ./svm.py <config_properties_file>"
	sys.exit()

	#train by bagging
	def train_bagging():
	model = build_model()
	bagging_model = BaggingClassifier(base_estimator=model,n_estimators=bagging_num_estimator,
	max_samples=bagging_sample_fraction,oob_score=bagging_use_oob)

	#train model
	bagging_model.fit(XC, yc)

	#persist model
	if persist_model:
	models = bagging_model.estimators_
	for m in zip(range(0, len(models)), models):
	model_file = model_file_directory + "/" + model_file_prefix + "_" + str(m[0] + 1) + ".mod"
	joblib.dump(m[1], model_file)

	score = bagging_model.score(XC, yc)
	print "average error %.3f" %(1.0 - score)

	#linear k fold validation
	def train_kfold_validation(nfold):
	if native_kfold_validation:
	print "native linear kfold validation"
	model = build_model()
	scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold)
	av_score = np.mean(scores)
	print "average error %.3f" %(1.0 - av_score)
	else:
	print "extended linear kfold validation"
	train_kfold_validation_ext(nfold)

	#linear k fold validation
	def train_kfold_validation_ext(nfold):
	model = build_model()
	#scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold)
	#print scores

	offset = 0
	length = dsize / nfold
	errors = []
	fp_errors = []
	fn_errors = []
	for i in range(0, nfold):
	print "....Next fold %d" %(i)

	#split data
	(XV,yv,X,y) = split_data(offset, length)
	dvsize = len(XV)

	#train model
	model.fit(X, y)

	#persist model
	if persist_model:
	model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
	joblib.dump(model, model_file)

	#print support vectors
	print_support_vectors(model)

	#predict
	print "making predictions..."
	yp = model.predict(XV)

	#show prediction output
	(er, fp_er, fn_er) = validate(dvsize,yv,yp)
	errors.append(er)
	fp_errors.append(fp_er)
	fn_errors.append(fn_er)

	offset += length

	#average error
	av_error = np.mean(errors)
	av_fp_error = np.mean(fp_errors)
	av_fn_error = np.mean(fn_errors)
	print "average error %.3f false positive error %.3f false negative error %.3f" %(av_error, av_fp_error, av_fn_error)

	# random k fold validation
	def train_rfold_validation(nfold, niter):
	if native_rfold_validation:
	print "native random kfold validation"
	train_fraction = 1.0 / nfold
	scores = []
	for i in range(0,niter):
	state = randint(1,100)
	X, XV, y, yv = sk.cross_validation.train_test_split(XC, yc, test_size=train_fraction, random_state=state)
	model = build_model()
	model.fit(X,y)
	scores.append(model.score(XV, yv))

	print scores
	av_score = np.mean(scores)
	print "average error %.3f" %(1.0 - av_score)

	else:
	print "extended random kfold validation"
	train_rfold_validation_ext(nfold, niter)

	# random k fold validation
	def train_rfold_validation_ext(nfold, niter):
	max_offset_frac = 1.0 - 1.0 / nfold
	max_offset_frac -= .01
	length = dsize / nfold

	errors = []
	fp_errors = []
	fn_errors = []
	for i in range(0,niter):
	print "...Next iteration %d" %(i)
	offset = int(dsize * random.random() * max_offset_frac)
	print "offset: %d length: %d" %(offset, length)
	(XV,yv,X,y) = split_data(offset, length)
	dvsize = len(XV)

	#build model
	model = build_model()

	#train model
	model.fit(X, y)

	#persist model
	if persist_model:
	model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
	print "saving model file " + model_file
	joblib.dump(model, model_file)

	#print support vectors
	print_support_vectors(model)

	#predict
	print "making predictions..."
	yp = model.predict(XV)

	#show prediction output
	(er, fp_er, fn_er) = validate(dvsize,yv,yp)
	errors.append(er)
	fp_errors.append(fp_er)
	fn_errors.append(fn_er)

	av_error = np.mean(errors)
	av_fp_error = np.mean(fp_errors)
	av_fn_error = np.mean(fn_errors)
	print "average error %.3f false positive error %.3f false negative error %.3f" %(av_error, av_fp_error, av_fn_error)

	# make predictions
	def predict():
	psize = len(X)
	class_counts = []

	#all models
	for i in range(0, num_models):
	model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
	print "loading model file " + model_file
	model = joblib.load(model_file)

	yp = model.predict(X)
	if i == 0:
	#initialize class counts
	for y in yp:
	class_count = {}
	if y == 0:
	class_count[0] = 1
	class_count[1] = 0
	else:
	class_count[1] = 1
	class_count[0] = 0
	class_counts.append(class_count)

	else:
	#increment class count
	for j in range(0, psize):
	class_count = class_counts[j]
	y = yp[j]
	class_count[y] += 1

	# predict based on majority vote
	print "here are the predictions"
	for k in range(0, psize):
	class_count = class_counts[k]
	if (class_count[0] > class_count[1]):
	y = 0
	majority = class_count[0]
	else:
	y = 1
	majority = class_count[1]

	print X[k]
	print "prediction %d majority count %d" %(y, majority)

	#builds model
	def build_model():
	#build model
	print "building model..."
	if algo == "svc":
	if kernel_fun == "poly":
	model = sk.svm.SVC(C=penalty,kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff)
	elif kernel_fun == "rbf" or kernel_fun == "sigmoid":
	model = sk.svm.SVC(C=penalty,kernel=kernel_fun,gamma=kernel_coeff)
	else:
	model = sk.svm.SVC(C=penalty,kernel=kernel_fun)
	elif algo == "nusvc":
	if kernel_fun == "poly":
	model = sk.svm.NuSVC(kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff)
	elif kernel_fun == "rbf" or kernel_fun == "sigmoid":
	model = sk.svm.NuSVC(kernel=kernel_fun,gamma=kernel_coeff)
	else:
	model = sk.svm.NuSVC(kernel=kernel_fun)
	elif algo == "linearsvc":
	model = sk.svm.LinearSVC()
	else:
	print "invalid svm algorithm"
	sys.exit()
	return model

	#splits data into training and validation sets
	def split_data(offset, length):
	print "splitting data..."
	#copy data
	XC_c = np.copy(XC)
	yc_c = list(yc)

	# validation set
	vlo = offset
	vup = vlo + length
	if (vup > len(yc)):
	vup = len(yc)
	XV = XC_c[vlo:vup:1]
	yv = yc_c[vlo:vup:1]
	dvsize = len(XV)
	print "data size %d validation data size %d" %(dsize, dvsize)
	#print "validation set"
	#print XV
	#print yv

	#training set
	X = np.delete(XC_c, np.s_[vlo:vup:1], 0)
	y = np.delete(yc_c, np.s_[vlo:vup:1], 0)
	#print "training set"
	#print X
	#print y
	return (XV,yv,X,y)

	#print support vectors
	def print_support_vectors(model):
	if (not algo == "linearsvc"):
	if print_sup_vectors:
	print "showing support vectors..."
	print model.support_vectors_
	print "num of support vectors"
	print model.n_support_

	#prints prediction output
	def validate(dvsize,yv,yp):
	print "showing predictions..."
	err_count = 0
	tp = 0
	tn = 0
	fp = 0
	fn = 0
	for r in range(0,dvsize):
	#print "actual: %d predicted: %d" %(yv[r], yp[r])
	if (not yv[r] == yp[r]):
	err_count += 1

	if (yp[r] == 1 and yv[r] == 1):
	tp += 1
	elif (yp[r] == 1 and yv[r] == 0):
	fp += 1
	elif (yp[r] == 0 and yv[r] == 0):
	tn += 1
	else:
	fn += 1

	er = float(err_count) / dvsize
	fp_er = float(fp) / dvsize
	fn_er = float(fn) / dvsize
	print "error %.3f" %(er)
	print "true positive : %.3f" %(float(tp) / dvsize)
	print "false positive: %.3f" %(fp_er)
	print "true negative : %.3f" %(float(tn) / dvsize)
	print "false negative: %.3f" %(fn_er)

	return (er, fp_er, fn_er)

	# load configuration
	def getConfigs(configFile):
	configs = {}
	print "using following configurations"
	with open(configFile) as fp:
	for key, value in jprops.iter_properties(fp):
	print key, value
	configs[key] = value

	return configs


	# load configuration
	configs = getConfigs(sys.argv[1])
	mode = configs["common.mode"]

	if mode == "train":
	#train
	print "running in train mode"
	data_file = configs["train.data.file"]
	feat_field_indices = configs["train.data.feature.fields"].split(",")
	feat_field_indices = [int(a) for a in feat_field_indices]
	class_field_index = int(configs["train.data.class.field"])
	preprocess = configs["common.preprocessing"]
	validation = configs["train.validation"]
	num_folds = int(configs["train.num.folds"])
	num_iter = int(configs["train.num.iter"])
	algo = configs["train.algorithm"]
	kernel_fun = configs["train.kernel.function"]
	poly_degree = int(configs["train.poly.degree"])
	penalty = float(configs["train.penalty"])
	if penalty < 0:
	penalty = 1.0
	print "using default for penalty"
	kernel_coeff = float(configs["train.gamma"])
	if kernel_coeff < 0:
	kernel_coeff = 'auto'
	print "using default for gamma"
	print_sup_vectors = configs["train.print.sup.vectors"].lower() == "true"
	persist_model = configs["train.persist.model"].lower() == "true"
	model_file_directory = configs["common.model.directory"]
	model_file_prefix = configs["common.model.file.prefix"]

	print feat_field_indices

	#extract feature fields
	d = np.loadtxt(data_file, delimiter=',')
	dsize = len(d)
	XC = d[:,feat_field_indices]

	#preprocess features
	if (preprocess == "scale"):
	XC = sk.preprocessing.scale(XC)
	elif (preprocess == "normalize"):
	XC = sk.preprocessing.normalize(XC, norm='l2')
	else:
	print "no preprocessing done"

	#extract output field
	yc = d[:,[class_field_index]]
	yc = yc.reshape(dsize)
	yc = [int(a) for a in yc]

	#print XC
	#print yc


	# train model
	if validation == "kfold":
	native_kfold_validation = configs["train.native.kfold.validation"].lower() == "true"
	train_kfold_validation(num_folds)
	elif validation == "rfold":
	native_rfold_validation = configs["train.native.rfold.validation"].lower() == "true"
	train_rfold_validation(num_folds,num_iter)
	elif validation == "bagging":
	bagging_num_estimator = int(configs["train.bagging.num.estimators"])
	bagging_sample_fraction = float(configs["train.bagging.sample.fraction"])
	bagging_use_oob = configs["train.bagging.sample.fraction"].lower() == "true"
	train_bagging()
	else:
	print "invalid training validation method"
	sys.exit()

	else:
	#predict
	print "running in prediction mode"
	pred_data_file = configs["pred.data.file"]
	pred_feat_field_indices = configs["pred.data.feature.fields"].split(",")
	pred_feat_field_indices = [int(a) for a in pred_feat_field_indices]
	preprocess = configs["common.preprocessing"]
	num_models = int(configs["pred.num.models"])
	model_file_directory = configs["common.model.directory"]
	model_file_prefix = configs["common.model.file.prefix"]

	#extract feature fields
	pd = np.loadtxt(pred_data_file, delimiter=',')
	pdsize = len(pd)
	X = pd[:,pred_feat_field_indices]

	#preprocess features
	if (preprocess == "scale"):
	X = sk.preprocessing.scale(X)
	elif (preprocess == "normalize"):
	X = sk.preprocessing.normalize(X, norm='l2')
	else:
	print "no preprocessing done"

	predict()