Spaces:
Runtime error
Runtime error
| #!/Users/pranab/Tools/anaconda/bin/python | |
| # avenir-python: Machine Learning | |
| # Author: Pranab Ghosh | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); you | |
| # may not use this file except in compliance with the License. You may | |
| # obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | |
| # implied. See the License for the specific language governing | |
| # permissions and limitations under the License. | |
| # Package imports | |
| import os | |
| import sys | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import sklearn as sk | |
| import sklearn.linear_model | |
| import matplotlib | |
| import random | |
| import jprops | |
| from sklearn.externals import joblib | |
| from sklearn.ensemble import BaggingClassifier | |
| from random import randint | |
| if len(sys.argv) < 2: | |
| print "usage: ./svm.py <config_properties_file>" | |
| sys.exit() | |
| #train by bagging | |
| def train_bagging(): | |
| model = build_model() | |
| bagging_model = BaggingClassifier(base_estimator=model,n_estimators=bagging_num_estimator, | |
| max_samples=bagging_sample_fraction,oob_score=bagging_use_oob) | |
| #train model | |
| bagging_model.fit(XC, yc) | |
| #persist model | |
| if persist_model: | |
| models = bagging_model.estimators_ | |
| for m in zip(range(0, len(models)), models): | |
| model_file = model_file_directory + "/" + model_file_prefix + "_" + str(m[0] + 1) + ".mod" | |
| joblib.dump(m[1], model_file) | |
| score = bagging_model.score(XC, yc) | |
| print "average error %.3f" %(1.0 - score) | |
| #linear k fold validation | |
| def train_kfold_validation(nfold): | |
| if native_kfold_validation: | |
| print "native linear kfold validation" | |
| model = build_model() | |
| scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold) | |
| av_score = np.mean(scores) | |
| print "average error %.3f" %(1.0 - av_score) | |
| else: | |
| print "extended linear kfold validation" | |
| train_kfold_validation_ext(nfold) | |
| #linear k fold validation | |
| def train_kfold_validation_ext(nfold): | |
| model = build_model() | |
| #scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold) | |
| #print scores | |
| offset = 0 | |
| length = dsize / nfold | |
| errors = [] | |
| fp_errors = [] | |
| fn_errors = [] | |
| for i in range(0, nfold): | |
| print "....Next fold %d" %(i) | |
| #split data | |
| (XV,yv,X,y) = split_data(offset, length) | |
| dvsize = len(XV) | |
| #train model | |
| model.fit(X, y) | |
| #persist model | |
| if persist_model: | |
| model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod" | |
| joblib.dump(model, model_file) | |
| #print support vectors | |
| print_support_vectors(model) | |
| #predict | |
| print "making predictions..." | |
| yp = model.predict(XV) | |
| #show prediction output | |
| (er, fp_er, fn_er) = validate(dvsize,yv,yp) | |
| errors.append(er) | |
| fp_errors.append(fp_er) | |
| fn_errors.append(fn_er) | |
| offset += length | |
| #average error | |
| av_error = np.mean(errors) | |
| av_fp_error = np.mean(fp_errors) | |
| av_fn_error = np.mean(fn_errors) | |
| print "average error %.3f false positive error %.3f false negative error %.3f" %(av_error, av_fp_error, av_fn_error) | |
| # random k fold validation | |
| def train_rfold_validation(nfold, niter): | |
| if native_rfold_validation: | |
| print "native random kfold validation" | |
| train_fraction = 1.0 / nfold | |
| scores = [] | |
| for i in range(0,niter): | |
| state = randint(1,100) | |
| X, XV, y, yv = sk.cross_validation.train_test_split(XC, yc, test_size=train_fraction, random_state=state) | |
| model = build_model() | |
| model.fit(X,y) | |
| scores.append(model.score(XV, yv)) | |
| print scores | |
| av_score = np.mean(scores) | |
| print "average error %.3f" %(1.0 - av_score) | |
| else: | |
| print "extended random kfold validation" | |
| train_rfold_validation_ext(nfold, niter) | |
| # random k fold validation | |
| def train_rfold_validation_ext(nfold, niter): | |
| max_offset_frac = 1.0 - 1.0 / nfold | |
| max_offset_frac -= .01 | |
| length = dsize / nfold | |
| errors = [] | |
| fp_errors = [] | |
| fn_errors = [] | |
| for i in range(0,niter): | |
| print "...Next iteration %d" %(i) | |
| offset = int(dsize * random.random() * max_offset_frac) | |
| print "offset: %d length: %d" %(offset, length) | |
| (XV,yv,X,y) = split_data(offset, length) | |
| dvsize = len(XV) | |
| #build model | |
| model = build_model() | |
| #train model | |
| model.fit(X, y) | |
| #persist model | |
| if persist_model: | |
| model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod" | |
| print "saving model file " + model_file | |
| joblib.dump(model, model_file) | |
| #print support vectors | |
| print_support_vectors(model) | |
| #predict | |
| print "making predictions..." | |
| yp = model.predict(XV) | |
| #show prediction output | |
| (er, fp_er, fn_er) = validate(dvsize,yv,yp) | |
| errors.append(er) | |
| fp_errors.append(fp_er) | |
| fn_errors.append(fn_er) | |
| av_error = np.mean(errors) | |
| av_fp_error = np.mean(fp_errors) | |
| av_fn_error = np.mean(fn_errors) | |
| print "average error %.3f false positive error %.3f false negative error %.3f" %(av_error, av_fp_error, av_fn_error) | |
| # make predictions | |
| def predict(): | |
| psize = len(X) | |
| class_counts = [] | |
| #all models | |
| for i in range(0, num_models): | |
| model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod" | |
| print "loading model file " + model_file | |
| model = joblib.load(model_file) | |
| yp = model.predict(X) | |
| if i == 0: | |
| #initialize class counts | |
| for y in yp: | |
| class_count = {} | |
| if y == 0: | |
| class_count[0] = 1 | |
| class_count[1] = 0 | |
| else: | |
| class_count[1] = 1 | |
| class_count[0] = 0 | |
| class_counts.append(class_count) | |
| else: | |
| #increment class count | |
| for j in range(0, psize): | |
| class_count = class_counts[j] | |
| y = yp[j] | |
| class_count[y] += 1 | |
| # predict based on majority vote | |
| print "here are the predictions" | |
| for k in range(0, psize): | |
| class_count = class_counts[k] | |
| if (class_count[0] > class_count[1]): | |
| y = 0 | |
| majority = class_count[0] | |
| else: | |
| y = 1 | |
| majority = class_count[1] | |
| print X[k] | |
| print "prediction %d majority count %d" %(y, majority) | |
| #builds model | |
| def build_model(): | |
| #build model | |
| print "building model..." | |
| if algo == "svc": | |
| if kernel_fun == "poly": | |
| model = sk.svm.SVC(C=penalty,kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff) | |
| elif kernel_fun == "rbf" or kernel_fun == "sigmoid": | |
| model = sk.svm.SVC(C=penalty,kernel=kernel_fun,gamma=kernel_coeff) | |
| else: | |
| model = sk.svm.SVC(C=penalty,kernel=kernel_fun) | |
| elif algo == "nusvc": | |
| if kernel_fun == "poly": | |
| model = sk.svm.NuSVC(kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff) | |
| elif kernel_fun == "rbf" or kernel_fun == "sigmoid": | |
| model = sk.svm.NuSVC(kernel=kernel_fun,gamma=kernel_coeff) | |
| else: | |
| model = sk.svm.NuSVC(kernel=kernel_fun) | |
| elif algo == "linearsvc": | |
| model = sk.svm.LinearSVC() | |
| else: | |
| print "invalid svm algorithm" | |
| sys.exit() | |
| return model | |
| #splits data into training and validation sets | |
| def split_data(offset, length): | |
| print "splitting data..." | |
| #copy data | |
| XC_c = np.copy(XC) | |
| yc_c = list(yc) | |
| # validation set | |
| vlo = offset | |
| vup = vlo + length | |
| if (vup > len(yc)): | |
| vup = len(yc) | |
| XV = XC_c[vlo:vup:1] | |
| yv = yc_c[vlo:vup:1] | |
| dvsize = len(XV) | |
| print "data size %d validation data size %d" %(dsize, dvsize) | |
| #print "validation set" | |
| #print XV | |
| #print yv | |
| #training set | |
| X = np.delete(XC_c, np.s_[vlo:vup:1], 0) | |
| y = np.delete(yc_c, np.s_[vlo:vup:1], 0) | |
| #print "training set" | |
| #print X | |
| #print y | |
| return (XV,yv,X,y) | |
| #print support vectors | |
| def print_support_vectors(model): | |
| if (not algo == "linearsvc"): | |
| if print_sup_vectors: | |
| print "showing support vectors..." | |
| print model.support_vectors_ | |
| print "num of support vectors" | |
| print model.n_support_ | |
| #prints prediction output | |
| def validate(dvsize,yv,yp): | |
| print "showing predictions..." | |
| err_count = 0 | |
| tp = 0 | |
| tn = 0 | |
| fp = 0 | |
| fn = 0 | |
| for r in range(0,dvsize): | |
| #print "actual: %d predicted: %d" %(yv[r], yp[r]) | |
| if (not yv[r] == yp[r]): | |
| err_count += 1 | |
| if (yp[r] == 1 and yv[r] == 1): | |
| tp += 1 | |
| elif (yp[r] == 1 and yv[r] == 0): | |
| fp += 1 | |
| elif (yp[r] == 0 and yv[r] == 0): | |
| tn += 1 | |
| else: | |
| fn += 1 | |
| er = float(err_count) / dvsize | |
| fp_er = float(fp) / dvsize | |
| fn_er = float(fn) / dvsize | |
| print "error %.3f" %(er) | |
| print "true positive : %.3f" %(float(tp) / dvsize) | |
| print "false positive: %.3f" %(fp_er) | |
| print "true negative : %.3f" %(float(tn) / dvsize) | |
| print "false negative: %.3f" %(fn_er) | |
| return (er, fp_er, fn_er) | |
| # load configuration | |
| def getConfigs(configFile): | |
| configs = {} | |
| print "using following configurations" | |
| with open(configFile) as fp: | |
| for key, value in jprops.iter_properties(fp): | |
| print key, value | |
| configs[key] = value | |
| return configs | |
| # load configuration | |
| configs = getConfigs(sys.argv[1]) | |
| mode = configs["common.mode"] | |
| if mode == "train": | |
| #train | |
| print "running in train mode" | |
| data_file = configs["train.data.file"] | |
| feat_field_indices = configs["train.data.feature.fields"].split(",") | |
| feat_field_indices = [int(a) for a in feat_field_indices] | |
| class_field_index = int(configs["train.data.class.field"]) | |
| preprocess = configs["common.preprocessing"] | |
| validation = configs["train.validation"] | |
| num_folds = int(configs["train.num.folds"]) | |
| num_iter = int(configs["train.num.iter"]) | |
| algo = configs["train.algorithm"] | |
| kernel_fun = configs["train.kernel.function"] | |
| poly_degree = int(configs["train.poly.degree"]) | |
| penalty = float(configs["train.penalty"]) | |
| if penalty < 0: | |
| penalty = 1.0 | |
| print "using default for penalty" | |
| kernel_coeff = float(configs["train.gamma"]) | |
| if kernel_coeff < 0: | |
| kernel_coeff = 'auto' | |
| print "using default for gamma" | |
| print_sup_vectors = configs["train.print.sup.vectors"].lower() == "true" | |
| persist_model = configs["train.persist.model"].lower() == "true" | |
| model_file_directory = configs["common.model.directory"] | |
| model_file_prefix = configs["common.model.file.prefix"] | |
| print feat_field_indices | |
| #extract feature fields | |
| d = np.loadtxt(data_file, delimiter=',') | |
| dsize = len(d) | |
| XC = d[:,feat_field_indices] | |
| #preprocess features | |
| if (preprocess == "scale"): | |
| XC = sk.preprocessing.scale(XC) | |
| elif (preprocess == "normalize"): | |
| XC = sk.preprocessing.normalize(XC, norm='l2') | |
| else: | |
| print "no preprocessing done" | |
| #extract output field | |
| yc = d[:,[class_field_index]] | |
| yc = yc.reshape(dsize) | |
| yc = [int(a) for a in yc] | |
| #print XC | |
| #print yc | |
| # train model | |
| if validation == "kfold": | |
| native_kfold_validation = configs["train.native.kfold.validation"].lower() == "true" | |
| train_kfold_validation(num_folds) | |
| elif validation == "rfold": | |
| native_rfold_validation = configs["train.native.rfold.validation"].lower() == "true" | |
| train_rfold_validation(num_folds,num_iter) | |
| elif validation == "bagging": | |
| bagging_num_estimator = int(configs["train.bagging.num.estimators"]) | |
| bagging_sample_fraction = float(configs["train.bagging.sample.fraction"]) | |
| bagging_use_oob = configs["train.bagging.sample.fraction"].lower() == "true" | |
| train_bagging() | |
| else: | |
| print "invalid training validation method" | |
| sys.exit() | |
| else: | |
| #predict | |
| print "running in prediction mode" | |
| pred_data_file = configs["pred.data.file"] | |
| pred_feat_field_indices = configs["pred.data.feature.fields"].split(",") | |
| pred_feat_field_indices = [int(a) for a in pred_feat_field_indices] | |
| preprocess = configs["common.preprocessing"] | |
| num_models = int(configs["pred.num.models"]) | |
| model_file_directory = configs["common.model.directory"] | |
| model_file_prefix = configs["common.model.file.prefix"] | |
| #extract feature fields | |
| pd = np.loadtxt(pred_data_file, delimiter=',') | |
| pdsize = len(pd) | |
| X = pd[:,pred_feat_field_indices] | |
| #preprocess features | |
| if (preprocess == "scale"): | |
| X = sk.preprocessing.scale(X) | |
| elif (preprocess == "normalize"): | |
| X = sk.preprocessing.normalize(X, norm='l2') | |
| else: | |
| print "no preprocessing done" | |
| predict() | |