Spaces:
Runtime error
Runtime error
| #!/usr/local/bin/python3 | |
| # avenir-python: Machine Learning | |
| # Author: Pranab Ghosh | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); you | |
| # may not use this file except in compliance with the License. You may | |
| # obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | |
| # implied. See the License for the specific language governing | |
| # permissions and limitations under the License. | |
| # Package imports | |
| import os | |
| import sys | |
| import numpy as np | |
| from sklearn import preprocessing | |
| from sklearn import metrics | |
| from sklearn.datasets import make_blobs | |
| from sklearn.datasets import make_classification | |
| import random | |
| from math import * | |
| from decimal import Decimal | |
| import statistics | |
| import jprops | |
| from Levenshtein import distance as ld | |
| from .util import * | |
| from .sampler import * | |
| class Configuration: | |
| """ | |
| Configuration management. Supports default value, mandatory value and typed value. | |
| """ | |
| def __init__(self, configFile, defValues, verbose=False): | |
| """ | |
| initializer | |
| Parameters | |
| configFile : config file path | |
| defValues : dictionary of default values | |
| verbose : verbosity flag | |
| """ | |
| configs = {} | |
| with open(configFile) as fp: | |
| for key, value in jprops.iter_properties(fp): | |
| configs[key] = value | |
| self.configs = configs | |
| self.defValues = defValues | |
| self.verbose = verbose | |
| def override(self, configFile): | |
| """ | |
| over ride configuration from file | |
| Parameters | |
| configFile : override config file path | |
| """ | |
| with open(configFile) as fp: | |
| for key, value in jprops.iter_properties(fp): | |
| self.configs[key] = value | |
| def setParam(self, name, value): | |
| """ | |
| override individual configuration | |
| Parameters | |
| name : config param name | |
| value : config param value | |
| """ | |
| self.configs[name] = value | |
| def getStringConfig(self, name): | |
| """ | |
| get string param | |
| Parameters | |
| name : config param name | |
| """ | |
| if self.isNone(name): | |
| val = (None, False) | |
| elif self.isDefault(name): | |
| val = (self.handleDefault(name), True) | |
| else: | |
| val = (self.configs[name], False) | |
| if self.verbose: | |
| print( "{} {} {}".format(name, self.configs[name], val[0])) | |
| return val | |
| def getIntConfig(self, name): | |
| """ | |
| get int param | |
| Parameters | |
| name : config param name | |
| """ | |
| #print "%s %s" %(name,self.configs[name]) | |
| if self.isNone(name): | |
| val = (None, False) | |
| elif self.isDefault(name): | |
| val = (self.handleDefault(name), True) | |
| else: | |
| val = (int(self.configs[name]), False) | |
| if self.verbose: | |
| print( "{} {} {}".format(name, self.configs[name], val[0])) | |
| return val | |
| def getFloatConfig(self, name): | |
| """ | |
| get float param | |
| Parameters | |
| name : config param name | |
| """ | |
| #print "%s %s" %(name,self.configs[name]) | |
| if self.isNone(name): | |
| val = (None, False) | |
| elif self.isDefault(name): | |
| val = (self.handleDefault(name), True) | |
| else: | |
| val = (float(self.configs[name]), False) | |
| if self.verbose: | |
| print( "{} {} {:06.3f}".format(name, self.configs[name], val[0])) | |
| return val | |
| def getBooleanConfig(self, name): | |
| """ | |
| #get boolean param | |
| Parameters | |
| name : config param name | |
| """ | |
| if self.isNone(name): | |
| val = (None, False) | |
| elif self.isDefault(name): | |
| val = (self.handleDefault(name), True) | |
| else: | |
| bVal = self.configs[name].lower() == "true" | |
| val = (bVal, False) | |
| if self.verbose: | |
| print( "{} {} {}".format(name, self.configs[name], val[0])) | |
| return val | |
| def getIntListConfig(self, name, delim=","): | |
| """ | |
| get int list param | |
| Parameters | |
| name : config param name | |
| delim : delemeter | |
| """ | |
| if self.isNone(name): | |
| val = (None, False) | |
| elif self.isDefault(name): | |
| val = (self.handleDefault(name), True) | |
| else: | |
| delSepStr = self.getStringConfig(name) | |
| #specified as range | |
| intList = strListOrRangeToIntArray(delSepStr[0]) | |
| val =(intList, delSepStr[1]) | |
| return val | |
| def getFloatListConfig(self, name, delim=","): | |
| """ | |
| get float list param | |
| Parameters | |
| name : config param name | |
| delim : delemeter | |
| """ | |
| delSepStr = self.getStringConfig(name) | |
| if self.isNone(name): | |
| val = (None, False) | |
| elif self.isDefault(name): | |
| val = (self.handleDefault(name), True) | |
| else: | |
| flList = strToFloatArray(delSepStr[0], delim) | |
| val =(flList, delSepStr[1]) | |
| return val | |
| def getStringListConfig(self, name, delim=","): | |
| """ | |
| get string list param | |
| Parameters | |
| name : config param name | |
| delim : delemeter | |
| """ | |
| delSepStr = self.getStringConfig(name) | |
| if self.isNone(name): | |
| val = (None, False) | |
| elif self.isDefault(name): | |
| val = (self.handleDefault(name), True) | |
| else: | |
| strList = delSepStr[0].split(delim) | |
| val = (strList, delSepStr[1]) | |
| return val | |
| def handleDefault(self, name): | |
| """ | |
| handles default | |
| Parameters | |
| name : config param name | |
| """ | |
| dVal = self.defValues[name] | |
| if (dVal[1] is None): | |
| val = dVal[0] | |
| else: | |
| raise ValueError(dVal[1]) | |
| return val | |
| def isNone(self, name): | |
| """ | |
| true is value is None | |
| Parameters | |
| name : config param name | |
| """ | |
| return self.configs[name].lower() == "none" | |
| def isDefault(self, name): | |
| """ | |
| true if the value is default | |
| Parameters | |
| name : config param name | |
| """ | |
| de = self.configs[name] == "_" | |
| #print de | |
| return de | |
| def eitherOrStringConfig(self, firstName, secondName): | |
| """ | |
| returns one of two string parameters | |
| Parameters | |
| firstName : first parameter name | |
| secondName : second parameter name | |
| """ | |
| if not self.isNone(firstName): | |
| first = self.getStringConfig(firstName)[0] | |
| second = None | |
| if not self.isNone(secondName): | |
| raise ValueError("only one of the two parameters should be set and not both " + firstName + " " + secondName) | |
| else: | |
| if not self.isNone(secondName): | |
| second = self.getStringConfig(secondtName)[0] | |
| first = None | |
| else: | |
| raise ValueError("at least one of the two parameters should be set " + firstName + " " + secondName) | |
| return (first, second) | |
| def eitherOrIntConfig(self, firstName, secondName): | |
| """ | |
| returns one of two int parameters | |
| Parameters | |
| firstName : first parameter name | |
| secondName : second parameter name | |
| """ | |
| if not self.isNone(firstName): | |
| first = self.getIntConfig(firstName)[0] | |
| second = None | |
| if not self.isNone(secondName): | |
| raise ValueError("only one of the two parameters should be set and not both " + firstName + " " + secondName) | |
| else: | |
| if not self.isNone(secondName): | |
| second = self.getIntConfig(secondsName)[0] | |
| first = None | |
| else: | |
| raise ValueError("at least one of the two parameters should be set " + firstName + " " + secondName) | |
| return (first, second) | |
| class CatLabelGenerator: | |
| """ | |
| label generator for categorical variables | |
| """ | |
| def __init__(self, catValues, delim): | |
| """ | |
| initilizers | |
| Parameters | |
| catValues : dictionary of categorical values | |
| delim : delemeter | |
| """ | |
| self.encoders = {} | |
| self.catValues = catValues | |
| self.delim = delim | |
| for k in self.catValues.keys(): | |
| le = preprocessing.LabelEncoder() | |
| le.fit(self.catValues[k]) | |
| self.encoders[k] = le | |
| def processRow(self, row): | |
| """ | |
| encode row categorical values | |
| Parameters: | |
| row : data row | |
| """ | |
| #print row | |
| rowArr = row.split(self.delim) | |
| for i in range(len(rowArr)): | |
| if (i in self.catValues): | |
| curVal = rowArr[i] | |
| assert curVal in self.catValues[i], "categorival value invalid" | |
| encVal = self.encoders[i].transform([curVal]) | |
| rowArr[i] = str(encVal[0]) | |
| return self.delim.join(rowArr) | |
| def getOrigLabels(self, indx): | |
| """ | |
| get original labels | |
| Parameters: | |
| indx : column index | |
| """ | |
| return self.encoders[indx].classes_ | |
| class SupvLearningDataGenerator: | |
| """ | |
| data generator for supervised learning | |
| """ | |
| def __init__(self, configFile): | |
| """ | |
| initilizers | |
| Parameters | |
| configFile : config file path | |
| """ | |
| defValues = dict() | |
| defValues["common.num.samp"] = (100, None) | |
| defValues["common.num.feat"] = (5, None) | |
| defValues["common.feat.trans"] = (None, None) | |
| defValues["common.feat.types"] = (None, "missing feature types") | |
| defValues["common.cat.feat.distr"] = (None, None) | |
| defValues["common.output.precision"] = (3, None) | |
| defValues["common.error"] = (0.01, None) | |
| defValues["class.gen.technique"] = ("blob", None) | |
| defValues["class.num.feat.informative"] = (2, None) | |
| defValues["class.num.feat.redundant"] = (2, None) | |
| defValues["class.num.feat.repeated"] = (0, None) | |
| defValues["class.num.feat.cat"] = (0, None) | |
| defValues["class.num.class"] = (2, None) | |
| self.config = Configuration(configFile, defValues) | |
| def genClassifierData(self): | |
| """ | |
| generates classifier data | |
| """ | |
| nsamp = self.config.getIntConfig("common.num.samp")[0] | |
| nfeat = self.config.getIntConfig("common.num.feat")[0] | |
| nclass = self.config.getIntConfig("class.num.class")[0] | |
| #transform with shift and scale | |
| ftrans = self.config.getFloatListConfig("common.feat.trans")[0] | |
| feTrans = dict() | |
| for i in range(0, len(ftrans), 2): | |
| tr = (ftrans[i], ftrans[i+1]) | |
| indx = int(i/2) | |
| feTrans[indx] = tr | |
| ftypes = self.config.getStringListConfig("common.feat.types")[0] | |
| # categorical feature distribution | |
| feCatDist = dict() | |
| fcatdl = self.config.getStringListConfig("common.cat.feat.distr")[0] | |
| for fcatds in fcatdl: | |
| fcatd = fcatds.split(":") | |
| feInd = int(fcatd[0]) | |
| clVal = int(fcatd[1]) | |
| key = (feInd, clVal) #feature index and class value | |
| dist = list(map(lambda i : (fcatd[i], float(fcatd[i+1])), range(2, len(fcatd), 2))) | |
| feCatDist[key] = CategoricalRejectSampler(*dist) | |
| #shift and scale | |
| genTechnique = self.config.getStringConfig("class.gen.technique")[0] | |
| error = self.config.getFloatConfig("common.error")[0] | |
| if genTechnique == "blob": | |
| features, claz = make_blobs(n_samples=nsamp, centers=nclass, n_features=nfeat) | |
| for i in range(nsamp): #shift and scale | |
| for j in range(nfeat): | |
| tr = feTrans[j] | |
| features[i,j] = (features[i,j] + tr[0]) * tr[1] | |
| claz = np.array(list(map(lambda c : random.randint(0, nclass-1) if random.random() < error else c, claz))) | |
| elif genTechnique == "classify": | |
| nfeatInfo = self.config.getIntConfig("class.num.feat.informative")[0] | |
| nfeatRed = self.config.getIntConfig("class.num.feat.redundant")[0] | |
| nfeatRep = self.config.getIntConfig("class.num.feat.repeated")[0] | |
| shifts = list(map(lambda i : feTrans[i][0], range(nfeat))) | |
| scales = list(map(lambda i : feTrans[i][1], range(nfeat))) | |
| features, claz = make_classification(n_samples=nsamp, n_features=nfeat, n_informative=nfeatInfo, n_redundant=nfeatRed, | |
| n_repeated=nfeatRep, n_classes=nclass, flip_y=error, shift=shifts, scale=scales) | |
| else: | |
| raise "invalid genaration technique" | |
| # add categorical features and format | |
| nCatFeat = self.config.getIntConfig("class.num.feat.cat")[0] | |
| prec = self.config.getIntConfig("common.output.precision")[0] | |
| for f , c in zip(features, claz): | |
| nfs = list(map(lambda i : self.numFeToStr(i, f[i], c, ftypes[i], prec), range(nfeat))) | |
| if nCatFeat > 0: | |
| cfs = list(map(lambda i : self.catFe(i, c, ftypes[i], feCatDist), range(nfeat, nfeat + nCatFeat, 1))) | |
| rec = ",".join(nfs) + "," + ",".join(cfs) + "," + str(c) | |
| else: | |
| rec = ",".join(nfs) + "," + str(c) | |
| yield rec | |
| def numFeToStr(self, fv, ft, prec): | |
| """ | |
| nummeric feature value to string | |
| Parameters | |
| fv : field value | |
| ft : field data type | |
| prec : precision | |
| """ | |
| if ft == "float": | |
| s = formatFloat(prec, fv) | |
| elif ft =="int": | |
| s = str(int(fv)) | |
| else: | |
| raise "invalid type expecting float or int" | |
| return s | |
| def catFe(self, i, cv, ft, feCatDist): | |
| """ | |
| generate categorical feature | |
| Parameters | |
| i : col index | |
| cv : class value | |
| ft : field data type | |
| feCatDist : cat value distribution | |
| """ | |
| if ft == "cat": | |
| key = (i, cv) | |
| s = feCatDist[key].sample() | |
| else: | |
| raise "invalid type expecting categorical" | |
| return s | |
| class RegressionDataGenerator: | |
| """ | |
| data generator for regression, including square terms, cross terms, bias, noise, correlated variables | |
| and user defined function | |
| """ | |
| def __init__(self, configFile, callback=None): | |
| """ | |
| initilizers | |
| Parameters | |
| configFile : config file path | |
| callback : user defined function | |
| """ | |
| defValues = dict() | |
| defValues["common.pvar.samplers"] = (None, None) | |
| defValues["common.pvar.ranges"] = (None, None) | |
| defValues["common.linear.weights"] = (None, None) | |
| defValues["common.square.weights"] = (None, None) | |
| defValues["common.crterm.weights"] = (None, None) | |
| defValues["common.corr.params"] = (None, None) | |
| defValues["common.bias"] = (0, None) | |
| defValues["common.noise"] = (None, None) | |
| defValues["common.tvar.range"] = (None, None) | |
| defValues["common.weight.niter"] = (20, None) | |
| self.config = Configuration(configFile, defValues) | |
| self.callback = callback | |
| #samplers for predictor variables | |
| items = self.config.getStringListConfig("common.pvar.samplers")[0] | |
| self.samplers = list(map(lambda s : createSampler(s), items)) | |
| self.npvar = len(self.samplers) | |
| #values range for predictor variables | |
| items = self.config.getStringListConfig("common.pvar.ranges")[0] | |
| self.pvranges = list() | |
| for i in range(0, len(items), 2): | |
| if items[i] =="none": | |
| r = None | |
| else: | |
| vmin = float(items[i]) | |
| vmax = float(items[i+1]) | |
| r = (vmin, vmax, vmax-vmin) | |
| self.pvranges.append(r) | |
| assertEqual(len(self.pvranges), self.npvar, "no of predicatble var ranges provided is inavalid") | |
| #linear weights for predictor variables | |
| self.lweights = self.config.getFloatListConfig("common.linear.weights")[0] | |
| assertEqual(len(self.lweights), self.npvar, "no of linear weights provided is inavalid") | |
| #square weights for predictor variables | |
| items = self.config.getStringListConfig("common.square.weights")[0] | |
| self.sqweight = dict() | |
| for i in range(0, len(items), 2): | |
| vi = int(items[i]) | |
| assertLesser(vi, self.npvar, "invalid predictor var index") | |
| wt = float(items[i+1]) | |
| self.sqweight[vi] = wt | |
| #crossterm weights for predictor variables | |
| items = self.config.getStringListConfig("common.crterm.weights")[0] | |
| self.crweight = dict() | |
| for i in range(0, len(items), 3): | |
| vi = int(items[i]) | |
| assertLesser(vi, self.npvar, "invalid predictor var index") | |
| vj = int(items[i+1]) | |
| assertLesser(vj, self.npvar, "invalid predictor var index") | |
| wt = float(items[i+2]) | |
| vp = (vi, vj) | |
| self.crweight[vp] = wt | |
| #correlated variables | |
| items = self.config.getStringListConfig("common.corr.params")[0] | |
| self.corrparams = dict() | |
| for co in items: | |
| cparam = co.split(":") | |
| vi = int(cparam[0]) | |
| vj = int(cparam[1]) | |
| k = (vi,vj) | |
| bias = float(cparam[2]) | |
| wt = float(cparam[3]) | |
| noise = float(cparam[4]) | |
| roundoff = cparam[5] == "true" | |
| v = (bias, wt, noise, roundoff) | |
| self.corrparams[k] = v | |
| #boas, noise and target range values | |
| self.bias = self.config.getFloatConfig("common.bias")[0] | |
| noise = self.config.getStringListConfig("common.noise")[0] | |
| self.ndistr = noise[0] | |
| self.noise = float(noise[1]) | |
| self.tvarlim = self.config.getFloatListConfig("common.tvar.range")[0] | |
| #sample | |
| niter = self.config.getIntConfig("common.weight.niter")[0] | |
| yvals = list() | |
| for i in range(niter): | |
| y = self.sample()[1] | |
| yvals.append(y) | |
| #scale weights by sampled mean and target mean | |
| my = statistics.mean(yvals) | |
| myt =(self.tvarlim[1] - self.tvarlim[0]) / 2 | |
| sc = (myt - self.bias) / (my - self.bias) | |
| #print("weight scale {:.3f}".format(sc)) | |
| self.lweights = list(map(lambda w : w * sc, self.lweights)) | |
| #print("weights {}".format(toStrFromList(self.lweights, 3))) | |
| for k in self.sqweight.keys(): | |
| self.sqweight[k] *= sc | |
| for k in self.crweight.keys(): | |
| self.crweight[k] *= sc | |
| def sample(self): | |
| """ | |
| sample predictor variables and target variable | |
| """ | |
| pvd = list(map(lambda s : s.sample(), self.samplers)) | |
| #correct for correlated variables | |
| for k in self.corrparams.keys(): | |
| vi = k[0] | |
| vj = k[1] | |
| v = self.corrparams[k] | |
| bias = v[0] | |
| wt = v[1] | |
| noise = v[2] | |
| roundoff = v[3] | |
| nv = bias + wt * pvd[vi] | |
| pvd[vj] = preturbScalar(nv, noise, "normal") | |
| if roundoff: | |
| pvd[vj] = round(pvd[vj]) | |
| spvd = list() | |
| lsum = self.bias | |
| for i in range(self.npvar): | |
| #range limit | |
| if self.pvranges[i] is not None: | |
| pvd[i] = rangeLimit(pvd[i], self.pvranges[i][0], self.pvranges[i][1]) | |
| spvd.append(pvd[i]) | |
| #scale | |
| pvd[i] = scaleMinMaxScaData(pvd[i], self.pvranges[i]) | |
| lsum += self.lweights[i] * pvd[i] | |
| #square terms | |
| ssum = 0 | |
| for k in self.sqweight.keys(): | |
| ssum += self.sqweight[k] + pvd[k] * pvd[k] | |
| #cross terms | |
| crsum = 0 | |
| for k in self.crweight.keys(): | |
| vi = k[0] | |
| vj = k[1] | |
| crsum += self.crweight[k] * pvd[vi] * pvd[vj] | |
| y = lsum + ssum + crsum | |
| y = preturbScalar(y, self.noise, self.ndistr) | |
| if self.callback is not None: | |
| ufy = self.callback(spvd) | |
| y += ufy | |
| r = (spvd, y) | |
| return r | |
| def loadDataFile(file, delim, cols, colIndices): | |
| """ | |
| loads delim separated file and extracts columns | |
| Parameters | |
| file : file path | |
| delim : delemeter | |
| cols : columns to use from file | |
| colIndices ; columns to extract | |
| """ | |
| data = np.loadtxt(file, delimiter=delim, usecols=cols) | |
| extrData = data[:,colIndices] | |
| return (data, extrData) | |
| def loadFeatDataFile(file, delim, cols): | |
| """ | |
| loads delim separated file and extracts columns | |
| Parameters | |
| file : file path | |
| delim : delemeter | |
| cols : columns to use from file | |
| """ | |
| data = np.loadtxt(file, delimiter=delim, usecols=cols) | |
| return data | |
| def extrColumns(arr, columns): | |
| """ | |
| extracts columns | |
| Parameters | |
| arr : 2D array | |
| columns : columns | |
| """ | |
| return arr[:, columns] | |
| def subSample(featData, clsData, subSampleRate, withReplacement): | |
| """ | |
| subsample feature and class label data | |
| Parameters | |
| featData : 2D array of feature data | |
| clsData : arrray of class labels | |
| subSampleRate : fraction to be sampled | |
| withReplacement : true if sampling with replacement | |
| """ | |
| sampSize = int(featData.shape[0] * subSampleRate) | |
| sampledIndx = np.random.choice(featData.shape[0],sampSize, replace=withReplacement) | |
| sampFeat = featData[sampledIndx] | |
| sampCls = clsData[sampledIndx] | |
| return(sampFeat, sampCls) | |
| def euclideanDistance(x,y): | |
| """ | |
| euclidean distance | |
| Parameters | |
| x : first vector | |
| y : second fvector | |
| """ | |
| return sqrt(sum(pow(a-b, 2) for a, b in zip(x, y))) | |
| def squareRooted(x): | |
| """ | |
| square root of sum square | |
| Parameters | |
| x : data vector | |
| """ | |
| return round(sqrt(sum([a*a for a in x])),3) | |
| def cosineSimilarity(x,y): | |
| """ | |
| cosine similarity | |
| Parameters | |
| x : first vector | |
| y : second fvector | |
| """ | |
| numerator = sum(a*b for a,b in zip(x,y)) | |
| denominator = squareRooted(x) * squareRooted(y) | |
| return round(numerator / float(denominator), 3) | |
| def cosineDistance(x,y): | |
| """ | |
| cosine distance | |
| Parameters | |
| x : first vector | |
| y : second fvector | |
| """ | |
| return 1.0 - cosineSimilarity(x,y) | |
| def manhattanDistance(x,y): | |
| """ | |
| manhattan distance | |
| Parameters | |
| x : first vector | |
| y : second fvector | |
| """ | |
| return sum(abs(a-b) for a,b in zip(x,y)) | |
| def nthRoot(value, nRoot): | |
| """ | |
| nth root | |
| Parameters | |
| value : data value | |
| nRoot : root | |
| """ | |
| rootValue = 1/float(nRoot) | |
| return round (Decimal(value) ** Decimal(rootValue),3) | |
| def minkowskiDistance(x,y,pValue): | |
| """ | |
| minkowski distance | |
| Parameters | |
| x : first vector | |
| y : second fvector | |
| pValue : power factor | |
| """ | |
| return nthRoot(sum(pow(abs(a-b),pValue) for a,b in zip(x, y)), pValue) | |
| def jaccardSimilarityX(x,y): | |
| """ | |
| jaccard similarity | |
| Parameters | |
| x : first vector | |
| y : second fvector | |
| """ | |
| intersectionCardinality = len(set.intersection(*[set(x), set(y)])) | |
| unionCardinality = len(set.union(*[set(x), set(y)])) | |
| return intersectionCardinality/float(unionCardinality) | |
| def jaccardSimilarity(x,y,wx=1.0,wy=1.0): | |
| """ | |
| jaccard similarity | |
| Parameters | |
| x : first vector | |
| y : second fvector | |
| wx : weight for x | |
| wy : weight for y | |
| """ | |
| sx = set(x) | |
| sy = set(y) | |
| sxyInt = sx.intersection(sy) | |
| intCardinality = len(sxyInt) | |
| sxIntDiff = sx.difference(sxyInt) | |
| syIntDiff = sy.difference(sxyInt) | |
| unionCardinality = len(sx.union(sy)) | |
| return intCardinality/float(intCardinality + wx * len(sxIntDiff) + wy * len(syIntDiff)) | |
| def levenshteinSimilarity(s1, s2): | |
| """ | |
| Levenshtein similarity for strings | |
| Parameters | |
| sx : first string | |
| sy : second string | |
| """ | |
| assert type(s1) == str and type(s2) == str, "Levenshtein similarity is for string only" | |
| d = ld(s1,s2) | |
| #print(d) | |
| l = max(len(s1),len(s2)) | |
| d = 1.0 - min(d/l, 1.0) | |
| return d | |
| def norm(values, po=2): | |
| """ | |
| norm | |
| Parameters | |
| values : list of values | |
| po : power | |
| """ | |
| no = sum(list(map(lambda v: pow(v,po), values))) | |
| no = pow(no,1.0/po) | |
| return list(map(lambda v: v/no, values)) | |
| def createOneHotVec(size, indx = -1): | |
| """ | |
| random one hot vector | |
| Parameters | |
| size : vector size | |
| indx : one hot position | |
| """ | |
| vec = [0] * size | |
| s = random.randint(0, size - 1) if indx < 0 else indx | |
| vec[s] = 1 | |
| return vec | |
| def createAllOneHotVec(size): | |
| """ | |
| create all one hot vectors | |
| Parameters | |
| size : vector size and no of vectors | |
| """ | |
| vecs = list() | |
| for i in range(size): | |
| vec = [0] * size | |
| vec[i] = 1 | |
| vecs.append(vec) | |
| return vecs | |
| def blockShuffle(data, blockSize): | |
| """ | |
| block shuffle | |
| Parameters | |
| data : list data | |
| blockSize : block size | |
| """ | |
| numBlock = int(len(data) / blockSize) | |
| remain = len(data) % blockSize | |
| numBlock += (1 if remain > 0 else 0) | |
| shuffled = list() | |
| for i in range(numBlock): | |
| b = random.randint(0, numBlock-1) | |
| beg = b * blockSize | |
| if (b < numBlock-1): | |
| end = beg + blockSize | |
| shuffled.extend(data[beg:end]) | |
| else: | |
| shuffled.extend(data[beg:]) | |
| return shuffled | |
| def shuffle(data, numShuffle): | |
| """ | |
| shuffle data by randonm swapping | |
| Parameters | |
| data : list data | |
| numShuffle : no of pairwise swaps | |
| """ | |
| sz = len(data) | |
| if numShuffle is None: | |
| numShuffle = int(sz / 2) | |
| for i in range(numShuffle): | |
| fi = random.randint(0, sz -1) | |
| se = random.randint(0, sz -1) | |
| tmp = data[fi] | |
| data[fi] = data[se] | |
| data[se] = tmp | |
| def randomWalk(size, start, lowStep, highStep): | |
| """ | |
| random walk | |
| Parameters | |
| size : list data | |
| start : initial position | |
| lowStep : step min | |
| highStep : step max | |
| """ | |
| cur = start | |
| for i in range(size): | |
| yield cur | |
| cur += randomFloat(lowStep, highStep) | |
| def binaryEcodeCategorical(values, value): | |
| """ | |
| one hot binary encoding | |
| Parameters | |
| values : list of values | |
| value : value to be replaced with 1 | |
| """ | |
| size = len(values) | |
| vec = [0] * size | |
| for i in range(size): | |
| if (values[i] == value): | |
| vec[i] = 1 | |
| return vec | |
| def createLabeledSeq(inputData, tw): | |
| """ | |
| Creates feature, label pair from sequence data, where we have tw number of features followed by output | |
| Parameters | |
| values : list containing feature and label | |
| tw : no of features | |
| """ | |
| features = list() | |
| labels = list() | |
| l = len(inputDta) | |
| for i in range(l - tw): | |
| trainSeq = inputData[i:i+tw] | |
| trainLabel = inputData[i+tw] | |
| features.append(trainSeq) | |
| labels.append(trainLabel) | |
| return (features, labels) | |
| def createLabeledSeq(filePath, delim, index, tw): | |
| """ | |
| Creates feature, label pair from 1D sequence data in file | |
| Parameters | |
| filePath : file path | |
| delim : delemeter | |
| index : column index | |
| tw : no of features | |
| """ | |
| seqData = getFileColumnAsFloat(filePath, delim, index) | |
| return createLabeledSeq(seqData, tw) | |
| def fromMultDimSeqToTabular(data, inpSize, seqLen): | |
| """ | |
| Input shape (nrow, inpSize * seqLen) output shape(nrow * seqLen, inpSize) | |
| Parameters | |
| data : 2D array | |
| inpSize : each input size in sequence | |
| seqLen : sequence length | |
| """ | |
| nrow = data.shape[0] | |
| assert data.shape[1] == inpSize * seqLen, "invalid input size or sequence length" | |
| return data.reshape(nrow * seqLen, inpSize) | |
| def fromTabularToMultDimSeq(data, inpSize, seqLen): | |
| """ | |
| Input shape (nrow * seqLen, inpSize) output shape (nrow, inpSize * seqLen) | |
| Parameters | |
| data : 2D array | |
| inpSize : each input size in sequence | |
| seqLen : sequence length | |
| """ | |
| nrow = int(data.shape[0] / seqLen) | |
| assert data.shape[1] == inpSize, "invalid input size" | |
| return data.reshape(nrow, seqLen * inpSize) | |
| def difference(data, interval=1): | |
| """ | |
| takes difference in time series data | |
| Parameters | |
| data :list data | |
| interval : interval for difference | |
| """ | |
| diff = list() | |
| for i in range(interval, len(data)): | |
| value = data[i] - data[i - interval] | |
| diff.append(value) | |
| return diff | |
| def normalizeMatrix(data, norm, axis=1): | |
| """ | |
| normalized each row of the matrix | |
| Parameters | |
| data : 2D data | |
| nporm : normalization method | |
| axis : row or column | |
| """ | |
| normalized = preprocessing.normalize(data,norm=norm, axis=axis) | |
| return normalized | |
| def standardizeMatrix(data, axis=0): | |
| """ | |
| standardizes each column of the matrix with mean and std deviation | |
| Parameters | |
| data : 2D data | |
| axis : row or column | |
| """ | |
| standardized = preprocessing.scale(data, axis=axis) | |
| return standardized | |
| def asNumpyArray(data): | |
| """ | |
| converts to numpy array | |
| Parameters | |
| data : array | |
| """ | |
| return np.array(data) | |
| def perfMetric(metric, yActual, yPred, clabels=None): | |
| """ | |
| predictive model accuracy metric | |
| Parameters | |
| metric : accuracy metric | |
| yActual : actual values array | |
| yPred : predicted values array | |
| clabels : class labels | |
| """ | |
| if metric == "rsquare": | |
| score = metrics.r2_score(yActual, yPred) | |
| elif metric == "mae": | |
| score = metrics.mean_absolute_error(yActual, yPred) | |
| elif metric == "mse": | |
| score = metrics.mean_squared_error(yActual, yPred) | |
| elif metric == "acc": | |
| yPred = np.rint(yPred) | |
| score = metrics.accuracy_score(yActual, yPred) | |
| elif metric == "mlAcc": | |
| yPred = np.argmax(yPred, axis=1) | |
| score = metrics.accuracy_score(yActual, yPred) | |
| elif metric == "prec": | |
| yPred = np.argmax(yPred, axis=1) | |
| score = metrics.precision_score(yActual, yPred) | |
| elif metric == "rec": | |
| yPred = np.argmax(yPred, axis=1) | |
| score = metrics.recall_score(yActual, yPred) | |
| elif metric == "fone": | |
| yPred = np.argmax(yPred, axis=1) | |
| score = metrics.f1_score(yActual, yPred) | |
| elif metric == "confm": | |
| yPred = np.argmax(yPred, axis=1) | |
| score = metrics.confusion_matrix(yActual, yPred) | |
| elif metric == "clarep": | |
| yPred = np.argmax(yPred, axis=1) | |
| score = metrics.classification_report(yActual, yPred) | |
| elif metric == "bce": | |
| if clabels is None: | |
| clabels = [0, 1] | |
| score = metrics.log_loss(yActual, yPred, labels=clabels) | |
| elif metric == "ce": | |
| assert clabels is not None, "labels must be provided" | |
| score = metrics.log_loss(yActual, yPred, labels=clabels) | |
| else: | |
| exitWithMsg("invalid prediction performance metric " + metric) | |
| return score | |
| def scaleData(data, method): | |
| """ | |
| scales feature data column wise | |
| Parameters | |
| data : 2D array | |
| method : scaling method | |
| """ | |
| if method == "minmax": | |
| scaler = preprocessing.MinMaxScaler() | |
| data = scaler.fit_transform(data) | |
| elif method == "zscale": | |
| data = preprocessing.scale(data) | |
| else: | |
| raise ValueError("invalid scaling method") | |
| return data | |
| def scaleDataWithParams(data, method, scParams): | |
| """ | |
| scales feature data column wise | |
| Parameters | |
| data : 2D array | |
| method : scaling method | |
| scParams : scaling parameters | |
| """ | |
| if method == "minmax": | |
| data = scaleMinMaxTabData(data, scParams) | |
| elif method == "zscale": | |
| raise ValueError("invalid scaling method") | |
| else: | |
| raise ValueError("invalid scaling method") | |
| return data | |
| def scaleMinMaxScaData(data, minMax): | |
| """ | |
| minmax scales scalar data | |
| Parameters | |
| data : scalar data | |
| minMax : min, max and range for each column | |
| """ | |
| sd = (data - minMax[0]) / minMax[2] | |
| return sd | |
| def scaleMinMaxTabData(tdata, minMax): | |
| """ | |
| for tabular scales feature data column wise using min max values for each field | |
| Parameters | |
| tdata : 2D array | |
| minMax : min, max and range for each column | |
| """ | |
| stdata = list() | |
| for r in tdata: | |
| srdata = list() | |
| for i, c in enumerate(r): | |
| sd = (c - minMax[i][0]) / minMax[i][2] | |
| srdata.append(sd) | |
| stdata.append(srdata) | |
| return stdata | |
| def scaleMinMax(rdata, minMax): | |
| """ | |
| scales feature data column wise using min max values for each field | |
| Parameters | |
| rdata : data array | |
| minMax : min, max and range for each column | |
| """ | |
| srdata = list() | |
| for i in range(len(rdata)): | |
| d = rdata[i] | |
| sd = (d - minMax[i][0]) / minMax[i][2] | |
| srdata.append(sd) | |
| return srdata | |
| def harmonicNum(n): | |
| """ | |
| harmonic number | |
| Parameters | |
| n : number | |
| """ | |
| h = 0 | |
| for i in range(1, n+1, 1): | |
| h += 1.0 / i | |
| return h | |
| def digammaFun(n): | |
| """ | |
| figamma function | |
| Parameters | |
| n : number | |
| """ | |
| #Euler Mascheroni constant | |
| ec = 0.577216 | |
| return harmonicNum(n - 1) - ec | |
| def getDataPartitions(tdata, types, columns = None): | |
| """ | |
| partitions data with the given columns and random split point defined with predicates | |
| Parameters | |
| tdata : 2D array | |
| types : data typers | |
| columns : column indexes | |
| """ | |
| (dtypes, cvalues) = extractTypesFromString(types) | |
| if columns is None: | |
| ncol = len(data[0]) | |
| columns = list(range(ncol)) | |
| ncol = len(columns) | |
| #print(columns) | |
| # partition predicates | |
| partitions = None | |
| for c in columns: | |
| #print(c) | |
| dtype = dtypes[c] | |
| pred = list() | |
| if dtype == "int" or dtype == "float": | |
| (vmin, vmax) = getColMinMax(tdata, c) | |
| r = vmax - vmin | |
| rmin = vmin + .2 * r | |
| rmax = vmax - .2 * r | |
| sp = randomFloat(rmin, rmax) | |
| if dtype == "int": | |
| sp = int(sp) | |
| else: | |
| sp = "{:.3f}".format(sp) | |
| sp = float(sp) | |
| pred.append([c, "LT", sp]) | |
| pred.append([c, "GE", sp]) | |
| elif dtype == "cat": | |
| cv = cvalues[c] | |
| card = len(cv) | |
| if card < 3: | |
| num = 1 | |
| else: | |
| num = randomInt(1, card - 1) | |
| sp = selectRandomSubListFromList(cv, num) | |
| sp = " ".join(sp) | |
| pred.append([c, "IN", sp]) | |
| pred.append([c, "NOTIN", sp]) | |
| #print(pred) | |
| if partitions is None: | |
| partitions = pred.copy() | |
| #print("initial") | |
| #print(partitions) | |
| else: | |
| #print("extension") | |
| tparts = list() | |
| for p in partitions: | |
| #print(p) | |
| l1 = p.copy() | |
| l1.extend(pred[0]) | |
| l2 = p.copy() | |
| l2.extend(pred[1]) | |
| #print("after extension") | |
| #print(l1) | |
| #print(l2) | |
| tparts.append(l1) | |
| tparts.append(l2) | |
| partitions = tparts | |
| #print("extending") | |
| #print(partitions) | |
| #for p in partitions: | |
| #print(p) | |
| return partitions | |
| def genAlmostUniformDistr(size, nswap=50): | |
| """ | |
| generate probability distribution | |
| Parameters | |
| size : distr size | |
| nswap : no of mass swaps | |
| """ | |
| un = 1.0 / size | |
| distr = [un] * size | |
| distr = mutDistr(distr, 0.1 * un, nswap) | |
| return distr | |
| def mutDistr(distr, shift, nswap=50): | |
| """ | |
| mutates a probability distribution | |
| Parameters | |
| distr distribution | |
| shift : amount of shift for swap | |
| nswap : no of mass swaps | |
| """ | |
| size = len(distr) | |
| for _ in range(nswap): | |
| fi = randomInt(0, size -1) | |
| si = randomInt(0, size -1) | |
| while fi == si: | |
| fi = randomInt(0, size -1) | |
| si = randomInt(0, size -1) | |
| shift = randomFloat(0, shift) | |
| t = distr[fi] | |
| distr[fi] -= shift | |
| if (distr[fi] < 0): | |
| distr[fi] = 0.0 | |
| shift = t | |
| distr[si] += shift | |
| return distr | |
| def generateBinDistribution(size, ntrue): | |
| """ | |
| generate binary array with some elements set to 1 | |
| Parameters | |
| size : distr size | |
| ntrue : no of true values | |
| """ | |
| distr = [0] * size | |
| idxs = selectRandomSubListFromList(list(range(size)), ntrue) | |
| for i in idxs: | |
| distr[i] = 1 | |
| return distr | |
| def mutBinaryDistr(distr, nmut): | |
| """ | |
| mutate binary distribution | |
| Parameters | |
| distr : distr | |
| nmut : no of mutations | |
| """ | |
| idxs = selectRandomSubListFromList(list(range(len(distr))), nmut) | |
| for i in idxs: | |
| distr[i] = distr[i] ^ 1 | |
| return distr | |
| def fileSelFieldSubSeqModifierGen(filePath, column, offset, seqLen, modifier, precision, delim=","): | |
| """ | |
| file record generator that superimposes given data in the specified segment of a column | |
| Parameters | |
| filePath ; file path | |
| column : column index | |
| offset : offset into column values | |
| seqLen : length of subseq | |
| modifier : data to be superimposed either list or a sampler object | |
| precision : floating point precision | |
| delim : delemeter | |
| """ | |
| beg = offset | |
| end = beg + seqLen | |
| isList = type(modifier) == list | |
| i = 0 | |
| for rec in fileRecGen(filePath, delim): | |
| if i >= beg and i < end: | |
| va = float(rec[column]) | |
| if isList: | |
| va += modifier[i - beg] | |
| else: | |
| va += modifier.sample() | |
| rec[column] = formatFloat(precision, va) | |
| yield delim.join(rec) | |
| i += 1 | |
| class ShiftedDataGenerator: | |
| """ | |
| transforms data for distribution shift | |
| """ | |
| def __init__(self, types, tdata, addFact, multFact): | |
| """ | |
| initializer | |
| Parameters | |
| types data types | |
| tdata : 2D array | |
| addFact ; factor for data shift | |
| multFact ; factor for data scaling | |
| """ | |
| (self.dtypes, self.cvalues) = extractTypesFromString(types) | |
| self.limits = dict() | |
| for k,v in self.dtypes.items(): | |
| if v == "int" or v == "false": | |
| (vmax, vmin) = getColMinMax(tdata, k) | |
| self.limits[k] = vmax - vmin | |
| self.addMin = - addFact / 2 | |
| self.addMax = addFact / 2 | |
| self.multMin = 1.0 - multFact / 2 | |
| self.multMax = 1.0 + multFact / 2 | |
| def transform(self, tdata): | |
| """ | |
| linear transforms data to create distribution shift with random shift and scale | |
| Parameters | |
| types : data types | |
| """ | |
| transforms = dict() | |
| for k,v in self.dtypes.items(): | |
| if v == "int" or v == "false": | |
| shift = randomFloat(self.addMin, self.addMax) * self.limits[k] | |
| scale = randomFloat(self.multMin, self.multMax) | |
| trns = (shift, scale) | |
| transforms[k] = trns | |
| elif v == "cat": | |
| transforms[k] = isEventSampled(50) | |
| ttdata = list() | |
| for rec in tdata: | |
| nrec = rec.copy() | |
| for c in range(len(rec)): | |
| if c in self.dtypes: | |
| dtype = self.dtypes[c] | |
| if dtype == "int" or dtype == "float": | |
| (shift, scale) = transforms[c] | |
| nval = shift + rec[c] * scale | |
| if dtype == "int": | |
| nrec[c] = int(nval) | |
| else: | |
| nrec[c] = nval | |
| elif dtype == "cat": | |
| cv = self.cvalues[c] | |
| if transforms[c]: | |
| nval = selectOtherRandomFromList(cv, rec[c]) | |
| nrec[c] = nval | |
| ttdata.append(nrec) | |
| return ttdata | |
| def transformSpecified(self, tdata, sshift, scale): | |
| """ | |
| linear transforms data to create distribution shift shift specified shift and scale | |
| Parameters | |
| types : data types | |
| sshift : shift factor | |
| scale : scale factor | |
| """ | |
| transforms = dict() | |
| for k,v in self.dtypes.items(): | |
| if v == "int" or v == "false": | |
| shift = sshift * self.limits[k] | |
| trns = (shift, scale) | |
| transforms[k] = trns | |
| elif v == "cat": | |
| transforms[k] = isEventSampled(50) | |
| ttdata = self.__scaleShift(tdata, transforms) | |
| return ttdata | |
| def __scaleShift(self, tdata, transforms): | |
| """ | |
| shifts and scales tabular data | |
| Parameters | |
| tdata : 2D array | |
| transforms : transforms to apply | |
| """ | |
| ttdata = list() | |
| for rec in tdata: | |
| nrec = rec.copy() | |
| for c in range(len(rec)): | |
| if c in self.dtypes: | |
| dtype = self.dtypes[c] | |
| if dtype == "int" or dtype == "float": | |
| (shift, scale) = transforms[c] | |
| nval = shift + rec[c] * scale | |
| if dtype == "int": | |
| nrec[c] = int(nval) | |
| else: | |
| nrec[c] = nval | |
| elif dtype == "cat": | |
| cv = self.cvalues[c] | |
| if transforms[c]: | |
| #nval = selectOtherRandomFromList(cv, rec[c]) | |
| #nrec[c] = nval | |
| pass | |
| ttdata.append(nrec) | |
| return ttdata | |
| class RollingStat(object): | |
| """ | |
| stats for rolling windowt | |
| """ | |
| def __init__(self, wsize): | |
| """ | |
| initializer | |
| Parameters | |
| wsize : window size | |
| """ | |
| self.window = list() | |
| self.wsize = wsize | |
| self.mean = None | |
| self.sd = None | |
| def add(self, value): | |
| """ | |
| add a value | |
| Parameters | |
| value : value to add | |
| """ | |
| self.window.append(value) | |
| if len(self.window) > self.wsize: | |
| self.window = self.window[1:] | |
| def getStat(self): | |
| """ | |
| get rolling window mean and std deviation | |
| """ | |
| assertGreater(len(self.window), 0, "window is empty") | |
| if len(self.window) == 1: | |
| self.mean = self.window[0] | |
| self.sd = 0 | |
| else: | |
| self.mean = statistics.mean(self.window) | |
| self.sd = statistics.stdev(self.window, xbar=self.mean) | |
| re = (self.mean, self.sd) | |
| return re | |
| def getSize(self): | |
| """ | |
| return window size | |
| """ | |
| return len(self.window) | |