Spaces:
Runtime error
Runtime error
| #!/usr/local/bin/python3 | |
| # avenir-python: Machine Learning | |
| # Author: Pranab Ghosh | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); you | |
| # may not use this file except in compliance with the License. You may | |
| # obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | |
| # implied. See the License for the specific language governing | |
| # permissions and limitations under the License. | |
| # Package imports | |
| import os | |
| import sys | |
| import matplotlib.pyplot as plt | |
| import matplotlib | |
| import random | |
| from random import randint | |
| from itertools import compress | |
| import numpy as np | |
| import torch | |
| from torch import nn | |
| from torch.nn import Linear | |
| from torch.autograd import Variable | |
| from torch.utils.data import DataLoader | |
| from torchvision import transforms | |
| from torch_geometric.nn import GCNConv | |
| from torch_geometric.nn import MessagePassing | |
| from torch_geometric.data import Data | |
| import sklearn as sk | |
| import jprops | |
| sys.path.append(os.path.abspath("../lib")) | |
| from util import * | |
| from mlutil import * | |
| from tnn import FeedForwardNetwork | |
| """ | |
| Graph convolution network | |
| """ | |
| class GraphConvoNetwork(nn.Module): | |
| def __init__(self, configFile): | |
| """ | |
| initilizer | |
| Parameters | |
| configFile : config file path | |
| """ | |
| defValues = dict() | |
| defValues["common.model.directory"] = ("model", None) | |
| defValues["common.model.file"] = (None, None) | |
| defValues["common.preprocessing"] = (None, None) | |
| defValues["common.scaling.method"] = ("zscale", None) | |
| defValues["common.scaling.minrows"] = (50, None) | |
| defValues["common.scaling.param.file"] = (None, None) | |
| defValues["common.verbose"] = (False, None) | |
| defValues["common.device"] = ("cpu", None) | |
| defValues["train.data.file"] = (None, "missing training data file") | |
| defValues["train.data.num.nodes.total"] = (None, None) | |
| defValues["train.data.num.nodes.training"] = (None, None) | |
| defValues["train.data.splits"] = ([.75,.15,.10], None) | |
| defValues["train.layer.data"] = (None, "missing layer data") | |
| defValues["train.input.size"] = (None, "missing output size") | |
| defValues["train.output.size"] = (None, "missing output size") | |
| defValues["train.loss.reduction"] = ("mean", None) | |
| defValues["train.num.iterations"] = (500, None) | |
| defValues["train.lossFn"] = ("mse", None) | |
| defValues["train.optimizer"] = ("sgd", None) | |
| defValues["train.opt.learning.rate"] = (.0001, None) | |
| defValues["train.opt.weight.decay"] = (0, None) | |
| defValues["train.opt.momentum"] = (0, None) | |
| defValues["train.opt.eps"] = (1e-08, None) | |
| defValues["train.opt.dampening"] = (0, None) | |
| defValues["train.opt.momentum.nesterov"] = (False, None) | |
| defValues["train.opt.betas"] = ([0.9, 0.999], None) | |
| defValues["train.opt.alpha"] = (0.99, None) | |
| defValues["train.save.model"] = (False, None) | |
| defValues["train.track.error"] = (False, None) | |
| defValues["train.epoch.intv"] = (5, None) | |
| defValues["train.print.weights"] = (False, None) | |
| defValues["valid.accuracy.metric"] = (None, None) | |
| defValues["predict.create.mask"] = (False, None) | |
| defValues["predict.use.saved.model"] = (True, None) | |
| self.config = Configuration(configFile, defValues) | |
| super(GraphConvoNetwork, self).__init__() | |
| def getConfig(self): | |
| """ | |
| return config | |
| """ | |
| return self.config | |
| def buildModel(self): | |
| """ | |
| Loads configuration and builds the various piecess necessary for the model | |
| """ | |
| torch.manual_seed(9999) | |
| self.verbose = self.config.getBooleanConfig("common.verbose")[0] | |
| numinp = self.config.getIntConfig("train.input.size")[0] | |
| self.outputSize = self.config.getIntConfig("train.output.size")[0] | |
| self.numIter = self.config.getIntConfig("train.num.iterations")[0] | |
| optimizer = self.config.getStringConfig("train.optimizer")[0] | |
| self.lossFnStr = self.config.getStringConfig("train.lossFn")[0] | |
| self.accMetric = self.config.getStringConfig("valid.accuracy.metric")[0] | |
| self.trackErr = self.config.getBooleanConfig("train.track.error")[0] | |
| self.restored = False | |
| self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None | |
| #build network | |
| layers = list() | |
| ninp = numinp | |
| trData = self.config.getStringConfig("train.layer.data")[0].split(",") | |
| for ld in trData: | |
| lde = ld.split(":") | |
| ne = len(lde) | |
| assert ne == 5 or ne == 6, "expecting 5 or 6 items for layer data" | |
| gconv = False | |
| if ne == 6: | |
| if lde[0] == "gconv": | |
| gconv == True | |
| lde = lde[1:] | |
| #num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction | |
| nunit = int(lde[0]) | |
| actStr = lde[1] | |
| act = FeedForwardNetwork.createActivation(actStr) if actStr != "none" else None | |
| bnorm = lde[2] == "true" | |
| afterAct = lde[3] == "true" | |
| dpr = float(lde[4]) | |
| if gconv: | |
| layers.append(GCNConv(ninp, nunit)) | |
| else: | |
| layers.append(Linear(ninp, nunit)) | |
| if bnorm: | |
| #with batch norm | |
| if afterAct: | |
| safeAppend(layers, act) | |
| layers.append(torch.nn.BatchNorm1d(nunit)) | |
| else: | |
| layers.append(torch.nn.BatchNorm1d(nunit)) | |
| safeAppend(layers, act) | |
| else: | |
| #without batch norm | |
| safeAppend(layers, act) | |
| if dpr > 0: | |
| layers.append(torch.nn.Dropout(dpr)) | |
| ninp = nunit | |
| self.layers = torch.nn.ModuleList(layers) | |
| self.device = FeedForwardNetwork.getDevice(self) | |
| self.to(self.device) | |
| self.loadData() | |
| self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr) | |
| self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizer) | |
| self.trained = False | |
| def loadData(self): | |
| """ | |
| load node and edge data | |
| """ | |
| dataFilePath = self.config.getStringConfig("train.data.file")[0] | |
| numNodes = self.config.getIntConfig("train.data.num.nodes.total")[0] | |
| numLabeled = self.config.getIntConfig("train.data.num.nodes.training")[0] | |
| splits = self.config.getFloatListConfig("train.data.splits")[0] | |
| crPredMask = self.config.getBooleanConfig("predict.create.mask")[0] | |
| dx = list() | |
| dy = list() | |
| edges = list() | |
| mask = None | |
| for rec in fileRecGen(dataFilePath, ","): | |
| if len(rec) > 2: | |
| x = rec[1 :-1] | |
| x = toFloatList(x) | |
| y = int(rec[-1]) | |
| dx.append(x) | |
| dy.append(y) | |
| elif len(rec) == 2: | |
| e = toIntList(rec) | |
| edges.append(e) | |
| elif len(rec) == 1: | |
| items = rec[0].split() | |
| assertEqual(items[0], "mask", "invalid mask data") | |
| numNodes = int(items[1]) | |
| print(numNodes) | |
| mask = list() | |
| for r in range(2, len(items), 1): | |
| ri = items[r].split(":") | |
| #print(ri) | |
| ms = list(range(int(ri[0]), int(ri[1]), 1)) | |
| mask.extend(ms) | |
| #scale node features | |
| if (self.config.getStringConfig("common.preprocessing")[0] == "scale"): | |
| scalingMethod = self.config.getStringConfig("common.scaling.method")[0] | |
| dx = scaleData(dx, scalingMethod) | |
| dx = torch.tensor(dx, dtype=torch.float) | |
| dy = torch.tensor(dy, dtype=torch.long) | |
| edges = torch.tensor(edges, dtype=torch.long) | |
| edges = edges.t().contiguous() | |
| dx = dx.to(self.device) | |
| dy = dy.to(self.device) | |
| edges = edges.to(self.device) | |
| self.data = Data(x=dx, edge_index=edges, y=dy) | |
| #maks | |
| if mask is None: | |
| #trainiug data in the beginning | |
| trStart = 0 | |
| vaStart = int(splits[0] * numLabeled) | |
| teStart = vaStart + int(splits[1] * numLabeled) | |
| trMask = [False] * numNodes | |
| trMask[0:vaStart] = [True] * vaStart | |
| vaMask = [False] * numNodes | |
| vaMask[vaStart:teStart] = [True] * (teStart - vaStart) | |
| teMask = [False] * numNodes | |
| teMask[teStart:] = [True] * (numNodes - teStart) | |
| else: | |
| #training data anywhere | |
| if crPredMask: | |
| prMask = [True] * numNodes | |
| for i in mask: | |
| prMask[i] = False | |
| self.prMask = torch.tensor(prMask, dtype=torch.bool) | |
| nshuffle = int(len(mask) / 2) | |
| shuffle(mask, nshuffle) | |
| #print(mask) | |
| lmask = len(mask) | |
| trme = int(splits[0] * lmask) | |
| vame = int((splits[0] + splits[1]) * lmask) | |
| teme = lmask | |
| trMask = [False] * numNodes | |
| for i in mask[:trme]: | |
| trMask[i] = True | |
| vaMask = [False] * numNodes | |
| for i in mask[trme:vame]: | |
| vaMask[i] = True | |
| teMask = [False] * numNodes | |
| for i in mask[vame:]: | |
| teMask[i] = True | |
| #print(vaMask) | |
| trMask = torch.tensor(trMask, dtype=torch.bool) | |
| trMask = trMask.to(self.device) | |
| self.data.train_mask = trMask | |
| vaMask = torch.tensor(vaMask, dtype=torch.bool) | |
| vaMask = vaMask.to(self.device) | |
| self.data.val_mask = vaMask | |
| teMask = torch.tensor(teMask, dtype=torch.bool) | |
| teMask = teMask.to(self.device) | |
| self.data.test_mask = teMask | |
| def descData(self): | |
| """ | |
| describe data | |
| """ | |
| print(f'Number of nodes: {self.data.num_nodes}') | |
| print(f'Number of edges: {self.data.num_edges}') | |
| print(f'Number of node features: {self.data.num_node_features}') | |
| print(f'Number of training nodes: {self.data.train_mask.sum()}') | |
| print(f'Training node label rate: {int(self.data.train_mask.sum()) / data.num_nodes:.2f}') | |
| print(f'Number of validation nodes: {self.data.val_mask.sum()}') | |
| print(f'Number of test nodes: {self.data.test_mask.sum()}') | |
| print(f'Is undirected: {self.data.is_undirected()}') | |
| print("Data attributes") | |
| print(self.data.keys) | |
| print("Data types") | |
| print(type(self.data.x)) | |
| print(type(self.data.y)) | |
| print(type(self.data.edge_index)) | |
| print(type(self.data.train_mask)) | |
| print("Sample data") | |
| print("x", self.data.x[:4]) | |
| print("y", self.data.y[:4]) | |
| print("edge", self.data.edge_index[:4]) | |
| print("train mask", self.data.train_mask[:4]) | |
| print("test mask", self.data.test_mask[:4]) | |
| print("Any isolated node? " , self.data.has_isolated_nodes()) | |
| print("Any self loop? ", self.data.has_self_loops()) | |
| print("Is graph directed? ", self.data.is_directed()) | |
| def forward(self): | |
| """ | |
| forward prop | |
| """ | |
| x, edges = self.data.x, self.data.edge_index | |
| for l in self.layers: | |
| if isinstance(l, MessagePassing): | |
| x = l(x, edges) | |
| else: | |
| x = l(x) | |
| return x | |
| def trainModel(model): | |
| """ | |
| train with batch data | |
| Parameters | |
| model : torch model | |
| """ | |
| epochIntv = model.config.getIntConfig("train.epoch.intv")[0] | |
| model.train() | |
| if model.trackErr: | |
| trErr = list() | |
| vaErr = list() | |
| for epoch in range(model.numIter): | |
| out = model() | |
| loss = model.lossFn(out[model.data.train_mask], model.data.y[model.data.train_mask]) | |
| #error tracking at batch level | |
| if model.trackErr: | |
| trErr.append(loss.item()) | |
| vErr = GraphConvoNetwork.evaluateModel(model) | |
| vaErr.append(vErr) | |
| if model.verbose and epoch % epochIntv == 0: | |
| print("epoch {} loss {:.6f} val error {:.6f}".format(epoch, loss.item(), vErr)) | |
| model.optimizer.zero_grad() | |
| loss.backward() | |
| model.optimizer.step() | |
| #acc = GraphConvoNetwork.evaluateModel(model, True) | |
| #print(acc) | |
| modelSave = model.config.getBooleanConfig("train.model.save")[0] | |
| if modelSave: | |
| FeedForwardNetwork.saveCheckpt(model) | |
| if model.trackErr: | |
| FeedForwardNetwork.errorPlot(model, trErr, vaErr) | |
| model.trained = True | |
| def evaluateModel(model, verbose=False): | |
| """ | |
| evaluate model | |
| Parameters | |
| model : torch model | |
| verbose : if True additional output | |
| """ | |
| model.eval() | |
| with torch.no_grad(): | |
| out = model() | |
| if verbose: | |
| print(out) | |
| yPred = out[model.data.val_mask].data.cpu().numpy() | |
| yActual = model.data.y[model.data.val_mask].data.cpu().numpy() | |
| if verbose: | |
| for pa in zip(yPred, yActual): | |
| print(pa) | |
| #correct = yPred == yActual | |
| #score = int(correct.sum()) / int(model.data.val_mask.sum()) | |
| score = perfMetric(model.lossFnStr, yActual, yPred, model.clabels) | |
| model.train() | |
| return score | |
| def validateModel(model, retPred=False): | |
| """ | |
| model validation | |
| Parameters | |
| model : torch model | |
| retPred : if True return prediction | |
| """ | |
| model.eval() | |
| with torch.no_grad(): | |
| out = model() | |
| yPred = out.argmax(dim=1) | |
| yPred = yPred[model.data.test_mask].data.cpu().numpy() | |
| yActual = model.data.y[model.data.test_mask].data.cpu().numpy() | |
| #correct = yPred == yActual | |
| #score = int(correct.sum()) / int(model.data.val_mask.sum()) | |
| score = perfMetric(model.accMetric, yActual, yPred) | |
| print(formatFloat(3, score, "test #perf score")) | |
| return score | |
| def modelPrediction(model, inclData=True): | |
| """ | |
| make prediction | |
| Parameters | |
| model : torch model | |
| inclData : True to include input data | |
| """ | |
| cmask = model.config.getBooleanConfig("predict.create.mask")[0] | |
| if not cmask: | |
| print("create prediction mask property needs to be set to True") | |
| return None | |
| useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0] | |
| if useSavedModel: | |
| FeedForwardNetwork.restoreCheckpt(model) | |
| else: | |
| if not model.trained: | |
| GraphConvoNetwork.trainModel(model) | |
| model.eval() | |
| with torch.no_grad(): | |
| out = model() | |
| yPred = out.argmax(dim=1) | |
| yPred = yPred[model.prMask].data.cpu().numpy() | |
| if inclData: | |
| dataFilePath = model.config.getStringConfig("train.data.file")[0] | |
| filt = lambda r : len(r) > 2 | |
| ndata = list(fileFiltRecGen(dataFilePath, filt)) | |
| prMask = model.prMask.data.cpu().numpy() | |
| assertEqual(len(ndata), prMask.shape[0], "data and mask lengths are not equal") | |
| precs = list(compress(ndata, prMask)) | |
| precs = list(map(lambda r : r[:-1], precs)) | |
| assertEqual(len(precs), yPred.shape[0], "data and mask lengths are not equal") | |
| res = zip(precs, yPred) | |
| else: | |
| res = yPred | |
| return res | |