""" This file is intended to perfom certain machine learning tasks based on numpy We are trying to keep it lean that's why no sklearn involved yet """ from __future__ import division import numpy as np class ML: @staticmethod def Filter (attr,value,data) : # # @TODO: Make sure this approach works across all transport classes # We may have a potential issue of how the data is stored ... it may not scale # return [item[0] for item in data if item[0][attr] == value] @staticmethod def Extract(lattr,data): return [[row[id] for id in lattr] for row in data] """ Implements a multivariate anomaly detection @TODO: determine computationally determine epsilon """ class AnomalyDetection: def split(self,data,index=-1,threshold=0.7) : N = len(data) # if N < LIMIT: # return None end = int(N*threshold) train = data[:end] test = data[end:] return {"train":train,"test":test} """ @param key field name by which the data will be filtered @param value field value for the filter @param features features to be used in the analysis @param labels used to assess performance @TODO: Map/Reduce does a good job at filtering """ def learn(self,data,key,value,features,label): xo = ML.Filter(key,value,data) # attr = conf['features'] # label= conf['label'] yo= ML.Extract([label['name']],xo) xo = ML.Extract(features,xo) yo = self.getLabel(yo,label) xo = self.split(xo) yo = self.split(yo) p = self.gParameters(xo['train']) px = self.gPx(p['mean'],p['cov'],xo['test']) print self.gPerformance(px,yo['test']) def getLabel(self,yo,label_conf): return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ] """ This function will compute the probability density function given a particular event/set of events @pre xu.shape[0] == sigma[0] == sigma[1] """ def gPx(self,xu,sigma,data,EPSILON=0.05): n = len(data[0]) r = [] a = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5 # EPSILON = np.float64(EPSILON) test = np.array(data) for row in test: row = np.array(row) d = np.matrix(row - xu) d.shape = (n,1) b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d)) px = float(b/a) r.append([px,int(px < EPSILON)]) return r """ This function computes performance metrics i.e precision, recall and f-score for details visit https://en.wikipedia.org/wiki/Precision_and_recall """ def gPerformance(self,test,labels) : N = len(test) tp = 0 # true positive fp = 0 # false positive fn = 0 # false negative tn = 0 # true negative for i in range(0,N): tp += 1 if (test[i][1]==labels[i] and test[i][1] == 1) else 0 fp += 1 if (test[i][1] != labels[i] and test[i][1] == 1) else 0 fn += 1 if (test[i][1] != labels[i] and test[i][1] == 0) else 0 tn += 1 if (test[i][1] == labels[i] and test[i][1] == 0) else 0 precision = tp / (tp + fp) recall = tp / (tp + fn) fscore = (2 * precision * recall)/ (precision + recall) return {"precision":precision,"recall":recall,"fscore":fscore} """ This function returns gaussian parameters i.e means and covariance The information will be used to compute probabilities """ def gParameters(self,train) : n = len(train[0]) m = np.transpose(np.array(train)) u = np.array([ np.mean(m[i][:]) for i in range(0,n)]) r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)]) # #-- Normalizing the matrix then we will compute covariance matrix # m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)]) sigma = np.cov(m) return {"cov":sigma,"mean":u}