""" This file is intended to perfom certain machine learning tasks based on numpy We are trying to keep it lean that's why no sklearn involved yet @TODO: Create factory method for the learners implemented here Improve preconditions (size of the dataset, labels) """ from __future__ import division import numpy as np class ML: @staticmethod def Filter (attr,value,data) : # # @TODO: Make sure this approach works across all transport classes # We may have a potential issue of how the data is stored ... it may not scale # #return [item[0] for item in data if item and attr in item[0] and item[0][attr] == value] return [[item for item in row if item[attr] == value] for row in data] @staticmethod def Extract(lattr,data): if isinstance(lattr,basestring): lattr = [lattr] return [[row[id] for id in lattr] for row in data] """ Implements a multivariate anomaly detection @TODO: determine computationally determine epsilon """ class AnomalyDetection: def split(self,data,index=-1,threshold=0.8) : N = len(data) # if N < LIMIT: # return None end = int(N*threshold) train = data[:end] test = data[end:] return {"train":train,"test":test} """ @param key field name by which the data will be filtered @param value field value for the filter @param features features to be used in the analysis @param labels used to assess performance @TODO: Map/Reduce does a good job at filtering """ def learn(self,data,key,value,features,label): xo = ML.Filter(key,value,data) print key,value, len(xo) if not xo or len(xo) < 100: return None #if len(xo) < 100 : #return None # attr = conf['features'] # label= conf['label'] yo= ML.Extract([label['name']],xo) xo = ML.Extract(features,xo) yo = self.getLabel(yo,label) xo = self.split(xo) yo = self.split(yo) if xo['train'] : E = 0.01 fscore = 0 for i in range(0,10): Epsilon = E + (2*E*i) p = self.gParameters(xo['train']) if p is None : return None px = self.gPx(p['mean'],p['cov'],xo['test'],Epsilon) perf = self.gPerformance(px,yo['test']) if fscore == 0 : fscore = perf['fscore'] elif perf['fscore'] > fscore and perf['fscore'] > 0.5 : perf['epsilon'] = Epsilon return {"label":value,"parameters":p,"performance":perf} return None def getLabel(self,yo,label_conf): return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ] """ This function will compute the probability density function given a particular event/set of events The return value is [px,yo] @pre xu.shape[0] == sigma[0] == sigma[1] """ def gPx(self,xu,sigma,data,EPSILON=0.01): n = len(data[0]) r = [] a = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5 # EPSILON = np.float64(EPSILON) test = np.array(data) for row in test: row = np.array(row) d = np.matrix(row - xu) d.shape = (n,1) b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d)) px = float(b/a) r.append([px,int(px < EPSILON)]) return r """ This function uses stored learnt information to predict on raw data In this case it will determin if we have an anomaly or not @param xo raw observations (matrix) @param info stored information about this """ def predict(self,xo,info): xo = ML.Extract(info['features'],xo) if not xo : return None sigma = info['parameters']['cov'] xu = info['parameters']['mean'] epsilon = info['performance']['epsilon'] return self.gPx(xu,sigma,xo,epsilon) """ This function computes performance metrics i.e precision, recall and f-score for details visit https://en.wikipedia.org/wiki/Precision_and_recall """ def gPerformance(self,test,labels) : N = len(test) tp = 0 # true positive fp = 0 # false positive fn = 0 # false negative tn = 0 # true negative for i in range(0,N): tp += 1 if (test[i][1]==labels[i] and test[i][1] == 1) else 0 fp += 1 if (test[i][1] != labels[i] and test[i][1] == 1) else 0 fn += 1 if (test[i][1] != labels[i] and test[i][1] == 0) else 0 tn += 1 if (test[i][1] == labels[i] and test[i][1] == 0) else 0 precision = tp / (tp + fp) if tp + fp > 0 else 1 recall = tp / (tp + fn) if tp + fp > 0 else 1 fscore = (2 * precision * recall)/ (precision + recall) return {"precision":precision,"recall":recall,"fscore":fscore} """ This function returns gaussian parameters i.e means and covariance The information will be used to compute probabilities """ def gParameters(self,train) : n = len(train[0]) m = np.transpose(np.array(train)) u = np.array([ np.mean(m[i][:]) for i in range(0,n)]) if np.sum(u) == 0: return None r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)]) # #-- Normalizing the matrix then we will compute covariance matrix # m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)]) sigma = np.cov(m) sigma = [ list(row) for row in sigma] return {"cov":sigma,"mean":list(u)} class Regression: parameters = {} @staticmethod def predict(xo): pass def __init__(self,config): pass