diff --git a/src/utils/ml.py b/src/utils/ml.py index b9f3c51..43dbce2 100644 --- a/src/utils/ml.py +++ b/src/utils/ml.py @@ -2,6 +2,7 @@ This file is intended to perfom certain machine learning tasks based on numpy We are trying to keep it lean that's why no sklearn involved yet """ +from __future__ import division import numpy as np class ML: @@ -15,18 +16,91 @@ class ML: @staticmethod def Extract(lattr,data): return [[row[id] for id in lattr] for row in data] +""" + Implements a multivariate anomaly detection + @TODO: determine computationally determine epsilon +""" +class AnomalyDetection: + def split(self,data,index=-1,threshold=0.7) : + N = len(data) + if N < LIMIT: + return None + + end = int(N*threshold) + train = data[:end] + test = data[end:] + if index > 0: + return {"train":train,"test":test,"labels":[]} + def learn(self,data,conf): + if 'filter' in conf: + filter = conf['filter'] + data = ML.Filter(filter['key'],filter['value'],data) + attr = conf['features'] + label= conf['label'] + labels= ML.Extract([label],data) + data = ML.Extract(attr,data) + + r = self.split(data) + labels = self.split(labels) + + p = self.gParameters(r['train']) + test = self.gPx(p['mean'],p['cov'],r['test']) + return self.gPerformance(test,labels['test']) + + + + """ + This function will compute the probability density function given a particular event/set of events + @pre xu.shape[0] == sigma[0] == sigma[1] + """ + def gPx(self,xu,sigma,data,EPSILON=0.05): + n = len(data[0]) + + r = [] + a = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5 + # EPSILON = np.float64(EPSILON) + test = np.array(data) + for row in test: + row = np.array(row) + d = np.matrix(row - xu) + d.shape = (n,1) + b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d)) + px = float(b/a) + r.append([px,int(px < EPSILON)]) + return r + """ + This function computes performance metrics i.e precision, recall and f-score + for details visit https://en.wikipedia.org/wiki/Precision_and_recall + + """ + def gPerformance(self,test,labels) : + N = len(test) + tp = 0 # true positive + fp = 0 # false positive + fn = 0 # false negative + for i in range(0,N): + tp += 1 if test[i][1]==labels[i] and test[i][1] == 1 + fp += 1 if test[i][1] != labels[i] and test[i][1] == 1 + fn += 1 if test[i][1] != labels[i] and test[i][1] == 0 + precision = tp / (tp + fp) + recall = tp / (tp + fn) + fscore = (2 * precision * recall)/ (precision + recall) + return {"precision":precision,"recall":recall,"fscore":fscore} + + """ + This function returns gaussian parameters i.e means and covariance + The information will be used to compute probabilities + """ + def gParameters(self,train) : - def init(self,lattr,data): - self.lattr = attr - self.data = data - self.X = [] - self.Xmeans = [] - for id in lattr: - xvalues = [item for item in self.data[id]] - self.Xmeans.append(np.mean(xvalues)) - self.X.append(xvalues) - slef.Xcov = np.cov(self.X) + n = len(train[0]) + m = np.transpose(np.array(train)) + + u = np.array([ np.mean(m[i][:]) for i in range(0,n)]) + r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)]) # - # Let's get the covariance matrix here ... + #-- Normalizing the matrix then we will compute covariance matrix # - \ No newline at end of file + m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)]) + sigma = np.cov(m) + return {"cov":sigma,"mean":u} diff --git a/test/demo.py b/test/demo.py index 6e2f3a7..0eb02ab 100644 --- a/test/demo.py +++ b/test/demo.py @@ -1,8 +1,10 @@ +from __future__ import division import numpy as np -m = [[0.0, 4.5], [0.0, 4.5], [11.6, 4.4], [12.2, 4.3], [1.4, 3.9], [1.4, 3.9], [2.5, 3.8], [0.1, 3.8], [0.5, 5.1], [0.7, 5.2], [0.7, 5.1], [0.0, 4.6], [0.0, 4.6]] -m = np.transpose(np.array(m)) -xu_ = np.mean(m[1,:]) -yu_ = np.mean(m[0,:]) +from utils.ml import AnomalyDetection +mo = [[0.0, 4.5], [0.0, 4.5], [11.6, 4.4], [12.2, 4.3], [1.4, 3.9], [1.4, 3.9], [2.5, 3.8], [0.1, 3.8], [0.5, 5.1], [0.7, 5.2], [0.7, 5.1], [0.0, 4.6], [0.0, 4.6]] +m = np.transpose(np.array(mo)) +xu_ = np.mean(m[0,:]) +yu_ = np.mean(m[1,:]) xr_ = np.sqrt(np.var(m[0,:])) yr_ = np.sqrt(np.var(m[1,:])) @@ -10,21 +12,34 @@ yr_ = np.sqrt(np.var(m[1,:])) # -- normalizing the matrix before computing covariance # mn = np.array([list( (m[0,:]-xu_)/xr_),list( (m[1,:]-yu_)/yr_)]) + cx = np.cov(mn) n = m.shape[0] -x = np.array([2.4,3.1]) +test=[2.4,3.1] +x = np.array(test) u = np.array([xu_,yu_]) + d = np.matrix(x - u) d.shape = (n,1) a = (2*(np.pi)**(n/2))*np.linalg.det(cx)**0.5 b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(cx)*d)) +print u.shape +print cx.shape from scipy.stats import multivariate_normal xo= multivariate_normal.pdf(x,u,cx) yo= (b/a)[0,0] -e= 0.001 +e= np.float64(0.05) print [yo,yo < e] print [xo,xo < e] +ml = AnomalyDetection() +end = int(len(mo)*.7) +mu,sigma = ml.gParameters(mo) +r = ml.gPx(mu,sigma,[test],0.05) +for i in range(0,len(r)) : + print ' *** ', mo[(i+end)],r[i] + + #for row in np.transpose(m): # print ",".join([str(value) for value in row]) #-- We are ready to perform anomaly detection ...