finished up anomaly detection, with precision/recall @TODO: testinggit stage ../src/utils/ml.py demo.py

master
Steve L. Nyemba 8 years ago
parent 44674bb83c
commit 886a9e1d76

@ -2,6 +2,7 @@
This file is intended to perfom certain machine learning tasks based on numpy This file is intended to perfom certain machine learning tasks based on numpy
We are trying to keep it lean that's why no sklearn involved yet We are trying to keep it lean that's why no sklearn involved yet
""" """
from __future__ import division
import numpy as np import numpy as np
class ML: class ML:
@ -15,18 +16,91 @@ class ML:
@staticmethod @staticmethod
def Extract(lattr,data): def Extract(lattr,data):
return [[row[id] for id in lattr] for row in data] return [[row[id] for id in lattr] for row in data]
"""
Implements a multivariate anomaly detection
@TODO: determine computationally determine epsilon
"""
class AnomalyDetection:
def split(self,data,index=-1,threshold=0.7) :
N = len(data)
if N < LIMIT:
return None
end = int(N*threshold)
train = data[:end]
test = data[end:]
if index > 0:
return {"train":train,"test":test,"labels":[]}
def learn(self,data,conf):
if 'filter' in conf:
filter = conf['filter']
data = ML.Filter(filter['key'],filter['value'],data)
attr = conf['features']
label= conf['label']
labels= ML.Extract([label],data)
data = ML.Extract(attr,data)
r = self.split(data)
labels = self.split(labels)
p = self.gParameters(r['train'])
test = self.gPx(p['mean'],p['cov'],r['test'])
return self.gPerformance(test,labels['test'])
"""
This function will compute the probability density function given a particular event/set of events
@pre xu.shape[0] == sigma[0] == sigma[1]
"""
def gPx(self,xu,sigma,data,EPSILON=0.05):
n = len(data[0])
def init(self,lattr,data): r = []
self.lattr = attr a = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
self.data = data # EPSILON = np.float64(EPSILON)
self.X = [] test = np.array(data)
self.Xmeans = [] for row in test:
for id in lattr: row = np.array(row)
xvalues = [item for item in self.data[id]] d = np.matrix(row - xu)
self.Xmeans.append(np.mean(xvalues)) d.shape = (n,1)
self.X.append(xvalues) b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
slef.Xcov = np.cov(self.X) px = float(b/a)
r.append([px,int(px < EPSILON)])
return r
"""
This function computes performance metrics i.e precision, recall and f-score
for details visit https://en.wikipedia.org/wiki/Precision_and_recall
"""
def gPerformance(self,test,labels) :
N = len(test)
tp = 0 # true positive
fp = 0 # false positive
fn = 0 # false negative
for i in range(0,N):
tp += 1 if test[i][1]==labels[i] and test[i][1] == 1
fp += 1 if test[i][1] != labels[i] and test[i][1] == 1
fn += 1 if test[i][1] != labels[i] and test[i][1] == 0
precision = tp / (tp + fp)
recall = tp / (tp + fn)
fscore = (2 * precision * recall)/ (precision + recall)
return {"precision":precision,"recall":recall,"fscore":fscore}
"""
This function returns gaussian parameters i.e means and covariance
The information will be used to compute probabilities
"""
def gParameters(self,train) :
n = len(train[0])
m = np.transpose(np.array(train))
u = np.array([ np.mean(m[i][:]) for i in range(0,n)])
r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
# #
# Let's get the covariance matrix here ... #-- Normalizing the matrix then we will compute covariance matrix
# #
m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
sigma = np.cov(m)
return {"cov":sigma,"mean":u}

@ -1,8 +1,10 @@
from __future__ import division
import numpy as np import numpy as np
m = [[0.0, 4.5], [0.0, 4.5], [11.6, 4.4], [12.2, 4.3], [1.4, 3.9], [1.4, 3.9], [2.5, 3.8], [0.1, 3.8], [0.5, 5.1], [0.7, 5.2], [0.7, 5.1], [0.0, 4.6], [0.0, 4.6]] from utils.ml import AnomalyDetection
m = np.transpose(np.array(m)) mo = [[0.0, 4.5], [0.0, 4.5], [11.6, 4.4], [12.2, 4.3], [1.4, 3.9], [1.4, 3.9], [2.5, 3.8], [0.1, 3.8], [0.5, 5.1], [0.7, 5.2], [0.7, 5.1], [0.0, 4.6], [0.0, 4.6]]
xu_ = np.mean(m[1,:]) m = np.transpose(np.array(mo))
yu_ = np.mean(m[0,:]) xu_ = np.mean(m[0,:])
yu_ = np.mean(m[1,:])
xr_ = np.sqrt(np.var(m[0,:])) xr_ = np.sqrt(np.var(m[0,:]))
yr_ = np.sqrt(np.var(m[1,:])) yr_ = np.sqrt(np.var(m[1,:]))
@ -10,21 +12,34 @@ yr_ = np.sqrt(np.var(m[1,:]))
# -- normalizing the matrix before computing covariance # -- normalizing the matrix before computing covariance
# #
mn = np.array([list( (m[0,:]-xu_)/xr_),list( (m[1,:]-yu_)/yr_)]) mn = np.array([list( (m[0,:]-xu_)/xr_),list( (m[1,:]-yu_)/yr_)])
cx = np.cov(mn) cx = np.cov(mn)
n = m.shape[0] n = m.shape[0]
x = np.array([2.4,3.1]) test=[2.4,3.1]
x = np.array(test)
u = np.array([xu_,yu_]) u = np.array([xu_,yu_])
d = np.matrix(x - u) d = np.matrix(x - u)
d.shape = (n,1) d.shape = (n,1)
a = (2*(np.pi)**(n/2))*np.linalg.det(cx)**0.5 a = (2*(np.pi)**(n/2))*np.linalg.det(cx)**0.5
b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(cx)*d)) b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(cx)*d))
print u.shape
print cx.shape
from scipy.stats import multivariate_normal from scipy.stats import multivariate_normal
xo= multivariate_normal.pdf(x,u,cx) xo= multivariate_normal.pdf(x,u,cx)
yo= (b/a)[0,0] yo= (b/a)[0,0]
e= 0.001 e= np.float64(0.05)
print [yo,yo < e] print [yo,yo < e]
print [xo,xo < e] print [xo,xo < e]
ml = AnomalyDetection()
end = int(len(mo)*.7)
mu,sigma = ml.gParameters(mo)
r = ml.gPx(mu,sigma,[test],0.05)
for i in range(0,len(r)) :
print ' *** ', mo[(i+end)],r[i]
#for row in np.transpose(m): #for row in np.transpose(m):
# print ",".join([str(value) for value in row]) # print ",".join([str(value) for value in row])
#-- We are ready to perform anomaly detection ... #-- We are ready to perform anomaly detection ...

Loading…
Cancel
Save