finished up anomaly detection, with precision/recall @TODO: testinggit stage ../src/utils/ml.py demo.py

master
Steve L. Nyemba 8 years ago
parent 44674bb83c
commit 886a9e1d76

@ -2,6 +2,7 @@
This file is intended to perfom certain machine learning tasks based on numpy
We are trying to keep it lean that's why no sklearn involved yet
"""
from __future__ import division
import numpy as np
class ML:
@ -15,18 +16,91 @@ class ML:
@staticmethod
def Extract(lattr,data):
return [[row[id] for id in lattr] for row in data]
"""
Implements a multivariate anomaly detection
@TODO: determine computationally determine epsilon
"""
class AnomalyDetection:
def split(self,data,index=-1,threshold=0.7) :
N = len(data)
if N < LIMIT:
return None
end = int(N*threshold)
train = data[:end]
test = data[end:]
if index > 0:
return {"train":train,"test":test,"labels":[]}
def learn(self,data,conf):
if 'filter' in conf:
filter = conf['filter']
data = ML.Filter(filter['key'],filter['value'],data)
attr = conf['features']
label= conf['label']
labels= ML.Extract([label],data)
data = ML.Extract(attr,data)
r = self.split(data)
labels = self.split(labels)
p = self.gParameters(r['train'])
test = self.gPx(p['mean'],p['cov'],r['test'])
return self.gPerformance(test,labels['test'])
"""
This function will compute the probability density function given a particular event/set of events
@pre xu.shape[0] == sigma[0] == sigma[1]
"""
def gPx(self,xu,sigma,data,EPSILON=0.05):
n = len(data[0])
def init(self,lattr,data):
self.lattr = attr
self.data = data
self.X = []
self.Xmeans = []
for id in lattr:
xvalues = [item for item in self.data[id]]
self.Xmeans.append(np.mean(xvalues))
self.X.append(xvalues)
slef.Xcov = np.cov(self.X)
r = []
a = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
# EPSILON = np.float64(EPSILON)
test = np.array(data)
for row in test:
row = np.array(row)
d = np.matrix(row - xu)
d.shape = (n,1)
b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
px = float(b/a)
r.append([px,int(px < EPSILON)])
return r
"""
This function computes performance metrics i.e precision, recall and f-score
for details visit https://en.wikipedia.org/wiki/Precision_and_recall
"""
def gPerformance(self,test,labels) :
N = len(test)
tp = 0 # true positive
fp = 0 # false positive
fn = 0 # false negative
for i in range(0,N):
tp += 1 if test[i][1]==labels[i] and test[i][1] == 1
fp += 1 if test[i][1] != labels[i] and test[i][1] == 1
fn += 1 if test[i][1] != labels[i] and test[i][1] == 0
precision = tp / (tp + fp)
recall = tp / (tp + fn)
fscore = (2 * precision * recall)/ (precision + recall)
return {"precision":precision,"recall":recall,"fscore":fscore}
"""
This function returns gaussian parameters i.e means and covariance
The information will be used to compute probabilities
"""
def gParameters(self,train) :
n = len(train[0])
m = np.transpose(np.array(train))
u = np.array([ np.mean(m[i][:]) for i in range(0,n)])
r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
#
# Let's get the covariance matrix here ...
#-- Normalizing the matrix then we will compute covariance matrix
#
m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
sigma = np.cov(m)
return {"cov":sigma,"mean":u}

@ -1,8 +1,10 @@
from __future__ import division
import numpy as np
m = [[0.0, 4.5], [0.0, 4.5], [11.6, 4.4], [12.2, 4.3], [1.4, 3.9], [1.4, 3.9], [2.5, 3.8], [0.1, 3.8], [0.5, 5.1], [0.7, 5.2], [0.7, 5.1], [0.0, 4.6], [0.0, 4.6]]
m = np.transpose(np.array(m))
xu_ = np.mean(m[1,:])
yu_ = np.mean(m[0,:])
from utils.ml import AnomalyDetection
mo = [[0.0, 4.5], [0.0, 4.5], [11.6, 4.4], [12.2, 4.3], [1.4, 3.9], [1.4, 3.9], [2.5, 3.8], [0.1, 3.8], [0.5, 5.1], [0.7, 5.2], [0.7, 5.1], [0.0, 4.6], [0.0, 4.6]]
m = np.transpose(np.array(mo))
xu_ = np.mean(m[0,:])
yu_ = np.mean(m[1,:])
xr_ = np.sqrt(np.var(m[0,:]))
yr_ = np.sqrt(np.var(m[1,:]))
@ -10,21 +12,34 @@ yr_ = np.sqrt(np.var(m[1,:]))
# -- normalizing the matrix before computing covariance
#
mn = np.array([list( (m[0,:]-xu_)/xr_),list( (m[1,:]-yu_)/yr_)])
cx = np.cov(mn)
n = m.shape[0]
x = np.array([2.4,3.1])
test=[2.4,3.1]
x = np.array(test)
u = np.array([xu_,yu_])
d = np.matrix(x - u)
d.shape = (n,1)
a = (2*(np.pi)**(n/2))*np.linalg.det(cx)**0.5
b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(cx)*d))
print u.shape
print cx.shape
from scipy.stats import multivariate_normal
xo= multivariate_normal.pdf(x,u,cx)
yo= (b/a)[0,0]
e= 0.001
e= np.float64(0.05)
print [yo,yo < e]
print [xo,xo < e]
ml = AnomalyDetection()
end = int(len(mo)*.7)
mu,sigma = ml.gParameters(mo)
r = ml.gPx(mu,sigma,[test],0.05)
for i in range(0,len(r)) :
print ' *** ', mo[(i+end)],r[i]
#for row in np.transpose(m):
# print ",".join([str(value) for value in row])
#-- We are ready to perform anomaly detection ...

Loading…
Cancel
Save