You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

120 lines
3.5 KiB
Python

"""
This file is intended to perfom certain machine learning tasks based on numpy
We are trying to keep it lean that's why no sklearn involved yet
"""
from __future__ import division
import numpy as np
class ML:
@staticmethod
def Filter (attr,value,data) :
#
# @TODO: Make sure this approach works across all transport classes
# We may have a potential issue of how the data is stored ... it may not scale
#
return [item[0] for item in data if item[0][attr] == value]
@staticmethod
def Extract(lattr,data):
return [[row[id] for id in lattr] for row in data]
"""
Implements a multivariate anomaly detection
@TODO: determine computationally determine epsilon
"""
class AnomalyDetection:
def split(self,data,index=-1,threshold=0.7) :
N = len(data)
# if N < LIMIT:
# return None
end = int(N*threshold)
train = data[:end]
test = data[end:]
return {"train":train,"test":test}
"""
@param key field name by which the data will be filtered
@param value field value for the filter
@param features features to be used in the analysis
@param labels used to assess performance
@TODO: Map/Reduce does a good job at filtering
"""
def learn(self,data,key,value,features,label):
xo = ML.Filter(key,value,data)
# attr = conf['features']
# label= conf['label']
yo= ML.Extract([label['name']],xo)
xo = ML.Extract(features,xo)
yo = self.getLabel(yo,label)
xo = self.split(xo)
yo = self.split(yo)
p = self.gParameters(xo['train'])
px = self.gPx(p['mean'],p['cov'],xo['test'])
print self.gPerformance(px,yo['test'])
def getLabel(self,yo,label_conf):
return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ]
"""
This function will compute the probability density function given a particular event/set of events
@pre xu.shape[0] == sigma[0] == sigma[1]
"""
def gPx(self,xu,sigma,data,EPSILON=0.05):
n = len(data[0])
r = []
a = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
# EPSILON = np.float64(EPSILON)
test = np.array(data)
for row in test:
row = np.array(row)
d = np.matrix(row - xu)
d.shape = (n,1)
b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
px = float(b/a)
r.append([px,int(px < EPSILON)])
return r
"""
This function computes performance metrics i.e precision, recall and f-score
for details visit https://en.wikipedia.org/wiki/Precision_and_recall
"""
def gPerformance(self,test,labels) :
N = len(test)
tp = 0 # true positive
fp = 0 # false positive
fn = 0 # false negative
tn = 0 # true negative
for i in range(0,N):
tp += 1 if (test[i][1]==labels[i] and test[i][1] == 1) else 0
fp += 1 if (test[i][1] != labels[i] and test[i][1] == 1) else 0
fn += 1 if (test[i][1] != labels[i] and test[i][1] == 0) else 0
tn += 1 if (test[i][1] == labels[i] and test[i][1] == 0) else 0
precision = tp / (tp + fp)
recall = tp / (tp + fn)
fscore = (2 * precision * recall)/ (precision + recall)
return {"precision":precision,"recall":recall,"fscore":fscore}
"""
This function returns gaussian parameters i.e means and covariance
The information will be used to compute probabilities
"""
def gParameters(self,train) :
n = len(train[0])
m = np.transpose(np.array(train))
u = np.array([ np.mean(m[i][:]) for i in range(0,n)])
r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
#
#-- Normalizing the matrix then we will compute covariance matrix
#
m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
sigma = np.cov(m)
return {"cov":sigma,"mean":u}