You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

185 lines
5.0 KiB
Python

"""
This file is intended to perfom certain machine learning tasks based on numpy
We are trying to keep it lean that's why no sklearn involved yet
@TODO:
Create factory method for the learners implemented here
Improve preconditions (size of the dataset, labels)
"""
from __future__ import division
import numpy as np
class ML:
@staticmethod
def Filter (attr,value,data) :
#
# @TODO: Make sure this approach works across all transport classes
# We may have a potential issue of how the data is stored ... it may not scale
#
#return [item[0] for item in data if item and attr in item[0] and item[0][attr] == value]
return [[item for item in row if item[attr] == value] for row in data]
@staticmethod
def Extract(lattr,data):
if isinstance(lattr,basestring):
lattr = [lattr]
return [[row[id] for id in lattr] for row in data]
"""
Implements a multivariate anomaly detection
@TODO: determine computationally determine epsilon
"""
class AnomalyDetection:
def split(self,data,index=-1,threshold=0.8) :
N = len(data)
# if N < LIMIT:
# return None
end = int(N*threshold)
train = data[:end]
test = data[end:]
return {"train":train,"test":test}
"""
@param key field name by which the data will be filtered
@param value field value for the filter
@param features features to be used in the analysis
@param labels used to assess performance
@TODO: Map/Reduce does a good job at filtering
"""
def learn(self,data,key,value,features,label):
xo = ML.Filter(key,value,data)
print key,value, len(xo)
if not xo or len(xo) < 100:
return None
#if len(xo) < 100 :
#return None
# attr = conf['features']
# label= conf['label']
yo= ML.Extract([label['name']],xo)
xo = ML.Extract(features,xo)
yo = self.getLabel(yo,label)
xo = self.split(xo)
yo = self.split(yo)
if xo['train'] :
E = 0.01
fscore = 0
for i in range(0,10):
Epsilon = E + (2*E*i)
p = self.gParameters(xo['train'])
if p is None :
return None
px = self.gPx(p['mean'],p['cov'],xo['test'],Epsilon)
perf = self.gPerformance(px,yo['test'])
if fscore == 0 :
fscore = perf['fscore']
elif perf['fscore'] > fscore and perf['fscore'] > 0.5 :
perf['epsilon'] = Epsilon
return {"label":value,"parameters":p,"performance":perf}
return None
def getLabel(self,yo,label_conf):
return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ]
"""
This function will compute the probability density function given a particular event/set of events
The return value is [px,yo]
@pre xu.shape[0] == sigma[0] == sigma[1]
"""
def gPx(self,xu,sigma,data,EPSILON=0.01):
n = len(data[0])
r = []
a = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
# EPSILON = np.float64(EPSILON)
test = np.array(data)
for row in test:
row = np.array(row)
d = np.matrix(row - xu)
d.shape = (n,1)
b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
px = float(b/a)
r.append([px,int(px < EPSILON)])
return r
"""
This function uses stored learnt information to predict on raw data
In this case it will determin if we have an anomaly or not
@param xo raw observations (matrix)
@param info stored information about this
"""
def predict(self,xo,info):
xo = ML.Extract(info['features'],xo)
if not xo :
return None
sigma = info['parameters']['cov']
xu = info['parameters']['mean']
epsilon = info['performance']['epsilon']
return self.gPx(xu,sigma,xo,epsilon)
"""
This function computes performance metrics i.e precision, recall and f-score
for details visit https://en.wikipedia.org/wiki/Precision_and_recall
"""
def gPerformance(self,test,labels) :
N = len(test)
tp = 0 # true positive
fp = 0 # false positive
fn = 0 # false negative
tn = 0 # true negative
for i in range(0,N):
tp += 1 if (test[i][1]==labels[i] and test[i][1] == 1) else 0
fp += 1 if (test[i][1] != labels[i] and test[i][1] == 1) else 0
fn += 1 if (test[i][1] != labels[i] and test[i][1] == 0) else 0
tn += 1 if (test[i][1] == labels[i] and test[i][1] == 0) else 0
precision = tp / (tp + fp) if tp + fp > 0 else 1
recall = tp / (tp + fn) if tp + fp > 0 else 1
fscore = (2 * precision * recall)/ (precision + recall)
return {"precision":precision,"recall":recall,"fscore":fscore}
"""
This function returns gaussian parameters i.e means and covariance
The information will be used to compute probabilities
"""
def gParameters(self,train) :
n = len(train[0])
m = np.transpose(np.array(train))
u = np.array([ np.mean(m[i][:]) for i in range(0,n)])
if np.sum(u) == 0:
return None
r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
#
#-- Normalizing the matrix then we will compute covariance matrix
#
m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
sigma = np.cov(m)
sigma = [ list(row) for row in sigma]
return {"cov":sigma,"mean":list(u)}
class Regression:
parameters = {}
@staticmethod
def predict(xo):
pass
def __init__(self,config):
pass