From 4184d0bdc7ea32933405edb59dbac67fd2aeea9b Mon Sep 17 00:00:00 2001 From: steve Date: Fri, 27 Jan 2017 14:36:56 -0600 Subject: [PATCH] bug fix around filter/extract @TODO: Work on overfitting --- src/utils/ml.py | 50 +++++++++++++++++++++++++++++++++++-------------- test/TestML.py | 17 ++++++++++++----- 2 files changed, 48 insertions(+), 19 deletions(-) diff --git a/src/utils/ml.py b/src/utils/ml.py index 580f198..9a76162 100644 --- a/src/utils/ml.py +++ b/src/utils/ml.py @@ -18,7 +18,7 @@ class ML: # #return [item[0] for item in data if item and attr in item[0] and item[0][attr] == value] - return [[item for item in row if item[attr] == value] for row in data] + return [[item for item in row if item[attr] == value][0] for row in data] @staticmethod def Extract(lattr,data): if isinstance(lattr,basestring): @@ -32,7 +32,7 @@ class ML: """ class AnomalyDetection: - def split(self,data,index=-1,threshold=0.8) : + def split(self,data,index=-1,threshold=0.9) : N = len(data) # if N < LIMIT: # return None @@ -53,7 +53,6 @@ class AnomalyDetection: """ def learn(self,data,key,value,features,label): xo = ML.Filter(key,value,data) - print key,value, len(xo) if not xo or len(xo) < 100: return None @@ -69,25 +68,47 @@ class AnomalyDetection: xo = self.split(xo) yo = self.split(yo) - - if xo['train'] : - E = 0.01 + p = self.gParameters(xo['train']) + has_cov = np.linalg.det(p['cov']) #-- making sure the matrix is invertible + if xo['train'] and has_cov : + E = 0.001 fscore = 0 + # + # We need to find an appropriate epsilon for the predictions + # The appropriate epsilon is one that yields an f-score [0.5,1[ + # + + __operf__ = None + perf = None for i in range(0,10): Epsilon = E + (2*E*i) - p = self.gParameters(xo['train']) + if p is None : return None + # + # At this point we've got enough data for the parameters + # We should try to fine tune epsilon for better results + # + px = self.gPx(p['mean'],p['cov'],xo['test'],Epsilon) - perf = self.gPerformance(px,yo['test']) - if fscore == 0 : - fscore = perf['fscore'] - elif perf['fscore'] > fscore and perf['fscore'] > 0.5 : - - perf['epsilon'] = Epsilon + + __operf__ = self.gPerformance(px,yo['test']) + print __operf__ + if __operf__['fscore'] == 1 : + break + if perf is None : + perf = __operf__['fscore'] + elif perf['fscore'] < __perf__['fscore'] and __operf__['fscore']> 0.5 : + perf = __operf__ + + perf['epsilon'] = Epsilon - return {"label":value,"parameters":p,"performance":perf} + + if perf and perf['fscore'] > 0.5 : + return {"label":value,"parameters":p,"performance":perf} + else: + return None return None def getLabel(self,yo,label_conf): return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ] @@ -109,6 +130,7 @@ class AnomalyDetection: row = np.array(row) d = np.matrix(row - xu) d.shape = (n,1) + b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d)) px = float(b/a) diff --git a/test/TestML.py b/test/TestML.py index e3ea79b..d4cc8cf 100644 --- a/test/TestML.py +++ b/test/TestML.py @@ -25,14 +25,20 @@ class TestML(unittest.TestCase): def test_Filter(self): r = self.greader.read() r = r['apps'] - x = ML.Filter('label','Google Chrome',r) + # + # To make this test case extensible we need to pull apps from the configuration + # + app = CONFIG['monitor']['processes']['config']['apps'][0] + x = ML.Filter('label',app,r) for row in x: - self.assertTrue(row['label'] == 'Google Chrome') + self.assertTrue(row['label'] == app) def test_Extract(self): r = self.greader.read() r = r['apps'] - x = ML.Filter('label','Google Chrome',r) + app = CONFIG['monitor']['processes']['config']['apps'][0] + x = ML.Filter('label',app,r) x_ = ML.Extract(['cpu_usage','memory_usage'], x) + self.assertTrue (len (x) == len(x_)) pass def test_Learn(self): @@ -43,11 +49,12 @@ class TestML(unittest.TestCase): data = greader.read() data = data['apps'] + app = CONFIG['monitor']['processes']['config']['apps'][1] lhandler = AnomalyDetection() features = CONFIG['learner']['anomalies']['features'] label = CONFIG['learner']['anomalies']['label'] - lhandler.learn(data,'label','Google Chrome',features,label) - + x = lhandler.learn(data,'label',app,features,label) + print x if __name__ == '__main__' :