Enabling preconditions to be effective (variance) @TODO: dimensionality reduction

master
Steve L. Nyemba 8 years ago
parent 4184d0bdc7
commit 0afbb1c072

@ -16,7 +16,7 @@ class ML:
# @TODO: Make sure this approach works across all transport classes # @TODO: Make sure this approach works across all transport classes
# We may have a potential issue of how the data is stored ... it may not scale # We may have a potential issue of how the data is stored ... it may not scale
# #
value = ML.CleanupName(value)
#return [item[0] for item in data if item and attr in item[0] and item[0][attr] == value] #return [item[0] for item in data if item and attr in item[0] and item[0][attr] == value]
return [[item for item in row if item[attr] == value][0] for row in data] return [[item for item in row if item[attr] == value][0] for row in data]
@staticmethod @staticmethod
@ -24,7 +24,9 @@ class ML:
if isinstance(lattr,basestring): if isinstance(lattr,basestring):
lattr = [lattr] lattr = [lattr]
return [[row[id] for id in lattr] for row in data] return [[row[id] for id in lattr] for row in data]
@staticmethod
def CleanupName(value) :
return value.replace('$','').replace('.+','')
""" """
Implements a multivariate anomaly detection Implements a multivariate anomaly detection
@ -32,7 +34,7 @@ class ML:
""" """
class AnomalyDetection: class AnomalyDetection:
def split(self,data,index=-1,threshold=0.9) : def split(self,data,index=-1,threshold=0.65) :
N = len(data) N = len(data)
# if N < LIMIT: # if N < LIMIT:
# return None # return None
@ -52,13 +54,13 @@ class AnomalyDetection:
@TODO: Map/Reduce does a good job at filtering @TODO: Map/Reduce does a good job at filtering
""" """
def learn(self,data,key,value,features,label): def learn(self,data,key,value,features,label):
xo = ML.Filter(key,value,data)
if not xo or len(xo) < 100:
return None
#if len(xo) < 100 : if len(data) < 10:
#return None return None
xo = ML.Filter(key,value,data)
if len(xo) < 10 :
return None
# attr = conf['features'] # attr = conf['features']
# label= conf['label'] # label= conf['label']
@ -69,9 +71,10 @@ class AnomalyDetection:
xo = self.split(xo) xo = self.split(xo)
yo = self.split(yo) yo = self.split(yo)
p = self.gParameters(xo['train']) p = self.gParameters(xo['train'])
has_cov = np.linalg.det(p['cov']) #-- making sure the matrix is invertible has_cov = np.linalg.det(p['cov']) if p else False #-- making sure the matrix is invertible
if xo['train'] and has_cov : if xo['train'] and has_cov :
E = 0.001 E = 0.001
ACCEPTABLE_FSCORE = 0.6
fscore = 0 fscore = 0
# #
# We need to find an appropriate epsilon for the predictions # We need to find an appropriate epsilon for the predictions
@ -94,22 +97,31 @@ class AnomalyDetection:
__operf__ = self.gPerformance(px,yo['test']) __operf__ = self.gPerformance(px,yo['test'])
print __operf__
if __operf__['fscore'] == 1 : if __operf__['fscore'] == 1 :
break continue
if perf is None : if perf is None :
perf = __operf__['fscore']
elif perf['fscore'] < __perf__['fscore'] and __operf__['fscore']> 0.5 :
perf = __operf__ perf = __operf__
elif perf['fscore'] < __operf__['fscore'] and __operf__['fscore'] > ACCEPTABLE_FSCORE :
perf = __operf__
perf['epsilon'] = Epsilon perf['epsilon'] = Epsilon
#
# At this point we are assuming we came out of the whole thing with an acceptable performance
# The understanding is that error drives performance thus we reject fscore==1
#
if perf and perf['fscore'] > ACCEPTABLE_FSCORE :
if perf and perf['fscore'] > 0.5 :
return {"label":value,"parameters":p,"performance":perf} return {"label":value,"parameters":p,"performance":perf}
else: else:
return None return None
return None return None
"""
This function determines if the preconditions for learning are met
For that parameters are passed to the function
p
"""
def canLearn(self,p) :
pass
def getLabel(self,yo,label_conf): def getLabel(self,yo,label_conf):
return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ] return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ]
@ -188,8 +200,15 @@ class AnomalyDetection:
return None return None
r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)]) r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
# #
# Before we normalize the data we must insure there's is some level of movement in this application
# A lack of movement suggests we may not bave enough information to do anything
#
if 0 in r :
return None
#
#-- Normalizing the matrix then we will compute covariance matrix #-- Normalizing the matrix then we will compute covariance matrix
# #
m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)]) m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
sigma = np.cov(m) sigma = np.cov(m)
sigma = [ list(row) for row in sigma] sigma = [ list(row) for row in sigma]

@ -521,7 +521,10 @@ class CouchdbWriter(Couchdb,Writer):
def __init__(self,**args): def __init__(self,**args):
uri = args['uri'] uri = args['uri']
self.uid = args['uid'] self.uid = args['uid']
self.filename = args['filename'] if 'filename' in args:
self.filename = args['filename']
else:
self.filename = None
dbname = args['dbname'] dbname = args['dbname']
self.server = Server(uri=uri) self.server = Server(uri=uri)
self.dbase = self.server.get_db(dbname) self.dbase = self.server.get_db(dbname)

@ -1,6 +1,7 @@
#import multiprocessing #import multiprocessing
from threading import Thread, RLock from threading import Thread, RLock
from utils import transport #from utils import transport
from utils.transport import *
from utils.ml import AnomalyDetection,ML from utils.ml import AnomalyDetection,ML
import time import time
import monitor import monitor
@ -17,7 +18,7 @@ class Top(Thread):
self.reader_class = _config['store']['class']['read'] self.reader_class = _config['store']['class']['read']
self.write_class = _config['store']['class']['write'] self.write_class = _config['store']['class']['write']
self.rw_args = _config['store']['args'] self.rw_args = _config['store']['args']
self.factory = transport.DataSourceFactory() self.factory = DataSourceFactory()
self.name = 'Zulu-Top' self.name = 'Zulu-Top'
self.quit = False self.quit = False
@ -36,6 +37,7 @@ class Top(Thread):
apps = self.config[label] apps = self.config[label]
self.handler.init(apps) self.handler.init(apps)
r = self.handler.composite() r = self.handler.composite()
gwriter.write(label=label,row=r) gwriter.write(label=label,row=r)
time.sleep(5) time.sleep(5)
self.lock.release() self.lock.release()
@ -44,7 +46,7 @@ class Top(Thread):
# This suggests we are in development mode # This suggests we are in development mode
# #
break break
ELLAPSED_TIME = 60*30 ELLAPSED_TIME = 60*20
time.sleep(ELLAPSED_TIME) time.sleep(ELLAPSED_TIME)
print "Exiting ",self.name print "Exiting ",self.name
@ -64,7 +66,7 @@ class Learner(Thread) :
self.features = config['learner']['anomalies']['features'] self.features = config['learner']['anomalies']['features']
self.yo = config['learner']['anomalies']['label'] self.yo = config['learner']['anomalies']['label']
self.apps = config['learner']['anomalies']['apps'] self.apps = config['learner']['anomalies']['apps']
self.factory = transport.DataSourceFactory() self.factory = DataSourceFactory()
self.quit = False self.quit = False
def stop(self): def stop(self):
@ -192,3 +194,13 @@ class Factory :
else: else:
return None return None
if __name__ =='__main__' :
import utils.params as SYS_ARGS
import json
PARAMS = SYS_ARGS.PARAMS
f = open(PARAMS['path'])
CONFIG = json.loads(f.read())
f.close()
ThreadManager.start(CONFIG)
Loading…
Cancel
Save