parent
f12c1467a0
commit
8e7cad9a11
@ -1,38 +0,0 @@
|
||||
import smtplib
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.text import MIMEText
|
||||
|
||||
class MailAgent :
|
||||
def __init__(self,conf) :
|
||||
self.uid = conf['uid']
|
||||
|
||||
|
||||
try:
|
||||
|
||||
self.handler = smtplib.SMTP_SSL(conf['host'],conf['port'])
|
||||
r = self.handler.login(self.uid,conf['password'])
|
||||
#
|
||||
# @TODO: Check the status of the authentication
|
||||
# If not authenticated the preconditions have failed
|
||||
#
|
||||
except Exception,e:
|
||||
print e
|
||||
self.handler = None
|
||||
pass
|
||||
|
||||
|
||||
def send(self,**args) :
|
||||
subject = args['subject']
|
||||
message = args['message']
|
||||
to = args['to']
|
||||
if '<' in message and '>' in message :
|
||||
message = MIMEText(message,'html')
|
||||
else:
|
||||
message = MIMEText(message,'plain')
|
||||
message['From'] = self.uid
|
||||
message['To'] = to
|
||||
message['Subject'] = subject
|
||||
return self.handler.sendmail(self.uid,to,message.as_string())
|
||||
def close(self):
|
||||
self.handler.quit()
|
||||
|
@ -1,312 +0,0 @@
|
||||
"""
|
||||
This file is intended to perfom certain machine learning tasks based on numpy
|
||||
We are trying to keep it lean that's why no sklearn involved yet
|
||||
|
||||
@TODO:
|
||||
Create factory method for the learners implemented here
|
||||
Improve preconditions (size of the dataset, labels)
|
||||
"""
|
||||
from __future__ import division
|
||||
import numpy as np
|
||||
|
||||
class ML:
|
||||
@staticmethod
|
||||
def Filter (attr,value,data) :
|
||||
#
|
||||
# @TODO: Make sure this approach works across all transport classes
|
||||
# We may have a potential issue of how the data is stored ... it may not scale
|
||||
#
|
||||
value = ML.CleanupName(value)
|
||||
#return [item[0] for item in data if item and attr in item[0] and item[0][attr] == value]
|
||||
#return [[item for item in row if item[attr] == value][0] for row in data]
|
||||
#
|
||||
# We are making the filtering more rescillient, i.e if an item doesn't exist we don't have to throw an exception
|
||||
# This is why we expanded the loops ... fully expressive but rescilient
|
||||
#
|
||||
r = []
|
||||
for row in data :
|
||||
if isinstance(row,list) :
|
||||
for item in row :
|
||||
|
||||
if attr in item and item[attr] == value:
|
||||
r.append(item)
|
||||
else:
|
||||
#
|
||||
# We are dealing with a vector of objects
|
||||
#
|
||||
if attr in row and row[attr] == value:
|
||||
r.append(row)
|
||||
|
||||
return r
|
||||
@staticmethod
|
||||
def Extract(lattr,data):
|
||||
if isinstance(lattr,basestring):
|
||||
lattr = [lattr]
|
||||
# return [[row[id] for id in lattr] for row in data]
|
||||
r = [[row[id] for id in lattr] for row in data]
|
||||
if len(lattr) == 1 :
|
||||
return [x[0] for x in r]
|
||||
else:
|
||||
return r
|
||||
@staticmethod
|
||||
def CleanupName(value) :
|
||||
return value.replace('$','').replace('.+','')
|
||||
@staticmethod
|
||||
def distribution(xo,lock,scale=False) :
|
||||
|
||||
d = []
|
||||
m = {}
|
||||
if scale :
|
||||
xu = np.mean(xo)
|
||||
sd = np.sqrt(np.var(xo))
|
||||
for xi in xo :
|
||||
value = round(xi,2)
|
||||
if scale :
|
||||
value = round((value - xu)/sd,2)
|
||||
id = str(value)
|
||||
lock.acquire()
|
||||
if id in m :
|
||||
index = m[id]
|
||||
d[index][1] += 1
|
||||
else:
|
||||
m[id] = len(d)
|
||||
d.append([value,1])
|
||||
lock.release()
|
||||
del m
|
||||
return d
|
||||
|
||||
"""
|
||||
Implements a multivariate anomaly detection
|
||||
@TODO: determine computationally determine epsilon
|
||||
"""
|
||||
class AnomalyDetection:
|
||||
def __init__(self):
|
||||
pass
|
||||
def split(self,data,index=-1,threshold=0.65) :
|
||||
N = len(data)
|
||||
# if N < LIMIT:
|
||||
# return None
|
||||
|
||||
end = int(N*threshold)
|
||||
train = data[:end]
|
||||
test = data[end:]
|
||||
|
||||
return {"train":train,"test":test}
|
||||
|
||||
"""
|
||||
|
||||
@param key field name by which the data will be filtered
|
||||
@param value field value for the filter
|
||||
@param features features to be used in the analysis
|
||||
@param labels used to assess performance
|
||||
@TODO: Map/Reduce does a good job at filtering
|
||||
"""
|
||||
def learn(self,data,key,value,features,label):
|
||||
|
||||
|
||||
if len(data) < 10:
|
||||
return None
|
||||
xo = ML.Filter(key,value,data)
|
||||
if len(xo) < 10 :
|
||||
return None
|
||||
# attr = conf['features']
|
||||
# label= conf['label']
|
||||
|
||||
yo= ML.Extract([label['name']],xo)
|
||||
xo = ML.Extract(features,xo)
|
||||
yo = self.getLabel(yo,label)
|
||||
#
|
||||
# @TODO: Insure this can be finetuned, training size matters for learning. It's not obvious to define upfront
|
||||
#
|
||||
xo = self.split(xo)
|
||||
yo = self.split(yo)
|
||||
p = self.gParameters(xo['train'])
|
||||
has_cov = np.linalg.det(p['cov']) if p else False #-- making sure the matrix is invertible
|
||||
|
||||
if xo['train'] and has_cov :
|
||||
E = 0.001
|
||||
ACCEPTABLE_FSCORE = 0.6
|
||||
fscore = 0
|
||||
#
|
||||
# We need to find an appropriate epsilon for the predictions
|
||||
# The appropriate epsilon is one that yields an f-score [0.5,1[
|
||||
#
|
||||
|
||||
__operf__ = None
|
||||
perf = None
|
||||
for i in range(0,10):
|
||||
Epsilon = E + (2*E*i)
|
||||
|
||||
if p is None :
|
||||
return None
|
||||
#
|
||||
# At this point we've got enough data for the parameters
|
||||
# We should try to fine tune epsilon for better results
|
||||
#
|
||||
|
||||
px = self.gPx(p['mean'],p['cov'],xo['test'],Epsilon)
|
||||
|
||||
|
||||
__operf__ = self.gPerformance(px,yo['test'])
|
||||
print value,__operf__
|
||||
if __operf__['fscore'] == 1 :
|
||||
continue
|
||||
if perf is None :
|
||||
perf = __operf__
|
||||
elif perf['fscore'] < __operf__['fscore'] and __operf__['fscore'] > ACCEPTABLE_FSCORE :
|
||||
perf = __operf__
|
||||
perf['epsilon'] = Epsilon
|
||||
#
|
||||
# At this point we are assuming we came out of the whole thing with an acceptable performance
|
||||
# The understanding is that error drives performance thus we reject fscore==1
|
||||
#
|
||||
|
||||
if perf and perf['fscore'] > ACCEPTABLE_FSCORE :
|
||||
return {"label":value,"parameters":p,"performance":perf}
|
||||
else:
|
||||
return None
|
||||
return None
|
||||
"""
|
||||
This function determines if the preconditions for learning are met
|
||||
For that parameters are passed to the function
|
||||
p
|
||||
"""
|
||||
def canLearn(self,p) :
|
||||
pass
|
||||
def getLabel(self,yo,label_conf):
|
||||
return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ]
|
||||
|
||||
|
||||
"""
|
||||
This function will compute the probability density function given a particular event/set of events
|
||||
The return value is [px,yo]
|
||||
@pre xu.shape[0] == sigma[0] == sigma[1]
|
||||
"""
|
||||
def gPx(self,xu,sigma,data,EPSILON=0.01):
|
||||
n = len(data[0])
|
||||
|
||||
r = []
|
||||
a = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
|
||||
# EPSILON = np.float64(EPSILON)
|
||||
test = np.array(data)
|
||||
for row in test:
|
||||
row = np.array(row)
|
||||
d = np.matrix(row - xu)
|
||||
d.shape = (n,1)
|
||||
|
||||
b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
|
||||
|
||||
px = float(b/a)
|
||||
r.append([px,int(px < EPSILON)])
|
||||
return r
|
||||
"""
|
||||
This function uses stored learnt information to predict on raw data
|
||||
In this case it will determin if we have an anomaly or not
|
||||
@param xo raw observations (matrix)
|
||||
@param info stored information about this
|
||||
"""
|
||||
def predict(self,xo,info):
|
||||
|
||||
xo = ML.Extract(info['features'],xo)
|
||||
|
||||
if not xo :
|
||||
return None
|
||||
|
||||
sigma = info['parameters']['cov']
|
||||
xu = info['parameters']['mean']
|
||||
epsilon = info['performance']['epsilon']
|
||||
|
||||
return self.gPx(xu,sigma,xo,epsilon)
|
||||
"""
|
||||
This function computes performance metrics i.e precision, recall and f-score
|
||||
for details visit https://en.wikipedia.org/wiki/Precision_and_recall
|
||||
|
||||
"""
|
||||
def gPerformance(self,test,labels) :
|
||||
N = len(test)
|
||||
tp = 0 # true positive
|
||||
fp = 0 # false positive
|
||||
fn = 0 # false negative
|
||||
tn = 0 # true negative
|
||||
for i in range(0,N):
|
||||
tp += 1 if (test[i][1]==labels[i] and test[i][1] == 1) else 0
|
||||
fp += 1 if (test[i][1] != labels[i] and test[i][1] == 1) else 0
|
||||
fn += 1 if (test[i][1] != labels[i] and test[i][1] == 0) else 0
|
||||
tn += 1 if (test[i][1] == labels[i] and test[i][1] == 0) else 0
|
||||
precision = tp /( (tp + fp) if tp + fp > 0 else 1)
|
||||
recall = tp / ((tp + fn) if tp + fn > 0 else 1)
|
||||
|
||||
fscore = (2 * precision * recall)/ ((precision + recall) if (precision + recall) > 0 else 1)
|
||||
return {"precision":precision,"recall":recall,"fscore":fscore}
|
||||
|
||||
"""
|
||||
This function returns gaussian parameters i.e means and covariance
|
||||
The information will be used to compute probabilities
|
||||
"""
|
||||
def gParameters(self,train) :
|
||||
|
||||
n = len(train[0])
|
||||
m = np.transpose(np.array(train))
|
||||
|
||||
u = np.array([ np.mean(m[i][:]) for i in range(0,n)])
|
||||
if np.sum(u) == 0:
|
||||
return None
|
||||
r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
|
||||
#
|
||||
# Before we normalize the data we must insure there's is some level of movement in this application
|
||||
# A lack of movement suggests we may not bave enough information to do anything
|
||||
#
|
||||
if 0 in r :
|
||||
return None
|
||||
#
|
||||
#-- Normalizing the matrix then we will compute covariance matrix
|
||||
#
|
||||
|
||||
m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
|
||||
sigma = np.cov(m)
|
||||
sigma = [ list(row) for row in sigma]
|
||||
return {"cov":sigma,"mean":list(u)}
|
||||
|
||||
class AnalyzeAnomaly(AnomalyDetection):
|
||||
def __init__(self):
|
||||
AnomalyDetection.__init__(self)
|
||||
"""
|
||||
This analysis function will include a predicted status because an anomaly can either be
|
||||
- A downtime i.e end of day
|
||||
- A spike and thus a potential imminent crash
|
||||
@param xo matrix of variables
|
||||
@param info information about what was learnt
|
||||
"""
|
||||
def predict(self,xo,info):
|
||||
x = xo[len(xo)-1]
|
||||
r = AnomalyDetection.predict(self,[x],info)
|
||||
#
|
||||
# In order to determine what the anomaly is we compute the slope (idle or crash)
|
||||
# The slope is computed using the covariance / variance of features
|
||||
#
|
||||
if r is not None:
|
||||
N = len(info['features'])
|
||||
xy = ML.Extract(info['features'],xo)
|
||||
xy = np.array(xy)
|
||||
|
||||
vxy= np.array([ np.var(xy[:,i]) for i in range(0,N)])
|
||||
cxy=np.array(info['parameters']['cov'])
|
||||
#cxy=np.cov(np.transpose(xy))
|
||||
if np.sum(vxy) == 0:
|
||||
vxy = cxy
|
||||
|
||||
alpha = cxy/vxy
|
||||
|
||||
|
||||
|
||||
r = {"anomaly":r[0][1],"slope":list(alpha[:,0])}
|
||||
|
||||
return r
|
||||
class Regression:
|
||||
parameters = {}
|
||||
@staticmethod
|
||||
def predict(xo):
|
||||
pass
|
||||
|
||||
def __init__(self,config):
|
||||
pass
|
@ -1,266 +0,0 @@
|
||||
#import multiprocessing
|
||||
from threading import Thread, RLock
|
||||
#from utils import transport
|
||||
from utils.transport import *
|
||||
from utils.ml import AnomalyDetection,ML
|
||||
import time
|
||||
import monitor
|
||||
import sys
|
||||
import os
|
||||
import datetime
|
||||
class BasicWorker(Thread):
|
||||
def __init__(self,config,lock):
|
||||
Thread.__init__(self)
|
||||
self.reader_class = config['store']['class']['read']
|
||||
self.write_class = config['store']['class']['write']
|
||||
self.rw_args = config['store']['args']
|
||||
self.factory = DataSourceFactory()
|
||||
self.lock = lock
|
||||
|
||||
"""
|
||||
This class is intended to collect data given a configuration
|
||||
|
||||
"""
|
||||
class Top(Thread):
|
||||
def __init__(self,_config,lock):
|
||||
Thread.__init__(self)
|
||||
self.lock = lock
|
||||
self.reader_class = _config['store']['class']['read']
|
||||
self.write_class = _config['store']['class']['write']
|
||||
self.rw_args = _config['store']['args']
|
||||
self.factory = DataSourceFactory()
|
||||
|
||||
self.name = 'Zulu-Top'
|
||||
self.quit = False
|
||||
|
||||
|
||||
className = ''.join(['monitor.',_config['monitor']['processes']['class'],'()'])
|
||||
self.handler = eval(className)
|
||||
self.config = _config['monitor']['processes']['config']
|
||||
def stop(self):
|
||||
self.quit = True
|
||||
def run(self):
|
||||
while self.quit == False:
|
||||
print ' ** ',self.name,datetime.datetime.today()
|
||||
for label in self.config :
|
||||
self.lock.acquire()
|
||||
gwriter = self.factory.instance(type=self.write_class,args=self.rw_args)
|
||||
apps = self.config[label]
|
||||
self.handler.init(apps)
|
||||
r = self.handler.composite()
|
||||
|
||||
gwriter.write(label=label,row=r)
|
||||
time.sleep(5)
|
||||
self.lock.release()
|
||||
if 'MONITOR_CONFIG_PATH' in os.environ:
|
||||
#
|
||||
# This suggests we are in development mode
|
||||
#
|
||||
break
|
||||
ELLAPSED_TIME = 60*20
|
||||
time.sleep(ELLAPSED_TIME)
|
||||
print "Exiting ",self.name
|
||||
|
||||
class Learner(Thread) :
|
||||
|
||||
"""
|
||||
This function expects paltform config (store,learner)
|
||||
It will leverage store and learner in order to operate
|
||||
"""
|
||||
def __init__(self,config,lock):
|
||||
Thread.__init__(self)
|
||||
self.name = 'Zulu-Learner'
|
||||
self.lock = lock
|
||||
self.reader_class = config['store']['class']['read']
|
||||
self.write_class = config['store']['class']['write']
|
||||
self.rw_args = config['store']['args']
|
||||
self.features = config['learner']['anomalies']['features']
|
||||
self.yo = config['learner']['anomalies']['label']
|
||||
self.apps = config['learner']['anomalies']['apps']
|
||||
self.factory = DataSourceFactory()
|
||||
self.quit = False
|
||||
|
||||
def stop(self):
|
||||
self.quit = True
|
||||
"""
|
||||
This function will initiate learning every (x-hour)
|
||||
If there is nothing to learn the app will simply go to sleep
|
||||
"""
|
||||
def run(self):
|
||||
reader = self.factory.instance(type=self.reader_class,args=self.rw_args)
|
||||
data = reader.read()
|
||||
|
||||
|
||||
#
|
||||
# Let's make sure we extract that which has aleady been learnt
|
||||
#
|
||||
|
||||
if 'learn' in data:
|
||||
r = data['learn']
|
||||
del data['learn']
|
||||
|
||||
r = ML.Extract('label',r)
|
||||
logs = [row[0] for row in r]
|
||||
logs = list(set(logs))
|
||||
|
||||
|
||||
else:
|
||||
logs = []
|
||||
#
|
||||
# In order to address the inefficiencies below, we chose to adopt the following policy
|
||||
# We don't learn that which is already learnt, This measure consists in filtering out the list of the apps that already have learning data
|
||||
#
|
||||
self.apps = list(set(self.apps) - set(logs))
|
||||
while self.quit == False:
|
||||
r = {}
|
||||
lapps = list(self.apps)
|
||||
print ' ** ',self.name,datetime.datetime.today()
|
||||
for key in data :
|
||||
logs = data[key]
|
||||
#
|
||||
# There poor design at this point, we need to make sure things tested don't get tested again
|
||||
# This creates innefficiencies (cartesian product)
|
||||
#
|
||||
for app in lapps:
|
||||
handler = AnomalyDetection()
|
||||
value = handler.learn(logs,'label',app,self.features,self.yo)
|
||||
|
||||
if value is not None:
|
||||
|
||||
if key not in r:
|
||||
r[key] = {}
|
||||
r[key][app] = value
|
||||
i = lapps.index(app)
|
||||
del lapps[i]
|
||||
#
|
||||
# This offers a clean write to the data store upon value retrieved
|
||||
# The removal of the application enables us to improve efficiency (among other things)
|
||||
#
|
||||
value = dict(value,**{"features":self.features})
|
||||
self.lock.acquire()
|
||||
writer = self.factory.instance(type=self.write_class,args=self.rw_args)
|
||||
writer.write(label='learn',row=value)
|
||||
self.lock.release()
|
||||
|
||||
|
||||
#
|
||||
# Usually this is used for development
|
||||
# @TODO : Remove this and find a healthy way to stop the server
|
||||
#
|
||||
if 'MONITOR_CONFIG_PATH' in os.environ:
|
||||
#
|
||||
# This suggests we are in development mode
|
||||
#
|
||||
break
|
||||
|
||||
TIME_ELLAPSED = 60*120 #-- Every 2 hours
|
||||
time.sleep(TIME_ELLAPSED)
|
||||
print "Exiting ",self.name
|
||||
|
||||
class FileWatchWorker(BasicWorker):
|
||||
def __init__(self,config,lock):
|
||||
BasicWorker.__init__(self,config,lock)
|
||||
self.name = "Zulu-FileWatch"
|
||||
self.config = config ;
|
||||
self.folder_config = config['monitor']['folders']['config']
|
||||
self.quit = False
|
||||
def stop(self):
|
||||
self.quit = True
|
||||
def run(self):
|
||||
TIME_ELAPSED = 60 * 10
|
||||
handler = monitor.FileWatch()
|
||||
ml_handler = ML()
|
||||
while self.quit == False :
|
||||
r = []
|
||||
print ' ** ',self.name,datetime.datetime.today()
|
||||
for id in self.folder_config :
|
||||
folders = self.folder_config [id]
|
||||
handler.init(folders)
|
||||
xo = handler.composite()
|
||||
#
|
||||
# We should perform a distribution analysis of the details in order to have usable data
|
||||
#
|
||||
xrow = {}
|
||||
xrow[id] = []
|
||||
for xo_row in xo:
|
||||
xo_age = [row['age'] for row in xo_row['details']]
|
||||
xo_size= [row['size'] for row in xo_row['details']]
|
||||
xo_row['details'] = {"age":ML.distribution(xo_age,self.lock),"size":ML.distribution(xo_size,self.lock)}
|
||||
|
||||
xo_row['id'] = id
|
||||
xrow[id].append(xo_row)
|
||||
#
|
||||
# Now we can save the file
|
||||
#
|
||||
self.lock.acquire()
|
||||
writer = self.factory.instance(type=self.write_class,args=self.rw_args)
|
||||
writer.write(label='folders',row=xrow)
|
||||
self.lock.release()
|
||||
if 'MONITOR_CONFIG_PATH' in os.environ:
|
||||
#
|
||||
# This suggests we are in development mode
|
||||
#
|
||||
break
|
||||
time.sleep(TIME_ELAPSED)
|
||||
print 'Exiting ',self.name
|
||||
|
||||
"""
|
||||
This class is a singleton designed to start quit dependent threads
|
||||
* monitor is designed to act as a data collection agent
|
||||
* learner is designed to be a learner i.e machine learning model(s)
|
||||
@TODO:
|
||||
- How to move them to processes that can be read by the os (that would allow us to eat our own dog-food)
|
||||
- Additionally we also need to have a pruning thread, to control the volume of data we have to deal with.This instills the "will to live" in the application
|
||||
"""
|
||||
class ThreadManager:
|
||||
|
||||
Pool = {}
|
||||
@staticmethod
|
||||
def start(config):
|
||||
lock = RLock()
|
||||
ThreadManager.Pool['monitor'] = Top(config,lock)
|
||||
ThreadManager.Pool['learner'] = Learner(config,lock)
|
||||
if 'folders' not in config :
|
||||
ThreadManager.Pool['file-watch'] = FileWatchWorker(config,lock)
|
||||
for id in ThreadManager.Pool :
|
||||
thread = ThreadManager.Pool[id]
|
||||
thread.start()
|
||||
@staticmethod
|
||||
def stop():
|
||||
for id in ThreadManager.Pool :
|
||||
thread = ThreadManager.Pool[id]
|
||||
thread.stop()
|
||||
@staticmethod
|
||||
def status():
|
||||
r = {}
|
||||
for id in ThreadManager.Pool :
|
||||
thread = ThreadManager.Pool[id]
|
||||
r[id] = thread.isAlive()
|
||||
|
||||
|
||||
|
||||
|
||||
class Factory :
|
||||
"""
|
||||
This function will return an instance of an object in the specified in the configuration file
|
||||
"""
|
||||
@staticmethod
|
||||
def instance(id,config):
|
||||
if id in config['monitor'] :
|
||||
className = config['monitor'][id]['class']
|
||||
ref = "".join(["monitor.",className,"()"])
|
||||
ref = eval(ref)
|
||||
return {"class":ref,"config":config['monitor'][id]["config"]}
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
if __name__ =='__main__' :
|
||||
import utils.params as SYS_ARGS
|
||||
import json
|
||||
PARAMS = SYS_ARGS.PARAMS
|
||||
f = open(PARAMS['path'])
|
||||
CONFIG = json.loads(f.read())
|
||||
f.close()
|
||||
ThreadManager.start(CONFIG)
|
||||
|
Loading…
Reference in new issue