DC - House keeping work, removing unused files

data-collector
Steve L. Nyemba 7 years ago
parent f12c1467a0
commit 8e7cad9a11

@ -1,38 +0,0 @@
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
class MailAgent :
def __init__(self,conf) :
self.uid = conf['uid']
try:
self.handler = smtplib.SMTP_SSL(conf['host'],conf['port'])
r = self.handler.login(self.uid,conf['password'])
#
# @TODO: Check the status of the authentication
# If not authenticated the preconditions have failed
#
except Exception,e:
print e
self.handler = None
pass
def send(self,**args) :
subject = args['subject']
message = args['message']
to = args['to']
if '<' in message and '>' in message :
message = MIMEText(message,'html')
else:
message = MIMEText(message,'plain')
message['From'] = self.uid
message['To'] = to
message['Subject'] = subject
return self.handler.sendmail(self.uid,to,message.as_string())
def close(self):
self.handler.quit()

@ -1,312 +0,0 @@
"""
This file is intended to perfom certain machine learning tasks based on numpy
We are trying to keep it lean that's why no sklearn involved yet
@TODO:
Create factory method for the learners implemented here
Improve preconditions (size of the dataset, labels)
"""
from __future__ import division
import numpy as np
class ML:
@staticmethod
def Filter (attr,value,data) :
#
# @TODO: Make sure this approach works across all transport classes
# We may have a potential issue of how the data is stored ... it may not scale
#
value = ML.CleanupName(value)
#return [item[0] for item in data if item and attr in item[0] and item[0][attr] == value]
#return [[item for item in row if item[attr] == value][0] for row in data]
#
# We are making the filtering more rescillient, i.e if an item doesn't exist we don't have to throw an exception
# This is why we expanded the loops ... fully expressive but rescilient
#
r = []
for row in data :
if isinstance(row,list) :
for item in row :
if attr in item and item[attr] == value:
r.append(item)
else:
#
# We are dealing with a vector of objects
#
if attr in row and row[attr] == value:
r.append(row)
return r
@staticmethod
def Extract(lattr,data):
if isinstance(lattr,basestring):
lattr = [lattr]
# return [[row[id] for id in lattr] for row in data]
r = [[row[id] for id in lattr] for row in data]
if len(lattr) == 1 :
return [x[0] for x in r]
else:
return r
@staticmethod
def CleanupName(value) :
return value.replace('$','').replace('.+','')
@staticmethod
def distribution(xo,lock,scale=False) :
d = []
m = {}
if scale :
xu = np.mean(xo)
sd = np.sqrt(np.var(xo))
for xi in xo :
value = round(xi,2)
if scale :
value = round((value - xu)/sd,2)
id = str(value)
lock.acquire()
if id in m :
index = m[id]
d[index][1] += 1
else:
m[id] = len(d)
d.append([value,1])
lock.release()
del m
return d
"""
Implements a multivariate anomaly detection
@TODO: determine computationally determine epsilon
"""
class AnomalyDetection:
def __init__(self):
pass
def split(self,data,index=-1,threshold=0.65) :
N = len(data)
# if N < LIMIT:
# return None
end = int(N*threshold)
train = data[:end]
test = data[end:]
return {"train":train,"test":test}
"""
@param key field name by which the data will be filtered
@param value field value for the filter
@param features features to be used in the analysis
@param labels used to assess performance
@TODO: Map/Reduce does a good job at filtering
"""
def learn(self,data,key,value,features,label):
if len(data) < 10:
return None
xo = ML.Filter(key,value,data)
if len(xo) < 10 :
return None
# attr = conf['features']
# label= conf['label']
yo= ML.Extract([label['name']],xo)
xo = ML.Extract(features,xo)
yo = self.getLabel(yo,label)
#
# @TODO: Insure this can be finetuned, training size matters for learning. It's not obvious to define upfront
#
xo = self.split(xo)
yo = self.split(yo)
p = self.gParameters(xo['train'])
has_cov = np.linalg.det(p['cov']) if p else False #-- making sure the matrix is invertible
if xo['train'] and has_cov :
E = 0.001
ACCEPTABLE_FSCORE = 0.6
fscore = 0
#
# We need to find an appropriate epsilon for the predictions
# The appropriate epsilon is one that yields an f-score [0.5,1[
#
__operf__ = None
perf = None
for i in range(0,10):
Epsilon = E + (2*E*i)
if p is None :
return None
#
# At this point we've got enough data for the parameters
# We should try to fine tune epsilon for better results
#
px = self.gPx(p['mean'],p['cov'],xo['test'],Epsilon)
__operf__ = self.gPerformance(px,yo['test'])
print value,__operf__
if __operf__['fscore'] == 1 :
continue
if perf is None :
perf = __operf__
elif perf['fscore'] < __operf__['fscore'] and __operf__['fscore'] > ACCEPTABLE_FSCORE :
perf = __operf__
perf['epsilon'] = Epsilon
#
# At this point we are assuming we came out of the whole thing with an acceptable performance
# The understanding is that error drives performance thus we reject fscore==1
#
if perf and perf['fscore'] > ACCEPTABLE_FSCORE :
return {"label":value,"parameters":p,"performance":perf}
else:
return None
return None
"""
This function determines if the preconditions for learning are met
For that parameters are passed to the function
p
"""
def canLearn(self,p) :
pass
def getLabel(self,yo,label_conf):
return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ]
"""
This function will compute the probability density function given a particular event/set of events
The return value is [px,yo]
@pre xu.shape[0] == sigma[0] == sigma[1]
"""
def gPx(self,xu,sigma,data,EPSILON=0.01):
n = len(data[0])
r = []
a = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
# EPSILON = np.float64(EPSILON)
test = np.array(data)
for row in test:
row = np.array(row)
d = np.matrix(row - xu)
d.shape = (n,1)
b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
px = float(b/a)
r.append([px,int(px < EPSILON)])
return r
"""
This function uses stored learnt information to predict on raw data
In this case it will determin if we have an anomaly or not
@param xo raw observations (matrix)
@param info stored information about this
"""
def predict(self,xo,info):
xo = ML.Extract(info['features'],xo)
if not xo :
return None
sigma = info['parameters']['cov']
xu = info['parameters']['mean']
epsilon = info['performance']['epsilon']
return self.gPx(xu,sigma,xo,epsilon)
"""
This function computes performance metrics i.e precision, recall and f-score
for details visit https://en.wikipedia.org/wiki/Precision_and_recall
"""
def gPerformance(self,test,labels) :
N = len(test)
tp = 0 # true positive
fp = 0 # false positive
fn = 0 # false negative
tn = 0 # true negative
for i in range(0,N):
tp += 1 if (test[i][1]==labels[i] and test[i][1] == 1) else 0
fp += 1 if (test[i][1] != labels[i] and test[i][1] == 1) else 0
fn += 1 if (test[i][1] != labels[i] and test[i][1] == 0) else 0
tn += 1 if (test[i][1] == labels[i] and test[i][1] == 0) else 0
precision = tp /( (tp + fp) if tp + fp > 0 else 1)
recall = tp / ((tp + fn) if tp + fn > 0 else 1)
fscore = (2 * precision * recall)/ ((precision + recall) if (precision + recall) > 0 else 1)
return {"precision":precision,"recall":recall,"fscore":fscore}
"""
This function returns gaussian parameters i.e means and covariance
The information will be used to compute probabilities
"""
def gParameters(self,train) :
n = len(train[0])
m = np.transpose(np.array(train))
u = np.array([ np.mean(m[i][:]) for i in range(0,n)])
if np.sum(u) == 0:
return None
r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
#
# Before we normalize the data we must insure there's is some level of movement in this application
# A lack of movement suggests we may not bave enough information to do anything
#
if 0 in r :
return None
#
#-- Normalizing the matrix then we will compute covariance matrix
#
m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
sigma = np.cov(m)
sigma = [ list(row) for row in sigma]
return {"cov":sigma,"mean":list(u)}
class AnalyzeAnomaly(AnomalyDetection):
def __init__(self):
AnomalyDetection.__init__(self)
"""
This analysis function will include a predicted status because an anomaly can either be
- A downtime i.e end of day
- A spike and thus a potential imminent crash
@param xo matrix of variables
@param info information about what was learnt
"""
def predict(self,xo,info):
x = xo[len(xo)-1]
r = AnomalyDetection.predict(self,[x],info)
#
# In order to determine what the anomaly is we compute the slope (idle or crash)
# The slope is computed using the covariance / variance of features
#
if r is not None:
N = len(info['features'])
xy = ML.Extract(info['features'],xo)
xy = np.array(xy)
vxy= np.array([ np.var(xy[:,i]) for i in range(0,N)])
cxy=np.array(info['parameters']['cov'])
#cxy=np.cov(np.transpose(xy))
if np.sum(vxy) == 0:
vxy = cxy
alpha = cxy/vxy
r = {"anomaly":r[0][1],"slope":list(alpha[:,0])}
return r
class Regression:
parameters = {}
@staticmethod
def predict(xo):
pass
def __init__(self,config):
pass

@ -1,266 +0,0 @@
#import multiprocessing
from threading import Thread, RLock
#from utils import transport
from utils.transport import *
from utils.ml import AnomalyDetection,ML
import time
import monitor
import sys
import os
import datetime
class BasicWorker(Thread):
def __init__(self,config,lock):
Thread.__init__(self)
self.reader_class = config['store']['class']['read']
self.write_class = config['store']['class']['write']
self.rw_args = config['store']['args']
self.factory = DataSourceFactory()
self.lock = lock
"""
This class is intended to collect data given a configuration
"""
class Top(Thread):
def __init__(self,_config,lock):
Thread.__init__(self)
self.lock = lock
self.reader_class = _config['store']['class']['read']
self.write_class = _config['store']['class']['write']
self.rw_args = _config['store']['args']
self.factory = DataSourceFactory()
self.name = 'Zulu-Top'
self.quit = False
className = ''.join(['monitor.',_config['monitor']['processes']['class'],'()'])
self.handler = eval(className)
self.config = _config['monitor']['processes']['config']
def stop(self):
self.quit = True
def run(self):
while self.quit == False:
print ' ** ',self.name,datetime.datetime.today()
for label in self.config :
self.lock.acquire()
gwriter = self.factory.instance(type=self.write_class,args=self.rw_args)
apps = self.config[label]
self.handler.init(apps)
r = self.handler.composite()
gwriter.write(label=label,row=r)
time.sleep(5)
self.lock.release()
if 'MONITOR_CONFIG_PATH' in os.environ:
#
# This suggests we are in development mode
#
break
ELLAPSED_TIME = 60*20
time.sleep(ELLAPSED_TIME)
print "Exiting ",self.name
class Learner(Thread) :
"""
This function expects paltform config (store,learner)
It will leverage store and learner in order to operate
"""
def __init__(self,config,lock):
Thread.__init__(self)
self.name = 'Zulu-Learner'
self.lock = lock
self.reader_class = config['store']['class']['read']
self.write_class = config['store']['class']['write']
self.rw_args = config['store']['args']
self.features = config['learner']['anomalies']['features']
self.yo = config['learner']['anomalies']['label']
self.apps = config['learner']['anomalies']['apps']
self.factory = DataSourceFactory()
self.quit = False
def stop(self):
self.quit = True
"""
This function will initiate learning every (x-hour)
If there is nothing to learn the app will simply go to sleep
"""
def run(self):
reader = self.factory.instance(type=self.reader_class,args=self.rw_args)
data = reader.read()
#
# Let's make sure we extract that which has aleady been learnt
#
if 'learn' in data:
r = data['learn']
del data['learn']
r = ML.Extract('label',r)
logs = [row[0] for row in r]
logs = list(set(logs))
else:
logs = []
#
# In order to address the inefficiencies below, we chose to adopt the following policy
# We don't learn that which is already learnt, This measure consists in filtering out the list of the apps that already have learning data
#
self.apps = list(set(self.apps) - set(logs))
while self.quit == False:
r = {}
lapps = list(self.apps)
print ' ** ',self.name,datetime.datetime.today()
for key in data :
logs = data[key]
#
# There poor design at this point, we need to make sure things tested don't get tested again
# This creates innefficiencies (cartesian product)
#
for app in lapps:
handler = AnomalyDetection()
value = handler.learn(logs,'label',app,self.features,self.yo)
if value is not None:
if key not in r:
r[key] = {}
r[key][app] = value
i = lapps.index(app)
del lapps[i]
#
# This offers a clean write to the data store upon value retrieved
# The removal of the application enables us to improve efficiency (among other things)
#
value = dict(value,**{"features":self.features})
self.lock.acquire()
writer = self.factory.instance(type=self.write_class,args=self.rw_args)
writer.write(label='learn',row=value)
self.lock.release()
#
# Usually this is used for development
# @TODO : Remove this and find a healthy way to stop the server
#
if 'MONITOR_CONFIG_PATH' in os.environ:
#
# This suggests we are in development mode
#
break
TIME_ELLAPSED = 60*120 #-- Every 2 hours
time.sleep(TIME_ELLAPSED)
print "Exiting ",self.name
class FileWatchWorker(BasicWorker):
def __init__(self,config,lock):
BasicWorker.__init__(self,config,lock)
self.name = "Zulu-FileWatch"
self.config = config ;
self.folder_config = config['monitor']['folders']['config']
self.quit = False
def stop(self):
self.quit = True
def run(self):
TIME_ELAPSED = 60 * 10
handler = monitor.FileWatch()
ml_handler = ML()
while self.quit == False :
r = []
print ' ** ',self.name,datetime.datetime.today()
for id in self.folder_config :
folders = self.folder_config [id]
handler.init(folders)
xo = handler.composite()
#
# We should perform a distribution analysis of the details in order to have usable data
#
xrow = {}
xrow[id] = []
for xo_row in xo:
xo_age = [row['age'] for row in xo_row['details']]
xo_size= [row['size'] for row in xo_row['details']]
xo_row['details'] = {"age":ML.distribution(xo_age,self.lock),"size":ML.distribution(xo_size,self.lock)}
xo_row['id'] = id
xrow[id].append(xo_row)
#
# Now we can save the file
#
self.lock.acquire()
writer = self.factory.instance(type=self.write_class,args=self.rw_args)
writer.write(label='folders',row=xrow)
self.lock.release()
if 'MONITOR_CONFIG_PATH' in os.environ:
#
# This suggests we are in development mode
#
break
time.sleep(TIME_ELAPSED)
print 'Exiting ',self.name
"""
This class is a singleton designed to start quit dependent threads
* monitor is designed to act as a data collection agent
* learner is designed to be a learner i.e machine learning model(s)
@TODO:
- How to move them to processes that can be read by the os (that would allow us to eat our own dog-food)
- Additionally we also need to have a pruning thread, to control the volume of data we have to deal with.This instills the "will to live" in the application
"""
class ThreadManager:
Pool = {}
@staticmethod
def start(config):
lock = RLock()
ThreadManager.Pool['monitor'] = Top(config,lock)
ThreadManager.Pool['learner'] = Learner(config,lock)
if 'folders' not in config :
ThreadManager.Pool['file-watch'] = FileWatchWorker(config,lock)
for id in ThreadManager.Pool :
thread = ThreadManager.Pool[id]
thread.start()
@staticmethod
def stop():
for id in ThreadManager.Pool :
thread = ThreadManager.Pool[id]
thread.stop()
@staticmethod
def status():
r = {}
for id in ThreadManager.Pool :
thread = ThreadManager.Pool[id]
r[id] = thread.isAlive()
class Factory :
"""
This function will return an instance of an object in the specified in the configuration file
"""
@staticmethod
def instance(id,config):
if id in config['monitor'] :
className = config['monitor'][id]['class']
ref = "".join(["monitor.",className,"()"])
ref = eval(ref)
return {"class":ref,"config":config['monitor'][id]["config"]}
else:
return None
if __name__ =='__main__' :
import utils.params as SYS_ARGS
import json
PARAMS = SYS_ARGS.PARAMS
f = open(PARAMS['path'])
CONFIG = json.loads(f.read())
f.close()
ThreadManager.start(CONFIG)
Loading…
Cancel
Save