bug fix around filter/extract @TODO: Work on overfitting

9 years ago · 4184d0bdc7
parent 3f8f975528
commit 4184d0bdc7
2 changed files with 48 additions and 19 deletions
--- a/src/utils/ml.py
+++ b/src/utils/ml.py
@ -18,7 +18,7 @@ class ML:
 		#
 		
 		#return [item[0] for item in data if item and attr in item[0] and item[0][attr] == value]
-		return [[item for item in row if item[attr] == value] for row in data]
+		return [[item for item in row if item[attr] == value][0] for row in data]
 	@staticmethod
 	def Extract(lattr,data):
 		if isinstance(lattr,basestring):
@ -32,7 +32,7 @@ class ML:
 """
 class AnomalyDetection:
 		
-	def split(self,data,index=-1,threshold=0.8) :
+	def split(self,data,index=-1,threshold=0.9) :
 		N	= len(data)
 		# if N < LIMIT:
 		# 	return None
@ -53,7 +53,6 @@ class AnomalyDetection:
 	"""
 	def learn(self,data,key,value,features,label):
 		xo = ML.Filter(key,value,data)
-		print key,value, len(xo)
 		
 		if not xo or len(xo) < 100:
 			return None
@ -69,25 +68,47 @@ class AnomalyDetection:
 		
 		xo = self.split(xo)
 		yo = self.split(yo)
-
-		if xo['train'] :
-			E = 0.01
+		p = self.gParameters(xo['train'])
+		has_cov =  np.linalg.det(p['cov']) #-- making sure the matrix is invertible
+		if xo['train'] and has_cov :
+			E = 0.001
 			fscore = 0
+			#
+			# We need to find an appropriate epsilon for the predictions
+			# The appropriate epsilon is one that yields an f-score [0.5,1[
+			#
+			
+			__operf__ = None
+			perf = None
 			for i in range(0,10):
 				Epsilon = E + (2*E*i)
-				p = self.gParameters(xo['train'])
+				
 				if p is None :
 					return None
+				#
+				# At this point we've got enough data for the parameters
+				# We should try to fine tune epsilon for better results
+				#
+				
 				px =  self.gPx(p['mean'],p['cov'],xo['test'],Epsilon)
 				
-				perf = self.gPerformance(px,yo['test'])
-				if fscore == 0 :
-					fscore = perf['fscore']
-				elif perf['fscore'] > fscore and perf['fscore'] > 0.5 :
-					
-					perf['epsilon'] = Epsilon
+				
+				__operf__ = self.gPerformance(px,yo['test'])
+				print __operf__
+				if __operf__['fscore'] == 1 :
+					break
+				if perf is None :
+					perf = __operf__['fscore']
+				elif perf['fscore'] < __perf__['fscore'] and __operf__['fscore']> 0.5 :
+					perf = __operf__
+				
+				perf['epsilon'] = Epsilon
 			
-			return {"label":value,"parameters":p,"performance":perf}
+			
+			if perf and perf['fscore'] > 0.5 :
+				return {"label":value,"parameters":p,"performance":perf}
+			else:
+				return None
 		return None
 	def getLabel(self,yo,label_conf):
 		return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ]
@ -109,6 +130,7 @@ class AnomalyDetection:
 			row = np.array(row)
 			d = np.matrix(row - xu)
 			d.shape = (n,1)
+			
 			b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
 			
 			px = float(b/a)
--- a/test/TestML.py
+++ b/test/TestML.py
@ -25,14 +25,20 @@ class TestML(unittest.TestCase):
 	def test_Filter(self):
 		r = self.greader.read()
 		r = r['apps']
-		x = ML.Filter('label','Google Chrome',r)
+		#
+		# To make this test case extensible we need to pull apps from the configuration
+		#
+		app = CONFIG['monitor']['processes']['config']['apps'][0]
+		x = ML.Filter('label',app,r)
 		for row in x:
-			self.assertTrue(row['label'] == 'Google Chrome')
+			self.assertTrue(row['label'] == app)
 	def test_Extract(self):
 		r = self.greader.read()
 		r = r['apps']
-		x = ML.Filter('label','Google Chrome',r)
+		app = CONFIG['monitor']['processes']['config']['apps'][0]
+		x = ML.Filter('label',app,r)
 		x_ = ML.Extract(['cpu_usage','memory_usage'], x)
+		
 		self.assertTrue (len (x) == len(x_))
 		pass
 	def test_Learn(self):
@ -43,11 +49,12 @@ class TestML(unittest.TestCase):
 		data = greader.read()
 		
 		data = data['apps']
+		app = CONFIG['monitor']['processes']['config']['apps'][1]
 		lhandler = AnomalyDetection()
 		features = CONFIG['learner']['anomalies']['features']
 		label	= CONFIG['learner']['anomalies']['label']
-		lhandler.learn(data,'label','Google Chrome',features,label)
-		
+		x = lhandler.learn(data,'label',app,features,label)
+		print x
 		

 if __name__ == '__main__' :