smart-top/src/utils/ml.py

"""
	This file is intended to perfom certain machine learning tasks based on numpy
	We are trying to keep it lean that's why no sklearn involved yet
"""
from __future__ import division
import numpy as np

class ML:
	@staticmethod
	def Filter (attr,value,data) :
		#
		# @TODO: Make sure this approach works across all transport classes
		# We may have a potential issue of how the data is stored ... it may not scale
		#
		return [item[0] for item in data if item[0][attr] == value]
	@staticmethod
	def Extract(lattr,data):
		return [[row[id] for id in lattr] for row in data]
"""
	Implements a multivariate anomaly detection
	@TODO: determine computationally determine epsilon
"""
class AnomalyDetection:
	def split(self,data,index=-1,threshold=0.7) :
		N	= len(data)
		# if N < LIMIT:
		# 	return None

		end 	= int(N*threshold)
		train	= data[:end]
		test	= data[end:]

		return {"train":train,"test":test}
	"""

		@param key 	field name by which the data will be filtered
		@param value 	field value for the filter
		@param features	features to be used in the analysis
		@param labels	used to assess performance
	@TODO: Map/Reduce does a good job at filtering
	"""
	def learn(self,data,key,value,features,label):
		xo = ML.Filter(key,value,data)

		# attr = conf['features']
		# label= conf['label']
		yo= ML.Extract([label['name']],xo)
		xo = ML.Extract(features,xo)
		yo = self.getLabel(yo,label)

		xo = self.split(xo)
		yo = self.split(yo)

		p = self.gParameters(xo['train'])

		px =  self.gPx(p['mean'],p['cov'],xo['test'])

		print self.gPerformance(px,yo['test'])
	def getLabel(self,yo,label_conf):
		return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ]


	"""
		This function will compute the probability density function given a particular event/set of events
		@pre xu.shape[0] == sigma[0] == sigma[1]
	"""
	def gPx(self,xu,sigma,data,EPSILON=0.05):
		n = len(data[0])

		r = []
		a  = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
		# EPSILON = np.float64(EPSILON)
		test = np.array(data)
		for row in test:
			row = np.array(row)
			d = np.matrix(row - xu)
			d.shape = (n,1)
			b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
			px = float(b/a)
			r.append([px,int(px < EPSILON)])
		return r
	"""
		This function computes performance metrics i.e precision, recall and f-score
		for details visit https://en.wikipedia.org/wiki/Precision_and_recall

	"""
	def gPerformance(self,test,labels) :
		N = len(test)
		tp = 0 # true positive
		fp = 0 # false positive
		fn = 0 # false negative
		tn = 0 # true negative
		for i in range(0,N):
			tp += 1 if (test[i][1]==labels[i] and test[i][1] == 1) else 0
			fp += 1 if (test[i][1] != labels[i] and test[i][1] == 1) else 0
			fn += 1 if (test[i][1] != labels[i] and test[i][1] == 0) else 0
			tn += 1 if (test[i][1] == labels[i] and test[i][1] == 0) else 0
		precision = tp / (tp + fp)
		recall	= tp / (tp + fn)
		fscore 	= (2 * precision * recall)/ (precision + recall)
		return {"precision":precision,"recall":recall,"fscore":fscore}

	"""
		This function returns gaussian parameters i.e means and covariance
		The information will be used to compute probabilities
	"""
	def gParameters(self,train) :

		n = len(train[0])
		m = np.transpose(np.array(train))

		u = np.array([ np.mean(m[i][:]) for i in range(0,n)])
		r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
		#
		#-- Normalizing the matrix then we will compute covariance matrix
		#
		m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
		sigma = np.cov(m)
		return {"cov":sigma,"mean":u}