bug fix redesign

7 years ago · ccc52fdaea
parent 8ecca833b3
commit ccc52fdaea
2 changed files with 195 additions and 415 deletions
--- a/src/data-collector.py
+++ b/src/data-collector.py
@ -7,146 +7,125 @@ h="""
 		python data-collector.py --path config.json
 	The configuration file is structured as JSON object as follows :
 	{
-		id: node identifier
-		key: customer's identification key
-		apps:"app_1,app_2,...",
-		folders:"path_1,path_2, ..."
+		id : identifier
+		key: customer's identification key,
+		api: http://localhost/monitor/1/client
+		folders:[]
 	}
+	NOTE: You can download a sample configuration file from https://the-phi.com/smart-top
 """
 from utils.params import PARAMS as SYS_ARGS, Logger
+import os
 import requests
-import pickle
 import json
-from threading import Thread, RLock
-import monitor
-import utils.agents.actor as actor
-from utils.agents.manager import Manager
-SYS_ARGS['host']='localhost'
-ENDPOINT="http://:host/monitor".replace(":host",SYS_ARGS['host'])
-class Collector(Thread) :
+# from threading import Thread, RLock
+from monitor import Apps, Folders
+import time
+from datetime import datetime
+class Collector :
 	def __init__(self) :
-		Thread.__init__(self)
 		"""
-			This function initializes the data collector with critical information
-			The process will include validating the user's account and plan
-			@param key	customer's key
-			@param id	node identifier
-			
+			The configuration file is passed to the class for basic initialization of variables
 		"""
-		#
-		# Let's open the file with the key (nothing else should be in the file
-		#f = open(SYS_ARGS['key'])
-		#SYS_ARGS['key'] = f.read()
-		#f.close()
-		
-		headers = {"key":SYS_ARGS["key"],"id":SYS_ARGS["id"]} #,"scope":json.dumps(scope)}
-		self.plan = None
-		self.store= None
+		self.httpclient = requests.Session()
 		
-		#headers['content-type'] = 'application/json'
-		try:
-			self.key = SYS_ARGS['key']
-			# Logger.log(subject='Collector',object='api',action='request',value=ENDPOINT)
-			# url 	= "/".join([ENDPOINT,"init/collector"])
-			Logger.log(subject='Collector',object='api',action='request',value=SYS_ARGS['api'])
-			url 	= "/".join([SYS_ARGS['api'],"init/collector"])
-			
-			r	= requests.post(url,headers=headers)
+	def init(self):
+		# if 'folders' not in SYS_ARGS :
+		# 	#
+		# 	# If nothing is set it will monitor the temporary directory
+		# 	self.locations = [os.environ[name] for name in ['TEMP','TMP','TMPDIR'] if name in os.environ]
+		# else:
+		# 	self.locations = SYS_ARGS['folders']
 		
-			r 	= json.loads(r.text)
+		#
+		# -- let's get the list of features we are interested .
+		url = SYS_ARGS['api']+'/1/client/login'
+		key = SYS_ARGS['key']
+		id = SYS_ARGS['id'] if 'id' in SYS_ARGS else os.environ['HOSTNAME']
+		headers = {"key":key,"id":id}
 		
-			if r :
 		#
-				# Persisting plan and data-store ...
-				self.plan = r['plan']
-				self.store = r['store']
-			info = {"store":self.store,"plan":self.plan}
+		#-- what features are allowed
+		r = self.httpclient.post(url,headers=headers)	
 		
-			if info['plan'] is not None and info['store'] is not None:
-				info['plan'] = self.plan['name']			
-				info['store'] = self.store['args']['dbname']
-				_action = 'init'
-				self.initialize()
+		if r.status_code == 200 :	
+			r = r.json()
+			
+			self.features = r['features']
+			self.config = r['config']	#-- contains apps and folders
+			Logger.log(action="login",value=r)
 		else:
-				info['plan'] = self.plan is not None
-				info['store']= self.store is not None
-				_action = 'init.error'
-			Logger.log(subject='collector',object='api',action=_action,value=info)
-		except Exception as e:			
-			print(e)
-			Logger.log(subject='collector',object='api',action='init.error',value=str(e))
-			self.monitor = None
-	def initialize(self):
+			self.features = None
+			self.config = None
+			Logger.log(action='login',value='error')
+	def callback(self,channel,method,header,stream):
+		pass
+	def listen(self):
+		
+		factory	= DataSourceFactory()
+		
+		# self.qlistener = factory.instance(type="QueueListener",args=_args)
+		# self.qlistener.callback = self.callback
+		# self.qlistener.init(SYS_ARGS['id'])
+	def post(self,**args) :
 		"""
-			This function creates a monitoring object with the associated parameters from the plan
-			plan.metadata = {"agents":...,"folder_size":...,"delay":...,"limit":...,"actors":...}
+			This function will post data to the endpoint
 		"""
-		_agents = [monitor.DetailProcess(),monitor.FileWatch()]
-		_actors = [actor.Apps(),actor.Folders(),actor.Mailer()]
-		# Initialiing the agents with the parameter values we know of
-		r = []
-		for agent in _agents :
-			
-			if agent.getName() in SYS_ARGS :
-				values = SYS_ARGS[agent.getName()]
-				# print (["init ",agent.getName(),values])
-				Logger.log(subject='Collector',object=agent.getName(),action='init.agent',value=values )
-				agent.init(values)
-				r.append(agent)
-		_agents = r 
-
-		
-		
-		config = {"store":self.store,"plan":self.plan}
-		#@TODO: add SYS_ARGS content so apps that are configured reboot/delete folders that are marked
-		# This is an important security measure!
-		#
-		self.manager = Manager()		
-		self.manager.init(node=SYS_ARGS['id'],agents=_agents,actors=_actors,config=config,key=self.key,host=SYS_ARGS['host'])
+		url = SYS_ARGS['api']+'/1/client/log'
+		key = SYS_ARGS['key']
+		id = SYS_ARGS['id'] if 'id' in SYS_ARGS else os.environ['HOSTNAME']
+		headers = {"key":key,"id":id,"context":args['context'],"content-type":"application/json"}
+		body = args['data'].to_json(orient='records')
+		if args.shape[0] > 0 :
+			r = self.httpclient.post(url,headers=headers,data=body)				
+			Logger.log(action="post."+args['context'],value=r.status_code)
+		else:
+			Logger.log(action="data.error",value="no data")
 		
 	def run(self):
 		"""
-			This funtion runs the authorized features and 
+			This function will execute the basic functions to monitor folders and apps running on the system
+			given the configuration specified on the server .
 		"""
-		#self.monitor.start()
+		while True :
+			try:
+				self.init()
+				if self.config and self.features :
+					ELAPSED_TIME = 60* int(self.features['schedule'].replace("min","").strip())
+					if 'apps' in self.config :					
+						self.post( data=(Apps()).get(filter=self.config['apps']),context="apps")
+					if 'folders' in self.config and self.config['folders'] :
+						folder = Folders()
+						f = folder.get(path=self.config['folders'])
+						self.post(data = f ,context="folders")
+				
+				Logger.log(action='sleeping',value=ELAPSED_TIME)
+				#
+				# In case no configuration is provided, the system will simply fall asleep and wait
+				# @TODO: Evaluate whether to wake up the system or not (security concerns)!
+				#
+				time.sleep(ELAPSED_TIME)

-		# Logger.log(subject='Collector',object='monitor',action='rpc',value=(self.manager is None) )
-		thread = Thread(target=self.manager.run)
-		thread.start()
-		# print self.manager
-		# print self.monitor.config['store']
-		# for agent in self.pool :
-		# 	try:
-		# 		agent =  pickle.loads(agent)
-		# 		agent.start()
+			except Exception,e:
+				Logger.log(action='error',value=e.message)
+				print e
+			break

-		# 		#p = pickle.loads(self.pool[key])
-		# 		#p.init(SYS_ARGS[key].split(','))
-		# 		#p.start()
-		# 	except Exception,e:
-		# 		print e

+		
+			
+			pass
 if __name__ == '__main__' :
 	#
 	#
 	if 'path' in SYS_ARGS :	
 		path = SYS_ARGS['path']
 		f = open(path)
-		p = json.loads(f.read())
+		SYS_ARGS = json.loads(f.read())
 		f.close()
 		
-	else:
-		for id in ['apps','folders']:
-			if id in SYS_ARGS :
-				SYS_ARGS[id] = SYS_ARGS[id].split(',')
-	
-		p = dict(SYS_ARGS)
 	Logger.init('data-collector')
-	SYS_ARGS = dict(SYS_ARGS,** p)
-	if 'apps' in SYS_ARGS :
-		SYS_ARGS['apps'] += [__file__]
-	
-	thread = Collector()
-	thread.start()
+	collector = Collector()
+	collector.run()
 else:
 	print (h)
--- a/src/monitor.py
+++ b/src/monitor.py
@ -1,275 +1,113 @@
 """
-	This program is designed to inspect an application environment
-	This program should only be run on unix friendly systems
+	Steve L. Nyemba <steve@the-phi.com>
+	The Phi Technology - Smart Top

-	We enable the engines to be able to run a several configurations
-	Similarly to what a visitor design-pattern would do
+	This program is the core for evaluating folders and applications. Each class is specialized to generate a report in a pandas data-frame
+	The classes will focus on Apps, Folders and Protocols
+		- SmartTop.get(**args)
+	@TODO:
+		Protocols (will be used in anomaly detection)
 """
 from __future__  import division
 import os
 import subprocess
-from sets import Set
-import re
-import datetime
-import urllib2 as http, base64
-from threading import Thread, RLock
-import time
 import numpy as np
-from utils.ml import ML
 import sys
-from scipy import stats
-
-class Analysis:
-	def __init__(self):
-		self.logs = []
-		pass
-	def post(self,object):
-		self.logs.append(object)
-	def init(self):
-		d = datetime.datetime.now()
-		self.now = {"month":d.month,"year":d.year, "day":d.day,"hour":d.hour,"minute":d.minute}
-	def getNow(self):
-		d = datetime.datetime.now()
-		return {"month":d.month,"year":d.year, "day":d.day,"hour":d.hour,"minute":d.minute,"second":d.second}
-	def getName(self):
-		return self.__class__.__name__
-	def reboot(self,row,conf) :
-		return False
-	def cleanup(self,text):
-		return re.sub('[^a-zA-Z0-9\s:]',' ',str(text)).strip()
-		
-
-"""
-	This class is designed to analyze environment variables. Environment variables can either be folders, files or simple values
-	The class returns a quantifiable assessment of the environment variables (expected 100%)
-"""
-class Env(Analysis):
-	def __init__(self):
-		Analysis.__init__(self)
-	def init(self,values):
-		#Analysis.init(self)
-		self.values = values
-	"""
-		This function evaluate the validity of an environment variable by returning a 1 or 0 (computable)
-		The function will use propositional logic (https://en.wikipedia.org/wiki/Propositional_calculus)
-	"""
-	def evaluate(self,id):
-		
-		if id in os.environ :
-			#
-			# We can inspect to make sure the environment variable is not a path or filename.
-			# Using propositional logic we proceed as follows:
-			# 	- (p) We determine if the value is an folder or file name (using regex)
-			# 	- (q) In case of a file or folder we check for existance
-			# The final result is a conjuction of p and q
-			#
-			value = os.environ[id]
-			expressions = [os.sep,'(\\.\w+)$']
-			p = sum([ re.search(xchar,value) is not None for xchar in expressions])
-			q = os.path.exists(value)
-			
-			return int(p and q)
-		else:
-			return 0
-	
-	def composite (self):
-		#Analysis.init(self)
-		r = [ self.evaluate(id) for id in self.values] ;		
-		N = len(r)
-		n = sum(r)
-		value = 100 * round(n/N,2)
-
-		missing = [self.values[i] for i in range(0,N) if r[i] == 0]
-		return dict(self.getNow(),**{"value":value,"missing":missing})
-"""
-	This class is designed to handle analaysis of the a python virtual environment i.e deltas between requirments file and a virtualenv
-	@TODO: update the virtual environment
-"""
-class Sandbox(Analysis):
-	def __init__(self):
-		Analysis.__init__(self)
-	def init(self,conf):
-		#Analysis.init(self)		
-		if os.path.exists(conf['sandbox']) :
-			self.sandbox_path = conf['sandbox']
-		else:
-			self.sandbox_path = None
-		if os.path.exists(conf['requirements']) :
-			self.requirements_path = conf['requirements']
-		else:
-			self.requirements_path = None
-
-	def get_requirements (self):
-		f = open(self.requirements_path)
-		return [ name.replace('-',' ').replace('_',' ') for name in f.read().split('\n') if name != '']
-	"""
-		This function will return the modules installed in the sandbox (virtual environment)
-	"""
-	def get_sandbox_requirements(self):
-		cmd = ['freeze']
-		xchar = ''.join([os.sep]*2)
-		pip_vm = ''.join([self.sandbox_path,os.sep,'bin',os.sep,'pip']).replace(xchar,os.sep)
-		cmd = [pip_vm]+cmd
-		r = subprocess.check_output(cmd).split('\n')
-		return [row.replace('-',' ').replace('_',' ') for row in r if row.strip() != '']	
-	def evaluate(self):
-		pass
-	def reboot(self,rows,limit=None) :
-		return sum([ len(item['missing']) for item in rows ]) > 0
-	"""
-		This function returns the ratio of existing modules relative to the ones expected
-	"""
-	def composite(self):
-		Analysis.init(self)
-		if self.sandbox_path and self.requirements_path :
-			required_modules= self.get_requirements()
-			sandbox_modules	= self.get_sandbox_requirements()
-			N = len(required_modules)
-			n = len(Set(required_modules) - Set(sandbox_modules))
-			value = round(1 - (n/N),2)*100
-			missing = list(Set(required_modules) - Set(sandbox_modules))
-			
-			return dict(self.getNow(),**{"value":value,"missing":missing})
-		else:
+import pandas as pd
+import datetime
+class SmartTop:
+	def get(self,**args):
 		return None

-"""
-	This class performs the analysis of a list of processes and determines
-	The class provides a quantifiable measure of how many processes it found over all
-"""
-class ProcessCounter(Analysis):
+class Apps(SmartTop) :
 	def __init__(self):
-		Analysis.__init__(self)
-	def init(self,names):
-		#Analysis.init(self)
-		self.names = names
-	def evaluate(self,name):
-		cmd  = "".join(['ps -eo comm |grep ',name,' |wc -l'])
-		handler = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
-		
-		return int(handler.communicate()[0].replace("\n","") )
-	def composite(self):
-		#Analysis.init(self)
-		r = {}
-		for name in self.names :
-			r[name] = self.evaluate(name)
-		
-		#N = len(r)
-		#n = sum(r)
-		#return n/N
-		return dict(self.getNow(),**r)
-	
 		"""
-	This class returns an application's both memory and cpu usage
+			This class will process a system command and parse the outpout accordingly given a parser
+			@param parse is a parser pointer
 		"""
-class DetailProcess(Analysis):
-	def __init__(self):
-		Analysis.__init__(self)
+		self.cmd = "ps -eo pid,user,pmem,pcpu,stat,etime,args|awk 'OFS=\";\" {$1=$1; if($5 > 9) print }'"
+		self.xchar = ';'
+	def get_app(self,stream):
+		index =  1	if os.path.exists(" ".join(stream[:1])) else len(stream)-1

-	def init (self,names):
-		#Analysis.init(self)
-		self.names = names;
-	def getName(self):
-		return "apps"
-	def split(self,name,stream):
+		cmd = " ".join(stream[:index]) if index > 0 else " ".join(stream)
 		
-		pattern = "(\d+.{0,1}\d*)\x20*(\d+.{0,1}\d*)\x20*(\d+.{0,1}\d*)".replace(":name",name).strip()
-		g = re.match(pattern,stream.strip())
-		if g :
-			return list(g.groups())+['1']+[name]
+		if ' ' in cmd.split('/')[len(cmd.split('/'))-1] :
+			p = cmd.split('/')[len(cmd.split('/'))-1].split(' ')
+			name = p[0]
+			args = " ".join(p[1:])
 		else:
-			return ''
-	def reboot(self,rows,conf=None) :
-		return np.sum([int(item['label']=='crash') for item in rows]) > 0
-	def parse(self,row,fields):
-		"""
-			The last field should be the command in its integrity
-			@pre len(fields) > len(row)
-		"""
-		r = {}
-		
-		now = self.getNow()
-		r['date'] = now
-		row = [term for term in row.split() if term.strip() != '']
-		for name in fields :
-			index = fields.index(name)
-
-			r[name] = row[index] if row else 0
-			if name not in ['user','cmd','status','pid'] :
-				r[name] = float(r[name])
-		r[name] = row[index: ] if row else []
-		#
-		# Let's set the status give the data extracted
-		#
-		if r['status'] == 0 :
-			r['status'] = 'crash'
-		elif 'Z' in r['status'] :
-			r['status'] = 'zombie'
-		elif r['memory_usage'] > 0 and r['cpu_usage'] > 0:
-			r['status'] = 'running'
+			name = cmd.split('/')[len(cmd.split('/'))-1]
+			args = " ".join(stream[index:]) if index > 0 else ""
+		return [name,cmd,args]
+	def to_pandas(self,m):
+		"""
+			This function will convert the output of ps to a data-frame
+			@param m raw matrix i.e list of values like a csv
+			
+		"""
+		m = [item for item in m if len(item) != len (m[0])]
+		m = "\n".join(m[1:])    
+		df = pd.read_csv(pd.compat.StringIO(m),sep=self.xchar)
+		df.columns =['pid','user','mem','cpu','status','started','name','cmd','args']
+		return df
+	def empty(self,name):
+		return pd.DataFrame([{"pid":None,"user":None,"mem":0,"cpu":0,"status":"X","started":None,"name":name,"cmd":None,"args":None}])
+	def parse(self,rows):
+		m = []
+		TIME_INDEX = 5
+		ARGS_INDEX = 6
+		
+		for item in rows :
+			if rows.index(item) != 0 :
+				parts = item.split(self.xchar)
+				row = parts[:TIME_INDEX]
+				row.append(' '.join(parts[TIME_INDEX:ARGS_INDEX]))
+				row += self.get_app(parts[ARGS_INDEX:])
 			else:
-			r['status'] = 'idle'
+				row = item.split(self.xchar)
+			row = (self.xchar.join(row)).strip()
+			if len(row.replace(";","")) > 0 :
+				m.append(row)		
+		
+		return m
+	def get(self,**args):
+		"""
+			This function returns a the output of a command to the calling code that is piped into the class			
+			The output will be stored in a data frame with columns
+			@
+		"""
+		try:
+
+			handler = subprocess.Popen(self.cmd,shell=True,stdout=subprocess.PIPE)
+			stream = handler.communicate()[0]
+			rows = stream.split('\n')
+			df =  self.to_pandas(self.parse(rows))
+			r = pd.DataFrame()
+			if 'filter' in args :
+				pattern = "|".join(args['filter']) 				
+				r =  df[df.name.str.contains(pattern)]
+				for name in args['filter'] :
+					filter = "name == '"+name+"'"
+					if r.query(filter).size == 0 :
+						r = r.append(self.empty(name))
 			return r
+		except Exception,e:
+			print (e)
+		return None

-	def evaluate(self,name=None) :
-		if name is None :
-			name = ".*"
-		fields = ["user","pid","memory_usage","cpu_usage","memory_available","status","cmd"]
-		cmd = "ps -eo user,pid,pmem,pcpu,vsize,stat,command|grep -Ei \":app\"".replace(":app",name)
-		handler = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
-		logs	= handler.communicate()[0].split('\n')
-		logs	= [row for row in logs if (row.strip() != '') and ('grep -Ei' in row )== False ]
-		
-		if len(logs) == 0:
-			return [dict(self.parse('',fields),**{'label':name}) ]
-		else :
-			return [dict(self.parse(row,fields),**{'label':name}) for row in logs  if row.strip() != '' and 'grep' not in row and '-Ei' not in row]
-
-	def status(self,row):
-		x = row['memory_usage']
-		y = row['cpu_usage']
-		z = row['memory_available']
-		if z :
-			if y :
-				return "running"
-			return "idle"
-		else:
-			return "crash"
-	#def format(self,row):
-	#	r= {"memory_usage":row[0],"cpu_usage":row[1],"memory_available":row[2]/1000,"proc_count":row[3],"label":self.cleanup(row[4])}
-	#	status = self.status(r)
-	#	r['status'] = status
-	#	return r
-
-	def composite(self):
-		ma = []
-		for name in self.names:
-			row = self.evaluate(name)
-			ma += row

-		return ma
+class Folders(SmartTop):
 	"""
-	This class evaluates a list of folders and provides detailed informaiton about age/size of each file
-	Additionally the the details are summarized in terms of global size, and oldest file.
+		This class will assess a folder and produce a report in a data-frame that can be later on used for summary statistics
 	"""
-class FileWatch(Analysis):
 	def __init__(self):
 		pass
-	def init(self,folders):		
-		self.folders = folders;
-		self.cache = []
-	def getName(self):
-		return "folders"
-
-	def evaluate(self,dir_path,r=[]):
-		"""
-			This function will recursively scan a folder and retrieve file sizes and age of the files.
-			The data will be returned as an array of {size,age,label} items
-		"""
+	def _get(self,dir_path,r=[]):
 		for child in os.listdir(dir_path):
 			path = os.path.join(dir_path, child)
 			if os.path.isdir(path):				
-				self.evaluate(path,r)
+				self._get(path,r)

 			else:
 				size 		= os.path.getsize(path)
@ -277,62 +115,25 @@ class FileWatch(Analysis):
 				file_date	= datetime.datetime.fromtimestamp(file_date)
 				now 		= datetime.datetime.now()
 				age 		= (now - file_date ).days
-				r.append({"label":path,"size":size,"age":age,"date":self.getNow()})
-		return r
-
-	def reboot(self,rows,limit) :
-		return np.sum([ 1 for item in rows if rows['size'] > limit ]) > 0
-	def composite(self):
-		out = []
-		for folder in self.folders :
-			
-			r = self.evaluate(folder,[])
-			file_count 	= len(r)
-			age_mode = [item[0] for item in stats.mode([item['age'] for item in r])]
-			size_mode= [item[0] for item in stats.mode([item['size'] for item in r])]
-			age 		= {"mode":age_mode,"median":np.median([item['age'] for item in r] ),"mean":np.mean([item['age'] for item in r] ),"var":np.mean([item['age'] for item in r])}
-			size 		= {"mode":size_mode,"median":np.median([item['size'] for item in r] ), "mean":np.mean([item['size'] for item in r] ),"var":np.mean([item['size'] for item in r])}
-			out.append({"label":folder,"date":self.getNow(),"stats":{"age":age,"size":size,"file_count":file_count},"logs":r})
-		return out
-
-
-# class Monitor (Thread):
-# 	def __init__(self,pConfig,pWriter,id='processes') :
-# 		Thread.__init__(self)
-		
-# 		self.config 	= pConfig[id]
-# 		self.writer	= pWriter;
-# 		self.logs	= []
-# 		self.handler = self.config['class']
-# 		self.mconfig = self.config['config']
-		
 				
+				name		= os.path.basename(path)
+				r.append({"name":name,"path":path,"size":size,"age":age})
+		return r

-# 	def stop(self):
-# 		self.keep_running = False
-# 	def run(self):
-# 		r = {}
-# 		self.keep_running = True
-# 		lock = RLock()
-# 		while self.keep_running:
-# 			lock.acquire()
-# 			for label in self.mconfig:
-				
-# 				self.handler.init(self.mconfig[label])
-# 				r = self.handler.composite()
-# 				self.writer.write(label=label,row = r)
-				
-# 				time.sleep(2)
-# 			lock.release()		
-			
-# 			self.prune()
-# 			TIME_LAPSE = 60*2
-# 			time.sleep(TIME_LAPSE)
-# 		print "Stopped ..."
-# 	def prune(self) :
-		
-# 		MAX_ENTRIES = 100
-# 		if len(self.logs) > MAX_ENTRIES :
-# 			BEG = len(self.logs) - MAX_SIZE -1 
-# 			self.logs = self.logs[BEG:]
+	def get(self,**args):
+		# path = args['path']
 		
+		if isinstance(args['path'],list) == False:
+			paths = [args['path']]
+		else:
+			paths = args['path']
+		_out = pd.DataFrame()
+		for path in paths :
+			if os.path.exists(path) :
+				#
+				# If the folder does NOT exists it should not be treated.
+				#
+				r =  pd.DataFrame(self._get(path))		
+				r =  pd.DataFrame([{"name":path,"files":r.shape[0],"age_in_days":r.age.mean(),"size_in_kb":r.size.sum()}])
+				_out = _out.append(r)
+		return _out