new design/architecture

5 years ago · 0a0e606456
parent 1a9c4b6630
commit 0a0e606456
10 changed files with 392 additions and 0 deletions
--- a/src/setup.py
+++ b/src/setup.py
@ -0,0 +1,37 @@
 """
 """
 from setuptools import setup, find_packages
 import os
 import sys
 def read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args    = {
    "name":"smart-top",
    "version":"1.0.0",
    "author":"The Phi Technology LLC","author_email":"info@the-phi.com",
    "license":"MIT",
    "packages":["smart"]}
 args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite']
 args["install_requires"] = ['pandas','numpy','requests']
 args["url"] =   "https://dev.the-phi.com/git/steve/smart-top.git"
 if sys.version_info[0] == 2 :
    args['use_2to3'] = True
    args['use_2to3_exclude_fixers']=['lib2to3.fixes.fix_import']
 setup(**args)
 # setup(
 #     name = "data-transport",
 #     version = "1.0",
 #     author = "The Phi Technology LLC",
 #     author_email = "steve@the-phi.com",
 #     license = "MIT",
 #     packages=['transport'],
 #     keywords=['mongodb','couchdb','rabbitmq','file','read','write','s3'],
 #     install_requires = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open'],
 #     url="https://dev.the-phi.com/git/steve/data-transport.git",
 #     use_2to3=True,
 #     long_description=read('README.md'),
 #     convert_2to3_doctests=['README.md'],
 #     #use_2to3_fixers=['your.fixers'],
 #     use_2to3_exclude_fixers=['lib2to3.fixes.fix_import'],
 #     )
--- a/src/smart/init.py
+++ b/src/smart/init.py
@ -0,0 +1,49 @@
 """
 This framework allows data to be logged to a given data store i.e :
    - disk, cloud (google, dropbox, box, sugarsync or s3) or a queue server
 The intent of the framework is to work as a standalone or embedded in code as a logging framework
 usage:
 dependencies :
    data-transport  pip install git+https://dev.the-phi.com/git/steve/data-transport.git
 """
 import smart.top
 import smart.folder
 import smart.logger
 # from transport import factory
 # class logger :
 #     """
 #     This class is a basic logger, it will log data regardless of the types of data, We will have subclasses that will implement various data extraction schemas:
 #     - processes (top), 
 #     """
 #     def __init__(self,**args):
 #         """
 #         :store  data store (disk,mongo,couch,google,dropbox)
 #         :args   arguments to pass for the data-store (read transport documentation)
 #         :notify function that returns true/false for notification
 #         """        
 #         self.store = factory.instance(type=store,args=args['args'])
 #         if 'notify' in args :
 #             self.notify = args
 #         pass
 #     def log(self,row):
 #         """
 #         This function will log data to a data store
 #         :row    row to be stored
 #         """
 #         self.store.write(row=row)
 #         if(hasattr(self,'notify')):
 #             if (self.notify(row)) :
 #                 #
 #                 # Let us notify the backend by generating a report and submitting  it
 #                 #
 #                 stream = self.get.report()
 #                 pass
 #             else:
 #                 pass
 #     def report(self)  :
--- a/src/smart/alert/init.py
+++ b/src/smart/alert/init.py
@ -0,0 +1,15 @@
 """
 This file will submit an alert to either a mailbox given a set of parameters, this will perform as following :
    - as-a-service
    - embedded
 """
 check = None
 def post(**args):
    """
    This function will submit a report to a given target provided some input 
    :key    will perform as-a-service
    :data   data that will be submitted to smtp/queue server
    :smtp   will send the file to a mailbox
    """
    pass
--- a/src/smart/folder/init.py
+++ b/src/smart/folder/init.py
@ -0,0 +1,63 @@
 """
 This file is designed to retrieve information on a folder
 {files,size,hash}
 """
 import subprocess
 import sys
 import re
 import os
 import pandas as pd
 import io
 import datetime
 class Util :
    def size(self,stream):
        PATTERN = '(^.+)([A-Z]+$)'
        value,units =  re.match('^(.+)([A-Z]+$)',stream).groups()
        value = float(value)
        if 'G' == units :
            value *= 1000
        elif 'K' == units:
            value /= 1000
        units = 'MB'
        return {"size":value,"units":units}
    def content(self,stream):
        return {"content":stream.split(' ')[0].strip()}
 def read(**args):
    """
    The path can also take in regular expressions
    """
    cmd = {"size":"du -sh :path","content":"find :path -type f -exec md5sum {} + | sort -z|md5sum"}
    r = {}
    util = Util()
    for key in cmd :
        _cmd = cmd[key]
        handler = subprocess.Popen(_cmd.replace(':path',args['path']),shell=True,stdout=subprocess.PIPE,encoding='utf-8')	    		
        stream = handler.communicate()[0]  
        if sys.version_info[0] > 2 :
            rows = str(stream).split('\n')
        else:
            rows = stream.split('\n')
        if key == 'size' :
            rows = rows[0]
            rows = util.size(rows.split('\t')[0])
        elif key == 'content' :
            #
            # There is a hash key that is generated and should be extracted
            rows = rows[0]
            rows = util.content(rows)
        r = dict(r, **rows)
    r['path'] = args['path']
    r['name'] = args['path'].split(os.sep)[-1:][0]
    r['node'] = os.uname()[1]
    r['date'] = datetime.datetime.now().strftime('%m-%d-%Y')
    r['time'] = datetime.datetime.now().strftime('%H:%M:%S')
    return pd.DataFrame([r])
    pass
--- a/src/smart/folder/main.py
+++ b/src/smart/folder/main.py
@ -0,0 +1,2 @@
 import smart.folder
 print (smart.folder.read(path='/home/steve/dev/data/vumc/aou'))
--- a/src/smart/logger/init.py
+++ b/src/smart/logger/init.py
@ -0,0 +1,52 @@
 import pandas as pd
 import numpy as np
 import transport
 import datetime
 import io
 import json
 import requests
 def subscribe (self,**args) :
    """
    This function will subscribe an email to a given service (report,notification). If already susbcribed no further action will be performed
    :email  provide a valid email for the free plan. Upgrades will be done via the website
    :id     service identifier accepted values are GOOGLE_DRIVE,DROPBOX,BOX,ONE_DRIVE    
    """
    url = "https://the-phi.com/store/smart-top/subscribe"    
    SERVICES=['GOOGLE','DROPBOX','BOX','ONE_DRIVE']
    if args['id'].upper() in SERVICES :
        data = {"email":args['email']}
        requests.post(url,data=data)
    pass
 def log(**args) :
    """
    This function will write to a designated location provided a set of inputs
    :store  mongo,file,couch,api
    """
    #
    # @TODO: Provide facility to write to a given cloud store (google,one-drive ...)
    #   This will have to be supported by some sort of subscription service
    #
    STORE_MAP = {"mongo":"MongoWriter","disk":"DiskWriter","couch":"CouchWriter",'sqlite':'SQLiteWriter'}
    if 'store' not in args :
        _id = 'console'
    else:
        _id = 'disk' if args['store'] == 'file' else args['store']
        _id = 'disk' if _id == 'sqlite' else _id
    if _id == 'console' :
        """
        We are going to print whatever we have to the console ... using the tool in cli mode
        """
        print()
        print (args['data'])
        print ()
        # stream = args['memory']
        # stream.write(json.dumps(args['row']) if isinstance(args['row'],dict) else args['row'])
        # stream.write("\n")
    else:
        store_type  = ".".join([args['store'],STORE_MAP[_id]])
        store_args  = args['params']
        store       = transport.factory.instance(type=store_type,args=store_args)    
        store.write( args['row'])
--- a/src/smart/top/.init.py.swp
+++ b/src/smart/top/.init.py.swp
--- a/src/smart/top/init.py
+++ b/src/smart/top/init.py
@ -0,0 +1,124 @@
 """
 This file contains class and functions that extract data from running processes like top and stores them into a data store of the calling codes choice
 dependencies:
    - top (on the os)
@TODO:
    Test this thing on windows to see if it works
 """
 import pandas as pd
 import numpy as np
 import subprocess
 import os
 import datetime
 # from transport import factory
 import sys
 import hashlib
 class Util:
    def app(self,stream):
        """
        Formatting application name, sometimes the name has parameters os separators ...
        """
        index =  1	if os.path.exists(" ".join(stream[:1])) else len(stream)-1
        cmd = " ".join(stream[:index]) if index > 0 else " ".join(stream)
        if ' ' in cmd.split(os.sep)[len(cmd.split(os.sep))-1] :
            p = cmd.split(os.sep)[len(cmd.split(os.sep))-1].split(' ')
            name = p[0]
            args = " ".join(p[1:])
        else:
            name = cmd.split('/')[len(cmd.split(os.sep))-1]
            args = " ".join(stream[index:]) if index > 0 else ""
        return [name,cmd,args]        
    def parse(self,rows,xchar=';'):
        """
        This function parses the document returned by the execution of the command returns a document that will have to be parsed and formatted
        """
        m = []
        TIME_INDEX = 5
        ARGS_INDEX = 6
        for item in rows :
            if rows.index(item) != 0 :
                parts = item.split(xchar)
                row = parts[:TIME_INDEX]
                row.append(' '.join(parts[TIME_INDEX:ARGS_INDEX]))
                row += self.app(parts[ARGS_INDEX:])
            else:
                row = item.split(xchar)
            row = (xchar.join(row)).strip()
            if len(row.replace(";","")) > 0 :
                m.append(row)
        return m 
 def read(args) :
    """
    This function will perform the actual reads of process informations. 
    @return {user,pid,start,status, name, args, mem,cpu}
    """
    cmd     = "ps -eo pid,user,pmem,pcpu,stat,etime,args|awk 'OFS=\";\" {$1=$1; if($5 > 9) print }'"
    xchar   = ";"
    try:
        handler = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)			
        stream = handler.communicate()[0]        
        if sys.version_info[0] > 2 :
            rows = str(stream).split('\\n')
        else:
            rows = stream.split('\n')
        formatter = Util()
        m = formatter.parse(rows)
        d = datetime.datetime.now().strftime('%m-%d-%Y')
        t = datetime.datetime.now().strftime('%H:%M:%S')
        m = [item for item in m if len(item) != len (m[0])]
        m = "\n".join(m[1:])    
        df = pd.read_csv(pd.compat.StringIO(m),sep=xchar)
        df['date'] = np.repeat(d,df.shape[0])
        df['time'] = np.repeat(t,df.shape[0])
        df['node'] = np.repeat(os.uname()[1],df.shape[0])
        df.columns =['pid','user','mem','cpu','status','started','name','cmd','args','date','time','node']
        #
        # We should filter the name of the apps we are interested in here (returning the full logs )
        # @TODO: Add filter here to handle filter on different columns
        #
        if 'name' in args :
            names = args['name'].split(',')
            r = pd.DataFrame()
            for name in names :
                tmp = df[df.name == name.strip()]
                if tmp.shape[0] :
                    r = r.append(tmp)
        df = r
        #
        # For security reasons lets has the args columns with an MD5 or sha256
        #
        df.args = [hashlib.md5(str(value).encode('utf-8')).hexdigest() for value in df.args.tolist()]
        STATUS = {'R':'RUNNING','Z':'DEAD','D':'STASIS','S':'SLEEP','Sl':'SLEEP','Ss':'SLEEP','W':'PAGING','T':'DEAD'}
        df.status = df.status.apply(lambda value: STATUS.get(value,'UNKNOWN'))
        if 'cols' in args :
            _cols = list(set(df.columns.tolist()) & set(args['cols']))
            if _cols :
                df = df[_cols]
        #
        # we return a list of objects (no data-frames)
        if 'logger' in args and args['logger'] != None :
            logger = args['logger']
            logger(data=df)
        return df.to_dict(orient='records')
    except Exception as e:
        print (e)
        pass
 if __name__ == '__main__' :
    #
    # Being directly called (external use of the )
    print(read())
--- a/src/smart/top/main.py
+++ b/src/smart/top/main.py
@ -0,0 +1,5 @@
 import smart.top
 import pandas as pd
 df = pd.DataFrame (smart.top.read(name='firefox,code'))
 print (df.groupby(['user'])['cpu','mem'].sum())
--- a/src/smart/view.py
+++ b/src/smart/view.py
@ -0,0 +1,45 @@
 """
 """
 import smart
 import sys
 import json
 import pandas as pd
 import numpy as np
 from multiprocessing import Process
 SYS_ARGS  = {}
 if len(sys.argv) > 1:
    N = len(sys.argv)
    for i in range(1,N):
        value = None
        if sys.argv[i].startswith('--'):
            key = sys.argv[i][2:] #.replace('-','')
            SYS_ARGS[key] = 1			
            if i + 1 < N:
                value = sys.argv[i + 1] = sys.argv[i+1].strip()
            if key and value:
                SYS_ARGS[key] = value
        i += 2
 #
 # Let's determine what data to return ...
 #   --folder <name> and/or --app <names>
 def format(prefix,pointer,logger,container):
    return [{'args':{prefix:term.strip(),'logger':logger},'pointer':pointer} for term in container]
 folders =  format('path',smart.folder.read,smart.logger.log,SYS_ARGS['folders'].split(',')) if 'folders' in SYS_ARGS else []
 apps    = format('name',smart.top.read,smart.logger.log,SYS_ARGS['apps'].split(',')) if 'apps' in SYS_ARGS else []
 if 'cols' in SYS_ARGS :
    cols  = [name.strip() for name in SYS_ARGS['cols'].split(',')]
 else:
    cols = []
 nodes = folders + apps 
 for node in nodes :
    if cols :
        node['args']['cols'] = cols
    pthread = Process(target=node['pointer'],args=(node['args'],))
    pthread.start()
		`@ -0,0 +1,2 @@`
							`import smart.folder`
							`print (smart.folder.read(path='/home/steve/dev/data/vumc/aou'))`