commit
						59d6cc50c0
					
				@ -0,0 +1 @@
 | 
				
			||||
pipeline.py
 | 
				
			||||
@ -0,0 +1,377 @@
 | 
				
			||||
#!/usr/bin/env python3
 | 
				
			||||
"""
 | 
				
			||||
This file will perform basic tasks to finalize the GAN process by performing the following :
 | 
				
			||||
    - basic stats & analytics
 | 
				
			||||
    - rebuild io to another dataset
 | 
				
			||||
"""
 | 
				
			||||
import pandas as pd
 | 
				
			||||
import numpy as np
 | 
				
			||||
from multiprocessing import Process, Lock
 | 
				
			||||
from google.oauth2 import service_account
 | 
				
			||||
from google.cloud import bigquery as bq
 | 
				
			||||
import transport
 | 
				
			||||
from data.params import SYS_ARGS 
 | 
				
			||||
import json
 | 
				
			||||
 | 
				
			||||
import pandas as pd
 | 
				
			||||
import numpy as np
 | 
				
			||||
from google.oauth2 import service_account
 | 
				
			||||
import json
 | 
				
			||||
 | 
				
			||||
# path = '../curation-prod.json'
 | 
				
			||||
# credentials = service_account.Credentials.from_service_account_file(path)
 | 
				
			||||
# df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
 | 
				
			||||
filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config']
 | 
				
			||||
f = open(filename)
 | 
				
			||||
config = json.loads(f.read())
 | 
				
			||||
args = config['pipeline']
 | 
				
			||||
f.close()
 | 
				
			||||
 | 
				
			||||
def _formatSQL(**_args):
 | 
				
			||||
    """
 | 
				
			||||
    This function will build the _map for a given segment
 | 
				
			||||
    """
 | 
				
			||||
    sql = """
 | 
				
			||||
    select DISTINCT x.person_id synthetic,y.person_id original 
 | 
				
			||||
    FROM :synthetic.:table x 
 | 
				
			||||
    INNER JOIN :original.:table y on x.person_id in (:ids)
 | 
				
			||||
    AND x.person_id <> y.person_id AND x.gender_source_value = y.gender_source_value 
 | 
				
			||||
    AND x.year_of_birth = y.year_of_birth 
 | 
				
			||||
    ORDER BY 1
 | 
				
			||||
    """
 | 
				
			||||
    table= _args['table']
 | 
				
			||||
    original,synthetic = _args['schema']['original'],_args['schema']['synthetic']
 | 
				
			||||
    _ids = np.array(_args['ids']).astype(str)
 | 
				
			||||
    return sql.replace(":ids",",".join(_ids)).replace(":synthetic",synthetic).replace(":original",original).replace(":table",table)
 | 
				
			||||
def _addCounts(**_args) :
 | 
				
			||||
    store   = _args['store']
 | 
				
			||||
    sql     = _args['sql']
 | 
				
			||||
    reader = transport.factory.instance(**store['source'])
 | 
				
			||||
    _df = reader.read(sql=sql)
 | 
				
			||||
    _ids = _df.synthetic.unique()
 | 
				
			||||
    _counts = [ np.sum(_df.synthetic == value) for value in _ids]
 | 
				
			||||
    original = [_df[_df.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids]
 | 
				
			||||
    _df = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts})
 | 
				
			||||
 | 
				
			||||
    #
 | 
				
			||||
    # We can post this to the backend ...
 | 
				
			||||
    #
 | 
				
			||||
    table = '_map'  #-- Yes this is hard-coded
 | 
				
			||||
    writer = transport.factory.instance(**dict(store['target'],**{"parallel":True,"table":table}))
 | 
				
			||||
    # if writer.has(table=table) is False:
 | 
				
			||||
    #     writer.write(_df)
 | 
				
			||||
    # else:
 | 
				
			||||
    _schema = [{"name":name,"type":"INTEGER"} for name in _df.columns]
 | 
				
			||||
    writer.write(_df,schema=_schema)
 | 
				
			||||
    
 | 
				
			||||
        
 | 
				
			||||
    
 | 
				
			||||
    
 | 
				
			||||
 | 
				
			||||
def Init(**_args) :
 | 
				
			||||
    """
 | 
				
			||||
    This function will build a map of the synthetic to real individuals. 
 | 
				
			||||
    The assumption is that the synthesized data is stored in the same data-store as the original the parameters provided are :
 | 
				
			||||
    :param  store       object from the configuration file with source,target entries
 | 
				
			||||
    :param  table       name of the original/synthetic tables (they should be the same)
 | 
				
			||||
    :param  feat.       featuress/attributes ... demographics to account for
 | 
				
			||||
    """
 | 
				
			||||
    store = _args['store']
 | 
				
			||||
    reader = transport.factory.instance(**store['source'])
 | 
				
			||||
    original,synthetic = _args['schema']['original'],_args['schema']['synthetic']
 | 
				
			||||
    table = _args['table']
 | 
				
			||||
    sql = _args['sql'].replace(':synthetic',synthetic).replace(':original',original).replace(':table',table)
 | 
				
			||||
   
 | 
				
			||||
    _map = reader.read(sql=sql)
 | 
				
			||||
    
 | 
				
			||||
    
 | 
				
			||||
    
 | 
				
			||||
    k = _args['k'] if 'k' in _args else 2
 | 
				
			||||
    # _iodf = reader.read(table=table)
 | 
				
			||||
    # _ids = _iodf['person_id'].unique().tolist()
 | 
				
			||||
    # x_  = np.array_split(_ids,1000)
 | 
				
			||||
    jobs = []
 | 
				
			||||
    # for _items in x_ :
 | 
				
			||||
    #     _p = {"ids":_items,"schema":_args['schema'],'store':store,'table':table}
 | 
				
			||||
    #     sql = _formatSQL(**_p)
 | 
				
			||||
    #     _p['sql'] = sql
 | 
				
			||||
    #     _apply = lambda params: _addCounts(**params)
 | 
				
			||||
    #     thread = Process(target=_apply,args=(_p,))
 | 
				
			||||
    #     thread.start()
 | 
				
			||||
    #     jobs.append(thread)
 | 
				
			||||
 | 
				
			||||
    # return jobs
 | 
				
			||||
    #
 | 
				
			||||
    # We have performed a m:m (many-to-many) relationship with original participants and synthetic participants
 | 
				
			||||
    # The goal is to obtain a singular map against which records will be migrated
 | 
				
			||||
    #
 | 
				
			||||
    print (['... computing counts (k)'])
 | 
				
			||||
    _ids = _map.synthetic.unique()
 | 
				
			||||
    _counts = [ np.sum(_map.synthetic == value) for value in _ids]
 | 
				
			||||
    original = [_map[_map.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids]
 | 
				
			||||
    print (['Building k-classes/groups'])
 | 
				
			||||
    _mdf = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts})
 | 
				
			||||
    i = _mdf.apply(lambda row: row.counts >= k,axis=1)
 | 
				
			||||
    _mdf = _mdf[i]
 | 
				
			||||
    #
 | 
				
			||||
    # Log what just happened here so we know about the equivalence classes, 
 | 
				
			||||
    # {"module":"binder","action":"map-generation","input":{"k":k,"rows":{"synthetic":_mdf.shape[0],"original":len(_counts)}}}
 | 
				
			||||
    
 | 
				
			||||
    return _mdf
 | 
				
			||||
    #
 | 
				
			||||
    # now we are posting this to target storage ...
 | 
				
			||||
    #
 | 
				
			||||
def ApplyOn (**_args):
 | 
				
			||||
    """
 | 
				
			||||
    This  function will rewrite SQL that applies the synthetic identifier to the entries of the pipeline
 | 
				
			||||
    We assume that the _map has two attributes (synthetic and original)
 | 
				
			||||
    :param  store
 | 
				
			||||
    :param  _config
 | 
				
			||||
    """
 | 
				
			||||
    store_args = _args['store']
 | 
				
			||||
    _config = _args['config']
 | 
				
			||||
    
 | 
				
			||||
    table = _config['from']
 | 
				
			||||
    reader  = transport.factory.instance(**dict(store_args['source'],**{"table":table}))
 | 
				
			||||
    attr = reader.read(limit=1).columns.tolist()
 | 
				
			||||
    original_key = _args['original_key'] #-- assuming referential integrity
 | 
				
			||||
    
 | 
				
			||||
    # synthetic_key= columns['synthetic']
 | 
				
			||||
    # mapped_original=columns['orginal']
 | 
				
			||||
    fields = list(set(attr) - set([original_key]))
 | 
				
			||||
    sql = "select _map.synthetic as :original_key,:fields from :original_schema.:table inner join :synthetic_schema._map on _map.original = :table.:original_key"
 | 
				
			||||
    sql = sql.replace(":table",table).replace(":fields",",".join(fields))
 | 
				
			||||
    sql = sql.replace(":original_key",original_key)
 | 
				
			||||
    _schema = _args['schema']
 | 
				
			||||
    sql = sql.replace(":original_schema",_schema['original']).replace(":synthetic_schema",_schema['synthetic'])
 | 
				
			||||
    
 | 
				
			||||
    return reader.read (sql=sql)
 | 
				
			||||
   
 | 
				
			||||
if __name__ == '__main__' :
 | 
				
			||||
    pass
 | 
				
			||||
 | 
				
			||||
# class Analytics :
 | 
				
			||||
#     """
 | 
				
			||||
#     This class will compile basic analytics about a given dataset i.e compare original/synthetic
 | 
				
			||||
#     """
 | 
				
			||||
#     @staticmethod
 | 
				
			||||
#     def distribution(**args):
 | 
				
			||||
#         context = args['context']
 | 
				
			||||
#         df = args['data']
 | 
				
			||||
#         #
 | 
				
			||||
#         #-- This data frame counts unique values for each feature (space)
 | 
				
			||||
#         df_counts = pd.DataFrame(df.apply(lambda col: col.unique().size),columns=['counts']).T  # unique counts
 | 
				
			||||
#         #
 | 
				
			||||
#         #-- Get the distributions for common values
 | 
				
			||||
#         #
 | 
				
			||||
#         names   = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False]
 | 
				
			||||
#         ddf     = df.apply(lambda col: pd.DataFrame(col.values,columns=[col.name]).groupby([col.name]).size() ).fillna(0)
 | 
				
			||||
#         ddf[context] = ddf.index
 | 
				
			||||
          
 | 
				
			||||
#         pass
 | 
				
			||||
#     def distance(**args):
 | 
				
			||||
#         """
 | 
				
			||||
#         This function will measure the distance between 
 | 
				
			||||
#         """
 | 
				
			||||
#         pass
 | 
				
			||||
# class Utils :
 | 
				
			||||
#     @staticmethod
 | 
				
			||||
#     def log(**args):
 | 
				
			||||
#         logger = transport.factory.instance(type="mongo.MongoWriter",args={"dbname":"aou","doc":"logs"})        
 | 
				
			||||
#         logger.write(args)
 | 
				
			||||
#         logger.close()
 | 
				
			||||
#     class get :
 | 
				
			||||
#         @staticmethod
 | 
				
			||||
#         def pipeline(table,path) :
 | 
				
			||||
#             # contexts    = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts']
 | 
				
			||||
#             config = json.loads((open(path)).read())
 | 
				
			||||
#             pipeline    = config['pipeline']
 | 
				
			||||
#             # return [ item for item in pipeline if item['context'] in contexts]
 | 
				
			||||
#             pipeline =  [item for item in pipeline if 'from' in item and item['from'].strip() == table]
 | 
				
			||||
#             Utils.log(module=table,action='init',input={"pipeline":pipeline})
 | 
				
			||||
#             return pipeline
 | 
				
			||||
#         @staticmethod
 | 
				
			||||
#         def sql(**args) :
 | 
				
			||||
#             """
 | 
				
			||||
#             This function is intended to build SQL query for the remainder of the table that was not synthesized
 | 
				
			||||
#             :config configuration entries
 | 
				
			||||
#             :from   source of the table name
 | 
				
			||||
#             :dataset    name of the source dataset
 | 
				
			||||
            
 | 
				
			||||
#             """
 | 
				
			||||
#             SQL = ["SELECT * FROM :from "]
 | 
				
			||||
#             SQL_FILTER = []
 | 
				
			||||
#             NO_FILTERS_FOUND = True
 | 
				
			||||
#             # pipeline = Utils.get.config(**args)
 | 
				
			||||
#             pipeline = args['pipeline']
 | 
				
			||||
#             REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='}
 | 
				
			||||
#             for item in pipeline :
 | 
				
			||||
                
 | 
				
			||||
 | 
				
			||||
#                 if 'filter' in item :
 | 
				
			||||
#                     if NO_FILTERS_FOUND :
 | 
				
			||||
#                         NO_FILTERS_FOUND = False
 | 
				
			||||
#                         SQL  += ['WHERE']
 | 
				
			||||
#                     #
 | 
				
			||||
#                     # Let us load the filter in the SQL Query
 | 
				
			||||
#                     FILTER = item['filter']
 | 
				
			||||
#                     QUALIFIER = REVERSE_QUALIFIER[FILTER['qualifier'].upper()]
 | 
				
			||||
#                     SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')']).replace(":dataset",args['dataset'])]
 | 
				
			||||
#             src = ".".join([args['dataset'],args['from']])
 | 
				
			||||
#             SQL += [" AND ".join(SQL_FILTER)]
 | 
				
			||||
#             #
 | 
				
			||||
#             # let's pull the field schemas out of the table definition
 | 
				
			||||
#             #
 | 
				
			||||
#             Utils.log(module=args['from'],action='sql',input={"sql":" ".join(SQL) })
 | 
				
			||||
#             return " ".join(SQL).replace(":from",src)
 | 
				
			||||
 | 
				
			||||
        
 | 
				
			||||
# def mk(**args) :
 | 
				
			||||
#     dataset  = args['dataset']
 | 
				
			||||
#     client  = args['client'] if 'client' in args else bq.Client.from_service_account_file(args['private_key'])
 | 
				
			||||
#     #
 | 
				
			||||
#     # let us see if we have a dataset handy here 
 | 
				
			||||
#     #
 | 
				
			||||
#     datasets = list(client.list_datasets())
 | 
				
			||||
#     found = [item for item in datasets if item.dataset_id == dataset]
 | 
				
			||||
    
 | 
				
			||||
#     if not found :
 | 
				
			||||
 | 
				
			||||
#         return client.create_dataset(dataset)
 | 
				
			||||
#     return found[0] 
 | 
				
			||||
        
 | 
				
			||||
# def move (args):
 | 
				
			||||
#     """
 | 
				
			||||
#     This function will move a table from the synthetic dataset into a designated location
 | 
				
			||||
#     This is the simplest case for finalizing a synthetic data set
 | 
				
			||||
#     :private_key        
 | 
				
			||||
#     """
 | 
				
			||||
#     pipeline   = Utils.get.pipeline(args['from'],args['config'])
 | 
				
			||||
#     _args = json.loads((open(args['config'])).read())
 | 
				
			||||
#     _args['pipeline'] = pipeline
 | 
				
			||||
#     # del _args['pipeline']
 | 
				
			||||
#     args = dict(args,**_args)
 | 
				
			||||
#     # del args['pipeline']
 | 
				
			||||
#     # private_key = args['private_key']
 | 
				
			||||
#     client      = bq.Client.from_service_account_json(args['private_key'])
 | 
				
			||||
 | 
				
			||||
#     dataset     = args['dataset']
 | 
				
			||||
#     if pipeline :
 | 
				
			||||
#         SQL         = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in pipeline]
 | 
				
			||||
#         SQL         += [Utils.get.sql(**args)]
 | 
				
			||||
#         SQL         =  ('\n UNION ALL \n'.join(SQL).replace(':dataset','io'))
 | 
				
			||||
#     else:
 | 
				
			||||
#         #
 | 
				
			||||
#         # moving a table to a designated location
 | 
				
			||||
#         tablename = args['from']
 | 
				
			||||
#         if 'sql' not in args :
 | 
				
			||||
#             SQL = "SELECT * FROM :dataset.:table"
 | 
				
			||||
#         else:
 | 
				
			||||
#             SQL = args['sql']
 | 
				
			||||
#         SQL = SQL.replace(":dataset",dataset).replace(":table",tablename)
 | 
				
			||||
#     Utils.log(module=args['from'],action='sql',input={'sql':SQL})
 | 
				
			||||
#     #
 | 
				
			||||
#     # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table
 | 
				
			||||
#     #
 | 
				
			||||
    
 | 
				
			||||
    
 | 
				
			||||
 | 
				
			||||
#     odataset    = mk(dataset=dataset+'_io',client=client)
 | 
				
			||||
#     # SQL =       "SELECT * FROM io.:context_full_io".replace(':context',context)
 | 
				
			||||
#     config = bq.QueryJobConfig()
 | 
				
			||||
#     config.destination = client.dataset(odataset.dataset_id).table(args['from'])
 | 
				
			||||
#     config.use_query_cache = True
 | 
				
			||||
#     config.allow_large_results = True
 | 
				
			||||
#     config.priority = 'INTERACTIVE'
 | 
				
			||||
#     #
 | 
				
			||||
#     #
 | 
				
			||||
 | 
				
			||||
#     schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
 | 
				
			||||
#     fields = [" ".join(["CAST (",item.name,"AS",item.field_type.replace("INTEGER","INT64").replace("FLOAT","FLOAT64"),") ",item.name]) for item in schema]
 | 
				
			||||
#     SQL = SQL.replace("*"," , ".join(fields))
 | 
				
			||||
#     # print (SQL)
 | 
				
			||||
#     out = client.query(SQL,location='US',job_config=config)
 | 
				
			||||
#     Utils.log(module=args['from'],action='move',input={'job':out.job_id})
 | 
				
			||||
#     return (out.job_id)
 | 
				
			||||
    
 | 
				
			||||
    
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
# import pandas as pd
 | 
				
			||||
# import numpy as np
 | 
				
			||||
# from google.oauth2 import service_account
 | 
				
			||||
# import json
 | 
				
			||||
 | 
				
			||||
# # path = '../curation-prod.json'
 | 
				
			||||
# # credentials = service_account.Credentials.from_service_account_file(path)
 | 
				
			||||
# # df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
 | 
				
			||||
# filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config']
 | 
				
			||||
# f = open(filename)
 | 
				
			||||
# config = json.loads(f.read())
 | 
				
			||||
# args = config['pipeline']
 | 
				
			||||
# f.close()
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
# if __name__ == '__main__' :
 | 
				
			||||
#     """
 | 
				
			||||
#     Usage :
 | 
				
			||||
#         finalize --<move|stats> --contexts <c1,c2,...c3> --from <table>
 | 
				
			||||
#     """
 | 
				
			||||
    
 | 
				
			||||
#     if 'move' in SYS_ARGS :
 | 
				
			||||
 | 
				
			||||
#         if 'init' in SYS_ARGS :
 | 
				
			||||
#             dep = config['dep'] if 'dep' in config else {}
 | 
				
			||||
#             info = []
 | 
				
			||||
            
 | 
				
			||||
#             if 'queries' in dep :
 | 
				
			||||
#                 info += dep['queries']
 | 
				
			||||
#                 print ('________')
 | 
				
			||||
#             if 'tables' in dep :
 | 
				
			||||
#                 info += dep['tables']
 | 
				
			||||
#             args = {}
 | 
				
			||||
#             jobs = []
 | 
				
			||||
#             for item in info :
 | 
				
			||||
#                 args = {}
 | 
				
			||||
#                 if type(item) == str :
 | 
				
			||||
#                     args['from'] = item
 | 
				
			||||
#                     name = item
 | 
				
			||||
#                 else:
 | 
				
			||||
#                     args = item
 | 
				
			||||
#                     name = item['from']
 | 
				
			||||
#                 args['config'] = SYS_ARGS['config']
 | 
				
			||||
#                 # args['pipeline'] = []
 | 
				
			||||
#                 job = Process(target=move,args=(args,))
 | 
				
			||||
#                 job.name = name
 | 
				
			||||
#                 jobs.append(job)
 | 
				
			||||
#                 job.start()
 | 
				
			||||
                
 | 
				
			||||
 | 
				
			||||
#             # while len(jobs) > 0 :
 | 
				
			||||
#             #     jobs = [job for job in jobs if job.is_alive()]
 | 
				
			||||
#             #     time.sleep(1)
 | 
				
			||||
            
 | 
				
			||||
 | 
				
			||||
#         else:
 | 
				
			||||
#             move(SYS_ARGS)
 | 
				
			||||
#         # # table = SYS_ARGS['from']
 | 
				
			||||
#         # # args = dict(config,**{"private_key":"../curation-prod.json"})
 | 
				
			||||
#         # args = dict(args,**SYS_ARGS)        
 | 
				
			||||
#         # contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
 | 
				
			||||
#         # log = []
 | 
				
			||||
#         # if contexts :
 | 
				
			||||
#         #     args['contexts'] = contexts
 | 
				
			||||
#         #     log = move(**args)
 | 
				
			||||
            
 | 
				
			||||
#         # else:
 | 
				
			||||
#         #     tables = args['from'].split(',')
 | 
				
			||||
#         #     for name in tables :
 | 
				
			||||
#         #         name = name.strip()
 | 
				
			||||
#         #         args['from'] = name
 | 
				
			||||
#         #         log += [move(**args)]
 | 
				
			||||
#         # print ("\n".join(log))
 | 
				
			||||
        
 | 
				
			||||
        
 | 
				
			||||
        
 | 
				
			||||
#     else:
 | 
				
			||||
#         print ("NOT YET READY !")
 | 
				
			||||
@ -1,2 +1,6 @@
 | 
				
			||||
import data.params as params
 | 
				
			||||
 | 
				
			||||
# import data.params as params
 | 
				
			||||
from data.params import SYS_ARGS
 | 
				
			||||
import transport
 | 
				
			||||
from multiprocessing import Process, Queue
 | 
				
			||||
from data.maker import prepare
 | 
				
			||||
from data.maker import state
 | 
				
			||||
 | 
				
			||||
											
												
													File diff suppressed because it is too large
													Load Diff
												
											
										
									
								@ -1,10 +0,0 @@
 | 
				
			||||
import pandas as pd
 | 
				
			||||
import data.maker
 | 
				
			||||
 | 
				
			||||
df      = pd.read_csv('sample.csv')
 | 
				
			||||
column  = 'gender'
 | 
				
			||||
id      = 'id' 
 | 
				
			||||
context = 'demo'
 | 
				
			||||
store = {"type":"mongo.MongoWriter","args":{"host":"localhost:27017","dbname":"GAN"}}
 | 
				
			||||
max_epochs = 11
 | 
				
			||||
data.maker.train(store=store,max_epochs=max_epochs,context=context,data=df,column=column,id=id,logs='foo')
 | 
				
			||||
@ -0,0 +1,76 @@
 | 
				
			||||
"""
 | 
				
			||||
This file is designed to specify the appliction of pre/post-processing code. 
 | 
				
			||||
    The pre-processing code gets applied after the data has been loaded
 | 
				
			||||
    The post-processing code get applied after the data has been generated for instance:
 | 
				
			||||
        -approximation code/logic; date shifting; suppression; adding noise
 | 
				
			||||
        - 
 | 
				
			||||
"""
 | 
				
			||||
import numpy as np
 | 
				
			||||
from datetime import datetime, timedelta
 | 
				
			||||
import time
 | 
				
			||||
 | 
				
			||||
class Phase:
 | 
				
			||||
    def __init__(self,**_args):
 | 
				
			||||
        self._df = _args['data']
 | 
				
			||||
        self.callback = _args['callback']
 | 
				
			||||
    def apply(self,**_args):
 | 
				
			||||
        """
 | 
				
			||||
            :param  data        data-frame
 | 
				
			||||
            :param  _info       arguments needed to be applied
 | 
				
			||||
            :param  callback    callback function once done
 | 
				
			||||
        """
 | 
				
			||||
        raise Exception ("Function needs to be Implemented")
 | 
				
			||||
class Pre(Phase):
 | 
				
			||||
    pass
 | 
				
			||||
class Post(Phase):
 | 
				
			||||
    def __init__(self,**_args):
 | 
				
			||||
        super().__init__(**_args)
 | 
				
			||||
    pass
 | 
				
			||||
 | 
				
			||||
class Date(Post):
 | 
				
			||||
    def __init__(self,**_args):
 | 
				
			||||
        super().__init__(**_args)
 | 
				
			||||
    def make(self,**_args):
 | 
				
			||||
        """
 | 
				
			||||
        This function generates a random date given a year and optionally a set of days from the randomly generated date
 | 
				
			||||
        :param year     initial value of a year
 | 
				
			||||
        :param offset   list of days between initial date    
 | 
				
			||||
        """
 | 
				
			||||
        if _args['year'] in ['',None,np.nan] :
 | 
				
			||||
            return None
 | 
				
			||||
        year = int(_args['year'])
 | 
				
			||||
 | 
				
			||||
        offset = _args['offset'] if 'offset' in _args else 0
 | 
				
			||||
        month   = np.random.randint(1,13)
 | 
				
			||||
        if month == 2:
 | 
				
			||||
            _end = 28 if year % 4 != 0 else 29
 | 
				
			||||
        else:
 | 
				
			||||
            _end = 31 if month in [1,3,5,7,8,10,12] else 30
 | 
				
			||||
        day = np.random.randint(1,_end)
 | 
				
			||||
 | 
				
			||||
        #-- synthetic date
 | 
				
			||||
        _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0)
 | 
				
			||||
        FORMAT =  '%Y-%m-%d' if 'format' not in _args else _args['format']
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
        # print ([_name,FORMAT, _date.strftime(FORMAT)])
 | 
				
			||||
        r = []
 | 
				
			||||
        if offset :
 | 
				
			||||
            r = [_date.strftime(FORMAT)]
 | 
				
			||||
            for _delta in offset :
 | 
				
			||||
                _date = _date + timedelta(_delta)
 | 
				
			||||
                r.append(_date.strptime(FORMAT))
 | 
				
			||||
            return r
 | 
				
			||||
        else:
 | 
				
			||||
            return _date.strftime(FORMAT)
 | 
				
			||||
        
 | 
				
			||||
    def apply(self,**_args):
 | 
				
			||||
        """
 | 
				
			||||
        
 | 
				
			||||
        """
 | 
				
			||||
      pass
 | 
				
			||||
class Approximate(Post):
 | 
				
			||||
    def apply(**_args):
 | 
				
			||||
        pass
 | 
				
			||||
    def applyWithRange(**_args):
 | 
				
			||||
@ -0,0 +1,284 @@
 | 
				
			||||
"""
 | 
				
			||||
(c) 2018 - 2021, Vanderbilt University Medical Center
 | 
				
			||||
Steve L. Nyemba, steve.l.nyemba@vumc.org
 | 
				
			||||
 | 
				
			||||
This file is designed to handle preconditions for a generative adversarial network:
 | 
				
			||||
    - The file will read/get data from a source specified by transport (or data-frame)
 | 
				
			||||
    - The class will convert the data to a binary vector
 | 
				
			||||
    - The class will also help rebuild the data from a binary matrix.
 | 
				
			||||
Usage :
 | 
				
			||||
 | 
				
			||||
"""
 | 
				
			||||
import transport
 | 
				
			||||
import json
 | 
				
			||||
import pandas as pd
 | 
				
			||||
import numpy as np
 | 
				
			||||
# import cupy as cp
 | 
				
			||||
import sys
 | 
				
			||||
import os
 | 
				
			||||
#
 | 
				
			||||
# The following is to address the issue over creating a large matrix ...
 | 
				
			||||
# 
 | 
				
			||||
# from multiprocessing import Process, Queue
 | 
				
			||||
 | 
				
			||||
# if 'GPU' in os.environ :
 | 
				
			||||
#     import cupy as np
 | 
				
			||||
# else:
 | 
				
			||||
#     import numpy as np    
 | 
				
			||||
class void:
 | 
				
			||||
    pass
 | 
				
			||||
class Hardware :
 | 
				
			||||
    """
 | 
				
			||||
    This class is intended to allow the use of hardware i.e GPU, index or CPU
 | 
				
			||||
    """
 | 
				
			||||
    pass
 | 
				
			||||
 | 
				
			||||
class Input :
 | 
				
			||||
    class NOVALUES :
 | 
				
			||||
        RANDOM,IGNORE,ALWAYS = ['random','ignore','always']
 | 
				
			||||
    """
 | 
				
			||||
    This class is designed to read data from a source and and perform a variet of operations :
 | 
				
			||||
        - provide a feature space, and rows (matrix profile)
 | 
				
			||||
        - a data index map
 | 
				
			||||
    """
 | 
				
			||||
 | 
				
			||||
    def __init__(self,**_args):
 | 
				
			||||
        """
 | 
				
			||||
        :param data    
 | 
				
			||||
        :param store    data-store parameters/configuration
 | 
				
			||||
        :param sql      sql query  that pulls a representative sample of the data
 | 
				
			||||
        """
 | 
				
			||||
        self._schema = _args['schema'] if 'schema' in _args else {}
 | 
				
			||||
        #
 | 
				
			||||
        # schema data should be in a hash map for these purposes
 | 
				
			||||
        #
 | 
				
			||||
        # if self._schema :
 | 
				
			||||
        #     r = {}
 | 
				
			||||
        #     for _item in self._schema :
 | 
				
			||||
        #         r[_item['name']] = r[_item['type']]
 | 
				
			||||
        # self._schema = r
 | 
				
			||||
        
 | 
				
			||||
        self.df = _args['data']
 | 
				
			||||
        if 'sql' not in _args :
 | 
				
			||||
            self._initdata(**_args)   
 | 
				
			||||
            # 
 | 
				
			||||
            pass  
 | 
				
			||||
        else:
 | 
				
			||||
            self._initsql(**_args)
 | 
				
			||||
        #
 | 
				
			||||
        # We need to have a means to map of values,columns and vector positions in order
 | 
				
			||||
        # to perform convert and revert to and from binary
 | 
				
			||||
        #
 | 
				
			||||
        self._map = {} if 'map' not in _args else _args['map']
 | 
				
			||||
        
 | 
				
			||||
        
 | 
				
			||||
    def _initsql(self,**_args):
 | 
				
			||||
        """
 | 
				
			||||
        This function will initialize the class on the basis of a data-store and optionally pre-defined columns to be used to be synthesized 
 | 
				
			||||
        :param store        data-store configuration
 | 
				
			||||
        :param columns      list of columns to be 
 | 
				
			||||
        """
 | 
				
			||||
        
 | 
				
			||||
        if 'columns' not in _args :
 | 
				
			||||
            self._initcols(data=self.df)
 | 
				
			||||
        else:
 | 
				
			||||
            self._initcols(data=self.df,columns=_args['columns'])
 | 
				
			||||
        
 | 
				
			||||
        pass     
 | 
				
			||||
    def _init_map(self,values):
 | 
				
			||||
        self._map = dict(zip(np.arange(len(values)),values))
 | 
				
			||||
        for key in self._map :
 | 
				
			||||
            self._map[key] = self._map[key].tolist()
 | 
				
			||||
    def _initcols (self,**_args) :
 | 
				
			||||
        """
 | 
				
			||||
        This function will initialize the columns to be synthesized and/or determine which ones can be synthesized
 | 
				
			||||
        :param data         data-frame that holds the data (matrix)
 | 
				
			||||
        :param columns      optional columns to be synthesized 
 | 
				
			||||
        """
 | 
				
			||||
        # df = _args['data'].copy()
 | 
				
			||||
        row_count = self.df.shape[0]
 | 
				
			||||
        cols    = None if 'columns' not in _args else _args['columns']
 | 
				
			||||
        self.columns = self.df.columns.tolist()
 | 
				
			||||
        self._io = []
 | 
				
			||||
 | 
				
			||||
        if 'columns' in _args :
 | 
				
			||||
            self._columns = _args['columns']
 | 
				
			||||
        # else:
 | 
				
			||||
        #
 | 
				
			||||
        # We will look into the count and make a judgment call
 | 
				
			||||
        try:
 | 
				
			||||
            # _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T
 | 
				
			||||
            # MIN_SPACE_SIZE = 2
 | 
				
			||||
            # self._columns  = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist()
 | 
				
			||||
            # self._io = _df.to_dict(orient='records')
 | 
				
			||||
            _df  = pd.DataFrame(self.df.nunique().T / self.df.shape[0]).T
 | 
				
			||||
            self._io =  (_df.to_dict(orient='records'))
 | 
				
			||||
            
 | 
				
			||||
        except Exception as e:
 | 
				
			||||
            print (e)
 | 
				
			||||
            self._io = []
 | 
				
			||||
    def _initdata(self,**_args):
 | 
				
			||||
        """
 | 
				
			||||
        This function will initialize the class with a data-frame and columns of interest (if any)
 | 
				
			||||
        :param data       data-frame that holds the data
 | 
				
			||||
        :param columns columns that need to be synthesized if any
 | 
				
			||||
        """
 | 
				
			||||
        self._initcols(**_args)
 | 
				
			||||
 | 
				
			||||
    def _convert(self,**_args):
 | 
				
			||||
        """
 | 
				
			||||
        This function will convert a data-frame into a binary matrix and provide a map to be able to map the values back to the matrix 
 | 
				
			||||
        :param columns  in case we specify the columns to account for (just in case the original assumptions don't hold)
 | 
				
			||||
        """
 | 
				
			||||
        if 'columns' in _args or 'column' in _args :
 | 
				
			||||
            columns = _args['columns'] if 'columns' in _args else [_args['column']]
 | 
				
			||||
        else:
 | 
				
			||||
            columns = self._columns
 | 
				
			||||
        _df = self.df if 'data' not in _args else _args['data']
 | 
				
			||||
        #
 | 
				
			||||
        # At this point we have the list of features we want to use
 | 
				
			||||
        i = 0
 | 
				
			||||
        
 | 
				
			||||
        _m = np.array([])
 | 
				
			||||
        _values  = []
 | 
				
			||||
        for name in columns :
 | 
				
			||||
            #
 | 
				
			||||
            # In case we have  dataset with incomplete value space, we should still be able to generate something meaningful
 | 
				
			||||
            #
 | 
				
			||||
            values = None if name not in self._map else list(self._map[name]['values'])
 | 
				
			||||
            _type = self._schema[name] if name in self._schema else _df[name].dtype
 | 
				
			||||
            cols, _matrix = self.tobinary(_df[name],values)
 | 
				
			||||
            _beg,_end = i,i+len(cols)
 | 
				
			||||
            if name not in self._map :
 | 
				
			||||
                self._map[name] = {"beg":_beg,"end":_end ,"values":cols.tolist()}
 | 
				
			||||
            i += len(cols)
 | 
				
			||||
            if not _m.shape[0]:
 | 
				
			||||
                _m = _matrix ;
 | 
				
			||||
            else:
 | 
				
			||||
                _m = np.concatenate((_m,_matrix),axis=1)
 | 
				
			||||
            if values :
 | 
				
			||||
                _values += list(values)
 | 
				
			||||
        #
 | 
				
			||||
        # @NOTE:
 | 
				
			||||
        # The map should allow us to be able to convert or reconvert the binary matrix to whatever we want ...
 | 
				
			||||
        #
 | 
				
			||||
        # self._matrix = _m
 | 
				
			||||
        
 | 
				
			||||
        return _values,_m
 | 
				
			||||
        
 | 
				
			||||
    def _revert(self,**_args) :
 | 
				
			||||
        """
 | 
				
			||||
        This function will take in a binary matrix and based on the map of values it will repopulate it with values
 | 
				
			||||
        :param _matrix  binary matrix
 | 
				
			||||
        :param column|columns   column name or columns if the column is specified
 | 
				
			||||
        """
 | 
				
			||||
        _column = _args['column'] if 'column' in _args else None
 | 
				
			||||
            
 | 
				
			||||
        
 | 
				
			||||
        matrix = _args['matrix']
 | 
				
			||||
        row_count = matrix.shape[0]
 | 
				
			||||
        r = {}
 | 
				
			||||
        for key in self._map :
 | 
				
			||||
            if _column and key != _column :
 | 
				
			||||
                continue
 | 
				
			||||
            _item = self._map[key]
 | 
				
			||||
            _beg = _item['beg']
 | 
				
			||||
            _end = _item['end']
 | 
				
			||||
            columns = np.array(_item['values'])
 | 
				
			||||
            #
 | 
				
			||||
            # @NOTE: We are accessing matrices in terms of [row,col], 
 | 
				
			||||
            # The beg,end variables are for the columns in the matrix (mini matrix)
 | 
				
			||||
            #
 | 
				
			||||
            # if not _column :
 | 
				
			||||
            #     _matrix = matrix[:,_beg:_end] #-- The understanding is that _end is not included
 | 
				
			||||
            # else:
 | 
				
			||||
                # _matrix = matrix
 | 
				
			||||
            _matrix = matrix[:,_beg:_end]
 | 
				
			||||
            #
 | 
				
			||||
            # vectorize the matrix to replace the bits by their actual values (accounting for the data-types)
 | 
				
			||||
            # @TODO: Find ways to do this on a GPU (for big data) or across threads
 | 
				
			||||
            #
 | 
				
			||||
            row_count = _matrix.shape[0]
 | 
				
			||||
            # r[key] = [columns[np.where(row == 1) [0][0] ] for row in _matrix[:,_beg:_end]]
 | 
				
			||||
            
 | 
				
			||||
            r[key] = [columns[np.where(row==1)[0][0]] if np.where(row==1)[0].size > 0 else '' for row in _matrix]
 | 
				
			||||
        #
 | 
				
			||||
        # we should consider decoding the matrix if possible
 | 
				
			||||
        #
 | 
				
			||||
            
 | 
				
			||||
        return pd.DataFrame(r)
 | 
				
			||||
     
 | 
				
			||||
    def tobinary(self,rows,cols=None) :
 | 
				
			||||
        """
 | 
				
			||||
        This function will compile a binary matrix from a row of values this allows hopefully this can be done in parallel, this function can be vectorized and processed 
 | 
				
			||||
        :param rows     np.array or list of vector of values
 | 
				
			||||
        :param cols     a space of values if it were to be different fromt he current sample.
 | 
				
			||||
        """
 | 
				
			||||
        if not cols:
 | 
				
			||||
            #
 | 
				
			||||
            # In the advent the sample rows do NOT have the values of the 
 | 
				
			||||
            cols = rows.unique()
 | 
				
			||||
        cols = np.array(cols)
 | 
				
			||||
        row_count = np.int64(len(rows))
 | 
				
			||||
        # if 'GPU' not in os.environ :
 | 
				
			||||
        # _matrix = np.zeros([row_count,cols.size],dtype=int)
 | 
				
			||||
        #
 | 
				
			||||
        # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure)
 | 
				
			||||
        #
 | 
				
			||||
        _matrix = np.array([np.repeat(0,cols.size) for i in range(0,row_count)])
 | 
				
			||||
        
 | 
				
			||||
        [np.put(_matrix[i], np.where(cols ==  rows[i])  ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
 | 
				
			||||
        # else:
 | 
				
			||||
        #     _matrix = cp.zeros([row_count,cols.size])
 | 
				
			||||
        #     [cp.put(_matrix[i], cp.where(cols ==  rows[i]),1)for i in cp.arange(row_count) ]
 | 
				
			||||
        #     _matrix = _matrix.asnumpy()
 | 
				
			||||
 | 
				
			||||
        
 | 
				
			||||
        return cols,_matrix
 | 
				
			||||
    def convert(self,**_args):
 | 
				
			||||
        if 'columns' in _args or 'column' in _args :
 | 
				
			||||
            columns = _args['columns'] if 'columns' in _args else [_args['column']]
 | 
				
			||||
        else:
 | 
				
			||||
            columns = self._columns
 | 
				
			||||
        _df = self.df if 'data' not in _args else _args['data']
 | 
				
			||||
        _values,_matrix = self.encode(_df,columns)
 | 
				
			||||
        _, _matrix =  self.tobinary(_matrix)
 | 
				
			||||
        self._init_map(_values)
 | 
				
			||||
        return _values,_matrix  #-- matrix has been updated !
 | 
				
			||||
    def revert(self,**_args):
 | 
				
			||||
        # _columns = _args['column'] if 'column' in _args else None
 | 
				
			||||
        _matrix = _args['matrix']
 | 
				
			||||
        # print (_matrix)
 | 
				
			||||
        return self.decode(_matrix,columns=self._columns)
 | 
				
			||||
        pass
 | 
				
			||||
    def encode(self,df,columns) :
 | 
				
			||||
        _df = df[columns].drop_duplicates()
 | 
				
			||||
        _values = _df.values.tolist()
 | 
				
			||||
        _encoded = df[columns].apply(lambda row: _values.index( list(row)) ,axis=1)
 | 
				
			||||
        return np.array(_values),_encoded
 | 
				
			||||
    def decode (self,_matrix,**_args):
 | 
				
			||||
        #
 | 
				
			||||
        # _matrix   binary matrix
 | 
				
			||||
        #
 | 
				
			||||
      
 | 
				
			||||
        columns = _args['columns']
 | 
				
			||||
        _values = np.array( list(self._map.values()))
 | 
				
			||||
        _matrix = pd.DataFrame(_matrix) #if type(_matrix) != pd.DataFrame else _matrix
 | 
				
			||||
        # x = _matrix.apply(lambda row: _values[row.values == 1 ].tolist()[0] if row.values.sum() > 0 else None, axis=1).tolist()
 | 
				
			||||
        #@TODO: Provide random values for things that are missing
 | 
				
			||||
        
 | 
				
			||||
        # x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist()
 | 
				
			||||
        #
 | 
				
			||||
        # @TODO: Provide a parameter to either:
 | 
				
			||||
        #   - missing = {outlier,random,none}
 | 
				
			||||
        #   -   outlier: select an outlier, random: randomly select a value, none: do nothing ...
 | 
				
			||||
        # 
 | 
				
			||||
        if np.random.choice([0,1],1)[0] :
 | 
				
			||||
            novalues = _values[np.random.choice( len(_values),1)[0]].tolist()
 | 
				
			||||
        else:
 | 
				
			||||
            novalues = np.repeat(None,len(self._columns))        
 | 
				
			||||
        x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else novalues ,axis=1).tolist()
 | 
				
			||||
        return pd.DataFrame(x,columns=columns)
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
@ -0,0 +1 @@
 | 
				
			||||
__init__.py
 | 
				
			||||
@ -0,0 +1,105 @@
 | 
				
			||||
"""
 | 
				
			||||
This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditiions
 | 
				
			||||
"""
 | 
				
			||||
"""
 | 
				
			||||
This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditions,
 | 
				
			||||
The specifications for this are as follows (within an entry of the configuration)
 | 
				
			||||
    {
 | 
				
			||||
        "state":{
 | 
				
			||||
            "pre":[{"approximate":{"field":"int"}},{"newdate":{"field":"format"}}],"post":[{"limit":10}]
 | 
				
			||||
        }
 | 
				
			||||
    }
 | 
				
			||||
"""
 | 
				
			||||
import importlib
 | 
				
			||||
import importlib.util
 | 
				
			||||
import sys
 | 
				
			||||
from datetime import datetime
 | 
				
			||||
from data.maker.state.default import *
 | 
				
			||||
import os
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
class State :
 | 
				
			||||
    @staticmethod
 | 
				
			||||
    def apply(_data,lpointers):
 | 
				
			||||
        """
 | 
				
			||||
        This function applies a pipeline against a given data-frame, the calling code must decide whether it is a pre/post
 | 
				
			||||
        :_data  data-frame
 | 
				
			||||
        :_lpointers functions modules returned by instance (module,_args)
 | 
				
			||||
        """
 | 
				
			||||
        for _item in lpointers :
 | 
				
			||||
            if _item is None :
 | 
				
			||||
                continue
 | 
				
			||||
            
 | 
				
			||||
            pointer = _item['module']
 | 
				
			||||
            _args = _item['args']
 | 
				
			||||
            
 | 
				
			||||
            _data = pointer(_data,_args)
 | 
				
			||||
        return _data
 | 
				
			||||
    @staticmethod
 | 
				
			||||
    def instance(_args):
 | 
				
			||||
        pre = []
 | 
				
			||||
        post=[]
 | 
				
			||||
       
 | 
				
			||||
        out  = {}
 | 
				
			||||
        for key in _args :
 | 
				
			||||
            #
 | 
				
			||||
            # If the item has a path property is should be ignored
 | 
				
			||||
            path  = _args[key]['path'] if 'path' in _args[key] else ''
 | 
				
			||||
            out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']]
 | 
				
			||||
            
 | 
				
			||||
        return out
 | 
				
			||||
        # if 'pre' in _args:
 | 
				
			||||
        #     path  = _args['pre']['path'] if 'path' in _args['pre'] else ''
 | 
				
			||||
            
 | 
				
			||||
        #     pre = [ State._build(dict(_item,**{'path':path})) for _item in _args['pre']['pipeline']]
 | 
				
			||||
        # else:
 | 
				
			||||
        #     path  = _args['post']['path'] if 'path' in _args['post'] else ''
 | 
				
			||||
 | 
				
			||||
        #     post = [ State._build(dict(_item,**{'path':path})) for _item in _args['post']['pipeline']]
 | 
				
			||||
        # return {'pre':pre,'post':post}
 | 
				
			||||
   
 | 
				
			||||
    @staticmethod
 | 
				
			||||
    def _extract(_entry):
 | 
				
			||||
        
 | 
				
			||||
        _name = list(set(_entry.keys()) - set(['path']) )
 | 
				
			||||
        _name = _name[0]
 | 
				
			||||
        path = _entry['path'] if 'path' in _entry and os.path.exists(_entry['path']) else ''
 | 
				
			||||
        return {"module": _name,"args": _entry[_name],'name':_name,'path':path}
 | 
				
			||||
        pass
 | 
				
			||||
    @staticmethod
 | 
				
			||||
    def _build(_args):
 | 
				
			||||
        
 | 
				
			||||
        _info = State._extract(_args)
 | 
				
			||||
        # _info = dict(_args,**_info)
 | 
				
			||||
        
 | 
				
			||||
        _info['module'] = State._instance(_info)
 | 
				
			||||
        return _info if _info['module'] is not None else None
 | 
				
			||||
            
 | 
				
			||||
    @staticmethod
 | 
				
			||||
    def _instance(_args):
 | 
				
			||||
        """
 | 
				
			||||
            :path   optional path of the file on disk
 | 
				
			||||
            :module   name of the function
 | 
				
			||||
        """
 | 
				
			||||
        
 | 
				
			||||
        _name = _args['module']
 | 
				
			||||
        
 | 
				
			||||
        if 'path' in _args and os.path.exists(_args['path']):
 | 
				
			||||
            path= _args['path']
 | 
				
			||||
            
 | 
				
			||||
            spec = importlib.util.spec_from_file_location(_name, path)
 | 
				
			||||
            module = importlib.util.module_from_spec(spec)
 | 
				
			||||
            spec.loader.exec_module(module)
 | 
				
			||||
        else:
 | 
				
			||||
            #
 | 
				
			||||
            # Probably calling a built-in module (should be in this file)
 | 
				
			||||
            
 | 
				
			||||
            module = sys.modules['data.maker.state.default'] 
 | 
				
			||||
       
 | 
				
			||||
        return getattr(module,_name) if hasattr(module,_name) else None
 | 
				
			||||
 
 | 
				
			||||
#
 | 
				
			||||
# Adding a few custom functions that should be able to help ....
 | 
				
			||||
# These functions can be called without specifying a path
 | 
				
			||||
#
 | 
				
			||||
 | 
				
			||||
@ -0,0 +1,116 @@
 | 
				
			||||
"""
 | 
				
			||||
This file contains default functions applied to a data-frame/dataset as pre/post processing jobs. 
 | 
				
			||||
The functions are organized in a pipeline i.e the data will be applied to each function 
 | 
				
			||||
 | 
				
			||||
Custom functions :
 | 
				
			||||
    functions must tak 2 arguments (_data,_args) : where _data is a data frame and _arg is a object describing the input parameters
 | 
				
			||||
"""
 | 
				
			||||
import pandas as pd
 | 
				
			||||
import numpy as np
 | 
				
			||||
from datetime import datetime, timedelta
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def limit(_data,size):
 | 
				
			||||
    """
 | 
				
			||||
        ...,{limit:size}
 | 
				
			||||
    """
 | 
				
			||||
    
 | 
				
			||||
    # size = int(_args['limit'])
 | 
				
			||||
    return _data.iloc[:size]
 | 
				
			||||
def format(_data,_schema):
 | 
				
			||||
    """
 | 
				
			||||
    This function enforces a schema against a data-frame, this may or may not work depending on the persistence storage
 | 
				
			||||
    :_data  data-frame containing all data
 | 
				
			||||
    :_args  schema to enforce the data, we are expecting the format as a list of {name,type,description}    
 | 
				
			||||
    """   
 | 
				
			||||
    return _data 
 | 
				
			||||
 | 
				
			||||
def approximate(_data,_args):
 | 
				
			||||
    """
 | 
				
			||||
    :_args  Object of {field:type}
 | 
				
			||||
    This function will approximate n-fields in the data given it's distribution
 | 
				
			||||
    """
 | 
				
			||||
    _m = {'int':int,'float':float,'integer':int,'double':float}
 | 
				
			||||
    columns = list(_args.keys())
 | 
				
			||||
    for _name in columns :
 | 
				
			||||
        if _name not in _data :
 | 
				
			||||
            continue
 | 
				
			||||
        otype = _args[_name]
 | 
				
			||||
        otype = str if otype not in _m else _m[otype]
 | 
				
			||||
        _data.loc[:,_name] = np.random.uniform(_data[_name].values).astype(otype)
 | 
				
			||||
        
 | 
				
			||||
    return _data
 | 
				
			||||
def split_date(_data,_args):
 | 
				
			||||
    """
 | 
				
			||||
    This function takes a field and applies the format from other fields
 | 
				
			||||
    :_data  data-frame
 | 
				
			||||
    :_config    configuration entry {column:{format,column:format,type}}
 | 
				
			||||
    """
 | 
				
			||||
    _columns = list(_args.keys())
 | 
				
			||||
    _m = {'int':int,'float':float,'integer':int,'double':float}
 | 
				
			||||
    for _name in _columns :
 | 
				
			||||
        _iname = _args[_name]['column']
 | 
				
			||||
        _iformat = _args[_name]['format']['in']
 | 
				
			||||
        _oformat = _args[_name]['format']['out']
 | 
				
			||||
        _otype = str if 'type' not in _args[_name]  else _args[_name]['type']
 | 
				
			||||
        _data.loc[:,_name] = _data[_iname].apply(lambda _date: datetime.strftime(datetime.strptime(str(_date),_iformat),_oformat)).astype(_otype)
 | 
				
			||||
    return _data
 | 
				
			||||
def newdate(_data,_args):
 | 
				
			||||
    """
 | 
				
			||||
    This function creates a new data on a given column from another 
 | 
				
			||||
    :_data  data frame
 | 
				
			||||
    :_args  configuration column:{format,column}
 | 
				
			||||
    """
 | 
				
			||||
    _columns = list(_args.keys())
 | 
				
			||||
    for _name in _columns :
 | 
				
			||||
        
 | 
				
			||||
        format = _args[_name]['format']
 | 
				
			||||
        ROW_COUNT = _data[_name].size
 | 
				
			||||
        if 'column' in _args[_name] :
 | 
				
			||||
            srcName = _args[_name]['column']
 | 
				
			||||
            years = _data[srcName].values
 | 
				
			||||
        else:
 | 
				
			||||
            years = np.random.choice(np.arange(datetime.now().year- 90,datetime.now().year),ROW_COUNT)
 | 
				
			||||
        _data.loc[:,_name] = [ _makedate(year = years[_index],format = format) for _index in np.arange(ROW_COUNT)]
 | 
				
			||||
        
 | 
				
			||||
    return _data
 | 
				
			||||
def _makedate(**_args):
 | 
				
			||||
    """
 | 
				
			||||
    This function creates a new date and applies it to a column
 | 
				
			||||
    :_data  data-frame with columns
 | 
				
			||||
    :_args  arguments for col1:format
 | 
				
			||||
    """
 | 
				
			||||
    _columns = list(_args.keys())
 | 
				
			||||
   
 | 
				
			||||
    # if _args['year'] in ['',None,np.nan] :
 | 
				
			||||
    #     year = np.random.choice(np.arange(1920,222),1)
 | 
				
			||||
    # else:
 | 
				
			||||
    #     year = int(_args['year'])
 | 
				
			||||
    year = int(_args['year'])
 | 
				
			||||
    offset = _args['offset'] if 'offset' in _args else 0
 | 
				
			||||
    month   = np.random.randint(1,13)
 | 
				
			||||
    if month == 2:
 | 
				
			||||
        _end = 28 if year % 4 != 0 else 29
 | 
				
			||||
    else:
 | 
				
			||||
        _end = 31 if month in [1,3,5,7,8,10,12] else 30
 | 
				
			||||
    day = np.random.randint(1,_end)
 | 
				
			||||
 | 
				
			||||
    #-- synthetic date
 | 
				
			||||
    _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0)
 | 
				
			||||
    FORMAT =  '%Y-%m-%d'
 | 
				
			||||
    
 | 
				
			||||
    if 'format' in _args:
 | 
				
			||||
        FORMAT = _args['format']
 | 
				
			||||
    
 | 
				
			||||
 | 
				
			||||
    # print ([_name,FORMAT, _date.strftime(FORMAT)])
 | 
				
			||||
    r = []
 | 
				
			||||
    if offset :
 | 
				
			||||
        r = [_date.strftime(FORMAT)]
 | 
				
			||||
        for _delta in offset :
 | 
				
			||||
            _date = _date + timedelta(_delta)
 | 
				
			||||
            r.append(_date.strptime(FORMAT))
 | 
				
			||||
        return r
 | 
				
			||||
    else:
 | 
				
			||||
        return _date.strftime(FORMAT)
 | 
				
			||||
 | 
				
			||||
@ -0,0 +1,303 @@
 | 
				
			||||
#!/usr/bin/env python3
 | 
				
			||||
import json
 | 
				
			||||
from transport import factory
 | 
				
			||||
import numpy as np
 | 
				
			||||
import os
 | 
				
			||||
from multiprocessing import Process
 | 
				
			||||
import pandas as pd
 | 
				
			||||
from google.oauth2 import service_account
 | 
				
			||||
import data.maker
 | 
				
			||||
 | 
				
			||||
from data.params import SYS_ARGS 
 | 
				
			||||
 | 
				
			||||
#
 | 
				
			||||
# The configuration array is now loaded and we will execute the pipe line as follows
 | 
				
			||||
DATASET='combined20190510'
 | 
				
			||||
 | 
				
			||||
class Components :
 | 
				
			||||
 | 
				
			||||
	@staticmethod
 | 
				
			||||
	def get(args):
 | 
				
			||||
		"""
 | 
				
			||||
		This function returns a data-frame provided a bigquery sql statement with conditions (and limits for testing purposes)
 | 
				
			||||
		The function must be wrapped around a lambda this makes testing easier and changing data stores transparent to the rest of the code. (Vital when testing)
 | 
				
			||||
		:sql	basic sql statement
 | 
				
			||||
		:condition	optional condition and filters
 | 
				
			||||
		"""
 | 
				
			||||
		SQL = args['sql']
 | 
				
			||||
		if 'condition' in args :
 | 
				
			||||
			condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')'])
 | 
				
			||||
			SQL = " ".join([SQL,'WHERE',condition])
 | 
				
			||||
 | 
				
			||||
		SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 "
 | 
				
			||||
		if 'limit' in args :
 | 
				
			||||
			SQL = SQL + 'LIMIT ' + args['limit']
 | 
				
			||||
		credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 | 
				
			||||
		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
 | 
				
			||||
		return df
 | 
				
			||||
		
 | 
				
			||||
		# return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna()
 | 
				
			||||
	@staticmethod
 | 
				
			||||
	def split(X,MAX_ROWS=3,PART_SIZE=3):
 | 
				
			||||
		
 | 
				
			||||
		return list(pd.cut( np.arange(X.shape[0]+1),PART_SIZE).categories)
 | 
				
			||||
 | 
				
			||||
	def train(self,**args):
 | 
				
			||||
		"""
 | 
				
			||||
		This function will perform training on the basis of a given pointer that reads data
 | 
				
			||||
 | 
				
			||||
		"""
 | 
				
			||||
		#
 | 
				
			||||
		# @TODO: we need to log something here about the parameters being passed
 | 
				
			||||
		pointer  = args['reader'] if 'reader' in args else lambda: Components.get(**args)
 | 
				
			||||
		df = pointer()
 | 
				
			||||
 | 
				
			||||
		#
 | 
				
			||||
		# Now we can parse the arguments and submit the entire thing to training
 | 
				
			||||
		#
 | 
				
			||||
		
 | 
				
			||||
		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 | 
				
			||||
		log_folder = args['logs'] if 'logs' in args else 'logs'
 | 
				
			||||
		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
 | 
				
			||||
		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 | 
				
			||||
		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
 | 
				
			||||
 | 
				
			||||
		MAX_ROWS = args['max_rows'] 	if 'max_rows' in args else 0
 | 
				
			||||
		PART_SIZE = args['part_size'] 	if 'part_size' in args else 0
 | 
				
			||||
		
 | 
				
			||||
		if df.shape[0] > MAX_ROWS and 'partition' not in args:
 | 
				
			||||
			lbound = 0
 | 
				
			||||
			bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
 | 
				
			||||
			# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
 | 
				
			||||
			
 | 
				
			||||
			qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'})
 | 
				
			||||
			
 | 
				
			||||
			for b in bounds :
 | 
				
			||||
				part_index = bounds.index(b)
 | 
				
			||||
				ubound = int(b.right)
 | 
				
			||||
				
 | 
				
			||||
					
 | 
				
			||||
				_data =  df.iloc[lbound:ubound][args['columns']]
 | 
				
			||||
				lbound = ubound
 | 
				
			||||
				
 | 
				
			||||
				# _args['logs'] = os.sep.join([log_folder,str(part_index)])
 | 
				
			||||
				_args['partition'] = str(part_index)
 | 
				
			||||
				_args['logger'] = {'args':{'dbname':'aou','doc':args['context']},'type':'mongo.MongoWriter'}
 | 
				
			||||
				#
 | 
				
			||||
				# We should post the the partitions to a queue server (at least the instructions on ):
 | 
				
			||||
				#	- where to get the data
 | 
				
			||||
				#	- and athe arguments to use (partition #,columns,gpu,epochs)
 | 
				
			||||
				#
 | 
				
			||||
				info = {"rows":_data.shape[0],"cols":_data.shape[1], "paritition":part_index,"logs":_args['logs']}
 | 
				
			||||
				p = {"args":_args,"data":_data.to_dict(orient="records"),"info":info}
 | 
				
			||||
				qwriter.write(p)
 | 
				
			||||
				#
 | 
				
			||||
				# @TODO:
 | 
				
			||||
				#	- Notify that information was just posted to the queue
 | 
				
			||||
				info['max_rows'] 	= MAX_ROWS
 | 
				
			||||
				info['part_size'] 	= PART_SIZE
 | 
				
			||||
				logger.write({"module":"train","action":"setup-partition","input":info})
 | 
				
			||||
			
 | 
				
			||||
			pass
 | 
				
			||||
		else:
 | 
				
			||||
			partition = args['partition'] if 'partition' in args else ''
 | 
				
			||||
			log_folder = os.sep.join([log_folder,args['context'],partition])
 | 
				
			||||
			_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
 | 
				
			||||
			_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 | 
				
			||||
			_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
 | 
				
			||||
			os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
 | 
				
			||||
			
 | 
				
			||||
			_args['data'] = df
 | 
				
			||||
			#
 | 
				
			||||
			# @log :
 | 
				
			||||
			#	Logging information about the training process for this partition (or not)
 | 
				
			||||
			#
 | 
				
			||||
			info = {"rows":df.shape[0],"cols":df.shape[1], "partition":partition,"logs":_args['logs']}
 | 
				
			||||
			logger.write({"module":"train","action":"train","input":info})
 | 
				
			||||
			data.maker.train(**_args)
 | 
				
			||||
 | 
				
			||||
		pass
 | 
				
			||||
		
 | 
				
			||||
	# @staticmethod
 | 
				
			||||
	def generate(self,args):
 | 
				
			||||
		"""
 | 
				
			||||
		This function will generate data and store it to a given,
 | 
				
			||||
		"""
 | 
				
			||||
		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 | 
				
			||||
		log_folder = args['logs'] if 'logs' in args else 'logs'
 | 
				
			||||
		partition = args['partition'] if 'partition' in args else ''
 | 
				
			||||
		log_folder = os.sep.join([log_folder,args['context'],partition])
 | 
				
			||||
		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
 | 
				
			||||
		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 | 
				
			||||
		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
 | 
				
			||||
		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
 | 
				
			||||
		_args['no_value']= args['no_value']
 | 
				
			||||
		MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
 | 
				
			||||
		PART_SIZE = args['part_size'] 	if 'part_size' in args else 0
 | 
				
			||||
		
 | 
				
			||||
		# credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 | 
				
			||||
		# _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
 | 
				
			||||
		reader = args['reader']
 | 
				
			||||
		df = reader()
 | 
				
			||||
		if 'partition' in args :
 | 
				
			||||
			bounds = Components.split(df,MAX_ROWS,PART_SIZE)
 | 
				
			||||
			# bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
 | 
				
			||||
			lbound = int(bounds[int(partition)].left)
 | 
				
			||||
			ubound = int(bounds[int(partition)].right)
 | 
				
			||||
			df = df.iloc[lbound:ubound]
 | 
				
			||||
		_args['data'] = df
 | 
				
			||||
			# _args['data'] = reader()
 | 
				
			||||
		#_args['data'] = _args['data'].astype(object)
 | 
				
			||||
		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
 | 
				
			||||
		_dc = data.maker.generate(**_args) 
 | 
				
			||||
		#
 | 
				
			||||
		# We need to post the generate the data in order to :
 | 
				
			||||
		#	1. compare immediately
 | 
				
			||||
		#	2. synthetic copy
 | 
				
			||||
		#
 | 
				
			||||
		
 | 
				
			||||
		cols = _dc.columns.tolist()
 | 
				
			||||
		
 | 
				
			||||
		data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io')				#-- will be used for comparison (store this in big query)
 | 
				
			||||
		base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
 | 
				
			||||
		
 | 
				
			||||
		for name in cols :
 | 
				
			||||
			_args['data'][name] = _dc[name]
 | 
				
			||||
			info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
 | 
				
			||||
			if partition != '' :
 | 
				
			||||
				info['partition'] = partition
 | 
				
			||||
			logger.write(info)
 | 
				
			||||
			# filename = os.sep.join([log_folder,'output',name+'.csv'])
 | 
				
			||||
			# data_comp[[name]].to_csv(filename,index=False)
 | 
				
			||||
 | 
				
			||||
		#
 | 
				
			||||
		#-- Let us store all of this into bigquery
 | 
				
			||||
		prefix = args['notify']+'.'+_args['context']
 | 
				
			||||
		table = '_'.join([prefix,partition,'io']).replace('__','_')
 | 
				
			||||
		folder = os.sep.join([args['logs'],args['context'],partition,'output']) 
 | 
				
			||||
		if 'file' in args :
 | 
				
			||||
			
 | 
				
			||||
			_fname = os.sep.join([folder,table.replace('_io','_full_io.csv')])
 | 
				
			||||
			_pname = os.sep.join([folder,table])+'.csv'
 | 
				
			||||
			data_comp.to_csv( _pname,index=False)
 | 
				
			||||
			_args['data'].to_csv(_fname,index=False)
 | 
				
			||||
			
 | 
				
			||||
			
 | 
				
			||||
		else:
 | 
				
			||||
			credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 | 
				
			||||
			_pname = os.sep.join([folder,table+'.csv'])
 | 
				
			||||
			_fname = table.replace('_io','_full_io')
 | 
				
			||||
			data_comp.to_gbq(if_exists='replace',destination_table=_pname,credentials='credentials',chunk_size=50000)	
 | 
				
			||||
			data_comp.to_csv(_pname,index=False)
 | 
				
			||||
			INSERT_FLAG = 'replace' if 'partition' not in args else 'append'	
 | 
				
			||||
			_args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=_fname,credentials='credentials',chunk_size=50000)
 | 
				
			||||
		
 | 
				
			||||
		info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} }
 | 
				
			||||
		if partition :
 | 
				
			||||
			info ['partition'] = partition
 | 
				
			||||
		logger.write({"module":"generate","action":"write","info":info} )
 | 
				
			||||
	@staticmethod
 | 
				
			||||
	def callback(channel,method,header,stream):
 | 
				
			||||
		
 | 
				
			||||
		info = json.loads(stream)
 | 
				
			||||
		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']})
 | 
				
			||||
		
 | 
				
			||||
		logger.write({'module':'process','action':'read-partition','input':info['info']})
 | 
				
			||||
		df = pd.DataFrame(info['data'])
 | 
				
			||||
		args = info['args']
 | 
				
			||||
		if int(args['num_gpu']) > 1 and args['gpu'] > 0:
 | 
				
			||||
			args['gpu'] = args['gpu'] + args['num_gpu']
 | 
				
			||||
		args['reader'] = lambda: df
 | 
				
			||||
		#
 | 
				
			||||
		# @TODO: Fix
 | 
				
			||||
		# 	There is an inconsistency in column/columns ... fix this shit!
 | 
				
			||||
		#
 | 
				
			||||
		args['columns'] = args['column']		
 | 
				
			||||
		(Components()).train(**args)		
 | 
				
			||||
		logger.write({"module":"process","action":"exit","info":info["info"]})
 | 
				
			||||
		channel.close()
 | 
				
			||||
		channel.connection.close()
 | 
				
			||||
		pass
 | 
				
			||||
	
 | 
				
			||||
if __name__ == '__main__' :
 | 
				
			||||
	filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json'
 | 
				
			||||
	f = open (filename)
 | 
				
			||||
	PIPELINE = json.loads(f.read())
 | 
				
			||||
	f.close()	
 | 
				
			||||
	index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else 0
 | 
				
			||||
	
 | 
				
			||||
	args =  (PIPELINE[index])
 | 
				
			||||
	args['dataset'] = 'combined20190510'
 | 
				
			||||
	args = dict(args,**SYS_ARGS)
 | 
				
			||||
	args['max_rows'] = int(args['max_rows']) 	if 'max_rows' in args else 3
 | 
				
			||||
	args['part_size']= int(args['part_size']) 	if 'part_size' in args else 3
 | 
				
			||||
 | 
				
			||||
	#
 | 
				
			||||
	# @TODO:
 | 
				
			||||
	#	Log what was initiated so we have context of this processing ...
 | 
				
			||||
	#
 | 
				
			||||
	if 'listen' not in SYS_ARGS :
 | 
				
			||||
		if 'file' in args :
 | 
				
			||||
			reader = lambda: pd.read_csv(args['file']) ;
 | 
				
			||||
		else:
 | 
				
			||||
			reader = lambda: Components().get(args)
 | 
				
			||||
		args['reader'] = reader
 | 
				
			||||
	
 | 
				
			||||
	if 'generate' in SYS_ARGS :
 | 
				
			||||
		#
 | 
				
			||||
		# Let us see if we have partitions given the log folder
 | 
				
			||||
		
 | 
				
			||||
		content = os.listdir( os.sep.join([args['logs'],args['context']]))
 | 
				
			||||
		generator = Components()
 | 
				
			||||
		if ''.join(content).isnumeric() :
 | 
				
			||||
			#
 | 
				
			||||
			# we have partitions we are working with
 | 
				
			||||
			
 | 
				
			||||
			for id in ''.join(content) :
 | 
				
			||||
				args['partition'] = id
 | 
				
			||||
				
 | 
				
			||||
				generator.generate(args)
 | 
				
			||||
		else:
 | 
				
			||||
			generator.generate(args)
 | 
				
			||||
		# Components.generate(args)
 | 
				
			||||
	elif 'listen' in args :
 | 
				
			||||
		#
 | 
				
			||||
		# This will start a worker just in case to listen to a queue
 | 
				
			||||
		if 'read' in SYS_ARGS :
 | 
				
			||||
			QUEUE_TYPE = 'queue.QueueReader'
 | 
				
			||||
			pointer = lambda qreader: qreader.read(1)
 | 
				
			||||
		else:
 | 
				
			||||
			QUEUE_TYPE = 'queue.QueueListener'
 | 
				
			||||
			pointer = lambda qlistener: qlistener.listen()
 | 
				
			||||
		N = int(SYS_ARGS['jobs']) if 'jobs' in SYS_ARGS else 1 
 | 
				
			||||
		
 | 
				
			||||
		qhandlers = [factory.instance(type=QUEUE_TYPE,args={'queue':'aou.io'}) for i in np.arange(N)]
 | 
				
			||||
		jobs = []
 | 
				
			||||
		for qhandler in qhandlers :
 | 
				
			||||
			qhandler.callback = Components.callback
 | 
				
			||||
			job = Process(target=pointer,args=(qhandler,))
 | 
				
			||||
			job.start()
 | 
				
			||||
			jobs.append(job)
 | 
				
			||||
		#
 | 
				
			||||
		# let us wait for the jobs
 | 
				
			||||
		print (["Started ",len(jobs)," trainers"])
 | 
				
			||||
		while len(jobs) > 0 :
 | 
				
			||||
			
 | 
				
			||||
			jobs = [job for job in jobs if job.is_alive()]
 | 
				
			||||
		
 | 
				
			||||
		# pointer(qhandler)
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
		# qreader.read(1)
 | 
				
			||||
		pass
 | 
				
			||||
	else:
 | 
				
			||||
		
 | 
				
			||||
		trainer = Components()
 | 
				
			||||
		trainer.train(**args)
 | 
				
			||||
		# Components.train(**args)
 | 
				
			||||
#for args in PIPELINE :
 | 
				
			||||
	#args['dataset'] = 'combined20190510'
 | 
				
			||||
	#process = Process(target=Components.train,args=(args,))
 | 
				
			||||
	#process.name = args['context']
 | 
				
			||||
	#process.start()
 | 
				
			||||
#	Components.train(args)
 | 
				
			||||
@ -0,0 +1,692 @@
 | 
				
			||||
#!/usr/bin/env python3
 | 
				
			||||
import json
 | 
				
			||||
from transport import factory
 | 
				
			||||
import numpy as np
 | 
				
			||||
import time
 | 
				
			||||
import os
 | 
				
			||||
from multiprocessing import Process, Lock
 | 
				
			||||
import pandas as pd
 | 
				
			||||
from google.oauth2 import service_account
 | 
				
			||||
from google.cloud import bigquery as bq
 | 
				
			||||
import data.maker
 | 
				
			||||
import copy
 | 
				
			||||
from data.params import SYS_ARGS 
 | 
				
			||||
                 
 | 
				
			||||
#
 | 
				
			||||
# The configuration array is now loaded and we will execute the pipe line as follows
 | 
				
			||||
 | 
				
			||||
class Components :
 | 
				
			||||
	lock = Lock()
 | 
				
			||||
	class KEYS :
 | 
				
			||||
		PIPELINE_KEY = 'pipeline'
 | 
				
			||||
		SQL_FILTER = 'filter'
 | 
				
			||||
	@staticmethod
 | 
				
			||||
	def get_filter (**args):
 | 
				
			||||
		if args['qualifier'] == 'IN' :
 | 
				
			||||
			return ' '.join([args['field'],args['qualifier'],'(',args['value'],')'])
 | 
				
			||||
		else:
 | 
				
			||||
			return ' '.join([args['field'],args['qualifier'],args['value']])
 | 
				
			||||
	@staticmethod
 | 
				
			||||
	def get_logger(**args) :
 | 
				
			||||
		return factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 | 
				
			||||
	@staticmethod
 | 
				
			||||
	def get(args):
 | 
				
			||||
		"""
 | 
				
			||||
		This function returns a data-frame provided a bigquery sql statement with conditions (and limits for testing purposes)
 | 
				
			||||
		The function must be wrapped around a lambda this makes testing easier and changing data stores transparent to the rest of the code. (Vital when testing)
 | 
				
			||||
		:sql	basic sql statement
 | 
				
			||||
		:condition	optional condition and filters
 | 
				
			||||
		"""
 | 
				
			||||
		SQL = args['sql']
 | 
				
			||||
		if Components.KEYS.SQL_FILTER in args :
 | 
				
			||||
			FILTER_KEY = Components.KEYS.SQL_FILTER
 | 
				
			||||
			SQL_FILTER = args[FILTER_KEY] if type(args[FILTER_KEY]) == list else [args[FILTER_KEY]]
 | 
				
			||||
			# condition = ' '.join([args[FILTER_KEY]['field'],args[FILTER_KEY]['qualifier'],'(',args[FILTER_KEY]['value'],')'])
 | 
				
			||||
			
 | 
				
			||||
			condition = ' AND '.join([Components.get_filter(**item) for item in SQL_FILTER])
 | 
				
			||||
			SQL = " ".join([SQL,'WHERE',condition])
 | 
				
			||||
 | 
				
			||||
		SQL = SQL.replace(':dataset',args['dataset']) #+ " LI "
 | 
				
			||||
		
 | 
				
			||||
		if 'limit' in args :
 | 
				
			||||
			SQL = SQL + ' LIMIT ' + args['limit']
 | 
				
			||||
		#
 | 
				
			||||
		# let's log the sql query that has been performed here
 | 
				
			||||
		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 | 
				
			||||
		logger.write({"module":"bigquery","action":"read","input":{"sql":SQL}})
 | 
				
			||||
		credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 | 
				
			||||
		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard')
 | 
				
			||||
		return df
 | 
				
			||||
		
 | 
				
			||||
		# return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna()
 | 
				
			||||
	@staticmethod
 | 
				
			||||
	def split(X,MAX_ROWS=3,PART_SIZE=3):
 | 
				
			||||
		
 | 
				
			||||
		return list(pd.cut( np.arange(X.shape[0]+1),PART_SIZE).categories)
 | 
				
			||||
	def format_schema(self,schema):
 | 
				
			||||
		_schema = {}
 | 
				
			||||
		for _item in schema :                
 | 
				
			||||
			_type = int
 | 
				
			||||
			_value = 0
 | 
				
			||||
			if _item.field_type == 'FLOAT' :
 | 
				
			||||
				_type =float
 | 
				
			||||
			elif _item.field_type != 'INTEGER' :
 | 
				
			||||
				_type = str
 | 
				
			||||
				_value = ''
 | 
				
			||||
			_schema[_item.name] = _type	
 | 
				
			||||
		return _schema	
 | 
				
			||||
	def get_ignore(self,**_args) :
 | 
				
			||||
		if 'columns' in _args and 'data' in _args :
 | 
				
			||||
			_df = _args['data']
 | 
				
			||||
			terms = _args['columns']
 | 
				
			||||
			return [name for name in _df.columns if  np.sum( [int(field in  name )for field in terms ])  ]
 | 
				
			||||
			
 | 
				
			||||
		return []
 | 
				
			||||
	def set_gpu(self,**_args) :
 | 
				
			||||
		if 'gpu' in _args :
 | 
				
			||||
			gpu = _args['gpu'] if type(_args['gpu']) != str else [_args['gpu']]
 | 
				
			||||
			_index = str(gpu[0])
 | 
				
			||||
			os.environ['CUDA_VISIBLE_DEVICES'] = _index
 | 
				
			||||
			return gpu
 | 
				
			||||
		else :
 | 
				
			||||
			return None
 | 
				
			||||
	def train(self,**args):
 | 
				
			||||
		"""
 | 
				
			||||
		This function will perform training on the basis of a given pointer that reads data
 | 
				
			||||
 | 
				
			||||
		"""
 | 
				
			||||
		schema = None
 | 
				
			||||
		if 'file' in args :
 | 
				
			||||
			
 | 
				
			||||
			df = pd.read_csv(args['file'])
 | 
				
			||||
			del args['file']
 | 
				
			||||
		elif 'data' not in args :
 | 
				
			||||
			
 | 
				
			||||
			reader = factory.instance(**args['store']['source'])
 | 
				
			||||
			
 | 
				
			||||
			
 | 
				
			||||
			if 'row_limit' in args :
 | 
				
			||||
				df = reader.read(sql=args['sql'],limit=args['row_limit'])
 | 
				
			||||
			else:
 | 
				
			||||
				df = reader.read(sql=args['sql'])		
 | 
				
			||||
			schema = reader.meta(table=args['from']) if hasattr(reader,'meta') and 'from' in args else None
 | 
				
			||||
		else:
 | 
				
			||||
			df = args['data']
 | 
				
			||||
		
 | 
				
			||||
		#
 | 
				
			||||
		#
 | 
				
			||||
		# df = df.fillna('')
 | 
				
			||||
		if schema :
 | 
				
			||||
			_schema = []
 | 
				
			||||
			for _item in schema :                
 | 
				
			||||
				_type = int
 | 
				
			||||
				_value = 0
 | 
				
			||||
				if _item.field_type == 'FLOAT' :
 | 
				
			||||
					_type =float
 | 
				
			||||
				elif _item.field_type != 'INTEGER' :
 | 
				
			||||
					_type = str
 | 
				
			||||
					_value = ''
 | 
				
			||||
				_schema += [{"name":_item.name,"type":_item.field_type}]
 | 
				
			||||
				df[_item.name] = df[_item.name].fillna(_value).astype(_type)
 | 
				
			||||
			args['schema'] = _schema
 | 
				
			||||
		# 		df[_item.name] = df[_item.name].astype(_type)
 | 
				
			||||
		_args = copy.deepcopy(args)
 | 
				
			||||
		# _args['store']  = args['store']['source']
 | 
				
			||||
		_args['data'] = df
 | 
				
			||||
		#
 | 
				
			||||
		# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
 | 
				
			||||
		if 'continuous' in args :
 | 
				
			||||
			x_cols = args['continuous']
 | 
				
			||||
		else:
 | 
				
			||||
			x_cols = []
 | 
				
			||||
 | 
				
			||||
		if 'ignore' in args and 'columns' in args['ignore'] :
 | 
				
			||||
			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
 | 
				
			||||
			_args['data'] = df[ list(set(df.columns)- set(_cols))]
 | 
				
			||||
		#
 | 
				
			||||
		# We need to make sure that continuous columns are removed 
 | 
				
			||||
		if x_cols :
 | 
				
			||||
			_args['data'] = _args['data'][list(set(_args['data'].columns) - set(x_cols))]
 | 
				
			||||
		if 'gpu' in args :
 | 
				
			||||
			_args['gpu'] = self.set_gpu(gpu=args['gpu'])
 | 
				
			||||
		if 'partition' in args :
 | 
				
			||||
			_args['partition'] = args['partition']
 | 
				
			||||
		if df.shape[0] and df.shape[0] :
 | 
				
			||||
			#
 | 
				
			||||
			# We have a full blown matrix to be processed 
 | 
				
			||||
			print ('-- Training --')
 | 
				
			||||
			data.maker.train(**_args)
 | 
				
			||||
		else:
 | 
				
			||||
			print ("... skipping training !!")
 | 
				
			||||
		
 | 
				
			||||
		if 'autopilot' in ( list(args.keys())) :
 | 
				
			||||
			
 | 
				
			||||
			args['data'] = df
 | 
				
			||||
			print (['autopilot mode enabled ....',args['context']])
 | 
				
			||||
			self.generate(args)
 | 
				
			||||
 | 
				
			||||
		pass
 | 
				
			||||
 | 
				
			||||
	def approximate(self,values):
 | 
				
			||||
		"""
 | 
				
			||||
		:param values	array of values to be approximated
 | 
				
			||||
		"""
 | 
				
			||||
		if values.dtype in [int,float] :
 | 
				
			||||
			#
 | 
				
			||||
			# @TODO: create bins?
 | 
				
			||||
			r = np.random.dirichlet(values+.001) #-- dirichlet doesn't work on values with zeros
 | 
				
			||||
			_sd = values[values > 0].std()
 | 
				
			||||
			_me = values[values > 0].mean()
 | 
				
			||||
			_mi = values.min()
 | 
				
			||||
			x = []
 | 
				
			||||
			_type = values.dtype
 | 
				
			||||
			for index in np.arange(values.size) :
 | 
				
			||||
				
 | 
				
			||||
				if np.random.choice([0,1],1)[0] :
 | 
				
			||||
					value = values[index] + (values[index] * r[index])
 | 
				
			||||
					
 | 
				
			||||
				else :
 | 
				
			||||
					value = values[index] - (values[index] * r[index])
 | 
				
			||||
				#
 | 
				
			||||
				# randomly shifting the measurements 
 | 
				
			||||
				if np.random.choice([0,1],1)[0] and _me > _sd :
 | 
				
			||||
					if np.random.choice([0,1],1)[0] :
 | 
				
			||||
						value = value * np.divide(_me,_sd)
 | 
				
			||||
					else:
 | 
				
			||||
						value = value + (np.divide(_me,_sd))
 | 
				
			||||
				value = int(value) if _type == int else np.round(value,2)
 | 
				
			||||
				x.append( value)
 | 
				
			||||
			np.random.shuffle(x)
 | 
				
			||||
			return np.array(x)
 | 
				
			||||
		else:
 | 
				
			||||
			return values
 | 
				
			||||
		pass
 | 
				
			||||
			
 | 
				
			||||
	def shuffle(self,_args):
 | 
				
			||||
		if 'data' in args :
 | 
				
			||||
			df = data['data'] 
 | 
				
			||||
		else:
 | 
				
			||||
			reader = factory.instance(**args['store']['source'])
 | 
				
			||||
			if 'file' in args :
 | 
				
			||||
				df = pd.read_csv(args['file'])
 | 
				
			||||
			elif 'data' in _args :
 | 
				
			||||
				df = _args['data']
 | 
				
			||||
			else:
 | 
				
			||||
				if 'row_limit' in args and 'sql' in args:
 | 
				
			||||
					df = reader.read(sql=args['sql'],limit=args['row_limit'])
 | 
				
			||||
				else:
 | 
				
			||||
					df = reader.read(sql=args['sql'])
 | 
				
			||||
			schema = None
 | 
				
			||||
			if 'schema' not in args and hasattr(reader,'meta') and 'file' not in args:
 | 
				
			||||
				schema = reader.meta(table=args['from'])
 | 
				
			||||
				schema = [{"name":_item.name,"type":_item.field_type} for _item in schema]
 | 
				
			||||
		#
 | 
				
			||||
		# We are shufling designated colmns and will be approximating the others
 | 
				
			||||
		#
 | 
				
			||||
		x_cols 	= []	#-- coumns tobe approximated.
 | 
				
			||||
		_cols 	= []	#-- columns to be ignored
 | 
				
			||||
		if 'continuous' in args :
 | 
				
			||||
			x_cols = args['continuous']
 | 
				
			||||
		if 'ignore' in args and 'columns' in args['ignore'] :
 | 
				
			||||
			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
 | 
				
			||||
		
 | 
				
			||||
		columns = args['columns'] if 'columns' in args else df.columns
 | 
				
			||||
		columns = list(set(columns) - set(_cols))
 | 
				
			||||
		for name in columns:
 | 
				
			||||
			i = np.arange(df.shape[0])
 | 
				
			||||
			np.random.shuffle(i)
 | 
				
			||||
			if name in x_cols :
 | 
				
			||||
				if df[name].unique().size > 0 :
 | 
				
			||||
					df[name] = self.approximate(df.iloc[i][name].fillna(0).values)
 | 
				
			||||
			# df[name] = df[name].astype(str)
 | 
				
			||||
		# 	pass
 | 
				
			||||
			
 | 
				
			||||
		df.index = np.arange(df.shape[0])		
 | 
				
			||||
		self.post(data=df,schema=schema,store=args['store']['target'])	
 | 
				
			||||
	def post(self,**_args) :
 | 
				
			||||
		table = _args['from'] if 'from' in _args else _args['store']['table']
 | 
				
			||||
		_schema = _args['schema'] if 'schema' in _args else None
 | 
				
			||||
		writer 	= factory.instance(**_args['store'])
 | 
				
			||||
		_df 	= _args['data']
 | 
				
			||||
		if _schema :
 | 
				
			||||
			columns = []
 | 
				
			||||
			for _item in _schema :
 | 
				
			||||
				name = _item['name']
 | 
				
			||||
				_type = str
 | 
				
			||||
				_value = 0
 | 
				
			||||
				if _item['type'] in ['DATE','TIMESTAMP','DATETIMESTAMP','DATETIME'] :
 | 
				
			||||
					if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
 | 
				
			||||
						#
 | 
				
			||||
						# There is an issue with missing dates that needs to be resolved.
 | 
				
			||||
						# for some reason a missing date/time here will cause the types to turn into timestamp (problem)
 | 
				
			||||
						#	The following is a hack to address the issue (alas) assuming 10 digit dates and 'NaT' replaces missing date values (pandas specifications)
 | 
				
			||||
						#
 | 
				
			||||
						_df[name] = _df[name].apply(lambda value: None if str(value) == 'NaT' else (str(value)[:10]) if _item['type'] in ['DATE','DATETIME'] else str(value))
 | 
				
			||||
						#_df[name] = _df[name].dt.date
 | 
				
			||||
						# _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce')
 | 
				
			||||
					else:
 | 
				
			||||
						pass
 | 
				
			||||
						_df[name] = pd.to_datetime(_df[name])
 | 
				
			||||
				else:
 | 
				
			||||
					value = 0
 | 
				
			||||
					if _item['type'] == 'INTEGER' :
 | 
				
			||||
						_type = np.int64
 | 
				
			||||
					elif _item['type'] in ['FLOAT','NUMERIC']:
 | 
				
			||||
						_type = np.float64
 | 
				
			||||
					else:
 | 
				
			||||
						
 | 
				
			||||
						_value = ''
 | 
				
			||||
					_df[name] = _df[name].fillna(_value) #.astype(_type)
 | 
				
			||||
				columns.append(name)
 | 
				
			||||
		
 | 
				
			||||
		fields = _df.columns.tolist()
 | 
				
			||||
		if not writer.has(table=table) and _args['store']['provider'] != 'bigquery':
 | 
				
			||||
			
 | 
				
			||||
			_map = {'STRING':'VARCHAR(256)','INTEGER':'BIGINT'} if 'provider' in _args['store'] and _args['store']['provider'] != 'bigquery' else {}
 | 
				
			||||
			_params = {'map':_map,'table':args['from']}
 | 
				
			||||
			if _schema :
 | 
				
			||||
				_params['schema'] = _schema
 | 
				
			||||
				
 | 
				
			||||
			else:
 | 
				
			||||
				_params['fields'] = fields
 | 
				
			||||
			
 | 
				
			||||
			writer.make(**_params)
 | 
				
			||||
			
 | 
				
			||||
		fields = _df.columns.tolist()
 | 
				
			||||
		_df = _df[fields]
 | 
				
			||||
		# writer.fields = fields
 | 
				
			||||
		if _args['store']['provider'] == 'bigquery' :
 | 
				
			||||
			print (['_______ POSTING ______________ ',table])
 | 
				
			||||
			print (['_______________ ',_df.shape[0],' ___________________'])
 | 
				
			||||
			writer.write(_df.astype(object),schema=_schema,table=table)
 | 
				
			||||
		else:
 | 
				
			||||
			writer.table = table	
 | 
				
			||||
			writer.write(_df)
 | 
				
			||||
		# else:
 | 
				
			||||
		# 	writer.write(_df,table=args['from'])
 | 
				
			||||
		
 | 
				
			||||
 | 
				
			||||
	def finalize(self,args):
 | 
				
			||||
		"""
 | 
				
			||||
		This function performs post-processing opertions on a synthetic table i.e :
 | 
				
			||||
			- remove duplicate keys
 | 
				
			||||
			- remove orphaned keys i.e 
 | 
				
			||||
		"""
 | 
				
			||||
		reader = factory.instance(**args['store']['source'])
 | 
				
			||||
		logger = factory.instance(**args['store']['logs'])
 | 
				
			||||
		
 | 
				
			||||
		target = args['store']['target']['args']['dataset'] 
 | 
				
			||||
		source = args['store']['source']['args']['dataset'] 
 | 
				
			||||
		table = args['from']
 | 
				
			||||
		schema = reader.meta(table=args['from'])
 | 
				
			||||
		#
 | 
				
			||||
		# keys :
 | 
				
			||||
		unique_field = "_".join([args['from'],'id']) if 'unique_fields' not in args else args['unique_fields']
 | 
				
			||||
		fields = [ item.name if item.name != unique_field else "y."+item.name for item in schema]
 | 
				
			||||
		SQL = [
 | 
				
			||||
			"SELECT :fields FROM ",
 | 
				
			||||
			"(SELECT ROW_NUMBER() OVER() AS row_number,* FROM :target.:table) x","INNER JOIN",
 | 
				
			||||
			"(SELECT ROW_NUMBER() OVER() AS row_number, :unique_field FROM :source.:table ORDER BY RAND()) y",
 | 
				
			||||
			"ON y.row_number = x.row_number"
 | 
				
			||||
		]
 | 
				
			||||
		SQL = " ".join(SQL).replace(":fields",",".join(fields)).replace(":table",table).replace(":source",source).replace(":target",target)
 | 
				
			||||
		SQL = SQL.replace(":unique_field",unique_field)
 | 
				
			||||
		#
 | 
				
			||||
		# Use a native job to get this done ...
 | 
				
			||||
		#
 | 
				
			||||
		client      = bq.Client.from_service_account_json(args['store']['source']['args']["private_key"])
 | 
				
			||||
		job = bq.QueryJobConfig()
 | 
				
			||||
		job.destination = client.dataset(target).table(table)
 | 
				
			||||
		job.use_query_cache = True
 | 
				
			||||
		job.allow_large_results = True
 | 
				
			||||
		# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
 | 
				
			||||
		job.write_disposition = "WRITE_TRUNCATE"
 | 
				
			||||
		job.priority = 'BATCH'
 | 
				
			||||
		r = client.query(SQL,location='US',job_config=job)		
 | 
				
			||||
		logger.write({"job":r.job_id,"action":"finalize", "args":{"sql":SQL,"source":"".join([source,table]),"destimation":".".join([target,table])}})
 | 
				
			||||
		#
 | 
				
			||||
		# Keep a log of what just happened...
 | 
				
			||||
		#
 | 
				
			||||
		otable = ".".join([args['store']['source']['args']['dataset'],args['from']])
 | 
				
			||||
		dtable = ".".join([args['store']['target']['args']['dataset'],args['from']])
 | 
				
			||||
	def generate(self,args):
 | 
				
			||||
		"""
 | 
				
			||||
		This function will generate data and store it to a given,
 | 
				
			||||
		"""
 | 
				
			||||
		store = args['store']['logs']
 | 
				
			||||
		if 'args' in store :
 | 
				
			||||
			store['args']['doc'] = args['context']
 | 
				
			||||
		else:
 | 
				
			||||
			store['doc'] = args['context']
 | 
				
			||||
		logger = factory.instance(**store) #type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 | 
				
			||||
 | 
				
			||||
		ostore = args['store']['target']
 | 
				
			||||
		writer = factory.instance(**ostore)
 | 
				
			||||
 | 
				
			||||
		schema = args['schema'] if 'schema' in args else None
 | 
				
			||||
		if 'data' in args :
 | 
				
			||||
			
 | 
				
			||||
			df = args['data']
 | 
				
			||||
		else:
 | 
				
			||||
 | 
				
			||||
			reader = factory.instance(**args['store']['source'])
 | 
				
			||||
			if 'row_limit' in args :
 | 
				
			||||
				df = reader.read(sql=args['sql'],limit=args['row_limit'])
 | 
				
			||||
			else:
 | 
				
			||||
				df = reader.read(sql=args['sql'])
 | 
				
			||||
			if 'schema' not in args and hasattr(reader,'meta'):
 | 
				
			||||
				schema = reader.meta(table=args['from'])
 | 
				
			||||
				schema = [{"name":_item.name,"type":_item.field_type} for _item in schema]
 | 
				
			||||
 | 
				
			||||
			# else:
 | 
				
			||||
			# 	#
 | 
				
			||||
			# 	# This will account for autopilot mode ...
 | 
				
			||||
			# 	df = args['data']
 | 
				
			||||
		_cast = {}
 | 
				
			||||
		if schema :
 | 
				
			||||
			
 | 
				
			||||
			for _item in schema :
 | 
				
			||||
				dtype = str
 | 
				
			||||
				name = _item['name']
 | 
				
			||||
				novalue = 0
 | 
				
			||||
				if _item['type'] in ['INTEGER','NUMERIC']:
 | 
				
			||||
					dtype = np.int64
 | 
				
			||||
					
 | 
				
			||||
				elif _item['type'] == 'FLOAT' :
 | 
				
			||||
					dtype = np.float64
 | 
				
			||||
				else:
 | 
				
			||||
					novalue = ''
 | 
				
			||||
				# _cast[schema['name']] = dtype
 | 
				
			||||
				df[name] = df[name].fillna(novalue).astype(dtype)
 | 
				
			||||
 | 
				
			||||
		_info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[1]},"schema":schema}
 | 
				
			||||
		logger.write(_info)
 | 
				
			||||
		
 | 
				
			||||
				
 | 
				
			||||
		_dc = pd.DataFrame()
 | 
				
			||||
		# for mdf in df :
 | 
				
			||||
		args['data'] = df.copy()
 | 
				
			||||
		#
 | 
				
			||||
		# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
 | 
				
			||||
		if 'continuous' in args :
 | 
				
			||||
			x_cols = args['continuous']
 | 
				
			||||
		else:
 | 
				
			||||
			x_cols = []
 | 
				
			||||
 | 
				
			||||
		if 'ignore' in args and 'columns' in args['ignore'] :
 | 
				
			||||
			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
 | 
				
			||||
			args['data'] = args['data'][ list(set(df.columns)- set(_cols))]
 | 
				
			||||
		#
 | 
				
			||||
		# We need to remove the continuous columns from the data-frame
 | 
				
			||||
		# @TODO: Abstract this !!
 | 
				
			||||
		#
 | 
				
			||||
		real_df = pd.DataFrame()
 | 
				
			||||
		if x_cols :
 | 
				
			||||
			args['data'] = args['data'][list(set(args['data'].columns) - set(x_cols))]
 | 
				
			||||
			real_df = df[x_cols].copy()
 | 
				
			||||
		
 | 
				
			||||
		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
 | 
				
			||||
		if 'gpu' in args :
 | 
				
			||||
			args['gpu'] = self.set_gpu(gpu=args['gpu'])
 | 
				
			||||
		# if 'partition' in args :
 | 
				
			||||
		# 	args['logs'] = os.sep.join([args['logs'],str(args['partition'])])
 | 
				
			||||
		
 | 
				
			||||
		_info = {"module":"gan-prep","action":"prune","shape":{"rows":args['data'].shape[0],"columns":args['data'].shape[1]}}
 | 
				
			||||
		logger.write(_info)
 | 
				
			||||
		if args['data'].shape[0] > 0 and args['data'].shape[1] > 0 :
 | 
				
			||||
			candidates = (data.maker.generate(**args))
 | 
				
			||||
			
 | 
				
			||||
		else:
 | 
				
			||||
			candidates = [df]
 | 
				
			||||
		
 | 
				
			||||
		# if 'sql.BQWriter' in ostore['type'] :
 | 
				
			||||
		_columns = None
 | 
				
			||||
		skip_columns = []
 | 
				
			||||
		_schema = schema
 | 
				
			||||
		if schema :
 | 
				
			||||
			cols = [_item['name'] for _item in _schema]
 | 
				
			||||
		else:
 | 
				
			||||
			cols = df.columns.tolist()
 | 
				
			||||
		_info = {"module":"gan-prep","action":"selection","input":{"candidates":len(candidates),"features":cols}}
 | 
				
			||||
		logger.write(_info)
 | 
				
			||||
		for _df in candidates :
 | 
				
			||||
			#
 | 
				
			||||
			# we need to format the fields here to make sure we have something cohesive
 | 
				
			||||
			#
 | 
				
			||||
 | 
				
			||||
			if not skip_columns :
 | 
				
			||||
				if 'ignore' in args and 'columns' in args['ignore'] :
 | 
				
			||||
						skip_columns = self.get_ignore(data=_df,columns=args['ignore']['columns'])
 | 
				
			||||
			#
 | 
				
			||||
			# We perform a series of set operations to insure that the following conditions are met:
 | 
				
			||||
			#	- the synthetic dataset only has fields that need to be synthesized
 | 
				
			||||
			#	- The original dataset has all the fields except those that need to be synthesized
 | 
				
			||||
			#
 | 
				
			||||
			
 | 
				
			||||
			_df = _df[list(set(_df.columns)  - set(skip_columns))].copy()
 | 
				
			||||
			if x_cols :
 | 
				
			||||
				_approx = {}
 | 
				
			||||
				for _col in x_cols :
 | 
				
			||||
					if real_df[_col].unique().size > 0 :
 | 
				
			||||
						
 | 
				
			||||
 | 
				
			||||
						_df[_col] = self.approximate(real_df[_col].values)
 | 
				
			||||
						_approx[_col] = {
 | 
				
			||||
							"io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)},
 | 
				
			||||
							"real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}
 | 
				
			||||
						}
 | 
				
			||||
					else:
 | 
				
			||||
						_df[_col] = -1
 | 
				
			||||
				logger.write({"module":"gan-generate","action":"approximate","status":_approx})				
 | 
				
			||||
			if set(df.columns) & set(_df.columns) :
 | 
				
			||||
				_columns = list(set(df.columns) - set(_df.columns))											
 | 
				
			||||
				df = df[_columns]
 | 
				
			||||
 | 
				
			||||
			#
 | 
				
			||||
			# Let us merge the dataset here and and have a comprehensive dataset
 | 
				
			||||
 | 
				
			||||
			_df = pd.DataFrame.join(df,_df)
 | 
				
			||||
			_params = {'data':_df,'store' : ostore,'from':args['from']}
 | 
				
			||||
			if _schema :
 | 
				
			||||
				_params ['schema'] = _schema
 | 
				
			||||
			_info = {"module":"gan-prep","action":"write","input":{"rows":_df.shape[0],"cols":_df.shape[1]}}
 | 
				
			||||
			logger.write(_info)
 | 
				
			||||
			self.post(**_params)
 | 
				
			||||
			# print (['_______ posting _________________',_df.shape])
 | 
				
			||||
			break
 | 
				
			||||
			
 | 
				
			||||
			
 | 
				
			||||
			pass
 | 
				
			||||
		# else:
 | 
				
			||||
		# 	pass
 | 
				
			||||
	def bind(self,**_args):
 | 
				
			||||
		print (_args)
 | 
				
			||||
	
 | 
				
			||||
	
 | 
				
			||||
if __name__ == '__main__' :
 | 
				
			||||
	filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json'
 | 
				
			||||
	f = open (filename)
 | 
				
			||||
	_config = json.loads(f.read())
 | 
				
			||||
	f.close()	
 | 
				
			||||
	PIPELINE = _config['pipeline']
 | 
				
			||||
	index = SYS_ARGS['index']
 | 
				
			||||
	if index.isnumeric() :
 | 
				
			||||
		index = int(SYS_ARGS['index']) 
 | 
				
			||||
	else:
 | 
				
			||||
		#
 | 
				
			||||
		# The index provided is a key to a pipeline entry mainly the context
 | 
				
			||||
		#
 | 
				
			||||
		N = len(PIPELINE)
 | 
				
			||||
		f = [i for i in range(0,N) if PIPELINE[i]['context'] == index]
 | 
				
			||||
		index = f[0] if f else 0
 | 
				
			||||
	#
 | 
				
			||||
	
 | 
				
			||||
	print ("..::: ",PIPELINE[index]['context'],':::..')
 | 
				
			||||
	args =  (PIPELINE[index])
 | 
				
			||||
	for key in _config :
 | 
				
			||||
		if key == 'pipeline' or key in args:
 | 
				
			||||
			#
 | 
				
			||||
			# skip in case of pipeline or if key exists in the selected pipeline (provided by index)
 | 
				
			||||
			# 
 | 
				
			||||
			continue
 | 
				
			||||
		args[key] = _config[key]
 | 
				
			||||
	
 | 
				
			||||
	args = dict(args,**SYS_ARGS)
 | 
				
			||||
	if 'matrix_size' in args :
 | 
				
			||||
		args['matrix_size'] = int(args['matrix_size'])
 | 
				
			||||
	if 'batch_size' not in args :
 | 
				
			||||
		args['batch_size']	= 2000 #if 'batch_size' not in args else int(args['batch_size'])
 | 
				
			||||
	if 'dataset' not in args :
 | 
				
			||||
		args['dataset'] = 'combined20191004v2_deid'
 | 
				
			||||
	args['logs'] = args['logs'] if 'logs' in args else 'logs'
 | 
				
			||||
	PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
 | 
				
			||||
	#
 | 
				
			||||
	# @TODO:
 | 
				
			||||
	#	Log what was initiated so we have context of this processing ...
 | 
				
			||||
	#
 | 
				
			||||
 | 
				
			||||
	GPU_CHIPS = args['gpu'] if 'gpu' in args else None
 | 
				
			||||
	if GPU_CHIPS and type(GPU_CHIPS) != list :				
 | 
				
			||||
		GPU_CHIPS = [int(_id.strip()) for _id in GPU_CHIPS.split(',')] if type(GPU_CHIPS) == str else [GPU_CHIPS]
 | 
				
			||||
	if 'gpu' in SYS_ARGS :
 | 
				
			||||
		args['gpu'] = GPU_CHIPS
 | 
				
			||||
	jobs = []
 | 
				
			||||
	if 'generate' in SYS_ARGS :
 | 
				
			||||
		#
 | 
				
			||||
		# Let us see if we have partitions given the log folder
 | 
				
			||||
		
 | 
				
			||||
		content = os.listdir( os.sep.join([args['logs'],'train',args['context']]))
 | 
				
			||||
		if 'all-chips' in SYS_ARGS and GPU_CHIPS:
 | 
				
			||||
			index = 0
 | 
				
			||||
			jobs = []
 | 
				
			||||
			for _gpu in GPU_CHIPS :
 | 
				
			||||
				_args = copy.deepcopy(args)
 | 
				
			||||
				_args['gpu'] = [int(_gpu)]
 | 
				
			||||
				_args['partition'] = int(_gpu) #index
 | 
				
			||||
				index += 1
 | 
				
			||||
				make = lambda _params: (Components()).generate(_params)
 | 
				
			||||
				job = Process(target=make,args=( dict(_args),))
 | 
				
			||||
				job.name = 'Trainer # ' + str(index)
 | 
				
			||||
				job.start()
 | 
				
			||||
				jobs.append(job)
 | 
				
			||||
			pass
 | 
				
			||||
		else:
 | 
				
			||||
			
 | 
				
			||||
			generator = Components()
 | 
				
			||||
			generator.generate(args)
 | 
				
			||||
	elif 'bind' in SYS_ARGS :
 | 
				
			||||
		import binder
 | 
				
			||||
		_args = _config['_map']
 | 
				
			||||
		_args['store'] = copy.deepcopy(_config['store'])
 | 
				
			||||
		if 'init' in SYS_ARGS :
 | 
				
			||||
			#
 | 
				
			||||
			# Creating and persisting the map ...
 | 
				
			||||
			print (['.... Binding Initialization'])
 | 
				
			||||
			# jobs = binder.Init(**_args)
 | 
				
			||||
			_mapped = binder.Init(**_args)
 | 
				
			||||
			
 | 
				
			||||
			
 | 
				
			||||
			_schema = [{"name":_name,"type":"INTEGER"} for _name in _mapped.columns.tolist()]
 | 
				
			||||
			publisher = lambda _params: (Components()).post(**_params)
 | 
				
			||||
			_args = {'data':_mapped,'store':_config['store']['target']}
 | 
				
			||||
			_args['store']['table'] = '_map'
 | 
				
			||||
			if _args['store']['provider'] =='bigquery' :
 | 
				
			||||
				_args['schema'] = _schema
 | 
				
			||||
			
 | 
				
			||||
			job = Process (target = publisher,args=(_args,))
 | 
				
			||||
			job.start()
 | 
				
			||||
			jobs = [job]
 | 
				
			||||
		else:
 | 
				
			||||
			#
 | 
				
			||||
			# Applying the map of k on a particular dataset
 | 
				
			||||
			#
 | 
				
			||||
			index = int(SYS_ARGS['index'])
 | 
				
			||||
			_args['config'] = _config['pipeline'][index]
 | 
				
			||||
			_args['original_key'] = 'person_id' if 'original_key' in _config else 'person_id'
 | 
				
			||||
			table = _config['pipeline'][index]['from']
 | 
				
			||||
			_df =  binder.ApplyOn(**_args)
 | 
				
			||||
			_df = np.array_split(_df,PART_SIZE)
 | 
				
			||||
			jobs = []
 | 
				
			||||
			print (['Publishing ',PART_SIZE,' PARTITION'])
 | 
				
			||||
			for data in _df :
 | 
				
			||||
				publisher = lambda _params: ( Components() ).post(**_params)
 | 
				
			||||
				_args = {'data':data,'store':_config['store']['target']}
 | 
				
			||||
				_args['store']['table'] = table
 | 
				
			||||
				print (_args['store'])
 | 
				
			||||
				job = Process(target = publisher,args=(_args,))
 | 
				
			||||
				job.name = "Publisher "+str(len(jobs)+1)
 | 
				
			||||
				job.start()
 | 
				
			||||
				jobs.append(job)
 | 
				
			||||
			
 | 
				
			||||
	elif 'shuffle' in SYS_ARGS :
 | 
				
			||||
		index = 0
 | 
				
			||||
		if GPU_CHIPS and 'all-chips' in SYS_ARGS:
 | 
				
			||||
 | 
				
			||||
			for index in GPU_CHIPS :
 | 
				
			||||
				publisher = lambda _params: ( Components() ).shuffle(_params)
 | 
				
			||||
				job = Process (target = publisher,args=( args,))
 | 
				
			||||
				job.name = 'Shuffler #' + str(index)
 | 
				
			||||
				job.start()
 | 
				
			||||
				jobs.append(job)
 | 
				
			||||
		else:
 | 
				
			||||
			shuffler = Components()
 | 
				
			||||
			shuffler.shuffle(args)
 | 
				
			||||
		pass
 | 
				
			||||
	elif 'train' in SYS_ARGS:
 | 
				
			||||
		
 | 
				
			||||
		# DATA  = np.array_split(DATA,PART_SIZE)
 | 
				
			||||
		#
 | 
				
			||||
		# Let us create n-jobs across n-gpus, The assumption here is the data that is produced will be a partition
 | 
				
			||||
		# @TODO: Find better name for partition
 | 
				
			||||
		#
 | 
				
			||||
		
 | 
				
			||||
		if GPU_CHIPS and 'all-chips' in SYS_ARGS:
 | 
				
			||||
			index = 0
 | 
				
			||||
			print (['... launching ',len(GPU_CHIPS),' jobs',args['context']])
 | 
				
			||||
			for _gpu in GPU_CHIPS :
 | 
				
			||||
				_args = copy.deepcopy(args)
 | 
				
			||||
				_args['gpu'] = [int(_gpu)]
 | 
				
			||||
				_args['partition'] = int(_gpu) #index
 | 
				
			||||
				index += 1
 | 
				
			||||
				make = lambda _params: (Components()).train(**_params)
 | 
				
			||||
				job = Process(target=make,args=( _args,))
 | 
				
			||||
				job.name = 'Trainer # ' + str(index)
 | 
				
			||||
				job.start()
 | 
				
			||||
				jobs.append(job)
 | 
				
			||||
			
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
		else:
 | 
				
			||||
			#
 | 
				
			||||
			# The choice of the chip will be made internally
 | 
				
			||||
			
 | 
				
			||||
			agent = Components()
 | 
				
			||||
			agent.train(**args)
 | 
				
			||||
		#
 | 
				
			||||
		# If we have any obs we should wait till they finish
 | 
				
			||||
		#
 | 
				
			||||
	DIRTY = 0
 | 
				
			||||
	if (len(jobs)) :
 | 
				
			||||
		print (['.... waiting on ',len(jobs),' jobs'])
 | 
				
			||||
	while len(jobs)> 0 :
 | 
				
			||||
		DIRTY =1
 | 
				
			||||
		jobs = [job for job in jobs if job.is_alive()]
 | 
				
			||||
		time.sleep(2)
 | 
				
			||||
	if DIRTY:
 | 
				
			||||
		print (["..:: jobs finished "])
 | 
				
			||||
	#
 | 
				
			||||
	# We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations
 | 
				
			||||
	# This holds true for bigquery - bigquery only
 | 
				
			||||
	IS_BIGQUERY = _config['store']['source']['provider'] == _config['store']['target']['provider'] and _config['store']['source']['provider'] == 'bigquery'
 | 
				
			||||
 | 
				
			||||
	# if 'bind' not in SYS_ARGS and IS_BIGQUERY and ('autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS  or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS)) :
 | 
				
			||||
	# 	#
 | 
				
			||||
	# 	# We should pull all the primary keys and regenerate them in order to insure some form of consistency
 | 
				
			||||
	# 	#
 | 
				
			||||
 | 
				
			||||
	# 	#
 | 
				
			||||
	# 	#
 | 
				
			||||
		
 | 
				
			||||
	# 	print (["..:: Finalizing process"])
 | 
				
			||||
	# 	(Components()).finalize(args)
 | 
				
			||||
					Loading…
					
					
				
		Reference in new issue