diff --git a/README.md b/README.md index f3c92ed..32224c2 100644 --- a/README.md +++ b/README.md @@ -13,17 +13,19 @@ This package is designed to generate synthetic data from a dataset from an origi After installing the easiest way to get started is as follows (using pandas). The process is as follows: +Read about [data-transport on github](https://github.com/lnyemba/data-transport) or on [healthcareio.the-phi.com/git/code/transport](https://healthcareio.the-phi.com/git/code/transport.git) + **Train the GAN on the original/raw dataset** +1. We define the data sources + +The sources will consists in source, target and logger20. import pandas as pd import data.maker + import transport + from transport import providers - df = pd.read_csv('sample.csv') - column = 'gender' - id = 'id' - context = 'demo' - data.maker.train(context=context,data=df,column=column,id=id,logs='logs') The trainer will store the data on disk (for now) in a structured folder that will hold training models that will be used to generate the synthetic data. diff --git a/bin/data-maker b/bin/data-maker new file mode 120000 index 0000000..f63f773 --- /dev/null +++ b/bin/data-maker @@ -0,0 +1 @@ +pipeline.py \ No newline at end of file diff --git a/binder.py b/binder.py new file mode 100644 index 0000000..5379d62 --- /dev/null +++ b/binder.py @@ -0,0 +1,377 @@ +#!/usr/bin/env python3 +""" +This file will perform basic tasks to finalize the GAN process by performing the following : + - basic stats & analytics + - rebuild io to another dataset +""" +import pandas as pd +import numpy as np +from multiprocessing import Process, Lock +from google.oauth2 import service_account +from google.cloud import bigquery as bq +import transport +from data.params import SYS_ARGS +import json + +import pandas as pd +import numpy as np +from google.oauth2 import service_account +import json + +# path = '../curation-prod.json' +# credentials = service_account.Credentials.from_service_account_file(path) +# df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard') +filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config'] +f = open(filename) +config = json.loads(f.read()) +args = config['pipeline'] +f.close() + +def _formatSQL(**_args): + """ + This function will build the _map for a given segment + """ + sql = """ + select DISTINCT x.person_id synthetic,y.person_id original + FROM :synthetic.:table x + INNER JOIN :original.:table y on x.person_id in (:ids) + AND x.person_id <> y.person_id AND x.gender_source_value = y.gender_source_value + AND x.year_of_birth = y.year_of_birth + ORDER BY 1 + """ + table= _args['table'] + original,synthetic = _args['schema']['original'],_args['schema']['synthetic'] + _ids = np.array(_args['ids']).astype(str) + return sql.replace(":ids",",".join(_ids)).replace(":synthetic",synthetic).replace(":original",original).replace(":table",table) +def _addCounts(**_args) : + store = _args['store'] + sql = _args['sql'] + reader = transport.factory.instance(**store['source']) + _df = reader.read(sql=sql) + _ids = _df.synthetic.unique() + _counts = [ np.sum(_df.synthetic == value) for value in _ids] + original = [_df[_df.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids] + _df = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts}) + + # + # We can post this to the backend ... + # + table = '_map' #-- Yes this is hard-coded + writer = transport.factory.instance(**dict(store['target'],**{"parallel":True,"table":table})) + # if writer.has(table=table) is False: + # writer.write(_df) + # else: + _schema = [{"name":name,"type":"INTEGER"} for name in _df.columns] + writer.write(_df,schema=_schema) + + + + + +def Init(**_args) : + """ + This function will build a map of the synthetic to real individuals. + The assumption is that the synthesized data is stored in the same data-store as the original the parameters provided are : + :param store object from the configuration file with source,target entries + :param table name of the original/synthetic tables (they should be the same) + :param feat. featuress/attributes ... demographics to account for + """ + store = _args['store'] + reader = transport.factory.instance(**store['source']) + original,synthetic = _args['schema']['original'],_args['schema']['synthetic'] + table = _args['table'] + sql = _args['sql'].replace(':synthetic',synthetic).replace(':original',original).replace(':table',table) + + _map = reader.read(sql=sql) + + + + k = _args['k'] if 'k' in _args else 2 + # _iodf = reader.read(table=table) + # _ids = _iodf['person_id'].unique().tolist() + # x_ = np.array_split(_ids,1000) + jobs = [] + # for _items in x_ : + # _p = {"ids":_items,"schema":_args['schema'],'store':store,'table':table} + # sql = _formatSQL(**_p) + # _p['sql'] = sql + # _apply = lambda params: _addCounts(**params) + # thread = Process(target=_apply,args=(_p,)) + # thread.start() + # jobs.append(thread) + + # return jobs + # + # We have performed a m:m (many-to-many) relationship with original participants and synthetic participants + # The goal is to obtain a singular map against which records will be migrated + # + print (['... computing counts (k)']) + _ids = _map.synthetic.unique() + _counts = [ np.sum(_map.synthetic == value) for value in _ids] + original = [_map[_map.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids] + print (['Building k-classes/groups']) + _mdf = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts}) + i = _mdf.apply(lambda row: row.counts >= k,axis=1) + _mdf = _mdf[i] + # + # Log what just happened here so we know about the equivalence classes, + # {"module":"binder","action":"map-generation","input":{"k":k,"rows":{"synthetic":_mdf.shape[0],"original":len(_counts)}}} + + return _mdf + # + # now we are posting this to target storage ... + # +def ApplyOn (**_args): + """ + This function will rewrite SQL that applies the synthetic identifier to the entries of the pipeline + We assume that the _map has two attributes (synthetic and original) + :param store + :param _config + """ + store_args = _args['store'] + _config = _args['config'] + + table = _config['from'] + reader = transport.factory.instance(**dict(store_args['source'],**{"table":table})) + attr = reader.read(limit=1).columns.tolist() + original_key = _args['original_key'] #-- assuming referential integrity + + # synthetic_key= columns['synthetic'] + # mapped_original=columns['orginal'] + fields = list(set(attr) - set([original_key])) + sql = "select _map.synthetic as :original_key,:fields from :original_schema.:table inner join :synthetic_schema._map on _map.original = :table.:original_key" + sql = sql.replace(":table",table).replace(":fields",",".join(fields)) + sql = sql.replace(":original_key",original_key) + _schema = _args['schema'] + sql = sql.replace(":original_schema",_schema['original']).replace(":synthetic_schema",_schema['synthetic']) + + return reader.read (sql=sql) + +if __name__ == '__main__' : + pass + +# class Analytics : +# """ +# This class will compile basic analytics about a given dataset i.e compare original/synthetic +# """ +# @staticmethod +# def distribution(**args): +# context = args['context'] +# df = args['data'] +# # +# #-- This data frame counts unique values for each feature (space) +# df_counts = pd.DataFrame(df.apply(lambda col: col.unique().size),columns=['counts']).T # unique counts +# # +# #-- Get the distributions for common values +# # +# names = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False] +# ddf = df.apply(lambda col: pd.DataFrame(col.values,columns=[col.name]).groupby([col.name]).size() ).fillna(0) +# ddf[context] = ddf.index + +# pass +# def distance(**args): +# """ +# This function will measure the distance between +# """ +# pass +# class Utils : +# @staticmethod +# def log(**args): +# logger = transport.factory.instance(type="mongo.MongoWriter",args={"dbname":"aou","doc":"logs"}) +# logger.write(args) +# logger.close() +# class get : +# @staticmethod +# def pipeline(table,path) : +# # contexts = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts'] +# config = json.loads((open(path)).read()) +# pipeline = config['pipeline'] +# # return [ item for item in pipeline if item['context'] in contexts] +# pipeline = [item for item in pipeline if 'from' in item and item['from'].strip() == table] +# Utils.log(module=table,action='init',input={"pipeline":pipeline}) +# return pipeline +# @staticmethod +# def sql(**args) : +# """ +# This function is intended to build SQL query for the remainder of the table that was not synthesized +# :config configuration entries +# :from source of the table name +# :dataset name of the source dataset + +# """ +# SQL = ["SELECT * FROM :from "] +# SQL_FILTER = [] +# NO_FILTERS_FOUND = True +# # pipeline = Utils.get.config(**args) +# pipeline = args['pipeline'] +# REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='} +# for item in pipeline : + + +# if 'filter' in item : +# if NO_FILTERS_FOUND : +# NO_FILTERS_FOUND = False +# SQL += ['WHERE'] +# # +# # Let us load the filter in the SQL Query +# FILTER = item['filter'] +# QUALIFIER = REVERSE_QUALIFIER[FILTER['qualifier'].upper()] +# SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')']).replace(":dataset",args['dataset'])] +# src = ".".join([args['dataset'],args['from']]) +# SQL += [" AND ".join(SQL_FILTER)] +# # +# # let's pull the field schemas out of the table definition +# # +# Utils.log(module=args['from'],action='sql',input={"sql":" ".join(SQL) }) +# return " ".join(SQL).replace(":from",src) + + +# def mk(**args) : +# dataset = args['dataset'] +# client = args['client'] if 'client' in args else bq.Client.from_service_account_file(args['private_key']) +# # +# # let us see if we have a dataset handy here +# # +# datasets = list(client.list_datasets()) +# found = [item for item in datasets if item.dataset_id == dataset] + +# if not found : + +# return client.create_dataset(dataset) +# return found[0] + +# def move (args): +# """ +# This function will move a table from the synthetic dataset into a designated location +# This is the simplest case for finalizing a synthetic data set +# :private_key +# """ +# pipeline = Utils.get.pipeline(args['from'],args['config']) +# _args = json.loads((open(args['config'])).read()) +# _args['pipeline'] = pipeline +# # del _args['pipeline'] +# args = dict(args,**_args) +# # del args['pipeline'] +# # private_key = args['private_key'] +# client = bq.Client.from_service_account_json(args['private_key']) + +# dataset = args['dataset'] +# if pipeline : +# SQL = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in pipeline] +# SQL += [Utils.get.sql(**args)] +# SQL = ('\n UNION ALL \n'.join(SQL).replace(':dataset','io')) +# else: +# # +# # moving a table to a designated location +# tablename = args['from'] +# if 'sql' not in args : +# SQL = "SELECT * FROM :dataset.:table" +# else: +# SQL = args['sql'] +# SQL = SQL.replace(":dataset",dataset).replace(":table",tablename) +# Utils.log(module=args['from'],action='sql',input={'sql':SQL}) +# # +# # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table +# # + + + +# odataset = mk(dataset=dataset+'_io',client=client) +# # SQL = "SELECT * FROM io.:context_full_io".replace(':context',context) +# config = bq.QueryJobConfig() +# config.destination = client.dataset(odataset.dataset_id).table(args['from']) +# config.use_query_cache = True +# config.allow_large_results = True +# config.priority = 'INTERACTIVE' +# # +# # + +# schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema +# fields = [" ".join(["CAST (",item.name,"AS",item.field_type.replace("INTEGER","INT64").replace("FLOAT","FLOAT64"),") ",item.name]) for item in schema] +# SQL = SQL.replace("*"," , ".join(fields)) +# # print (SQL) +# out = client.query(SQL,location='US',job_config=config) +# Utils.log(module=args['from'],action='move',input={'job':out.job_id}) +# return (out.job_id) + + + + +# import pandas as pd +# import numpy as np +# from google.oauth2 import service_account +# import json + +# # path = '../curation-prod.json' +# # credentials = service_account.Credentials.from_service_account_file(path) +# # df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard') +# filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config'] +# f = open(filename) +# config = json.loads(f.read()) +# args = config['pipeline'] +# f.close() + + +# if __name__ == '__main__' : +# """ +# Usage : +# finalize -- --contexts --from +# """ + +# if 'move' in SYS_ARGS : + +# if 'init' in SYS_ARGS : +# dep = config['dep'] if 'dep' in config else {} +# info = [] + +# if 'queries' in dep : +# info += dep['queries'] +# print ('________') +# if 'tables' in dep : +# info += dep['tables'] +# args = {} +# jobs = [] +# for item in info : +# args = {} +# if type(item) == str : +# args['from'] = item +# name = item +# else: +# args = item +# name = item['from'] +# args['config'] = SYS_ARGS['config'] +# # args['pipeline'] = [] +# job = Process(target=move,args=(args,)) +# job.name = name +# jobs.append(job) +# job.start() + + +# # while len(jobs) > 0 : +# # jobs = [job for job in jobs if job.is_alive()] +# # time.sleep(1) + + +# else: +# move(SYS_ARGS) +# # # table = SYS_ARGS['from'] +# # # args = dict(config,**{"private_key":"../curation-prod.json"}) +# # args = dict(args,**SYS_ARGS) +# # contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']] +# # log = [] +# # if contexts : +# # args['contexts'] = contexts +# # log = move(**args) + +# # else: +# # tables = args['from'].split(',') +# # for name in tables : +# # name = name.strip() +# # args['from'] = name +# # log += [move(**args)] +# # print ("\n".join(log)) + + + +# else: +# print ("NOT YET READY !") \ No newline at end of file diff --git a/data/__init__.py b/data/__init__.py index 98124f1..91b566d 100644 --- a/data/__init__.py +++ b/data/__init__.py @@ -1,2 +1,6 @@ -import data.params as params - +# import data.params as params +from data.params import SYS_ARGS +import transport +from multiprocessing import Process, Queue +from data.maker import prepare +from data.maker import state diff --git a/data/bridge.py b/data/bridge.py index fa323af..3116a4b 100644 --- a/data/bridge.py +++ b/data/bridge.py @@ -153,21 +153,28 @@ class Binary : """ This is a utility class to import and export a data to/from a binary matrix """ - def __stream(self,column) : + def __stream(self,column,size=-1) : """ This function will convert a column into a binary matrix with the value-space representing each column of the resulting matrix :column a column vector i.e every item is a row """ # values = np.unique(column) - values = column.dropna().unique() - values.sort() + # values = column.dropna().unique() + + # values.sort() + # column = column.values + values = self.get_column(column,size) + column = column.values # # Let's treat the case of missing values i.e nulls # row_count,col_count = column.size,values.size + # if row_count * col_count > size and row_count < size: + + - matrix = [ np.zeros(col_count) for i in np.arange(row_count)] + matrix = [ np.zeros(col_count,dtype=np.float32) for i in np.arange(row_count)] # # let's create a binary matrix of the feature that was passed in # The indices of the matrix are inspired by classical x,y axis @@ -176,14 +183,52 @@ class Binary : for yi in np.arange(row_count) : value = column[yi] - if value not in values : - continue - xi = np.where(values == value) - xi = xi[0][0] #-- column index - matrix[yi][xi] = 1 + # if value not in values : + # continue + xi = np.where(values == value) + if xi and xi[0].size > 0: + xi = xi[0][0] #-- column index + matrix[yi][xi] = 1 - return matrix - def Export(self,df) : + return pd.DataFrame(matrix,columns=values) + def apply(self,column,size): + return self.__stream(column,size) + def get_column(self,column,size=-1): + """ + This function will return the columns that are available for processing ... + """ + values = column.dropna().value_counts().index.values + + if size > 0 and column.size > size: + values = values[:size] + values.sort() + return values + def get_missing(self,column,size=-1): + values = column.dropna().value_counts().index.values + if size > 0 and column.size > size : + values = values[size:] + else: + values = np.array([]) + values.sort() + return values.tolist(); + + def _get_column_values(self,column,size=-1): + values = column.dropna().unique() + values.sort() + + # + # Let's treat the case of missing values i.e nulls + # + row_count,col_count = column.size,values.size + if col_count > size and size > 0: + # N = np.divide(size,row_count).astype(int) + # N = + i = np.random.choice(col_count,size) + values = values[-i] + col_count = size + return values + + def _Export(self,df) : """ This function will convert a data-frame to a binary matrix :return _map,matrix @@ -191,12 +236,14 @@ class Binary : # # This will give us a map of how each column was mapped to a bitstream - _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0) + # _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0) + # _map = df.fillna(np.nan).apply(lambda column: column,axis=0) + print (df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)) # # We will merge this to have a healthy matrix _matrix = _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1) - _matrix = np.matrix([list(item) for item in _matrix]) + _matrix = np.matrix([list(item) for item in _matrix]).astype(np.float32) # # let's format the map so we don't have an unreasonable amount of data # @@ -210,7 +257,8 @@ class Binary : _m[name] = {"start":beg,"end":end} beg = end - return _m,_matrix.astype(np.float32) + # return _m,_matrix.astype(np.float32) + return _matrix def Import(self,df,values,_map): """ @@ -237,37 +285,41 @@ if __name__ == '__main__' : --pseudo will create pseudonyms for a given --export will export data to a specified location """ - has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys() - has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys() - if has_basic and has_action : - builder = Builder() - if 'export' in SYS_ARGS : - print () - print ("exporting ....") - if not os.path.exists(SYS_ARGS['export']) : - os.mkdir(SYS_ARGS['export']) - SQL = builder.encode(**SYS_ARGS) - # - # Assuming the user wants to filter the records returned : - # + df = pd.read_csv('sample.csv') + print ( df.race.value_counts()) + print ( (Binary()).apply(df['race'], 3)) + + # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys() + # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys() + # if has_basic and has_action : + # builder = Builder() + # if 'export' in SYS_ARGS : + # print () + # print ("exporting ....") + # if not os.path.exists(SYS_ARGS['export']) : + # os.mkdir(SYS_ARGS['export']) + # SQL = builder.encode(**SYS_ARGS) + # # + # # Assuming the user wants to filter the records returned : + # # - credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key']) - df = pd.read_gbq(SQL,credentials =credentials,dialect='standard') - FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv']) - # - # This would allow us to export it to wherever we see fit - print (FILENAME) - df.to_csv(FILENAME,index=False) - f = open(FILENAME.replace('.csv','.sql'),'w+') - f.write(SQL) - f.close() - elif 'pseudo' in SYS_ARGS : - builder.process(**SYS_ARGS) - else: - print ("") - print (SYS_ARGS.keys()) - print ("has basic ",has_basic) - print ("has action ",has_action) + # credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key']) + # df = pd.read_gbq(SQL,credentials =credentials,dialect='standard') + # FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv']) + # # + # # This would allow us to export it to wherever we see fit + # print (FILENAME) + # df.to_csv(FILENAME,index=False) + # f = open(FILENAME.replace('.csv','.sql'),'w+') + # f.write(SQL) + # f.close() + # elif 'pseudo' in SYS_ARGS : + # builder.process(**SYS_ARGS) + # else: + # print ("") + # print (SYS_ARGS.keys()) + # print ("has basic ",has_basic) + # print ("has action ",has_action) # pseudonym.apply(table='person',dataset='wgan_original',key='./curation-test-2.json') # args = {"dataset":"wgan_original","table":"observation","key":"./curation-test-2.json"} # builder = Builder() diff --git a/data/gan.py b/data/gan.py index 43d15ae..d2cc3ea 100644 --- a/data/gan.py +++ b/data/gan.py @@ -1,11 +1,28 @@ """ -usage : - optional : - --num_gpu number of gpus to use will default to 1 - --epoch steps per epoch default to 256 +This code was originally writen by Ziqi Zhang in order to generate synthetic data. +The code is an implementation of a Generative Adversarial Network that uses the Wasserstein Distance (WGAN). +It is intended to be used in 2 modes (embedded in code or using CLI) + +USAGE : + +The following parameters should be provided in a configuration file (JSON format) +python data/maker --config + +CONFIGURATION FILE STRUCTURE : + + context what it is you are loading (stroke, hypertension, ...) + data path of the file to be loaded + logs folder to store training model and meta data about learning + max_epochs number of iterations in learning + num_gpu number of gpus to be used (will still run if the GPUs are not available) + +EMBEDDED IN CODE : + """ import tensorflow as tf -from tensorflow.contrib.layers import l2_regularizer +# from tensorflow.contrib.layers import l2_regularizer +from tensorflow.keras import layers +from tensorflow.keras.regularizers import L2 as l2_regularizer import numpy as np import pandas as pd import time @@ -19,565 +36,798 @@ import pickle os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ['CUDA_VISIBLE_DEVICES'] = "0" os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' - -# STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256 -# NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu']) +tf.compat.v1.disable_eager_execution() +# STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256 +# NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu']) # BATCHSIZE_PER_GPU = 2000 -# TOTAL_BATCHSIZE = BATCHSIZE_PER_GPU * NUM_GPUS +# TOTAL_BATCHSIZE = BATCHSIZE_PER_GPU * NUM_GPUS class void : - pass + pass class GNet : - """ - This is the base class of a generative network functions, the details will be implemented in the subclasses. - An instance of this class is accessed as follows - object.layers.normalize applies batch normalization or otherwise - obect.get.variables instanciate variables on cpu and return a reference (tensor) - """ - def __init__(self,**args): - self.layers = void() - self.layers.normalize = self.normalize - - - self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] - - - self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854 - self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE] - self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis - # self.NUM_LABELS = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1] - if 'label' in args and len(args['label'].shape) == 2 : - self.NUM_LABELS = args['label'].shape[1] - elif 'label' in args and len(args['label']) == 1 : - self.NUM_LABELS = args['label'].shape[0] - else: - self.NUM_LABELS = 8 - self.Z_DIM = 128 #self.X_SPACE_SIZE - self.BATCHSIZE_PER_GPU = args['real'].shape[0] if 'real' in args else 256 - self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS - self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) - self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs']) - self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100 - self.CONTEXT = args['context'] - self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None} - self._REAL = args['real'] if 'real' in args else None - self._LABEL = args['label'] if 'label' in args else None - - self.get = void() - self.get.variables = self._variable_on_cpu - self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - self.logger = args['logger'] if 'logger' in args and args['logger'] else None - self.init_logs(**args) - - def init_logs(self,**args): - self.log_dir = args['logs'] if 'logs' in args else 'logs' - self.mkdir(self.log_dir) - # - # - for key in ['train','output'] : - self.mkdir(os.sep.join([self.log_dir,key])) - self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT])) - - self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) - self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) - - def load_meta(self,column): + def log(self,**args): + self.logs = dict(args,**self.logs) + + """ - This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model. - Because prediction and training can happen independently + This is the base class of a generative network functions, the details will be implemented in the subclasses. + An instance of this class is accessed as follows + object.layers.normalize applies batch normalization or otherwise + obect.get.variables instanciate variables on cpu and return a reference (tensor) """ - # suffix = "-".join(column) if isinstance(column,list)else column - suffix = self.get.suffix() - _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json']) - if os.path.exists(_name) : - attr = json.loads((open(_name)).read()) - for key in attr : - value = attr[key] - setattr(self,key,value) - self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) - self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) + def __init__(self,**args): + self.layers = void() + self.layers.normalize = self.normalize + self.logs = {} + + # self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] + self.GPU_CHIPS = None if 'gpu' not in args else [args['gpu']] + if self.GPU_CHIPS is None: + self.GPU_CHIPS = [0] + if 'CUDA_VISIBLE_DEVICES' in os.environ : + os.environ.pop('CUDA_VISIBLE_DEVICES') + self.NUM_GPUS = 0 + else: + self.NUM_GPUS = len(self.GPU_CHIPS) + # os.environ['CUDA_VISIBLE_DEVICES'] = str(self.GPU_CHIPS[0]) + + self.PARTITION = args['partition'] if 'partition' in args else None + # if self.NUM_GPUS > 1 : + # os.environ['CUDA_VISIBLE_DEVICES'] = "4" + + self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854 + self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE] + self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis + # self.NUM_LABELS = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1] + + if 'label' in args and len(args['label'].shape) == 2 : + self.NUM_LABELS = args['label'].shape[1] + elif 'label' in args and len(args['label']) == 1 : + self.NUM_LABELS = args['label'].shape[0] + else: + self.NUM_LABELS = None + # self.Z_DIM = 128 #self.X_SPACE_SIZE + self.Z_DIM = 128 #-- used as rows down stream + self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM] + PROPOSED_BATCH_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size']) + self.BATCHSIZE_PER_GPU = PROPOSED_BATCH_PER_GPU + if 'real' in args : + self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM] + + if args['real'].shape[0] < PROPOSED_BATCH_PER_GPU : + self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) + # self.BATCHSIZE_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size']) + self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS + self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) + self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs']) + CHECKPOINT_SKIPS = int(args['checkpoint_skips']) if 'checkpoint_skips' in args else int(self.MAX_EPOCHS/10) + + CHECKPOINT_SKIPS = 1 if CHECKPOINT_SKIPS < 1 else CHECKPOINT_SKIPS + # if self.MAX_EPOCHS < 2*CHECKPOINT_SKIPS : + # CHECKPOINT_SKIPS = 2 + # self.CHECKPOINTS = [1,self.MAX_EPOCHS] + np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() + self.CHECKPOINTS = np.repeat(CHECKPOINT_SKIPS, self.MAX_EPOCHS/ CHECKPOINT_SKIPS).cumsum().astype(int).tolist() + + self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100 + self.CONTEXT = args['context'] + self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None} + self._REAL = args['real'] if 'real' in args else None + self._LABEL = args['label'] if 'label' in args else None + + self.get = void() + self.get.variables = self._variable_on_cpu + self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + self.logger = args['logger'] if 'logger' in args and args['logger'] else None + self.init_logs(**args) + + def init_logs(self,**args): + self.log_dir = args['logs'] if 'logs' in args else 'logs' + self.mkdir(self.log_dir) + # + # + for key in ['train','output'] : + self.mkdir(os.sep.join([self.log_dir,key])) + self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT])) + # if 'partition' in args : + # self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT,str(args['partition'])])) + self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) + self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) + if 'partition' in args : + self.train_dir = os.sep.join([self.train_dir,str(args['partition'])]) + self.out_dir = os.sep.join([self.out_dir,str(args['partition'])]) + + for checkpoint in self.CHECKPOINTS : + self.mkdir (os.sep.join([self.train_dir,str(checkpoint)])) + self.mkdir (os.sep.join([self.out_dir,str(checkpoint)])) + + # if self.logger : + + # We will clear the logs from the data-store + + # column = self.ATTRIBUTES['synthetic'] + # db = self.logger.db + # if db[column].count() > 0 : + # db.backup.insert({'name':column,'logs':list(db[column].find()) }) + # db[column].drop() + + def load_meta(self,**args): + """ + This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model. + Because prediction and training can happen independently + """ + # suffix = "-".join(column) if isinstance(column,list)else column + suffix = self.CONTEXT #self.get.suffix() + _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json']) + if os.path.exists(_name) : + attr = json.loads((open(_name)).read()) + for key in attr : + value = attr[key] + if not hasattr(self,key): + setattr(self,key,value) + self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) + self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) + # if 'partition' in args : + # self.train_dir = os.sep.join([self.train_dir,str(args['partition'])]) + # self.out_dir = os.sep.join([self.out_dir,str(args['partition'])]) + + + def log_meta(self,**args) : + + _object = { + # '_id':'meta', + 'CONTEXT':self.CONTEXT, + 'ATTRIBUTES':self.ATTRIBUTES, + 'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU, + 'Z_DIM':self.Z_DIM, + "X_SPACE_SIZE":self.X_SPACE_SIZE, + "D_STRUCTURE":self.D_STRUCTURE, + "G_STRUCTURE":self.G_STRUCTURE, + "NUM_GPUS":self.NUM_GPUS, + "GPU_CHIPS":self.GPU_CHIPS, + "NUM_LABELS":self.NUM_LABELS, + "MAX_EPOCHS":self.MAX_EPOCHS, + "ROW_COUNT":self.ROW_COUNT + } + if args and 'key' in args and 'value' in args : + key = args['key'] + value= args['value'] + object[key] = value + # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column + suffix = self.CONTEXT #self.get.suffix() + _name = os.sep.join([self.out_dir,'meta-'+suffix]) + + # f = open(_name+'.json','w') + # f.write(json.dumps(_object)) + # f.close() + + for _info in [{"name":os.sep.join([self.out_dir,'meta-'+suffix+'.json']),"data":_object},{"name":os.sep.join([self.out_dir,'epochs.json']),"data":self.logs['epochs'] if 'epochs' in self.logs else []}] : + f = open(_info['name'],'w') + f.write(json.dumps(_info['data'])) + f.close() + return _object + def mkdir (self,path): + if not os.path.exists(path) : + if os.sep in path : + pass + root = [] + + for loc in path.strip().split(os.sep) : + if loc == '' : + root.append(os.sep) + root.append(loc) + if not os.path.exists(os.sep.join(root)) : + os.mkdir(os.sep.join(root)) + + elif not os.path.exists(path): + os.mkdir(path) - - def log_meta(self,**args) : - _object = { - 'CONTEXT':self.CONTEXT, - 'ATTRIBUTES':self.ATTRIBUTES, - 'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU, - 'Z_DIM':self.Z_DIM, - "X_SPACE_SIZE":self.X_SPACE_SIZE, - "D_STRUCTURE":self.D_STRUCTURE, - "G_STRUCTURE":self.G_STRUCTURE, - "NUM_GPUS":self.NUM_GPUS, - "NUM_LABELS":self.NUM_LABELS, - "MAX_EPOCHS":self.MAX_EPOCHS, - "ROW_COUNT":self.ROW_COUNT - } - if args and 'key' in args and 'value' in args : - key = args['key'] - value= args['value'] - object[key] = value - # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column - suffix = self.get.suffix() - _name = os.sep.join([self.out_dir,'meta-'+suffix]) - - f = open(_name+'.json','w') - f.write(json.dumps(_object)) - return _object - def mkdir (self,path): - if not os.path.exists(path) : - os.mkdir(path) - - - def normalize(self,**args): - """ - This function will perform a batch normalization on an network layer - inputs input layer of the neural network - name name of the scope the - labels labels (attributes not synthesized) by default None - n_labels number of labels default None - """ - inputs = args['inputs'] - name = args['name'] - labels = None if 'labels' not in args else args['labels'] - n_labels= None if 'n_labels' not in args else args['n_labels'] - shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing - mean, var = tf.nn.moments(inputs, shift, keep_dims=True) - shape = inputs.shape[1].value - offset_m = self.get.variables(shape=[n_labels,shape], name='offset'+name, - initializer=tf.zeros_initializer) - scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name, - initializer=tf.ones_initializer) - - offset = tf.nn.embedding_lookup(offset_m, labels) - scale = tf.nn.embedding_lookup(scale_m, labels) - result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8) - return result - - def _variable_on_cpu(self,**args): - """ - This function makes sure variables/tensors are not created on the GPU but rather on the CPU - """ - name = args['name'] - shape = args['shape'] - initializer=None if 'initializer' not in args else args['initializer'] - with tf.device('/cpu:0') : - cpu_var = tf.compat.v1.get_variable(name,shape,initializer= initializer) - return cpu_var - def average_gradients(self,tower_grads): - average_grads = [] - for grad_and_vars in zip(*tower_grads): - grads = [] - for g, _ in grad_and_vars: - expanded_g = tf.expand_dims(g, 0) - grads.append(expanded_g) - - grad = tf.concat(axis=0, values=grads) - grad = tf.reduce_mean(grad, 0) - - v = grad_and_vars[0][1] - grad_and_var = (grad, v) - average_grads.append(grad_and_var) - return average_grads + def normalize(self,**args): + """ + This function will perform a batch normalization on an network layer + inputs input layer of the neural network + name name of the scope the + labels labels (attributes not synthesized) by default None + n_labels number of labels default None + """ + inputs = args['inputs'] + name = args['name'] + labels = None if 'labels' not in args else args['labels'] + n_labels= None if 'n_labels' not in args else args['n_labels'] + shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing + # mean, var = tf.nn.moments(inputs, shift, keep_dims=True) + mean, var = tf.nn.moments(inputs, shift,keepdims=True) + # shape = inputs.shape[1].value + shape = inputs.shape[1] + + if labels is not None: + offset_m = self.get.variables(shape=[1,shape], name='offset'+name,initializer=tf.zeros_initializer) + scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,initializer=tf.ones_initializer) + offset = tf.nn.embedding_lookup(offset_m, labels) + scale = tf.nn.embedding_lookup(scale_m, labels) + + else: + offset = None + scale = None + + result = tf.nn.batch_normalization(inputs, mean, var,offset,scale, 1e-8) + return result + + def _variable_on_cpu(self,**args): + """ + This function makes sure variables/tensors are not created on the GPU but rather on the CPU + """ + + name = args['name'] + shape = args['shape'] + initializer=None if 'initializer' not in args else args['initializer'] + with tf.device('/cpu:0') : + cpu_var = tf.compat.v1.get_variable(name,shape,initializer= initializer) + return cpu_var + def average_gradients(self,tower_grads): + average_grads = [] + for grad_and_vars in zip(*tower_grads): + grads = [] + for g, _ in grad_and_vars: + expanded_g = tf.expand_dims(g, 0) + grads.append(expanded_g) + + grad = tf.concat(axis=0, values=grads) + grad = tf.reduce_mean(grad, 0) + + v = grad_and_vars[0][1] + grad_and_var = (grad, v) + average_grads.append(grad_and_var) + return average_grads class Generator (GNet): - """ - This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator, this allows the generator not to be random - - """ - def __init__(self,**args): - GNet.__init__(self,**args) - self.discriminator = Discriminator(**args) - def loss(self,**args): - fake = args['fake'] - label = args['label'] - y_hat_fake = self.discriminator.network(inputs=fake, label=label) - #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) - loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs) - #tf.add_to_collection('glosses', loss) - tf.compat.v1.add_to_collection('glosses', loss) - return loss, loss - def load_meta(self, column): - super().load_meta(column) - self.discriminator.load_meta(column) - def network(self,**args) : """ - This function will build the network that will generate the synthetic candidates - :inputs matrix of data that we need - :dim dimensions of ... - """ - x = args['inputs'] - tmp_dim = self.Z_DIM if 'dim' not in args else args['dim'] - label = args['label'] + This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator, this allows the generator not to be random - with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): - for i, dim in enumerate(self.G_STRUCTURE[:-1]): - kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim]) - h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS) - h2 = tf.nn.relu(h1) - x = x + h2 - tmp_dim = dim - i = len(self.G_STRUCTURE) - 1 - # - # This seems to be an extra hidden layer: - # It's goal is to map continuous values to discrete values (pre-trained to do this) - kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]]) - h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i), - labels=label, n_labels=self.NUM_LABELS) - h2 = tf.nn.tanh(h1) - x = x + h2 - # This seems to be the output layer - # - kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE]) - bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE]) - x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias)) - return x + """ + def __init__(self,**args): + GNet.__init__(self,**args) + self.discriminator = Discriminator(**args) + def loss(self,**args): + fake = args['fake'] + label = args['label'] + y_hat_fake = self.discriminator.network(inputs=fake, label=label) + #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) + all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) + loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs) + #tf.add_to_collection('glosses', loss) + tf.compat.v1.add_to_collection('glosses', loss) + return loss, loss + def load_meta(self, **args): + # super().load_meta(**args) + self.discriminator.load_meta(**args) + + + def network(self,**args) : + """ + This function will build the network that will generate the synthetic candidates + :inputs matrix of data that we need + :dim dimensions of ... + """ + x = args['inputs'] + tmp_dim = self.Z_DIM if 'dim' not in args else args['dim'] + label = args['label'] + + with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): + for i, dim in enumerate(self.G_STRUCTURE[:-1]): + kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim]) + h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS) + h2 = tf.nn.relu(h1) + x = x + h2 + tmp_dim = dim + i = len(self.G_STRUCTURE) - 1 + # + # This seems to be an extra hidden layer: + # It's goal is to map continuous values to discrete values (pre-trained to do this) + kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]]) + h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i), + labels=label, n_labels=self.NUM_LABELS) + h2 = tf.nn.tanh(h1) + x = x + h2 + # This seems to be the output layer + # + kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE]) + bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE]) + x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias)) + return x class Discriminator(GNet): - def __init__(self,**args): - GNet.__init__(self,**args) - def network(self,**args): - """ - This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output (neuron) - :inputs - :label - """ - x = args['inputs'] - label = args['label'] - with tf.compat.v1.variable_scope('D', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): - for i, dim in enumerate(self.D_STRUCTURE[1:]): - kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[i], dim]) - bias = self.get.variables(name='b_' + str(i), shape=[dim]) - # print (["\t",bias,kernel]) - x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias)) - x = self.normalize(inputs=x, name='cln' + str(i), shift=1,labels=label, n_labels=self.NUM_LABELS) - i = len(self.D_STRUCTURE) - kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[-1], 1]) - bias = self.get.variables(name='b_' + str(i), shape=[1]) - y = tf.add(tf.matmul(x, kernel), bias) - return y - - def loss(self,**args) : - """ - This function compute the loss of - :real - :fake - :label - """ - real = args['real'] - fake = args['fake'] - label = args['label'] - epsilon = tf.random.uniform(shape=[self.BATCHSIZE_PER_GPU,1],minval=0,maxval=1) - - x_hat = real + epsilon * (fake - real) - y_hat_fake = self.network(inputs=fake, label=label) + def __init__(self,**args): + GNet.__init__(self,**args) + def network(self,**args): + """ + This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output (neuron) + :inputs + :label + """ + x = args['inputs'] + label = args['label'] + with tf.compat.v1.variable_scope('D', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): + for i, dim in enumerate(self.D_STRUCTURE[1:]): + kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[i], dim]) + bias = self.get.variables(name='b_' + str(i), shape=[dim]) + # print (["\t",bias,kernel]) + x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias)) + x = self.normalize(inputs=x, name='cln' + str(i), shift=1,labels=label, n_labels=self.NUM_LABELS) + i = len(self.D_STRUCTURE) + kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[-1], 1]) + bias = self.get.variables(name='b_' + str(i), shape=[1]) + y = tf.add(tf.matmul(x, kernel), bias) + return y - y_hat_real = self.network(inputs=real, label=label) - y_hat = self.network(inputs=x_hat, label=label) - - grad = tf.gradients(y_hat, [x_hat])[0] - slopes = tf.sqrt(tf.reduce_sum(tf.square(grad), 1)) - gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2) - #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) - w_distance = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake) - loss = w_distance + 10 * gradient_penalty + sum(all_regs) - #tf.add_to_collection('dlosses', loss) - tf.compat.v1.add_to_collection('dlosses', loss) - - return w_distance, loss + def loss(self,**args) : + """ + This function compute the loss of + :real + :fake + :label + """ + real = args['real'] + fake = args['fake'] + label = args['label'] + epsilon = tf.random.uniform(shape=[self.BATCHSIZE_PER_GPU,1],minval=0,maxval=1) + + x_hat = real + epsilon * (fake - real) + y_hat_fake = self.network(inputs=fake, label=label) + + y_hat_real = self.network(inputs=real, label=label) + y_hat = self.network(inputs=x_hat, label=label) + + grad = tf.gradients(y_hat, [x_hat])[0] + slopes = tf.sqrt(tf.reduce_sum(tf.square(grad), 1)) + gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2) + #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) + all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) + w_distance = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake) + loss = w_distance + 10 * gradient_penalty + sum(all_regs) + #tf.add_to_collection('dlosses', loss) + tf.compat.v1.add_to_collection('dlosses', loss) + + return w_distance, loss class Train (GNet): - def __init__(self,**args): - GNet.__init__(self,**args) - self.generator = Generator(**args) - self.discriminator = Discriminator(**args) - self._REAL = args['real'] - self._LABEL= args['label'] - self.column = args['column'] - # print ([" *** ",self.BATCHSIZE_PER_GPU]) - - self.meta = self.log_meta() - def load_meta(self, column): - """ - This function will delegate the calls to load meta data to it's dependents - column name - """ - super().load_meta(column) - self.generator.load_meta(column) - self.discriminator.load_meta(column) - def loss(self,**args): - """ - This function will compute a "tower" loss of the generated candidate against real data - Training will consist in having both generator and discriminators - :scope - :stage - :real - :label - """ + def __init__(self,**args): + GNet.__init__(self,**args) + self.generator = Generator(**args) + self.discriminator = Discriminator(**args) + self._REAL = args['real'] + self._LABEL= args['label'] if 'label' in args else None + # self.column = args['column'] + # print ([" *** ",self.BATCHSIZE_PER_GPU]) + + self.meta = self.log_meta() + if(self.logger): + + self.logger.write({"module":"gan-train","action":"start","input":{"partition":self.PARTITION,"meta":self.meta} } ) + + + # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta) + def load_meta(self, column): + """ + This function will delegate the calls to load meta data to it's dependents + column name + """ + super().load_meta(column) + self.generator.load_meta(column) + self.discriminator.load_meta(column) + def loss(self,**args): + """ + This function will compute a "tower" loss of the generated candidate against real data + Training will consist in having both generator and discriminators + :scope + :stage + :real + :label + """ + + scope = args['scope'] + stage = args['stage'] + real = args['real'] + label = args['label'] - scope = args['scope'] - stage = args['stage'] - real = args['real'] - label = args['label'] - label = tf.cast(label, tf.int32) - # - # @TODO: Ziqi needs to explain what's going on here - m = [[i] for i in np.arange(self._LABEL.shape[1]-2)] - label = label[:, 1] * len(m) + tf.squeeze( - tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32)) - ) - # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] ) - z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) + + if label is not None : + label = tf.cast(label, tf.int32) + # + # @TODO: Ziqi needs to explain what's going on here + m = [[i] for i in np.arange(self._LABEL.shape[1]-2)] + label = label[:, 1] * len(m) + tf.squeeze( + tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32)) + ) + # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] ) + z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) + + fake = self.generator.network(inputs=z, label=label) + if stage == 'D': + w, loss = self.discriminator.loss(real=real, fake=fake, label=label) + #losses = tf.get_collection('dlosses', scope) + flag = 'dlosses' + losses = tf.compat.v1.get_collection('dlosses', scope) + else: + w, loss = self.generator.loss(fake=fake, label=label) + #losses = tf.get_collection('glosses', scope) + flag = 'glosses' + losses = tf.compat.v1.get_collection('glosses', scope) + # losses = tf.compat.v1.get_collection(flag, scope) + + total_loss = tf.add_n(losses, name='total_loss') + # print (total_loss) + return total_loss, w + def input_fn(self): + """ + This function seems to produce + """ + features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32) + LABEL_SHAPE = [None,None] if self._LABEL is None else self._LABEL.shape + labels_placeholder = tf.compat.v1.placeholder(shape=LABEL_SHAPE, dtype=tf.float32) + if self._LABEL is not None : + dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) + else : + dataset = tf.data.Dataset.from_tensor_slices(features_placeholder) + # labels_placeholder = None + dataset = dataset.repeat(800000) + + dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) + dataset = dataset.prefetch(1) + # iterator = dataset.make_initializable_iterator() + iterator = tf.compat.v1.data.make_initializable_iterator(dataset) + return iterator, features_placeholder, labels_placeholder - fake = self.generator.network(inputs=z, label=label) - if stage == 'D': - w, loss = self.discriminator.loss(real=real, fake=fake, label=label) - #losses = tf.get_collection('dlosses', scope) - losses = tf.compat.v1.get_collection('dlosses', scope) - else: - w, loss = self.generator.loss(fake=fake, label=label) - #losses = tf.get_collection('glosses', scope) - losses = tf.compat.v1.get_collection('glosses', scope) - - total_loss = tf.add_n(losses, name='total_loss') - - return total_loss, w - def input_fn(self): - """ - This function seems to produce - """ - features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32) - labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32) - dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) - dataset = dataset.repeat(10000) - dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) - dataset = dataset.prefetch(1) - iterator = dataset.make_initializable_iterator() - # next_element = iterator.get_next() - # init_op = iterator.initializer - return iterator, features_placeholder, labels_placeholder - - def network(self,**args): - # def graph(stage, opt): - # global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False) - stage = args['stage'] - opt = args['opt'] - tower_grads = [] - per_gpu_w = [] - iterator, features_placeholder, labels_placeholder = self.input_fn() - with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): - for i in range(self.NUM_GPUS): - with tf.device('/gpu:%d' % i): - with tf.name_scope('%s_%d' % ('TOWER', i)) as scope: - (real, label) = iterator.get_next() - loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL) - #tf.get_variable_scope().reuse_variables() - tf.compat.v1.get_variable_scope().reuse_variables() - #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage) - vars_ = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=stage) - grads = opt.compute_gradients(loss, vars_) - tower_grads.append(grads) - per_gpu_w.append(w) - - grads = self.average_gradients(tower_grads) - apply_gradient_op = opt.apply_gradients(grads) - - mean_w = tf.reduce_mean(per_gpu_w) - train_op = apply_gradient_op - return train_op, mean_w, iterator, features_placeholder, labels_placeholder - def apply(self,**args): - # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10 - REAL = self._REAL - LABEL= self._LABEL - with tf.device('/cpu:0'): - opt_d = tf.compat.v1.train.AdamOptimizer(1e-4) - opt_g = tf.compat.v1.train.AdamOptimizer(1e-4) - - train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d) - train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g) - # saver = tf.train.Saver() - saver = tf.compat.v1.train.Saver() - # init = tf.global_variables_initializer() - init = tf.compat.v1.global_variables_initializer() - logs = [] - #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: - with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: - sess.run(init) - sess.run(iterator_d.initializer, - feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL}) - sess.run(iterator_g.initializer, - feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL}) - - for epoch in range(1, self.MAX_EPOCHS + 1): - start_time = time.time() - w_sum = 0 - for i in range(self.STEPS_PER_EPOCH): - for _ in range(2): - _, w = sess.run([train_d, w_distance]) - w_sum += w - sess.run(train_g) - duration = time.time() - start_time - - assert not np.isnan(w_sum), 'Model diverged with loss = NaN' - - format_str = 'epoch: %d, w_distance = %f (%.1f)' - print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) - # print (dir (w_distance)) - - logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) }) - - if epoch % self.MAX_EPOCHS == 0: - # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - suffix = self.get.suffix() - _name = os.sep.join([self.train_dir,suffix]) - # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) - saver.save(sess, _name, write_meta_graph=False, global_step=epoch) + def network(self,**args): + stage = args['stage'] + opt = args['opt'] + tower_grads = [] + per_gpu_w = [] + iterator, features_placeholder, labels_placeholder = self.input_fn() + with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): # + # @TODO: Find a way to handle this across multiple CPU in case the GPU are not available + # - abstract hardware specification + # - determine if the GPU/CPU are busy # - if self.logger : - row = {"logs":logs} #,"model":pickle.dump(sess)} - - self.logger.write(row=row) - + for i in self.GPU_CHIPS : #range(self.NUM_GPUS): + + with tf.device('/gpu:%d' % i): + with tf.name_scope('%s_%d' % ('TOWER', i)) as scope: + if self._LABEL is not None : + (real, label) = iterator.get_next() + else: + + real = iterator.get_next() + label= None + loss, w = self.loss(scope=scope, stage=stage, real=real, label=label) + + #tf.get_variable_scope().reuse_variables() + tf.compat.v1.get_variable_scope().reuse_variables() + #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage) + vars_ = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=stage) + grads = opt.compute_gradients(loss, vars_) + tower_grads.append(grads) + per_gpu_w.append(w) + + grads = self.average_gradients(tower_grads) + apply_gradient_op = opt.apply_gradients(grads) + + mean_w = tf.reduce_mean(per_gpu_w) + train_op = apply_gradient_op + return train_op, mean_w, iterator, features_placeholder, labels_placeholder + def apply(self,**args): + # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10 + REAL = self._REAL + LABEL= self._LABEL + if (self.logger): + pass + + with tf.device('/cpu:0'): + opt_d = tf.compat.v1.train.AdamOptimizer(1e-4) + opt_g = tf.compat.v1.train.AdamOptimizer(1e-4) + + train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d) + train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g) + # saver = tf.train.Saver() + saver = tf.compat.v1.train.Saver(max_to_keep=len(self.CHECKPOINTS)) + # init = tf.global_variables_initializer() + init = tf.compat.v1.global_variables_initializer() + logs = [] + self.logs['epochs'] = [] + #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: + with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: + + sess.run(init) + + sess.run(iterator_d.initializer, + feed_dict={features_placeholder_d: REAL}) + sess.run(iterator_g.initializer, + feed_dict={features_placeholder_g: REAL}) + + for epoch in range(1, self.MAX_EPOCHS + 1): + start_time = time.time() + w_sum = 0 + for i in range(self.STEPS_PER_EPOCH): + for _ in range(2): + _, w = sess.run([train_d, w_distance]) + w_sum += w + sess.run(train_g) + duration = time.time() - start_time + + assert not np.isnan(w_sum), 'Model diverged with loss = NaN' + + format_str = 'epoch: %d, w_distance = %f (%.1f)' + print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) + # print (dir (w_distance)) + + # logs.append({"epoch": int(epoch),"distance":float(-w_sum/(self.STEPS_PER_EPOCH*2)) }) + + suffix = str(self.CONTEXT) + _name = os.sep.join([self.train_dir,str(epoch),suffix]) if epoch in self.CHECKPOINTS else '' + _logentry = {"path":_name,"epochs":int(epoch),"loss":float(-w_sum/(self.STEPS_PER_EPOCH*2))} + # if epoch % self.MAX_EPOCHS == 0: + # if epoch in [5,10,20,50,75, self.MAX_EPOCHS] : + if epoch in self.CHECKPOINTS : + # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + # suffix = self.CONTEXT #self.get.suffix() + # _name = os.sep.join([self.train_dir,str(epoch),suffix]) + # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) + saver.save(sess, _name, write_meta_graph=False, global_step=np.int64(epoch)) + + # + # + + # logs = [] + # if self.logger : + # # row = {"module":"gan-train","action":"epochs","input":{"logs":logs}} #,"model":pickle.dump(sess)} + # # self.logger.write(row) + # self.logs['epochs'] += logs + # # + # # @TODO: + # # We should upload the files in the checkpoint + # # This would allow the learnt model to be portable to another system + # + self.logs['epochs'].append(_logentry) + tf.compat.v1.reset_default_graph() + # + # let's sort the epochs we've logged thus far (if any) + # Take on the last five checkpoints https://stackoverflow.com/questions/41018454/tensorflow-checkpoint-models-getting-deleted + # + # self.logs['epochs'] = self.logs['epochs'][-5:] + + if self.logger : + _log = {'module':'gan-train','context':self.CONTEXT,'action':'epochs','input':self.logs['epochs']} + self.logger.write(_log) + + # + # @TODO: + # Make another copy of this on disk to be able to load it should we not have a logger setup + # + self.log_meta() class Predict(GNet): - """ - This class uses synthetic data given a learned model - """ - def __init__(self,**args): - GNet.__init__(self,**args) - self.generator = Generator(**args) - self.values = args['values'] - def load_meta(self, column): - super().load_meta(column) - self.generator.load_meta(column) - def apply(self,**args): - # print (self.train_dir) - # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - suffix = self.get.suffix() - model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) - demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] - tf.compat.v1.reset_default_graph() - z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) - y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32) - ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] - label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) - - fake = self.generator.network(inputs=z, label=label) - init = tf.compat.v1.global_variables_initializer() - saver = tf.compat.v1.train.Saver() - with tf.compat.v1.Session() as sess: - - # sess.run(init) - saver.restore(sess, model_dir) - labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) - - labels= demo - f = sess.run(fake,feed_dict={y:labels}) - # - # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes - # - - df = ( pd.DataFrame(np.round(f).astype(np.int32))) - # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms - # df = (i * df).sum(axis=1) - # - # In case we are dealing with actual values like diagnosis codes we can perform - # - columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] - - r = np.zeros((self.ROW_COUNT,len(columns))) - for col in df : - i = np.where(df[col])[0] - r[i] = col - - df = pd.DataFrame(r,columns=columns) - - df[df.columns] = (df.apply(lambda value: self.values[ int(value)],axis=1)) - return df.to_dict(orient='lists') - # return df.to_dict(orient='list') - # count = str(len(os.listdir(self.out_dir))) - # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv']) - # df.to_csv(_name,index=False) - - - # output.extend(np.round(f)) - - # for m in range(2): - # for n in range(2, self.NUM_LABELS): - # idx1 = (demo[:, m] == 1) - # idx2 = (demo[:, n] == 1) - # idx = [idx1[j] and idx2[j] for j in range(len(idx1))] - # num = np.sum(idx) - # print ("___________________list__") - # print (idx1) - # print (idx2) - # print (idx) - # print (num) - # print ("_____________________") - # nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU)) - # label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS)) - # label_input[:, n] = 1 - # label_input[:, m] = 1 - # output = [] - # for i in range(nbatch): - # f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]}) - # output.extend(np.round(f)) - # output = np.array(output)[:num] - # print ([m,n,output]) - - # np.save(self.out_dir + str(m) + str(n), output) - - -if __name__ == '__main__' : - # - # Now we get things done ... - column = SYS_ARGS['column'] - column_id = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id' - column_id = column_id.split(',') if ',' in column_id else column_id - df = pd.read_csv(SYS_ARGS['raw-data']) - LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values - - context = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4] - if set(['train','learn']) & set(SYS_ARGS.keys()): - - df = pd.read_csv(SYS_ARGS['raw-data']) - - # cols = SYS_ARGS['column'] - # _map,_df = (Binary()).Export(df) - # i = np.arange(_map[column]['start'],_map[column]['end']) - max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10 - # REAL = _df[:,i] - REAL = pd.get_dummies(df[column]).astype(np.float32).values - LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values - trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id) - trainer.apply() - - - - - # - # We should train upon this data - # - # -- we need to convert the data-frame to binary matrix, given a column - # - pass - elif 'generate' in SYS_ARGS: - values = df[column].unique().tolist() - values.sort() - - p = Predict(context=context,label=LABEL,values=values,column=column) - p.load_meta(column) - r = p.apply() - print (df) - print () - df[column] = r[column] - print (df) - + """ + This class uses synthetic data given a learned model + """ + def __init__(self,**args): + GNet.__init__(self,**args) + self.generator = Generator(**args) + self.values = args['values'] + self.ROW_COUNT = args['row_count'] + self.oROW_COUNT = self.ROW_COUNT + + # self.MISSING_VALUES = np.nan_to_num(np.nan) + # if 'no_value' in args and args['no_value'] not in ['na','','NA'] : + # self.MISSING_VALUES = args['no_value'] + self.MISSING_VALUES = args['missing'] if 'missing' in args else [] + + + # self.MISSING_VALUES = args['no_value'] + # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value'] + def load_meta(self, **args): + super().load_meta(**args) + self.generator.load_meta(**args) + self.ROW_COUNT = self.oROW_COUNT + # + # updating the input/output for the generator, so it points properly + # + + for object in [self,self.generator] : + _train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT,str(self.MAX_EPOCHS)]) + _out_dir= os.sep.join([self.log_dir,'output',self.CONTEXT,str(self.MAX_EPOCHS)]) + setattr(object,'train_dir',_train_dir) + setattr(object,'out_dir',_out_dir) + def apply(self,**args): + suffix = self.CONTEXT #self.get.suffix() + model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) + # model_dir = os.sep.join([self.train_dir,str(self.MAX_EPOCHS)]) + + demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] + # + # setup computational graph + tf.compat.v1.reset_default_graph() + z = tf.random.normal(shape=[self.ROW_COUNT, self.Z_DIM]) + + y = tf.compat.v1.placeholder(shape=[self.ROW_COUNT, self.NUM_LABELS], dtype=tf.int32) + if self._LABEL is not None : + ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] + label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) + else: + label = None + + fake = self.generator.network(inputs=z, label=label) + init = tf.compat.v1.global_variables_initializer() + saver = tf.compat.v1.train.Saver() + df = pd.DataFrame() + CANDIDATE_COUNT = args['candidates'] if 'candidates' in args else 1 #0 if self.ROW_COUNT < 1000 else 100 + candidates = [] - else: - print (SYS_ARGS.keys()) - print (__doc__) - pass + with tf.compat.v1.Session() as sess: + saver.restore(sess, model_dir) + if self._LABEL is not None : + # labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) + labels= demo + else: + labels = None + + for i in np.arange(CANDIDATE_COUNT) : + if labels : + _matrix = sess.run(fake,feed_dict={y:labels}) + else: + _matrix = sess.run(fake) + # + # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes + # The code below will insure we have some acceptable cardinal relationships between id and synthetic values + # + + # df = pd.DataFrame(np.round(f)).astype(np.int32) + # candidates.append (np.round(_matrix).astype(np.int64)) + candidates.append(np.array([np.round(row).astype(int) for row in _matrix])) + # return candidates[0] if len(candidates) == 1 else candidates + + return [candidates [0]] + + def _apply(self,**args): + # print (self.train_dir) + # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + suffix = self.CONTEXT #self.get.suffix() + model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) + demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] + tf.compat.v1.reset_default_graph() + z = tf.random.normal(shape=[self.ROW_COUNT, self.Z_DIM]) + + y = tf.compat.v1.placeholder(shape=[self.ROW_COUNT, self.NUM_LABELS], dtype=tf.int32) + if self._LABEL is not None : + ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] + label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) + else: + label = None + + fake = self.generator.network(inputs=z, label=label) + init = tf.compat.v1.global_variables_initializer() + saver = tf.compat.v1.train.Saver() + df = pd.DataFrame() + CANDIDATE_COUNT = 5 #0 if self.ROW_COUNT < 1000 else 100 + NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0] + with tf.compat.v1.Session() as sess: + + # sess.run(init) + + saver.restore(sess, model_dir) + if self._LABEL is not None : + labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) + labels= demo + else: + labels = None + + found = [] + ratio = [] + __x__ = None + __ratio=0 + for i in np.arange(CANDIDATE_COUNT) : + if labels : + _matrix = sess.run(fake,feed_dict={y:labels}) + else: + _matrix = sess.run(fake) + # + # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes + # The code below will insure we have some acceptable cardinal relationships between id and synthetic values + # + + # df = pd.DataFrame(np.round(f)).astype(np.int32) + found.append (np.round(_matrix).astype(np.int64)) + # df = pd.DataFrame(np.round(_matrix),dtype=int) + p = 0 not in df.sum(axis=1).values + # x = df.sum(axis=1).values + + # if np.divide( np.sum(x), x.size) > .9 or p and np.sum(x) == x.size : + # ratio.append(np.divide( np.sum(x), x.size)) + # found.append(df) + + # # break + # if len(found) == CANDIDATE_COUNT: + + # break + # else: + # __x__ = df if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __x__ + # __ratio = np.divide( np.sum(x), x.size) if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __ratio + # continue + + # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms + # df = (i * df).sum(axis=1) + # + # In case we are dealing with actual values like diagnosis codes we can perform + # + # N = len(found) + # _index = [i for i in range(0,N) if found[i].shape[1] == len(self.values)] + # if not _index and not found : + # df = __x__ + # INDEX = -1 + # else : + # if not _index : + # INDEX = np.random.choice(np.arange(len(found)),1)[0] + # INDEX = ratio.index(np.max(ratio)) + # else: + # INDEX = _index[0] + + + # df = found[INDEX] + # columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] + + # r = np.zeros((self.ROW_COUNT,len(columns))) + # r = np.zeros(self.ROW_COUNT) + + # if self.logger : + # info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)} + # if df.shape[1] > len(self.values) : + # df = df.iloc[:len(self.values)] + # if INDEX > 0 : + # info =dict(info ,**{"selected":INDEX, "ratio": ratio[INDEX] }) + # else : + + # info['selected'] = -1 + # info['ratio'] = __ratio + # info['partition'] = self.PARTITION + # self.logger.write({"module":"gan-generate","action":"generate","input":info}) + # # df.columns = self.values + # if len(found) or df.columns.size <= len(self.values): + # ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1) + # missing = [] + # if ii.sum() > 0 : + # # + # # If the generator had a reductive effect we should be able to get random values from either : + # # - The space of outliers + # # - existing values for smaller spaces that have suffered over training + # # + + # N = ii.sum() + # missing_values = self.MISSING_VALUES if self.MISSING_VALUES else self.values + # missing = np.random.choice(missing_values,N) + # # missing = [] + # # + # # @TODO: + # # Log the findings here in terms of ratio, missing, candidate count + # # print ([np.max(ratio),len(missing),len(found),i]) + # i = np.where(ii == 0)[0] + + + # df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) + # df.columns = columns + # df = df[columns[0]].append(pd.Series(missing)) + + + # if self.logger : + + # info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION} + # self.logger.write({"module":"gan-generate","action":"compile.io","input":info}) + + + + # print(df.head()) + tf.compat.v1.reset_default_graph() + # df = pd.DataFrame(df) + # df.columns = columns + # np.random.shuffle(df[columns[0]].values) + # return df.to_dict(orient='list') + return _matrix diff --git a/data/maker/__init__.py b/data/maker/__init__.py index f97e5f3..7b3a347 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -11,68 +11,592 @@ This package is designed to generate synthetic data from a dataset from an origi import pandas as pd import numpy as np import data.gan as gan -from transport import factory -def train (**args) : - """ - This function is intended to train the GAN in order to learn about the distribution of the features - :column columns that need to be synthesized (discrete) - :logs where the output of the (location on disk) - :id identifier of the dataset - :data data-frame to be synthesized - :context label of what we are synthesizing +import transport +# from data.bridge import Binary +import threading +from data.maker import prepare +from data.maker.state import State +import copy +import os +import nujson as json +from multiprocessing import Process, RLock +from datetime import datetime, timedelta +from multiprocessing import Queue + +import time + + + +class Learner(Process): + + def __init__(self,**_args): + + + super(Learner, self).__init__() + self.ndx = 0 + self._queue = Queue() + self.lock = RLock() + if 'gpu' in _args : + + os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu']) + self.gpu = int(_args['gpu']) + else: + self.gpu = None + + self.info = _args['info'] + self.columns = self.info['columns'] if 'columns' in self.info else None + self.store = _args['store'] + + if 'network_args' not in _args : + self.network_args ={ + 'context':self.info['context'] , + 'logs':_args['logs'] if 'logs' in _args else 'logs', + 'max_epochs':int(_args['epochs']) if 'epochs' in _args else 2, + 'batch_size':int (_args['batch']) if 'batch' in _args else 2000 + } + else: + self.network_args = _args['network_args'] + self._encoder = None + self._map = None + self._df = _args['data'] if 'data' in _args else None + + self.name = self.__class__.__name__ + + # + # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork + # + + _log = {'action':'init','gpu':(self.gpu if self.gpu is not None else -1)} + self.log(**_log) + self.cache = [] + # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs' + # sel.max_epoc + self.logger = None + if 'logger' in self.store : + self.logger = transport.factory.instance(**self.store['logger']) + self.autopilot = False #-- to be set by caller + self._initStateSpace() + def _initStateSpace(self): + """ + Initializing state-space for the data-maker, The state-space functions are used as pre-post processing functions applied to the data accordingly i.e + - Trainer -> pre-processing + - Generation -> post processing + The specifications of a state space in the configuration file is as such + state:{pre:{path,pipeline:[]}, post:{path,pipeline:[]}} + """ + self._states = None + + if 'state' in self.info : + try: + _config = self.info ['state'] + self._states = State.instance(_config) + except Exception as e: + print (e) + pass + finally: + # __info = (pd.DataFrame(self._states)[['name','path','args']]).to_dict(orient='records') + if self._states : + __info = {} + + for key in self._states : + __info[key] = [{"name":_item['name'],"args":_item['args'],"path":_item['path']} for _item in self._states[key]] + self.log(object='state-space',action='load',input=__info) + + + + def log(self,**_args): + + try: + _context = self.info['context'] + _label = self.info['info'] if 'info' in self.info else _context + # logger = + _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args}) + if 'logger' in self.store : + logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider=transport.providers.CONSOLE,context='write',lock=True) + logger.write(_args) + self.ndx += 1 + # if hasattr(logger,'close') : + # logger.close() + pass + except Exception as e: + # print () + # print (_args) + # print (e) + pass + finally: + + pass + def get_schema(self): + # if self.store['source']['provider'] != 'bigquery' : + # return [] #{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])] + # else: + # reader = transport.factory.instance(**self.store['source']) + # return reader.meta(table=self.info['from']) + reader = transport.factory.instance(**self.store['source']) + return reader.meta(table=self.info['from']) + + def initalize(self): + reader = transport.factory.instance(**self.store['source']) + _read_args= self.info + if self._df is None : + self._df = reader.read(**_read_args) + # + # NOTE : PRE + # At this point we apply pre-processing of the data if there were ever a need for it + # + _log = {} + HAS_STATES = self._states is not None and 'pre' in self._states + NOT_GENERATING = self.name in ['Trainer','Shuffle'] + IS_AUTOPILOT = self.autopilot + # + # allow calling pre-conditions if either of the conditions is true + # 1. states and not generating + # 2. IS_GENERATING and states and not autopilot + _ALLOW_PRE_CALL = (HAS_STATES and NOT_GENERATING) or (NOT_GENERATING is False and HAS_STATES and IS_AUTOPILOT is False) + if _ALLOW_PRE_CALL : + # if HAS_STATES and NOT_GENERATING or (HAS_STATES and IS_AUTOPILOT is False and NOT_GENERATING is False): + _logs = {'action':'status','input':{'pre':self._states['pre']}} + _beg = list(self._df.shape) + self._df = State.apply(self._df,self._states['pre']) + _end = list(self._df.shape) + _logs['input']['size'] = _beg,_end + self.log(**_log) + + # + # + + columns = self.columns if self.columns else self._df.columns + # + # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases + # - The code below tries to address the issue (Perhaps better suited for the reading components) + + for name in columns : + # + # randomly sampling 5 elements to make sense of data-types + if self._df[name].size < 5 : + continue + _index = np.random.choice(np.arange(self._df[name].size),5,False) + no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]] + no_value = 0 if np.sum(no_value) > 0 else '' + try: + self._df[name] = self._df[name].fillna(no_value) + finally: + pass + + _log[name] = self._df[name].dtypes.name + _log = {'action':'structure','input':_log} + self.log(**_log) + # + # convert the data to binary here ... + _schema = self.get_schema() + _args = {"schema":_schema,"data":self._df,"columns":columns} + if self._map : + _args['map'] = self._map + self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None + _log = {'action':'data-prep','input':{'rows':int(self._df.shape[0]),'cols':int(self._df.shape[1]) } } + self.log(**_log) + def get(self): + + if self.cache : + return self.cache if len(self.cache) > 0 else(self.cache if not self.cache else self.cache[0]) + else: + return self._queue.get() if self._queue.qsize() > 0 else [] + + def listen(self): + while True : + _info = self._queue.get() + self.cache.append(_info) + self._queue.task_done() + def publish(self,caller): + if hasattr(caller,'_queue') : + _queue = caller._queue + _queue.put(self.cache) + + # _queue.join() + pass +class Trainer(Learner): """ - column = args['column'] - - column_id = args['id'] - df = args['data'] - logs = args['logs'] - real = pd.get_dummies(df[column]).astype(np.float32).values - labels = pd.get_dummies(df[column_id]).astype(np.float32).values - num_gpu = 1 if 'num_gpu' not in args else args['num_gpu'] - max_epochs = 10 if 'max_epochs' not in args else args['max_epochs'] - context = args['context'] - if 'store' in args : - args['store']['args']['doc'] = context - logger = factory.instance(**args['store']) - - else: - logger = None - - trainer = gan.Train(context=context,max_epochs=max_epochs,num_gpu=num_gpu,real=real,label=labels,column=column,column_id=column_id,logger = logger,logs=logs) - return trainer.apply() - -def generate(**args): + This will perform training using a GAN """ - This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset - @return pandas.DataFrame + def __init__(self,**_args): + super().__init__(**_args) + # self.info = _args['info'] + self.limit = int(_args['limit']) if 'limit' in _args else None + + self.autopilot = _args['autopilot'] if 'autopilot' in _args else False + self.generate = None + self.candidates = int(_args['candidates']) if 'candidates' in _args else 1 + self.checkpoint_skips = _args['checkpoint_skips'] if 'checkpoint_skips' in _args else None + def run(self): + self.initalize() + if self._encoder is None : + # + # @TODO Log that the dataset was empty or not statistically relevant + return + _space,_matrix = self._encoder.convert() + + _args = self.network_args + if self.gpu : + _args['gpu'] = self.gpu + _args['real'] = _matrix + _args['candidates'] = self.candidates + if 'logger' in self.store : + _args['logger'] = transport.factory.instance(**self.store['logger']) + if self.checkpoint_skips : + _args['checkpoint_skips'] = self.checkpoint_skips + # + # At this point we have the binary matrix, we can initiate training + # + beg = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S') + gTrain = gan.Train(**_args) + gTrain.apply() + + writer = transport.factory.instance(provider=transport.providers.FILE,context='write',path=os.sep.join([gTrain.out_dir,'map.json'])) + + writer.write(self._encoder._map,overwrite=True) + writer.close() + + # + # @TODO: At this point we need to generate another some other objects + # + _args = {"network_args":self.network_args,"store":self.store,"info":self.info,"candidates":self.candidates,"data":self._df} + _args['logs'] = self.network_args['logs'] + _args['autopilot'] = self.autopilot + if self.gpu : + _args['gpu'] = self.gpu + + # + # Let us find the smallest, the item is sorted by loss on disk + # + _epochs = [_e for _e in gTrain.logs['epochs'] if _e['path'] != ''] + _epochs.sort(key=lambda _item: _item['loss'],reverse=False) + + _args['network_args']['max_epochs'] = _epochs[0]['epochs'] + self.log(action='autopilot',input={'epoch':_epochs[0]}) + g = Generator(**_args) + # g.run() + + end = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S') + _min = float((end-beg).seconds/ 60) + _logs = {'action':'train','input':{'start':beg.strftime('%Y-%m-%d %H:%M:%S'),'minutes':_min,"unique_counts":self._encoder._io[0]}} + self.log(**_logs) + self._g = g + if self.autopilot : + self._g.run() + # + #@TODO Find a way to have the data in the object .... + + + + def generate (self): + if self.autopilot : + print( "Autopilot is set ... No need to call this function") + else: + raise Exception( "Autopilot has not been, Wait till training is finished. Use is_alive function on process object") + +class Generator (Learner): + def __init__(self,**_args): + super().__init__(**_args) + # + # We need to load the mapping information for the space we are working with ... + # + self.network_args['candidates'] = int(_args['candidates']) if 'candidates' in _args else 1 + filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json']) + self.log(**{'action':'init-map','input':{'filename':filename,'exists':os.path.exists(filename)}}) + if os.path.exists(filename): + file = open(filename) + self._map = json.loads(file.read()) + file.close() + else: + self._map = {} + self.autopilot = False if 'autopilot' not in _args else _args['autopilot'] + def run(self): + self.initalize() + if self._encoder is None : + # + # @TODO Log that the dataset was empty or not statistically relevant + return + + # + # The values will be returned because we have provided _map information from the constructor + # + + values,_matrix = self._encoder.convert() + _args = self.network_args + _args['map'] = self._map + _args['values'] = np.array(values) + _args['row_count'] = self._df.shape[0] + if self.gpu : + _args['gpu'] = self.gpu + if 'logger' in self.store : + _args['logger'] = transport.factory.instance(**self.store['logger']) + gHandler = gan.Predict(**_args) + gHandler.load_meta(columns=None) + _iomatrix = gHandler.apply() + _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix] + + _size = np.sum([len(_item) for _item in _iomatrix]) + _log = {'action':'io-data','input':{'candidates':len(_candidates),'rows':int(_size)}} + self.log(**_log) + # self.cache = _candidates + self.post(_candidates) + def approximate(self,_df): + _columns = self.info['approximate'] + + for name in _columns : + if _df[name].size > 100 : + BATCH_SIZE = 10 + + else: + BATCH_SIZE = 1 + batches = np.array_split(_df[name].fillna(np.nan).values,BATCH_SIZE) + _type = np.int64 if 'int' in self.info['approximate'][name]else np.float64 + x = [] + _log = {'action':'approximate','input':{'batch':BATCH_SIZE,'col':name}} + for values in batches : + + index = [ _x not in ['',None,np.nan] for _x in values] + + if np.sum(index) == 0: + # + # Sometimes messy data has unpleasant surprises + continue + + _values = np.random.rand( len(values[index])) + _values += np.std(values[index]) / 4 + + values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values) + values[index] = values[index].astype(_type) + x += values.tolist() + + if x : + _log['input']['identical_percentage'] = 100 * (np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size)) + + _df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64) + self.log(**_log) + return _df + def make_date(self,**_args) : + """ + :param year initial value + """ + if _args['year'] in ['',None,np.nan] : + return None + year = int(_args['year']) + + offset = _args['offset'] if 'offset' in _args else 0 + month = np.random.randint(1,13) + if month == 2: + _end = 28 if year % 4 != 0 else 29 + else: + _end = 31 if month in [1,3,5,7,8,10,12] else 30 + day = np.random.randint(1,_end) + + #-- synthetic date + _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0) + FORMAT = '%Y-%m-%d' + _name = _args['field'] if 'field' in _args else None + if 'format' in self.info and _name in self.info['format']: + # _name = _args['field'] + FORMAT = self.info['format'][_name] + + + # print ([_name,FORMAT, _date.strftime(FORMAT)]) + r = [] + if offset : + r = [_date.strftime(FORMAT)] + for _delta in offset : + _date = _date + timedelta(_delta) + r.append(_date.strptime(FORMAT)) + return r + else: + return _date.strftime(FORMAT) + + pass + def format(self,_df,_schema): + r = {} + + for _item in _schema : + name = _item['name'] + + if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] : + FORMAT = '%Y-%m-%d' + + try: + # + #-- Sometimes data isn't all it's meant to be + SIZE = -1 + if 'format' in self.info and name in self.info['format'] : + FORMAT = self.info['format'][name] + SIZE = 10 + elif _item['type'] in ['DATETIME','TIMESTAMP'] : + FORMAT = '%Y-%m-%-d %H:%M:%S' + SIZE = 19 + + if SIZE > 0 : + + values = pd.to_datetime(_df[name], format=FORMAT).astype(np.datetime64) + # _df[name] = [_date[:SIZE].strip() for _date in values] + + + # _df[name] = _df[name].astype(str) + r[name] = FORMAT + # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') + if _item['type'] in ['DATETIME','TIMESTAMP']: + pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]') + + except Exception as e: + pass + finally: + pass + else: + + # + # Because types are inferred on the basis of the sample being processed they can sometimes be wrong + # To help disambiguate we add the schema information + _type = None + + if 'int' in _df[name].dtypes.name or 'int' in _item['type'].lower(): + _type = np.int + + elif 'float' in _df[name].dtypes.name or 'float' in _item['type'].lower(): + _type = np.float + if _type : + + _df[name] = _df[name].fillna(0).replace(' ',0).replace('',0).replace('NA',0).replace('nan',0).astype(_type) + # else: + # _df[name] = _df[name].astype(str) + # _df = _df.replace('NaT','').replace('NA','') + + if r : + self.log(**{'action':'format','input':r}) + return _df + + pass + def post(self,_candidates): + + if 'target' in self.store : + _store = self.store['target'] if 'target' in self.store else {'provider':'console'} + _store['lock'] = True + _store['context'] = 'write' #-- Just in case + if 'table' not in _store : + _store['table'] = self.info['from'] + else: + _store = None + N = 0 + for _iodf in _candidates : + _df = self._df.copy() + _df[self.columns] = _iodf[self.columns] + N += _df.shape[0] + if self._states and 'post' in self._states: + _df = State.apply(_df,self._states['post']) + # # + # #@TODO: + # # Improve formatting with better post-processing pipeline + # if 'approximate' in self.info : + # _df = self.approximate(_df) + # if 'make_date' in self.info : + # for name in self.info['make_date'] : + # # iname = self.info['make_date']['init_field'] + # iname = self.info['make_date'][name] + + # years = _df[iname] + # _dates = [self.make_date(year=_year,field=name) for _year in years] + # if _dates : + # _df[name] = _dates + + + + _schema = self.get_schema() + + _df = self.format(_df,_schema) + _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ] + self.log(**{"action":"consolidate","input":_log}) - :data data-frame to be synthesized - :column columns that need to be synthesized (discrete) - :id column identifying an entity - :logs location on disk where the learnt knowledge of the dataset is + + if _store : + writer = transport.factory.instance(**_store) + if _store['provider'] == 'bigquery': + writer.write(_df,schema=[],table=self.info['from']) + else: + writer.write(_df,table=self.info['from']) + else: + self.cache.append(_df) + + + + + + self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) +class Shuffle(Generator): """ - df = args['data'] - - column = args['column'] - column_id = args['id'] - logs = args['logs'] - context = args['context'] - num_gpu = 1 if 'num_gpu' not in args else args['num_gpu'] - max_epochs = 10 if 'max_epochs' not in args else args['max_epochs'] - - # - #@TODO: - # If the identifier is not present, we should fine a way to determine or make one - # - #ocolumns= list(set(df.columns.tolist())- set(columns)) - - values = df[column].unique().tolist() - values.sort() - - labels = pd.get_dummies(df[column_id]).astype(np.float32).values - handler = gan.Predict (context=context,label=labels,max_epochs=max_epochs,num_gpu=num_gpu,values=values,column=column,logs=logs) - handler.load_meta(column) - r = handler.apply() - _df = df.copy() - _df[column] = r[column] - return _df \ No newline at end of file + This is a method that will yield data with low utility + """ + def __init__(self,**_args): + super().__init__(**_args) + def run(self): + + np.random.seed(1) + self.initalize() + _index = np.arange(self._df.shape[0]) + np.random.shuffle(_index) + np.random.shuffle(_index) + _iocolumns = self.info['columns'] + _ocolumns = list(set(self._df.columns) - set(_iocolumns) ) + # _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size)) + _iodf = pd.DataFrame(self._df[_iocolumns].copy(),index = np.arange(_index.size)) + # self._df = self._df.loc[_index][_ocolumns].join(_iodf) + self._df = self._df.loc[_index][_ocolumns] + self._df.index = np.arange(self._df.shape[0]) + self._df = self._df.join(_iodf) + # + # The following is a full shuffle + self._df = self._df.loc[_index] + self._df.index = np.arange(self._df.shape[0]) + + + _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}} + self.log(**_log) + try: + self.post([self._df]) + self.log(**{'action':'completed','input':{'candidates':1,'rows':int(self._df.shape[0])}}) + except Exception as e : + # print (e) + self.log(**{'action':'failed','input':{'msg':e,'info':self.info}}) +class apply : + TRAIN,GENERATE,RANDOM = 'train','generate','random' +class factory : + _infocache = {} + @staticmethod + def instance(**_args): + """ + An instance of an object that trains and generates candidate datasets + :param gpu (optional) index of the gpu to be used if using one + :param store {source,target} if no target is provided console will be output + :param epochs (default 2) number of epochs to train + :param candidates(default 1) number of candidates to generate + :param info {columns,sql,from} + :param autopilot will generate output automatically + :param batch (default 2k) size of the batch + + """ + + + if _args['apply'] in [apply.RANDOM] : + pthread = Shuffle(**_args) + elif _args['apply'] == apply.GENERATE : + pthread = Generator(**_args) + else: + pthread= Trainer(**_args) + if 'start' in _args and _args['start'] == True : + pthread.start() + return pthread + +class plugins: + @staticmethod + def load(_config): + """ + This function attempts to load the plugins to insure they are valid + _config configuration for plugin specifications {pre:{pipeline,path},post:{pipeline,path}} + """ + + diff --git a/data/maker/__main__.py b/data/maker/__main__.py deleted file mode 100644 index e77bf0a..0000000 --- a/data/maker/__main__.py +++ /dev/null @@ -1,10 +0,0 @@ -import pandas as pd -import data.maker - -df = pd.read_csv('sample.csv') -column = 'gender' -id = 'id' -context = 'demo' -store = {"type":"mongo.MongoWriter","args":{"host":"localhost:27017","dbname":"GAN"}} -max_epochs = 11 -data.maker.train(store=store,max_epochs=max_epochs,context=context,data=df,column=column,id=id,logs='foo') \ No newline at end of file diff --git a/data/maker/apply.py b/data/maker/apply.py new file mode 100644 index 0000000..bb6a085 --- /dev/null +++ b/data/maker/apply.py @@ -0,0 +1,76 @@ +""" +This file is designed to specify the appliction of pre/post-processing code. + The pre-processing code gets applied after the data has been loaded + The post-processing code get applied after the data has been generated for instance: + -approximation code/logic; date shifting; suppression; adding noise + - +""" +import numpy as np +from datetime import datetime, timedelta +import time + +class Phase: + def __init__(self,**_args): + self._df = _args['data'] + self.callback = _args['callback'] + def apply(self,**_args): + """ + :param data data-frame + :param _info arguments needed to be applied + :param callback callback function once done + """ + raise Exception ("Function needs to be Implemented") +class Pre(Phase): + pass +class Post(Phase): + def __init__(self,**_args): + super().__init__(**_args) + pass + +class Date(Post): + def __init__(self,**_args): + super().__init__(**_args) + def make(self,**_args): + """ + This function generates a random date given a year and optionally a set of days from the randomly generated date + :param year initial value of a year + :param offset list of days between initial date + """ + if _args['year'] in ['',None,np.nan] : + return None + year = int(_args['year']) + + offset = _args['offset'] if 'offset' in _args else 0 + month = np.random.randint(1,13) + if month == 2: + _end = 28 if year % 4 != 0 else 29 + else: + _end = 31 if month in [1,3,5,7,8,10,12] else 30 + day = np.random.randint(1,_end) + + #-- synthetic date + _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0) + FORMAT = '%Y-%m-%d' if 'format' not in _args else _args['format'] + + + + # print ([_name,FORMAT, _date.strftime(FORMAT)]) + r = [] + if offset : + r = [_date.strftime(FORMAT)] + for _delta in offset : + _date = _date + timedelta(_delta) + r.append(_date.strptime(FORMAT)) + return r + else: + return _date.strftime(FORMAT) + + def apply(self,**_args): + """ + + """ + pass +class Approximate(Post): + def apply(**_args): + pass + def applyWithRange(**_args): diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py new file mode 100644 index 0000000..b11be57 --- /dev/null +++ b/data/maker/prepare/__init__.py @@ -0,0 +1,284 @@ +""" +(c) 2018 - 2021, Vanderbilt University Medical Center +Steve L. Nyemba, steve.l.nyemba@vumc.org + +This file is designed to handle preconditions for a generative adversarial network: + - The file will read/get data from a source specified by transport (or data-frame) + - The class will convert the data to a binary vector + - The class will also help rebuild the data from a binary matrix. +Usage : + +""" +import transport +import json +import pandas as pd +import numpy as np +# import cupy as cp +import sys +import os +# +# The following is to address the issue over creating a large matrix ... +# +# from multiprocessing import Process, Queue + +# if 'GPU' in os.environ : +# import cupy as np +# else: +# import numpy as np +class void: + pass +class Hardware : + """ + This class is intended to allow the use of hardware i.e GPU, index or CPU + """ + pass + +class Input : + class NOVALUES : + RANDOM,IGNORE,ALWAYS = ['random','ignore','always'] + """ + This class is designed to read data from a source and and perform a variet of operations : + - provide a feature space, and rows (matrix profile) + - a data index map + """ + + def __init__(self,**_args): + """ + :param data + :param store data-store parameters/configuration + :param sql sql query that pulls a representative sample of the data + """ + self._schema = _args['schema'] if 'schema' in _args else {} + # + # schema data should be in a hash map for these purposes + # + # if self._schema : + # r = {} + # for _item in self._schema : + # r[_item['name']] = r[_item['type']] + # self._schema = r + + self.df = _args['data'] + if 'sql' not in _args : + self._initdata(**_args) + # + pass + else: + self._initsql(**_args) + # + # We need to have a means to map of values,columns and vector positions in order + # to perform convert and revert to and from binary + # + self._map = {} if 'map' not in _args else _args['map'] + + + def _initsql(self,**_args): + """ + This function will initialize the class on the basis of a data-store and optionally pre-defined columns to be used to be synthesized + :param store data-store configuration + :param columns list of columns to be + """ + + if 'columns' not in _args : + self._initcols(data=self.df) + else: + self._initcols(data=self.df,columns=_args['columns']) + + pass + def _init_map(self,values): + self._map = dict(zip(np.arange(len(values)),values)) + for key in self._map : + self._map[key] = self._map[key].tolist() + def _initcols (self,**_args) : + """ + This function will initialize the columns to be synthesized and/or determine which ones can be synthesized + :param data data-frame that holds the data (matrix) + :param columns optional columns to be synthesized + """ + # df = _args['data'].copy() + row_count = self.df.shape[0] + cols = None if 'columns' not in _args else _args['columns'] + self.columns = self.df.columns.tolist() + self._io = [] + + if 'columns' in _args : + self._columns = _args['columns'] + # else: + # + # We will look into the count and make a judgment call + try: + # _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T + # MIN_SPACE_SIZE = 2 + # self._columns = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist() + # self._io = _df.to_dict(orient='records') + _df = pd.DataFrame(self.df.nunique().T / self.df.shape[0]).T + self._io = (_df.to_dict(orient='records')) + + except Exception as e: + print (e) + self._io = [] + def _initdata(self,**_args): + """ + This function will initialize the class with a data-frame and columns of interest (if any) + :param data data-frame that holds the data + :param columns columns that need to be synthesized if any + """ + self._initcols(**_args) + + def _convert(self,**_args): + """ + This function will convert a data-frame into a binary matrix and provide a map to be able to map the values back to the matrix + :param columns in case we specify the columns to account for (just in case the original assumptions don't hold) + """ + if 'columns' in _args or 'column' in _args : + columns = _args['columns'] if 'columns' in _args else [_args['column']] + else: + columns = self._columns + _df = self.df if 'data' not in _args else _args['data'] + # + # At this point we have the list of features we want to use + i = 0 + + _m = np.array([]) + _values = [] + for name in columns : + # + # In case we have dataset with incomplete value space, we should still be able to generate something meaningful + # + values = None if name not in self._map else list(self._map[name]['values']) + _type = self._schema[name] if name in self._schema else _df[name].dtype + cols, _matrix = self.tobinary(_df[name],values) + _beg,_end = i,i+len(cols) + if name not in self._map : + self._map[name] = {"beg":_beg,"end":_end ,"values":cols.tolist()} + i += len(cols) + if not _m.shape[0]: + _m = _matrix ; + else: + _m = np.concatenate((_m,_matrix),axis=1) + if values : + _values += list(values) + # + # @NOTE: + # The map should allow us to be able to convert or reconvert the binary matrix to whatever we want ... + # + # self._matrix = _m + + return _values,_m + + def _revert(self,**_args) : + """ + This function will take in a binary matrix and based on the map of values it will repopulate it with values + :param _matrix binary matrix + :param column|columns column name or columns if the column is specified + """ + _column = _args['column'] if 'column' in _args else None + + + matrix = _args['matrix'] + row_count = matrix.shape[0] + r = {} + for key in self._map : + if _column and key != _column : + continue + _item = self._map[key] + _beg = _item['beg'] + _end = _item['end'] + columns = np.array(_item['values']) + # + # @NOTE: We are accessing matrices in terms of [row,col], + # The beg,end variables are for the columns in the matrix (mini matrix) + # + # if not _column : + # _matrix = matrix[:,_beg:_end] #-- The understanding is that _end is not included + # else: + # _matrix = matrix + _matrix = matrix[:,_beg:_end] + # + # vectorize the matrix to replace the bits by their actual values (accounting for the data-types) + # @TODO: Find ways to do this on a GPU (for big data) or across threads + # + row_count = _matrix.shape[0] + # r[key] = [columns[np.where(row == 1) [0][0] ] for row in _matrix[:,_beg:_end]] + + r[key] = [columns[np.where(row==1)[0][0]] if np.where(row==1)[0].size > 0 else '' for row in _matrix] + # + # we should consider decoding the matrix if possible + # + + return pd.DataFrame(r) + + def tobinary(self,rows,cols=None) : + """ + This function will compile a binary matrix from a row of values this allows hopefully this can be done in parallel, this function can be vectorized and processed + :param rows np.array or list of vector of values + :param cols a space of values if it were to be different fromt he current sample. + """ + if not cols: + # + # In the advent the sample rows do NOT have the values of the + cols = rows.unique() + cols = np.array(cols) + row_count = np.int64(len(rows)) + # if 'GPU' not in os.environ : + # _matrix = np.zeros([row_count,cols.size],dtype=int) + # + # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure) + # + _matrix = np.array([np.repeat(0,cols.size) for i in range(0,row_count)]) + + [np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0] + # else: + # _matrix = cp.zeros([row_count,cols.size]) + # [cp.put(_matrix[i], cp.where(cols == rows[i]),1)for i in cp.arange(row_count) ] + # _matrix = _matrix.asnumpy() + + + return cols,_matrix + def convert(self,**_args): + if 'columns' in _args or 'column' in _args : + columns = _args['columns'] if 'columns' in _args else [_args['column']] + else: + columns = self._columns + _df = self.df if 'data' not in _args else _args['data'] + _values,_matrix = self.encode(_df,columns) + _, _matrix = self.tobinary(_matrix) + self._init_map(_values) + return _values,_matrix #-- matrix has been updated ! + def revert(self,**_args): + # _columns = _args['column'] if 'column' in _args else None + _matrix = _args['matrix'] + # print (_matrix) + return self.decode(_matrix,columns=self._columns) + pass + def encode(self,df,columns) : + _df = df[columns].drop_duplicates() + _values = _df.values.tolist() + _encoded = df[columns].apply(lambda row: _values.index( list(row)) ,axis=1) + return np.array(_values),_encoded + def decode (self,_matrix,**_args): + # + # _matrix binary matrix + # + + columns = _args['columns'] + _values = np.array( list(self._map.values())) + _matrix = pd.DataFrame(_matrix) #if type(_matrix) != pd.DataFrame else _matrix + # x = _matrix.apply(lambda row: _values[row.values == 1 ].tolist()[0] if row.values.sum() > 0 else None, axis=1).tolist() + #@TODO: Provide random values for things that are missing + + # x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist() + # + # @TODO: Provide a parameter to either: + # - missing = {outlier,random,none} + # - outlier: select an outlier, random: randomly select a value, none: do nothing ... + # + if np.random.choice([0,1],1)[0] : + novalues = _values[np.random.choice( len(_values),1)[0]].tolist() + else: + novalues = np.repeat(None,len(self._columns)) + x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else novalues ,axis=1).tolist() + return pd.DataFrame(x,columns=columns) + + + diff --git a/data/maker/prepare/__main__.py b/data/maker/prepare/__main__.py new file mode 120000 index 0000000..93f5256 --- /dev/null +++ b/data/maker/prepare/__main__.py @@ -0,0 +1 @@ +__init__.py \ No newline at end of file diff --git a/data/maker/state/__init__.py b/data/maker/state/__init__.py new file mode 100644 index 0000000..adf9837 --- /dev/null +++ b/data/maker/state/__init__.py @@ -0,0 +1,105 @@ +""" +This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditiions +""" +""" +This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditions, +The specifications for this are as follows (within an entry of the configuration) + { + "state":{ + "pre":[{"approximate":{"field":"int"}},{"newdate":{"field":"format"}}],"post":[{"limit":10}] + } + } +""" +import importlib +import importlib.util +import sys +from datetime import datetime +from data.maker.state.default import * +import os + + +class State : + @staticmethod + def apply(_data,lpointers): + """ + This function applies a pipeline against a given data-frame, the calling code must decide whether it is a pre/post + :_data data-frame + :_lpointers functions modules returned by instance (module,_args) + """ + for _item in lpointers : + if _item is None : + continue + + pointer = _item['module'] + _args = _item['args'] + + _data = pointer(_data,_args) + return _data + @staticmethod + def instance(_args): + pre = [] + post=[] + + out = {} + for key in _args : + # + # If the item has a path property is should be ignored + path = _args[key]['path'] if 'path' in _args[key] else '' + out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']] + + return out + # if 'pre' in _args: + # path = _args['pre']['path'] if 'path' in _args['pre'] else '' + + # pre = [ State._build(dict(_item,**{'path':path})) for _item in _args['pre']['pipeline']] + # else: + # path = _args['post']['path'] if 'path' in _args['post'] else '' + + # post = [ State._build(dict(_item,**{'path':path})) for _item in _args['post']['pipeline']] + # return {'pre':pre,'post':post} + + @staticmethod + def _extract(_entry): + + _name = list(set(_entry.keys()) - set(['path']) ) + _name = _name[0] + path = _entry['path'] if 'path' in _entry and os.path.exists(_entry['path']) else '' + return {"module": _name,"args": _entry[_name],'name':_name,'path':path} + pass + @staticmethod + def _build(_args): + + _info = State._extract(_args) + # _info = dict(_args,**_info) + + _info['module'] = State._instance(_info) + return _info if _info['module'] is not None else None + + @staticmethod + def _instance(_args): + """ + :path optional path of the file on disk + :module name of the function + """ + + _name = _args['module'] + + if 'path' in _args and os.path.exists(_args['path']): + path= _args['path'] + + spec = importlib.util.spec_from_file_location(_name, path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + else: + # + # Probably calling a built-in module (should be in this file) + + module = sys.modules['data.maker.state.default'] + + return getattr(module,_name) if hasattr(module,_name) else None + +# +# Adding a few custom functions that should be able to help .... +# These functions can be called without specifying a path +# + diff --git a/data/maker/state/default.py b/data/maker/state/default.py new file mode 100644 index 0000000..75c2c4b --- /dev/null +++ b/data/maker/state/default.py @@ -0,0 +1,116 @@ +""" +This file contains default functions applied to a data-frame/dataset as pre/post processing jobs. +The functions are organized in a pipeline i.e the data will be applied to each function + +Custom functions : + functions must tak 2 arguments (_data,_args) : where _data is a data frame and _arg is a object describing the input parameters +""" +import pandas as pd +import numpy as np +from datetime import datetime, timedelta + + +def limit(_data,size): + """ + ...,{limit:size} + """ + + # size = int(_args['limit']) + return _data.iloc[:size] +def format(_data,_schema): + """ + This function enforces a schema against a data-frame, this may or may not work depending on the persistence storage + :_data data-frame containing all data + :_args schema to enforce the data, we are expecting the format as a list of {name,type,description} + """ + return _data + +def approximate(_data,_args): + """ + :_args Object of {field:type} + This function will approximate n-fields in the data given it's distribution + """ + _m = {'int':int,'float':float,'integer':int,'double':float} + columns = list(_args.keys()) + for _name in columns : + if _name not in _data : + continue + otype = _args[_name] + otype = str if otype not in _m else _m[otype] + _data.loc[:,_name] = np.random.uniform(_data[_name].values).astype(otype) + + return _data +def split_date(_data,_args): + """ + This function takes a field and applies the format from other fields + :_data data-frame + :_config configuration entry {column:{format,column:format,type}} + """ + _columns = list(_args.keys()) + _m = {'int':int,'float':float,'integer':int,'double':float} + for _name in _columns : + _iname = _args[_name]['column'] + _iformat = _args[_name]['format']['in'] + _oformat = _args[_name]['format']['out'] + _otype = str if 'type' not in _args[_name] else _args[_name]['type'] + _data.loc[:,_name] = _data[_iname].apply(lambda _date: datetime.strftime(datetime.strptime(str(_date),_iformat),_oformat)).astype(_otype) + return _data +def newdate(_data,_args): + """ + This function creates a new data on a given column from another + :_data data frame + :_args configuration column:{format,column} + """ + _columns = list(_args.keys()) + for _name in _columns : + + format = _args[_name]['format'] + ROW_COUNT = _data[_name].size + if 'column' in _args[_name] : + srcName = _args[_name]['column'] + years = _data[srcName].values + else: + years = np.random.choice(np.arange(datetime.now().year- 90,datetime.now().year),ROW_COUNT) + _data.loc[:,_name] = [ _makedate(year = years[_index],format = format) for _index in np.arange(ROW_COUNT)] + + return _data +def _makedate(**_args): + """ + This function creates a new date and applies it to a column + :_data data-frame with columns + :_args arguments for col1:format + """ + _columns = list(_args.keys()) + + # if _args['year'] in ['',None,np.nan] : + # year = np.random.choice(np.arange(1920,222),1) + # else: + # year = int(_args['year']) + year = int(_args['year']) + offset = _args['offset'] if 'offset' in _args else 0 + month = np.random.randint(1,13) + if month == 2: + _end = 28 if year % 4 != 0 else 29 + else: + _end = 31 if month in [1,3,5,7,8,10,12] else 30 + day = np.random.randint(1,_end) + + #-- synthetic date + _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0) + FORMAT = '%Y-%m-%d' + + if 'format' in _args: + FORMAT = _args['format'] + + + # print ([_name,FORMAT, _date.strftime(FORMAT)]) + r = [] + if offset : + r = [_date.strftime(FORMAT)] + for _delta in offset : + _date = _date + timedelta(_delta) + r.append(_date.strptime(FORMAT)) + return r + else: + return _date.strftime(FORMAT) + diff --git a/data/params.py b/data/params.py index 999b919..f2c3536 100644 --- a/data/params.py +++ b/data/params.py @@ -1,6 +1,6 @@ import sys -SYS_ARGS = {'context':''} +SYS_ARGS = {} if len(sys.argv) > 1: N = len(sys.argv) @@ -9,8 +9,10 @@ if len(sys.argv) > 1: if sys.argv[i].startswith('--'): key = sys.argv[i][2:] #.replace('-','') SYS_ARGS[key] = 1 - if i + 1 < N: + if i + 1 < N and not sys.argv[i + 1].startswith('--'): value = sys.argv[i + 1] = sys.argv[i+1].strip() + else: + value = None if key and value: SYS_ARGS[key] = value diff --git a/drive/pipeline.py b/drive/pipeline.py new file mode 100644 index 0000000..04658da --- /dev/null +++ b/drive/pipeline.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +import json +from transport import factory +import numpy as np +import os +from multiprocessing import Process +import pandas as pd +from google.oauth2 import service_account +import data.maker + +from data.params import SYS_ARGS + +# +# The configuration array is now loaded and we will execute the pipe line as follows +DATASET='combined20190510' + +class Components : + + @staticmethod + def get(args): + """ + This function returns a data-frame provided a bigquery sql statement with conditions (and limits for testing purposes) + The function must be wrapped around a lambda this makes testing easier and changing data stores transparent to the rest of the code. (Vital when testing) + :sql basic sql statement + :condition optional condition and filters + """ + SQL = args['sql'] + if 'condition' in args : + condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')']) + SQL = " ".join([SQL,'WHERE',condition]) + + SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 " + if 'limit' in args : + SQL = SQL + 'LIMIT ' + args['limit'] + credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') + df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna() + return df + + # return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna() + @staticmethod + def split(X,MAX_ROWS=3,PART_SIZE=3): + + return list(pd.cut( np.arange(X.shape[0]+1),PART_SIZE).categories) + + def train(self,**args): + """ + This function will perform training on the basis of a given pointer that reads data + + """ + # + # @TODO: we need to log something here about the parameters being passed + pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args) + df = pointer() + + # + # Now we can parse the arguments and submit the entire thing to training + # + + logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) + log_folder = args['logs'] if 'logs' in args else 'logs' + _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) + _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + + MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 + PART_SIZE = args['part_size'] if 'part_size' in args else 0 + + if df.shape[0] > MAX_ROWS and 'partition' not in args: + lbound = 0 + bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories) + # bounds = Components.split(df,MAX_ROWS,PART_SIZE) + + qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'}) + + for b in bounds : + part_index = bounds.index(b) + ubound = int(b.right) + + + _data = df.iloc[lbound:ubound][args['columns']] + lbound = ubound + + # _args['logs'] = os.sep.join([log_folder,str(part_index)]) + _args['partition'] = str(part_index) + _args['logger'] = {'args':{'dbname':'aou','doc':args['context']},'type':'mongo.MongoWriter'} + # + # We should post the the partitions to a queue server (at least the instructions on ): + # - where to get the data + # - and athe arguments to use (partition #,columns,gpu,epochs) + # + info = {"rows":_data.shape[0],"cols":_data.shape[1], "paritition":part_index,"logs":_args['logs']} + p = {"args":_args,"data":_data.to_dict(orient="records"),"info":info} + qwriter.write(p) + # + # @TODO: + # - Notify that information was just posted to the queue + info['max_rows'] = MAX_ROWS + info['part_size'] = PART_SIZE + logger.write({"module":"train","action":"setup-partition","input":info}) + + pass + else: + partition = args['partition'] if 'partition' in args else '' + log_folder = os.sep.join([log_folder,args['context'],partition]) + _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) + _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' + + _args['data'] = df + # + # @log : + # Logging information about the training process for this partition (or not) + # + info = {"rows":df.shape[0],"cols":df.shape[1], "partition":partition,"logs":_args['logs']} + logger.write({"module":"train","action":"train","input":info}) + data.maker.train(**_args) + + pass + + # @staticmethod + def generate(self,args): + """ + This function will generate data and store it to a given, + """ + logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) + log_folder = args['logs'] if 'logs' in args else 'logs' + partition = args['partition'] if 'partition' in args else '' + log_folder = os.sep.join([log_folder,args['context'],partition]) + _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) + _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' + _args['no_value']= args['no_value'] + MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 + PART_SIZE = args['part_size'] if 'part_size' in args else 0 + + # credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') + # _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna() + reader = args['reader'] + df = reader() + if 'partition' in args : + bounds = Components.split(df,MAX_ROWS,PART_SIZE) + # bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories) + lbound = int(bounds[int(partition)].left) + ubound = int(bounds[int(partition)].right) + df = df.iloc[lbound:ubound] + _args['data'] = df + # _args['data'] = reader() + #_args['data'] = _args['data'].astype(object) + _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + _dc = data.maker.generate(**_args) + # + # We need to post the generate the data in order to : + # 1. compare immediately + # 2. synthetic copy + # + + cols = _dc.columns.tolist() + + data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query) + base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) + + for name in cols : + _args['data'][name] = _dc[name] + info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}} + if partition != '' : + info['partition'] = partition + logger.write(info) + # filename = os.sep.join([log_folder,'output',name+'.csv']) + # data_comp[[name]].to_csv(filename,index=False) + + # + #-- Let us store all of this into bigquery + prefix = args['notify']+'.'+_args['context'] + table = '_'.join([prefix,partition,'io']).replace('__','_') + folder = os.sep.join([args['logs'],args['context'],partition,'output']) + if 'file' in args : + + _fname = os.sep.join([folder,table.replace('_io','_full_io.csv')]) + _pname = os.sep.join([folder,table])+'.csv' + data_comp.to_csv( _pname,index=False) + _args['data'].to_csv(_fname,index=False) + + + else: + credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') + _pname = os.sep.join([folder,table+'.csv']) + _fname = table.replace('_io','_full_io') + data_comp.to_gbq(if_exists='replace',destination_table=_pname,credentials='credentials',chunk_size=50000) + data_comp.to_csv(_pname,index=False) + INSERT_FLAG = 'replace' if 'partition' not in args else 'append' + _args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=_fname,credentials='credentials',chunk_size=50000) + + info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} } + if partition : + info ['partition'] = partition + logger.write({"module":"generate","action":"write","info":info} ) + @staticmethod + def callback(channel,method,header,stream): + + info = json.loads(stream) + logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']}) + + logger.write({'module':'process','action':'read-partition','input':info['info']}) + df = pd.DataFrame(info['data']) + args = info['args'] + if int(args['num_gpu']) > 1 and args['gpu'] > 0: + args['gpu'] = args['gpu'] + args['num_gpu'] + args['reader'] = lambda: df + # + # @TODO: Fix + # There is an inconsistency in column/columns ... fix this shit! + # + args['columns'] = args['column'] + (Components()).train(**args) + logger.write({"module":"process","action":"exit","info":info["info"]}) + channel.close() + channel.connection.close() + pass + +if __name__ == '__main__' : + filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json' + f = open (filename) + PIPELINE = json.loads(f.read()) + f.close() + index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else 0 + + args = (PIPELINE[index]) + args['dataset'] = 'combined20190510' + args = dict(args,**SYS_ARGS) + args['max_rows'] = int(args['max_rows']) if 'max_rows' in args else 3 + args['part_size']= int(args['part_size']) if 'part_size' in args else 3 + + # + # @TODO: + # Log what was initiated so we have context of this processing ... + # + if 'listen' not in SYS_ARGS : + if 'file' in args : + reader = lambda: pd.read_csv(args['file']) ; + else: + reader = lambda: Components().get(args) + args['reader'] = reader + + if 'generate' in SYS_ARGS : + # + # Let us see if we have partitions given the log folder + + content = os.listdir( os.sep.join([args['logs'],args['context']])) + generator = Components() + if ''.join(content).isnumeric() : + # + # we have partitions we are working with + + for id in ''.join(content) : + args['partition'] = id + + generator.generate(args) + else: + generator.generate(args) + # Components.generate(args) + elif 'listen' in args : + # + # This will start a worker just in case to listen to a queue + if 'read' in SYS_ARGS : + QUEUE_TYPE = 'queue.QueueReader' + pointer = lambda qreader: qreader.read(1) + else: + QUEUE_TYPE = 'queue.QueueListener' + pointer = lambda qlistener: qlistener.listen() + N = int(SYS_ARGS['jobs']) if 'jobs' in SYS_ARGS else 1 + + qhandlers = [factory.instance(type=QUEUE_TYPE,args={'queue':'aou.io'}) for i in np.arange(N)] + jobs = [] + for qhandler in qhandlers : + qhandler.callback = Components.callback + job = Process(target=pointer,args=(qhandler,)) + job.start() + jobs.append(job) + # + # let us wait for the jobs + print (["Started ",len(jobs)," trainers"]) + while len(jobs) > 0 : + + jobs = [job for job in jobs if job.is_alive()] + + # pointer(qhandler) + + + # qreader.read(1) + pass + else: + + trainer = Components() + trainer.train(**args) + # Components.train(**args) +#for args in PIPELINE : + #args['dataset'] = 'combined20190510' + #process = Process(target=Components.train,args=(args,)) + #process.name = args['context'] + #process.start() +# Components.train(args) diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000..9d095d9 --- /dev/null +++ b/pipeline.py @@ -0,0 +1,692 @@ +#!/usr/bin/env python3 +import json +from transport import factory +import numpy as np +import time +import os +from multiprocessing import Process, Lock +import pandas as pd +from google.oauth2 import service_account +from google.cloud import bigquery as bq +import data.maker +import copy +from data.params import SYS_ARGS + +# +# The configuration array is now loaded and we will execute the pipe line as follows + +class Components : + lock = Lock() + class KEYS : + PIPELINE_KEY = 'pipeline' + SQL_FILTER = 'filter' + @staticmethod + def get_filter (**args): + if args['qualifier'] == 'IN' : + return ' '.join([args['field'],args['qualifier'],'(',args['value'],')']) + else: + return ' '.join([args['field'],args['qualifier'],args['value']]) + @staticmethod + def get_logger(**args) : + return factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) + @staticmethod + def get(args): + """ + This function returns a data-frame provided a bigquery sql statement with conditions (and limits for testing purposes) + The function must be wrapped around a lambda this makes testing easier and changing data stores transparent to the rest of the code. (Vital when testing) + :sql basic sql statement + :condition optional condition and filters + """ + SQL = args['sql'] + if Components.KEYS.SQL_FILTER in args : + FILTER_KEY = Components.KEYS.SQL_FILTER + SQL_FILTER = args[FILTER_KEY] if type(args[FILTER_KEY]) == list else [args[FILTER_KEY]] + # condition = ' '.join([args[FILTER_KEY]['field'],args[FILTER_KEY]['qualifier'],'(',args[FILTER_KEY]['value'],')']) + + condition = ' AND '.join([Components.get_filter(**item) for item in SQL_FILTER]) + SQL = " ".join([SQL,'WHERE',condition]) + + SQL = SQL.replace(':dataset',args['dataset']) #+ " LI " + + if 'limit' in args : + SQL = SQL + ' LIMIT ' + args['limit'] + # + # let's log the sql query that has been performed here + logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) + logger.write({"module":"bigquery","action":"read","input":{"sql":SQL}}) + credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') + df = pd.read_gbq(SQL,credentials=credentials,dialect='standard') + return df + + # return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna() + @staticmethod + def split(X,MAX_ROWS=3,PART_SIZE=3): + + return list(pd.cut( np.arange(X.shape[0]+1),PART_SIZE).categories) + def format_schema(self,schema): + _schema = {} + for _item in schema : + _type = int + _value = 0 + if _item.field_type == 'FLOAT' : + _type =float + elif _item.field_type != 'INTEGER' : + _type = str + _value = '' + _schema[_item.name] = _type + return _schema + def get_ignore(self,**_args) : + if 'columns' in _args and 'data' in _args : + _df = _args['data'] + terms = _args['columns'] + return [name for name in _df.columns if np.sum( [int(field in name )for field in terms ]) ] + + return [] + def set_gpu(self,**_args) : + if 'gpu' in _args : + gpu = _args['gpu'] if type(_args['gpu']) != str else [_args['gpu']] + _index = str(gpu[0]) + os.environ['CUDA_VISIBLE_DEVICES'] = _index + return gpu + else : + return None + def train(self,**args): + """ + This function will perform training on the basis of a given pointer that reads data + + """ + schema = None + if 'file' in args : + + df = pd.read_csv(args['file']) + del args['file'] + elif 'data' not in args : + + reader = factory.instance(**args['store']['source']) + + + if 'row_limit' in args : + df = reader.read(sql=args['sql'],limit=args['row_limit']) + else: + df = reader.read(sql=args['sql']) + schema = reader.meta(table=args['from']) if hasattr(reader,'meta') and 'from' in args else None + else: + df = args['data'] + + # + # + # df = df.fillna('') + if schema : + _schema = [] + for _item in schema : + _type = int + _value = 0 + if _item.field_type == 'FLOAT' : + _type =float + elif _item.field_type != 'INTEGER' : + _type = str + _value = '' + _schema += [{"name":_item.name,"type":_item.field_type}] + df[_item.name] = df[_item.name].fillna(_value).astype(_type) + args['schema'] = _schema + # df[_item.name] = df[_item.name].astype(_type) + _args = copy.deepcopy(args) + # _args['store'] = args['store']['source'] + _args['data'] = df + # + # The columns that are continuous should also be skipped because they don't need to be synthesied (like-that) + if 'continuous' in args : + x_cols = args['continuous'] + else: + x_cols = [] + + if 'ignore' in args and 'columns' in args['ignore'] : + _cols = self.get_ignore(data=df,columns=args['ignore']['columns']) + _args['data'] = df[ list(set(df.columns)- set(_cols))] + # + # We need to make sure that continuous columns are removed + if x_cols : + _args['data'] = _args['data'][list(set(_args['data'].columns) - set(x_cols))] + if 'gpu' in args : + _args['gpu'] = self.set_gpu(gpu=args['gpu']) + if 'partition' in args : + _args['partition'] = args['partition'] + if df.shape[0] and df.shape[0] : + # + # We have a full blown matrix to be processed + print ('-- Training --') + data.maker.train(**_args) + else: + print ("... skipping training !!") + + if 'autopilot' in ( list(args.keys())) : + + args['data'] = df + print (['autopilot mode enabled ....',args['context']]) + self.generate(args) + + pass + + def approximate(self,values): + """ + :param values array of values to be approximated + """ + if values.dtype in [int,float] : + # + # @TODO: create bins? + r = np.random.dirichlet(values+.001) #-- dirichlet doesn't work on values with zeros + _sd = values[values > 0].std() + _me = values[values > 0].mean() + _mi = values.min() + x = [] + _type = values.dtype + for index in np.arange(values.size) : + + if np.random.choice([0,1],1)[0] : + value = values[index] + (values[index] * r[index]) + + else : + value = values[index] - (values[index] * r[index]) + # + # randomly shifting the measurements + if np.random.choice([0,1],1)[0] and _me > _sd : + if np.random.choice([0,1],1)[0] : + value = value * np.divide(_me,_sd) + else: + value = value + (np.divide(_me,_sd)) + value = int(value) if _type == int else np.round(value,2) + x.append( value) + np.random.shuffle(x) + return np.array(x) + else: + return values + pass + + def shuffle(self,_args): + if 'data' in args : + df = data['data'] + else: + reader = factory.instance(**args['store']['source']) + if 'file' in args : + df = pd.read_csv(args['file']) + elif 'data' in _args : + df = _args['data'] + else: + if 'row_limit' in args and 'sql' in args: + df = reader.read(sql=args['sql'],limit=args['row_limit']) + else: + df = reader.read(sql=args['sql']) + schema = None + if 'schema' not in args and hasattr(reader,'meta') and 'file' not in args: + schema = reader.meta(table=args['from']) + schema = [{"name":_item.name,"type":_item.field_type} for _item in schema] + # + # We are shufling designated colmns and will be approximating the others + # + x_cols = [] #-- coumns tobe approximated. + _cols = [] #-- columns to be ignored + if 'continuous' in args : + x_cols = args['continuous'] + if 'ignore' in args and 'columns' in args['ignore'] : + _cols = self.get_ignore(data=df,columns=args['ignore']['columns']) + + columns = args['columns'] if 'columns' in args else df.columns + columns = list(set(columns) - set(_cols)) + for name in columns: + i = np.arange(df.shape[0]) + np.random.shuffle(i) + if name in x_cols : + if df[name].unique().size > 0 : + df[name] = self.approximate(df.iloc[i][name].fillna(0).values) + # df[name] = df[name].astype(str) + # pass + + df.index = np.arange(df.shape[0]) + self.post(data=df,schema=schema,store=args['store']['target']) + def post(self,**_args) : + table = _args['from'] if 'from' in _args else _args['store']['table'] + _schema = _args['schema'] if 'schema' in _args else None + writer = factory.instance(**_args['store']) + _df = _args['data'] + if _schema : + columns = [] + for _item in _schema : + name = _item['name'] + _type = str + _value = 0 + if _item['type'] in ['DATE','TIMESTAMP','DATETIMESTAMP','DATETIME'] : + if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] : + # + # There is an issue with missing dates that needs to be resolved. + # for some reason a missing date/time here will cause the types to turn into timestamp (problem) + # The following is a hack to address the issue (alas) assuming 10 digit dates and 'NaT' replaces missing date values (pandas specifications) + # + _df[name] = _df[name].apply(lambda value: None if str(value) == 'NaT' else (str(value)[:10]) if _item['type'] in ['DATE','DATETIME'] else str(value)) + #_df[name] = _df[name].dt.date + # _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce') + else: + pass + _df[name] = pd.to_datetime(_df[name]) + else: + value = 0 + if _item['type'] == 'INTEGER' : + _type = np.int64 + elif _item['type'] in ['FLOAT','NUMERIC']: + _type = np.float64 + else: + + _value = '' + _df[name] = _df[name].fillna(_value) #.astype(_type) + columns.append(name) + + fields = _df.columns.tolist() + if not writer.has(table=table) and _args['store']['provider'] != 'bigquery': + + _map = {'STRING':'VARCHAR(256)','INTEGER':'BIGINT'} if 'provider' in _args['store'] and _args['store']['provider'] != 'bigquery' else {} + _params = {'map':_map,'table':args['from']} + if _schema : + _params['schema'] = _schema + + else: + _params['fields'] = fields + + writer.make(**_params) + + fields = _df.columns.tolist() + _df = _df[fields] + # writer.fields = fields + if _args['store']['provider'] == 'bigquery' : + print (['_______ POSTING ______________ ',table]) + print (['_______________ ',_df.shape[0],' ___________________']) + writer.write(_df.astype(object),schema=_schema,table=table) + else: + writer.table = table + writer.write(_df) + # else: + # writer.write(_df,table=args['from']) + + + def finalize(self,args): + """ + This function performs post-processing opertions on a synthetic table i.e : + - remove duplicate keys + - remove orphaned keys i.e + """ + reader = factory.instance(**args['store']['source']) + logger = factory.instance(**args['store']['logs']) + + target = args['store']['target']['args']['dataset'] + source = args['store']['source']['args']['dataset'] + table = args['from'] + schema = reader.meta(table=args['from']) + # + # keys : + unique_field = "_".join([args['from'],'id']) if 'unique_fields' not in args else args['unique_fields'] + fields = [ item.name if item.name != unique_field else "y."+item.name for item in schema] + SQL = [ + "SELECT :fields FROM ", + "(SELECT ROW_NUMBER() OVER() AS row_number,* FROM :target.:table) x","INNER JOIN", + "(SELECT ROW_NUMBER() OVER() AS row_number, :unique_field FROM :source.:table ORDER BY RAND()) y", + "ON y.row_number = x.row_number" + ] + SQL = " ".join(SQL).replace(":fields",",".join(fields)).replace(":table",table).replace(":source",source).replace(":target",target) + SQL = SQL.replace(":unique_field",unique_field) + # + # Use a native job to get this done ... + # + client = bq.Client.from_service_account_json(args['store']['source']['args']["private_key"]) + job = bq.QueryJobConfig() + job.destination = client.dataset(target).table(table) + job.use_query_cache = True + job.allow_large_results = True + # job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY) + job.write_disposition = "WRITE_TRUNCATE" + job.priority = 'BATCH' + r = client.query(SQL,location='US',job_config=job) + logger.write({"job":r.job_id,"action":"finalize", "args":{"sql":SQL,"source":"".join([source,table]),"destimation":".".join([target,table])}}) + # + # Keep a log of what just happened... + # + otable = ".".join([args['store']['source']['args']['dataset'],args['from']]) + dtable = ".".join([args['store']['target']['args']['dataset'],args['from']]) + def generate(self,args): + """ + This function will generate data and store it to a given, + """ + store = args['store']['logs'] + if 'args' in store : + store['args']['doc'] = args['context'] + else: + store['doc'] = args['context'] + logger = factory.instance(**store) #type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) + + ostore = args['store']['target'] + writer = factory.instance(**ostore) + + schema = args['schema'] if 'schema' in args else None + if 'data' in args : + + df = args['data'] + else: + + reader = factory.instance(**args['store']['source']) + if 'row_limit' in args : + df = reader.read(sql=args['sql'],limit=args['row_limit']) + else: + df = reader.read(sql=args['sql']) + if 'schema' not in args and hasattr(reader,'meta'): + schema = reader.meta(table=args['from']) + schema = [{"name":_item.name,"type":_item.field_type} for _item in schema] + + # else: + # # + # # This will account for autopilot mode ... + # df = args['data'] + _cast = {} + if schema : + + for _item in schema : + dtype = str + name = _item['name'] + novalue = 0 + if _item['type'] in ['INTEGER','NUMERIC']: + dtype = np.int64 + + elif _item['type'] == 'FLOAT' : + dtype = np.float64 + else: + novalue = '' + # _cast[schema['name']] = dtype + df[name] = df[name].fillna(novalue).astype(dtype) + + _info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[1]},"schema":schema} + logger.write(_info) + + + _dc = pd.DataFrame() + # for mdf in df : + args['data'] = df.copy() + # + # The columns that are continuous should also be skipped because they don't need to be synthesied (like-that) + if 'continuous' in args : + x_cols = args['continuous'] + else: + x_cols = [] + + if 'ignore' in args and 'columns' in args['ignore'] : + _cols = self.get_ignore(data=df,columns=args['ignore']['columns']) + args['data'] = args['data'][ list(set(df.columns)- set(_cols))] + # + # We need to remove the continuous columns from the data-frame + # @TODO: Abstract this !! + # + real_df = pd.DataFrame() + if x_cols : + args['data'] = args['data'][list(set(args['data'].columns) - set(x_cols))] + real_df = df[x_cols].copy() + + args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) + if 'gpu' in args : + args['gpu'] = self.set_gpu(gpu=args['gpu']) + # if 'partition' in args : + # args['logs'] = os.sep.join([args['logs'],str(args['partition'])]) + + _info = {"module":"gan-prep","action":"prune","shape":{"rows":args['data'].shape[0],"columns":args['data'].shape[1]}} + logger.write(_info) + if args['data'].shape[0] > 0 and args['data'].shape[1] > 0 : + candidates = (data.maker.generate(**args)) + + else: + candidates = [df] + + # if 'sql.BQWriter' in ostore['type'] : + _columns = None + skip_columns = [] + _schema = schema + if schema : + cols = [_item['name'] for _item in _schema] + else: + cols = df.columns.tolist() + _info = {"module":"gan-prep","action":"selection","input":{"candidates":len(candidates),"features":cols}} + logger.write(_info) + for _df in candidates : + # + # we need to format the fields here to make sure we have something cohesive + # + + if not skip_columns : + if 'ignore' in args and 'columns' in args['ignore'] : + skip_columns = self.get_ignore(data=_df,columns=args['ignore']['columns']) + # + # We perform a series of set operations to insure that the following conditions are met: + # - the synthetic dataset only has fields that need to be synthesized + # - The original dataset has all the fields except those that need to be synthesized + # + + _df = _df[list(set(_df.columns) - set(skip_columns))].copy() + if x_cols : + _approx = {} + for _col in x_cols : + if real_df[_col].unique().size > 0 : + + + _df[_col] = self.approximate(real_df[_col].values) + _approx[_col] = { + "io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}, + "real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)} + } + else: + _df[_col] = -1 + logger.write({"module":"gan-generate","action":"approximate","status":_approx}) + if set(df.columns) & set(_df.columns) : + _columns = list(set(df.columns) - set(_df.columns)) + df = df[_columns] + + # + # Let us merge the dataset here and and have a comprehensive dataset + + _df = pd.DataFrame.join(df,_df) + _params = {'data':_df,'store' : ostore,'from':args['from']} + if _schema : + _params ['schema'] = _schema + _info = {"module":"gan-prep","action":"write","input":{"rows":_df.shape[0],"cols":_df.shape[1]}} + logger.write(_info) + self.post(**_params) + # print (['_______ posting _________________',_df.shape]) + break + + + pass + # else: + # pass + def bind(self,**_args): + print (_args) + + +if __name__ == '__main__' : + filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json' + f = open (filename) + _config = json.loads(f.read()) + f.close() + PIPELINE = _config['pipeline'] + index = SYS_ARGS['index'] + if index.isnumeric() : + index = int(SYS_ARGS['index']) + else: + # + # The index provided is a key to a pipeline entry mainly the context + # + N = len(PIPELINE) + f = [i for i in range(0,N) if PIPELINE[i]['context'] == index] + index = f[0] if f else 0 + # + + print ("..::: ",PIPELINE[index]['context'],':::..') + args = (PIPELINE[index]) + for key in _config : + if key == 'pipeline' or key in args: + # + # skip in case of pipeline or if key exists in the selected pipeline (provided by index) + # + continue + args[key] = _config[key] + + args = dict(args,**SYS_ARGS) + if 'matrix_size' in args : + args['matrix_size'] = int(args['matrix_size']) + if 'batch_size' not in args : + args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size']) + if 'dataset' not in args : + args['dataset'] = 'combined20191004v2_deid' + args['logs'] = args['logs'] if 'logs' in args else 'logs' + PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 + # + # @TODO: + # Log what was initiated so we have context of this processing ... + # + + GPU_CHIPS = args['gpu'] if 'gpu' in args else None + if GPU_CHIPS and type(GPU_CHIPS) != list : + GPU_CHIPS = [int(_id.strip()) for _id in GPU_CHIPS.split(',')] if type(GPU_CHIPS) == str else [GPU_CHIPS] + if 'gpu' in SYS_ARGS : + args['gpu'] = GPU_CHIPS + jobs = [] + if 'generate' in SYS_ARGS : + # + # Let us see if we have partitions given the log folder + + content = os.listdir( os.sep.join([args['logs'],'train',args['context']])) + if 'all-chips' in SYS_ARGS and GPU_CHIPS: + index = 0 + jobs = [] + for _gpu in GPU_CHIPS : + _args = copy.deepcopy(args) + _args['gpu'] = [int(_gpu)] + _args['partition'] = int(_gpu) #index + index += 1 + make = lambda _params: (Components()).generate(_params) + job = Process(target=make,args=( dict(_args),)) + job.name = 'Trainer # ' + str(index) + job.start() + jobs.append(job) + pass + else: + + generator = Components() + generator.generate(args) + elif 'bind' in SYS_ARGS : + import binder + _args = _config['_map'] + _args['store'] = copy.deepcopy(_config['store']) + if 'init' in SYS_ARGS : + # + # Creating and persisting the map ... + print (['.... Binding Initialization']) + # jobs = binder.Init(**_args) + _mapped = binder.Init(**_args) + + + _schema = [{"name":_name,"type":"INTEGER"} for _name in _mapped.columns.tolist()] + publisher = lambda _params: (Components()).post(**_params) + _args = {'data':_mapped,'store':_config['store']['target']} + _args['store']['table'] = '_map' + if _args['store']['provider'] =='bigquery' : + _args['schema'] = _schema + + job = Process (target = publisher,args=(_args,)) + job.start() + jobs = [job] + else: + # + # Applying the map of k on a particular dataset + # + index = int(SYS_ARGS['index']) + _args['config'] = _config['pipeline'][index] + _args['original_key'] = 'person_id' if 'original_key' in _config else 'person_id' + table = _config['pipeline'][index]['from'] + _df = binder.ApplyOn(**_args) + _df = np.array_split(_df,PART_SIZE) + jobs = [] + print (['Publishing ',PART_SIZE,' PARTITION']) + for data in _df : + publisher = lambda _params: ( Components() ).post(**_params) + _args = {'data':data,'store':_config['store']['target']} + _args['store']['table'] = table + print (_args['store']) + job = Process(target = publisher,args=(_args,)) + job.name = "Publisher "+str(len(jobs)+1) + job.start() + jobs.append(job) + + elif 'shuffle' in SYS_ARGS : + index = 0 + if GPU_CHIPS and 'all-chips' in SYS_ARGS: + + for index in GPU_CHIPS : + publisher = lambda _params: ( Components() ).shuffle(_params) + job = Process (target = publisher,args=( args,)) + job.name = 'Shuffler #' + str(index) + job.start() + jobs.append(job) + else: + shuffler = Components() + shuffler.shuffle(args) + pass + elif 'train' in SYS_ARGS: + + # DATA = np.array_split(DATA,PART_SIZE) + # + # Let us create n-jobs across n-gpus, The assumption here is the data that is produced will be a partition + # @TODO: Find better name for partition + # + + if GPU_CHIPS and 'all-chips' in SYS_ARGS: + index = 0 + print (['... launching ',len(GPU_CHIPS),' jobs',args['context']]) + for _gpu in GPU_CHIPS : + _args = copy.deepcopy(args) + _args['gpu'] = [int(_gpu)] + _args['partition'] = int(_gpu) #index + index += 1 + make = lambda _params: (Components()).train(**_params) + job = Process(target=make,args=( _args,)) + job.name = 'Trainer # ' + str(index) + job.start() + jobs.append(job) + + + + + else: + # + # The choice of the chip will be made internally + + agent = Components() + agent.train(**args) + # + # If we have any obs we should wait till they finish + # + DIRTY = 0 + if (len(jobs)) : + print (['.... waiting on ',len(jobs),' jobs']) + while len(jobs)> 0 : + DIRTY =1 + jobs = [job for job in jobs if job.is_alive()] + time.sleep(2) + if DIRTY: + print (["..:: jobs finished "]) + # + # We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations + # This holds true for bigquery - bigquery only + IS_BIGQUERY = _config['store']['source']['provider'] == _config['store']['target']['provider'] and _config['store']['source']['provider'] == 'bigquery' + + # if 'bind' not in SYS_ARGS and IS_BIGQUERY and ('autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS)) : + # # + # # We should pull all the primary keys and regenerate them in order to insure some form of consistency + # # + + # # + # # + + # print (["..:: Finalizing process"]) + # (Components()).finalize(args) diff --git a/setup.py b/setup.py index db4029b..8ad1b09 100644 --- a/setup.py +++ b/setup.py @@ -4,9 +4,10 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.0.5","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.6.4", + "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} -args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] +args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git' if sys.version_info[0] == 2 :