From dcc55eb1fbab75f32f8953d9b150dfe8fd567448 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 10 Jan 2020 13:12:58 -0600 Subject: [PATCH 001/250] bug fixes --- data/gan.py | 25 ++++++++++++++++++++----- data/maker/__init__.py | 39 +++++++++++++++++++++++---------------- data/maker/__main__.py | 33 +++++++++++++++++++++++++-------- 3 files changed, 68 insertions(+), 29 deletions(-) diff --git a/data/gan.py b/data/gan.py index 43d15ae..46ecb18 100644 --- a/data/gan.py +++ b/data/gan.py @@ -1,8 +1,23 @@ """ -usage : - optional : - --num_gpu number of gpus to use will default to 1 - --epoch steps per epoch default to 256 +This code was originally writen by Ziqi Zhang in order to generate synthetic data. +The code is an implementation of a Generative Adversarial Network that uses the Wasserstein Distance (WGAN). +It is intended to be used in 2 modes (embedded in code or using CLI) + +USAGE : + +The following parameters should be provided in a configuration file (JSON format) +python data/maker --config + +CONFIGURATION FILE STRUCTURE : + + context what it is you are loading (stroke, hypertension, ...) + data path of the file to be loaded + logs folder to store training model and meta data about learning + max_epochs number of iterations in learning + num_gpu number of gpus to be used (will still run if the GPUs are not available) + +EMBEDDED IN CODE : + """ import tensorflow as tf from tensorflow.contrib.layers import l2_regularizer @@ -426,7 +441,7 @@ class Train (GNet): print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) # print (dir (w_distance)) - logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) }) + logs.append({"epoch":epoch,"distance":-w_sum }) if epoch % self.MAX_EPOCHS == 0: # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] diff --git a/data/maker/__init__.py b/data/maker/__init__.py index f97e5f3..e0ca55d 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -24,21 +24,25 @@ def train (**args) : column = args['column'] column_id = args['id'] - df = args['data'] - logs = args['logs'] - real = pd.get_dummies(df[column]).astype(np.float32).values - labels = pd.get_dummies(df[column_id]).astype(np.float32).values - num_gpu = 1 if 'num_gpu' not in args else args['num_gpu'] - max_epochs = 10 if 'max_epochs' not in args else args['max_epochs'] + df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) + # logs = args['logs'] + # real = pd.get_dummies(df[column]).astype(np.float32).values + # labels = pd.get_dummies(df[column_id]).astype(np.float32).values + args['real'] = pd.get_dummies(df[column]).astype(np.float32).values + args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values + # num_gpu = 1 if 'num_gpu' not in args else args['num_gpu'] + # max_epochs = 10 if 'max_epochs' not in args else args['max_epochs'] context = args['context'] + if 'store' in args : args['store']['args']['doc'] = context logger = factory.instance(**args['store']) + args['logger'] = logger else: logger = None - - trainer = gan.Train(context=context,max_epochs=max_epochs,num_gpu=num_gpu,real=real,label=labels,column=column,column_id=column_id,logger = logger,logs=logs) + trainer = gan.Train(**args) + # trainer = gan.Train(context=context,max_epochs=max_epochs,num_gpu=num_gpu,real=real,label=labels,column=column,column_id=column_id,logger = logger,logs=logs) return trainer.apply() def generate(**args): @@ -51,14 +55,14 @@ def generate(**args): :id column identifying an entity :logs location on disk where the learnt knowledge of the dataset is """ - df = args['data'] - + # df = args['data'] + df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) column = args['column'] column_id = args['id'] - logs = args['logs'] - context = args['context'] - num_gpu = 1 if 'num_gpu' not in args else args['num_gpu'] - max_epochs = 10 if 'max_epochs' not in args else args['max_epochs'] + # logs = args['logs'] + # context = args['context'] + # num_gpu = 1 if 'num_gpu' not in args else args['num_gpu'] + # max_epochs = 10 if 'max_epochs' not in args else args['max_epochs'] # #@TODO: @@ -69,8 +73,11 @@ def generate(**args): values = df[column].unique().tolist() values.sort() - labels = pd.get_dummies(df[column_id]).astype(np.float32).values - handler = gan.Predict (context=context,label=labels,max_epochs=max_epochs,num_gpu=num_gpu,values=values,column=column,logs=logs) + # labels = pd.get_dummies(df[column_id]).astype(np.float32).values + args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values + args['values'] = values + # handler = gan.Predict (context=context,label=labels,max_epochs=max_epochs,num_gpu=num_gpu,values=values,column=column,logs=logs) + handler = gan.Predict (**args) handler.load_meta(column) r = handler.apply() _df = df.copy() diff --git a/data/maker/__main__.py b/data/maker/__main__.py index e77bf0a..56defec 100644 --- a/data/maker/__main__.py +++ b/data/maker/__main__.py @@ -1,10 +1,27 @@ import pandas as pd import data.maker - -df = pd.read_csv('sample.csv') -column = 'gender' -id = 'id' -context = 'demo' -store = {"type":"mongo.MongoWriter","args":{"host":"localhost:27017","dbname":"GAN"}} -max_epochs = 11 -data.maker.train(store=store,max_epochs=max_epochs,context=context,data=df,column=column,id=id,logs='foo') \ No newline at end of file +from data.params import SYS_ARGS +import json +from scipy.stats import wasserstein_distance as wd +import risk +import numpy as np +if 'config' in SYS_ARGS : + ARGS = json.loads(open(SYS_ARGS['config']).read()) + if 'generate' not in SYS_ARGS : + data.maker.train(**ARGS) + else: + # + # + _df = data.maker.generate(**ARGS) + odf = pd.read_csv (ARGS['data']) + odf.columns = [name.lower() for name in odf.columns] + column = [ARGS['column'] ] #+ ARGS['id'] + print (column) + print (_df[column].risk.evaluate()) + print (odf[column].risk.evaluate()) + _x = pd.get_dummies(_df[column]).values + y = pd.get_dummies(odf[column]).values + N = _df.shape[0] + print (np.mean([ wd(_x[i],y[i])for i in range(0,N)])) + # column = SYS_ARGS['column'] + # odf = open(SYS_ARGS['data']) \ No newline at end of file From 63a7f1a968293ad5a4e70d71b8be62ba1f97c9ea Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 10 Jan 2020 13:16:11 -0600 Subject: [PATCH 002/250] version # update --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index db4029b..5f800d9 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.0.5","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.0.6","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git' From 31ca5886f0f6c53b77c4a6e001aee8a995cd7f78 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 11 Feb 2020 12:00:16 -0600 Subject: [PATCH 003/250] not sure about the changes (oops) --- data/gan.py | 99 ++++++++++++++++++++++++++++++++---------- data/maker/__init__.py | 89 ++++++++++++++++++++----------------- data/maker/__main__.py | 18 ++++---- 3 files changed, 137 insertions(+), 69 deletions(-) diff --git a/data/gan.py b/data/gan.py index 46ecb18..3f22740 100644 --- a/data/gan.py +++ b/data/gan.py @@ -43,6 +43,10 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' class void : pass class GNet : + def log(self,**args): + self.logs = dict(args,**self.logs) + + """ This is the base class of a generative network functions, the details will be implemented in the subclasses. An instance of this class is accessed as follows @@ -52,7 +56,7 @@ class GNet : def __init__(self,**args): self.layers = void() self.layers.normalize = self.normalize - + self.logs = {} self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] @@ -95,6 +99,15 @@ class GNet : self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) + if self.logger : + # + # We will clear the logs from the data-store + # + column = self.ATTRIBUTES['synthetic'] + db = self.logger.db + if db[column].count() > 0 : + db.backup.insert({'name':column,'logs':list(db[column].find()) }) + db[column].drop() def load_meta(self,column): """ @@ -114,7 +127,9 @@ class GNet : def log_meta(self,**args) : + _object = { + '_id':'meta', 'CONTEXT':self.CONTEXT, 'ATTRIBUTES':self.ATTRIBUTES, 'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU, @@ -314,6 +329,11 @@ class Train (GNet): # print ([" *** ",self.BATCHSIZE_PER_GPU]) self.meta = self.log_meta() + if(self.logger): + + self.logger.write( row=self.meta ) + + self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta) def load_meta(self, column): """ This function will delegate the calls to load meta data to it's dependents @@ -350,11 +370,14 @@ class Train (GNet): if stage == 'D': w, loss = self.discriminator.loss(real=real, fake=fake, label=label) #losses = tf.get_collection('dlosses', scope) + flag = 'dlosses' losses = tf.compat.v1.get_collection('dlosses', scope) else: w, loss = self.generator.loss(fake=fake, label=label) #losses = tf.get_collection('glosses', scope) + flag = 'glosses' losses = tf.compat.v1.get_collection('glosses', scope) + # losses = tf.compat.v1.get_collection(flag, scope) total_loss = tf.add_n(losses, name='total_loss') @@ -369,7 +392,8 @@ class Train (GNet): dataset = dataset.repeat(10000) dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) dataset = dataset.prefetch(1) - iterator = dataset.make_initializable_iterator() + # iterator = dataset.make_initializable_iterator() + iterator = tf.compat.v1.data.make_initializable_iterator(dataset) # next_element = iterator.get_next() # init_op = iterator.initializer return iterator, features_placeholder, labels_placeholder @@ -405,7 +429,10 @@ class Train (GNet): def apply(self,**args): # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10 REAL = self._REAL - LABEL= self._LABEL + LABEL= self._LABEL + if (self.logger): + pass + with tf.device('/cpu:0'): opt_d = tf.compat.v1.train.AdamOptimizer(1e-4) opt_g = tf.compat.v1.train.AdamOptimizer(1e-4) @@ -441,7 +468,7 @@ class Train (GNet): print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) # print (dir (w_distance)) - logs.append({"epoch":epoch,"distance":-w_sum }) + logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) }) if epoch % self.MAX_EPOCHS == 0: # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] @@ -452,9 +479,14 @@ class Train (GNet): # # if self.logger : - row = {"logs":logs} #,"model":pickle.dump(sess)} - + row = {"logs":logs} #,"model":pickle.dump(sess)} self.logger.write(row=row) + # + # @TODO: + # We should upload the files in the checkpoint + # This would allow the learnt model to be portable to another system + # + tf.compat.v1.reset_default_graph() class Predict(GNet): """ @@ -479,38 +511,61 @@ class Predict(GNet): ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) - fake = self.generator.network(inputs=z, label=label) + fake = self.generator.network(inputs=z, label=label) init = tf.compat.v1.global_variables_initializer() - saver = tf.compat.v1.train.Saver() + saver = tf.compat.v1.train.Saver() + df = pd.DataFrame() + CANDIDATE_COUNT = 1000 + NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0] with tf.compat.v1.Session() as sess: # sess.run(init) saver.restore(sess, model_dir) labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) + found = [] labels= demo - f = sess.run(fake,feed_dict={y:labels}) - # - # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes - # - - df = ( pd.DataFrame(np.round(f).astype(np.int32))) + for i in np.arange(CANDIDATE_COUNT) : + + f = sess.run(fake,feed_dict={y:labels}) + # + # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes + # The code below will insure we have some acceptable cardinal relationships between id and synthetic values + # + df = ( pd.DataFrame(np.round(f).astype(np.int32))) + p = 0 not in df.sum(axis=1).values + + if p: + found.append(df) + if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT: + break + else: + continue + # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms # df = (i * df).sum(axis=1) # # In case we are dealing with actual values like diagnosis codes we can perform # + df = found[np.random.choice(np.arange(len(found)),1)[0]] columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] - r = np.zeros((self.ROW_COUNT,len(columns))) - for col in df : - i = np.where(df[col])[0] - r[i] = col - - df = pd.DataFrame(r,columns=columns) + # r = np.zeros((self.ROW_COUNT,len(columns))) + r = np.zeros(self.ROW_COUNT) + df.columns = self.values + if len(found): + print (len(found),NTH_VALID_CANDIDATE) + # x = df * self.values + + df = pd.DataFrame( df.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) + df.columns = columns + + - df[df.columns] = (df.apply(lambda value: self.values[ int(value)],axis=1)) - return df.to_dict(orient='lists') + + tf.compat.v1.reset_default_graph() + + return df.to_dict(orient='list') # return df.to_dict(orient='list') # count = str(len(os.listdir(self.out_dir))) # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv']) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index e0ca55d..f1a9537 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -12,6 +12,7 @@ import pandas as pd import numpy as np import data.gan as gan from transport import factory +import threading as thread def train (**args) : """ This function is intended to train the GAN in order to learn about the distribution of the features @@ -21,30 +22,42 @@ def train (**args) : :data data-frame to be synthesized :context label of what we are synthesizing """ - column = args['column'] + column = args['column'] if (isinstance(args['column'],list)) else [args['column']] column_id = args['id'] df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) - # logs = args['logs'] - # real = pd.get_dummies(df[column]).astype(np.float32).values - # labels = pd.get_dummies(df[column_id]).astype(np.float32).values - args['real'] = pd.get_dummies(df[column]).astype(np.float32).values - args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values - # num_gpu = 1 if 'num_gpu' not in args else args['num_gpu'] - # max_epochs = 10 if 'max_epochs' not in args else args['max_epochs'] - context = args['context'] - - if 'store' in args : - args['store']['args']['doc'] = context - logger = factory.instance(**args['store']) - args['logger'] = logger - - else: - logger = None - trainer = gan.Train(**args) - # trainer = gan.Train(context=context,max_epochs=max_epochs,num_gpu=num_gpu,real=real,label=labels,column=column,column_id=column_id,logger = logger,logs=logs) - return trainer.apply() + df.columns = [name.lower() for name in df.columns] + # + # If we have several columns we will proceed one at a time (it could be done in separate threads) + # @TODO : Consider performing this task on several threads/GPUs simulataneously + # + args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values + for col in column : + args['real'] = pd.get_dummies(df[col]).astype(np.float32).values + args['column'] = col + args['context'] = col + context = args['context'] + if 'store' in args : + args['store']['args']['doc'] = context + logger = factory.instance(**args['store']) + args['logger'] = logger + + else: + logger = None + trainer = gan.Train(**args) + trainer.apply() +def post(**args): + """ + This uploads the tensorflow checkpoint to a data-store (mongodb, biguqery, s3) + + """ + pass +def get(**): + """ + This function will restore a checkpoint from a persistant storage on to disk + """ + pass def generate(**args): """ This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset @@ -57,29 +70,27 @@ def generate(**args): """ # df = args['data'] df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) - column = args['column'] + + column = args['column'] if (isinstance(args['column'],list)) else [args['column']] column_id = args['id'] - # logs = args['logs'] - # context = args['context'] - # num_gpu = 1 if 'num_gpu' not in args else args['num_gpu'] - # max_epochs = 10 if 'max_epochs' not in args else args['max_epochs'] - # #@TODO: # If the identifier is not present, we should fine a way to determine or make one # - #ocolumns= list(set(df.columns.tolist())- set(columns)) - - values = df[column].unique().tolist() - values.sort() - - # labels = pd.get_dummies(df[column_id]).astype(np.float32).values args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values - args['values'] = values - # handler = gan.Predict (context=context,label=labels,max_epochs=max_epochs,num_gpu=num_gpu,values=values,column=column,logs=logs) - handler = gan.Predict (**args) - handler.load_meta(column) - r = handler.apply() - _df = df.copy() - _df[column] = r[column] + _df = df.copy() + for col in column : + args['context'] = col + args['column'] = col + values = df[col].unique().tolist() + # values.sort() + args['values'] = values + # + # we can determine the cardinalities here so we know what to allow or disallow + handler = gan.Predict (**args) + handler.load_meta(col) + r = handler.apply() + # print (r) + _df[col] = r[col] + # break return _df \ No newline at end of file diff --git a/data/maker/__main__.py b/data/maker/__main__.py index 56defec..63b464b 100644 --- a/data/maker/__main__.py +++ b/data/maker/__main__.py @@ -15,13 +15,15 @@ if 'config' in SYS_ARGS : _df = data.maker.generate(**ARGS) odf = pd.read_csv (ARGS['data']) odf.columns = [name.lower() for name in odf.columns] - column = [ARGS['column'] ] #+ ARGS['id'] - print (column) - print (_df[column].risk.evaluate()) - print (odf[column].risk.evaluate()) - _x = pd.get_dummies(_df[column]).values - y = pd.get_dummies(odf[column]).values - N = _df.shape[0] - print (np.mean([ wd(_x[i],y[i])for i in range(0,N)])) + column = ARGS['column'] if isinstance(ARGS['column'],list) else [ARGS['column']] + print(pd.merge(odf,_df, on='id')) + # print (_df[column].risk.evaluate(flag='synth')) + # print (odf[column].risk.evaluate(flag='original')) + # _x = pd.get_dummies(_df[column]).values + # y = pd.get_dummies(odf[column]).values + # N = _df.shape[0] + # print (np.mean([ wd(_x[i],y[i])for i in range(0,N)])) + # print (wd(_x[0],y[0]) ) + # column = SYS_ARGS['column'] # odf = open(SYS_ARGS['data']) \ No newline at end of file From 37ba836a7ea50e6e5334c8fe6a5f52eeb7ca27f9 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 12 Feb 2020 12:41:01 -0600 Subject: [PATCH 004/250] bug fix ... need to design porting/loading models on the fly --- data/maker/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index f1a9537..2becbe2 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -53,7 +53,7 @@ def post(**args): """ pass -def get(**): +def get(**,args): """ This function will restore a checkpoint from a persistant storage on to disk """ diff --git a/setup.py b/setup.py index 5f800d9..2fea026 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.0.6","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.0.7","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git' From 6c12cf0b2a561f88539128fe1e2f1ec5500c52b1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 12 Feb 2020 12:43:30 -0600 Subject: [PATCH 005/250] bug fix ... need to design porting/loading models on the fly --- data/maker/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 2becbe2..12abc8d 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -53,7 +53,7 @@ def post(**args): """ pass -def get(**,args): +def get(**args): """ This function will restore a checkpoint from a persistant storage on to disk """ From 725e32b160ff1788b447a92131b91a0a263e70fd Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 12 Feb 2020 13:46:20 -0600 Subject: [PATCH 006/250] bug fix ... need to design porting/loading models on the fly --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2fea026..8034249 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.0.7","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.0.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git' From 383d7b7e64989d1b900a2b5a90931313f1942e87 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 12 Feb 2020 13:49:05 -0600 Subject: [PATCH 007/250] bug fix ... need to design porting/loading models on the fly --- data/gan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/gan.py b/data/gan.py index 3f22740..439b52a 100644 --- a/data/gan.py +++ b/data/gan.py @@ -331,7 +331,7 @@ class Train (GNet): self.meta = self.log_meta() if(self.logger): - self.logger.write( row=self.meta ) + self.logger.write( self.meta ) self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta) def load_meta(self, column): @@ -480,7 +480,7 @@ class Train (GNet): # if self.logger : row = {"logs":logs} #,"model":pickle.dump(sess)} - self.logger.write(row=row) + self.logger.write(row) # # @TODO: # We should upload the files in the checkpoint From 4024e508a82cce6473849dd2ca7c44722560fd7f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 12 Feb 2020 13:57:28 -0600 Subject: [PATCH 008/250] bug fix ... --- data/gan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/gan.py b/data/gan.py index 439b52a..e54daa8 100644 --- a/data/gan.py +++ b/data/gan.py @@ -129,7 +129,7 @@ class GNet : def log_meta(self,**args) : _object = { - '_id':'meta', + # '_id':'meta', 'CONTEXT':self.CONTEXT, 'ATTRIBUTES':self.ATTRIBUTES, 'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU, From ce55848cc8d8fa06aad95ce8f75274ae968e657d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 13 Feb 2020 17:30:56 -0600 Subject: [PATCH 009/250] bug fix with dimensions @TODO: GPU workload --- data/gan.py | 1185 ++++++++++++++++++++++++++------------------------- setup.py | 2 +- 2 files changed, 598 insertions(+), 589 deletions(-) diff --git a/data/gan.py b/data/gan.py index e54daa8..367d63c 100644 --- a/data/gan.py +++ b/data/gan.py @@ -10,11 +10,11 @@ python data/maker --config CONFIGURATION FILE STRUCTURE : - context what it is you are loading (stroke, hypertension, ...) - data path of the file to be loaded - logs folder to store training model and meta data about learning - max_epochs number of iterations in learning - num_gpu number of gpus to be used (will still run if the GPUs are not available) + context what it is you are loading (stroke, hypertension, ...) + data path of the file to be loaded + logs folder to store training model and meta data about learning + max_epochs number of iterations in learning + num_gpu number of gpus to be used (will still run if the GPUs are not available) EMBEDDED IN CODE : @@ -35,619 +35,628 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ['CUDA_VISIBLE_DEVICES'] = "0" os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' -# STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256 -# NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu']) +# STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256 +# NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu']) # BATCHSIZE_PER_GPU = 2000 -# TOTAL_BATCHSIZE = BATCHSIZE_PER_GPU * NUM_GPUS +# TOTAL_BATCHSIZE = BATCHSIZE_PER_GPU * NUM_GPUS class void : - pass + pass class GNet : - def log(self,**args): - self.logs = dict(args,**self.logs) - - - """ - This is the base class of a generative network functions, the details will be implemented in the subclasses. - An instance of this class is accessed as follows - object.layers.normalize applies batch normalization or otherwise - obect.get.variables instanciate variables on cpu and return a reference (tensor) - """ - def __init__(self,**args): - self.layers = void() - self.layers.normalize = self.normalize - self.logs = {} - - self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] - - - self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854 - self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE] - self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis - # self.NUM_LABELS = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1] - if 'label' in args and len(args['label'].shape) == 2 : - self.NUM_LABELS = args['label'].shape[1] - elif 'label' in args and len(args['label']) == 1 : - self.NUM_LABELS = args['label'].shape[0] - else: - self.NUM_LABELS = 8 - self.Z_DIM = 128 #self.X_SPACE_SIZE - self.BATCHSIZE_PER_GPU = args['real'].shape[0] if 'real' in args else 256 - self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS - self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) - self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs']) - self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100 - self.CONTEXT = args['context'] - self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None} - self._REAL = args['real'] if 'real' in args else None - self._LABEL = args['label'] if 'label' in args else None - - self.get = void() - self.get.variables = self._variable_on_cpu - self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - self.logger = args['logger'] if 'logger' in args and args['logger'] else None - self.init_logs(**args) - - def init_logs(self,**args): - self.log_dir = args['logs'] if 'logs' in args else 'logs' - self.mkdir(self.log_dir) - # - # - for key in ['train','output'] : - self.mkdir(os.sep.join([self.log_dir,key])) - self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT])) - - self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) - self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) - if self.logger : - # - # We will clear the logs from the data-store - # - column = self.ATTRIBUTES['synthetic'] - db = self.logger.db - if db[column].count() > 0 : - db.backup.insert({'name':column,'logs':list(db[column].find()) }) - db[column].drop() - - def load_meta(self,column): + def log(self,**args): + self.logs = dict(args,**self.logs) + + """ - This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model. - Because prediction and training can happen independently + This is the base class of a generative network functions, the details will be implemented in the subclasses. + An instance of this class is accessed as follows + object.layers.normalize applies batch normalization or otherwise + obect.get.variables instanciate variables on cpu and return a reference (tensor) """ - # suffix = "-".join(column) if isinstance(column,list)else column - suffix = self.get.suffix() - _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json']) - if os.path.exists(_name) : - attr = json.loads((open(_name)).read()) - for key in attr : - value = attr[key] - setattr(self,key,value) - self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) - self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) + def __init__(self,**args): + self.layers = void() + self.layers.normalize = self.normalize + self.logs = {} + + self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] + if self.NUM_GPUS > 1 : + os.environ['CUDA_VISIBLE_DEVICES'] = "4" + + self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854 + self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE] + self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis + # self.NUM_LABELS = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1] + + if 'label' in args and len(args['label'].shape) == 2 : + self.NUM_LABELS = args['label'].shape[1] + elif 'label' in args and len(args['label']) == 1 : + self.NUM_LABELS = args['label'].shape[0] + else: + self.NUM_LABELS = 8 + # self.Z_DIM = 128 #self.X_SPACE_SIZE + self.Z_DIM = 128 #-- used as rows down stream + self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM] + if 'real' in args : + self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM] + + self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) if 'real' in args else 256 + self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS + self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) + self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs']) + self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100 + self.CONTEXT = args['context'] + self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None} + self._REAL = args['real'] if 'real' in args else None + self._LABEL = args['label'] if 'label' in args else None + + self.get = void() + self.get.variables = self._variable_on_cpu + self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + self.logger = args['logger'] if 'logger' in args and args['logger'] else None + self.init_logs(**args) + + def init_logs(self,**args): + self.log_dir = args['logs'] if 'logs' in args else 'logs' + self.mkdir(self.log_dir) + # + # + for key in ['train','output'] : + self.mkdir(os.sep.join([self.log_dir,key])) + self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT])) + + self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) + self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) + if self.logger : + # + # We will clear the logs from the data-store + # + column = self.ATTRIBUTES['synthetic'] + db = self.logger.db + if db[column].count() > 0 : + db.backup.insert({'name':column,'logs':list(db[column].find()) }) + db[column].drop() + + def load_meta(self,column): + """ + This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model. + Because prediction and training can happen independently + """ + # suffix = "-".join(column) if isinstance(column,list)else column + suffix = self.get.suffix() + _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json']) + if os.path.exists(_name) : + attr = json.loads((open(_name)).read()) + for key in attr : + value = attr[key] + setattr(self,key,value) + self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) + self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) + + + def log_meta(self,**args) : + + _object = { + # '_id':'meta', + 'CONTEXT':self.CONTEXT, + 'ATTRIBUTES':self.ATTRIBUTES, + 'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU, + 'Z_DIM':self.Z_DIM, + "X_SPACE_SIZE":self.X_SPACE_SIZE, + "D_STRUCTURE":self.D_STRUCTURE, + "G_STRUCTURE":self.G_STRUCTURE, + "NUM_GPUS":self.NUM_GPUS, + "NUM_LABELS":self.NUM_LABELS, + "MAX_EPOCHS":self.MAX_EPOCHS, + "ROW_COUNT":self.ROW_COUNT + } + if args and 'key' in args and 'value' in args : + key = args['key'] + value= args['value'] + object[key] = value + # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column + suffix = self.get.suffix() + _name = os.sep.join([self.out_dir,'meta-'+suffix]) + + f = open(_name+'.json','w') + f.write(json.dumps(_object)) + return _object + def mkdir (self,path): + if not os.path.exists(path) : + os.mkdir(path) - - def log_meta(self,**args) : - - _object = { - # '_id':'meta', - 'CONTEXT':self.CONTEXT, - 'ATTRIBUTES':self.ATTRIBUTES, - 'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU, - 'Z_DIM':self.Z_DIM, - "X_SPACE_SIZE":self.X_SPACE_SIZE, - "D_STRUCTURE":self.D_STRUCTURE, - "G_STRUCTURE":self.G_STRUCTURE, - "NUM_GPUS":self.NUM_GPUS, - "NUM_LABELS":self.NUM_LABELS, - "MAX_EPOCHS":self.MAX_EPOCHS, - "ROW_COUNT":self.ROW_COUNT - } - if args and 'key' in args and 'value' in args : - key = args['key'] - value= args['value'] - object[key] = value - # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column - suffix = self.get.suffix() - _name = os.sep.join([self.out_dir,'meta-'+suffix]) - - f = open(_name+'.json','w') - f.write(json.dumps(_object)) - return _object - def mkdir (self,path): - if not os.path.exists(path) : - os.mkdir(path) - - - def normalize(self,**args): - """ - This function will perform a batch normalization on an network layer - inputs input layer of the neural network - name name of the scope the - labels labels (attributes not synthesized) by default None - n_labels number of labels default None - """ - inputs = args['inputs'] - name = args['name'] - labels = None if 'labels' not in args else args['labels'] - n_labels= None if 'n_labels' not in args else args['n_labels'] - shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing - mean, var = tf.nn.moments(inputs, shift, keep_dims=True) - shape = inputs.shape[1].value - offset_m = self.get.variables(shape=[n_labels,shape], name='offset'+name, - initializer=tf.zeros_initializer) - scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name, - initializer=tf.ones_initializer) - - offset = tf.nn.embedding_lookup(offset_m, labels) - scale = tf.nn.embedding_lookup(scale_m, labels) - result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8) - return result - - def _variable_on_cpu(self,**args): - """ - This function makes sure variables/tensors are not created on the GPU but rather on the CPU - """ - name = args['name'] - shape = args['shape'] - initializer=None if 'initializer' not in args else args['initializer'] - with tf.device('/cpu:0') : - cpu_var = tf.compat.v1.get_variable(name,shape,initializer= initializer) - return cpu_var - def average_gradients(self,tower_grads): - average_grads = [] - for grad_and_vars in zip(*tower_grads): - grads = [] - for g, _ in grad_and_vars: - expanded_g = tf.expand_dims(g, 0) - grads.append(expanded_g) - - grad = tf.concat(axis=0, values=grads) - grad = tf.reduce_mean(grad, 0) - - v = grad_and_vars[0][1] - grad_and_var = (grad, v) - average_grads.append(grad_and_var) - return average_grads + def normalize(self,**args): + """ + This function will perform a batch normalization on an network layer + inputs input layer of the neural network + name name of the scope the + labels labels (attributes not synthesized) by default None + n_labels number of labels default None + """ + inputs = args['inputs'] + name = args['name'] + labels = None if 'labels' not in args else args['labels'] + n_labels= None if 'n_labels' not in args else args['n_labels'] + shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing + mean, var = tf.nn.moments(inputs, shift, keep_dims=True) + shape = inputs.shape[1].value + offset_m = self.get.variables(shape=[n_labels,shape], name='offset'+name, + initializer=tf.zeros_initializer) + scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name, + initializer=tf.ones_initializer) + + offset = tf.nn.embedding_lookup(offset_m, labels) + scale = tf.nn.embedding_lookup(scale_m, labels) + result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8) + return result + + def _variable_on_cpu(self,**args): + """ + This function makes sure variables/tensors are not created on the GPU but rather on the CPU + """ + + name = args['name'] + shape = args['shape'] + initializer=None if 'initializer' not in args else args['initializer'] + with tf.device('/cpu:0') : + cpu_var = tf.compat.v1.get_variable(name,shape,initializer= initializer) + return cpu_var + def average_gradients(self,tower_grads): + average_grads = [] + for grad_and_vars in zip(*tower_grads): + grads = [] + for g, _ in grad_and_vars: + expanded_g = tf.expand_dims(g, 0) + grads.append(expanded_g) + + grad = tf.concat(axis=0, values=grads) + grad = tf.reduce_mean(grad, 0) + + v = grad_and_vars[0][1] + grad_and_var = (grad, v) + average_grads.append(grad_and_var) + return average_grads class Generator (GNet): - """ - This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator, this allows the generator not to be random - - """ - def __init__(self,**args): - GNet.__init__(self,**args) - self.discriminator = Discriminator(**args) - def loss(self,**args): - fake = args['fake'] - label = args['label'] - y_hat_fake = self.discriminator.network(inputs=fake, label=label) - #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) - loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs) - #tf.add_to_collection('glosses', loss) - tf.compat.v1.add_to_collection('glosses', loss) - return loss, loss - def load_meta(self, column): - super().load_meta(column) - self.discriminator.load_meta(column) - def network(self,**args) : - """ - This function will build the network that will generate the synthetic candidates - :inputs matrix of data that we need - :dim dimensions of ... """ - x = args['inputs'] - tmp_dim = self.Z_DIM if 'dim' not in args else args['dim'] - label = args['label'] + This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator, this allows the generator not to be random - with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): - for i, dim in enumerate(self.G_STRUCTURE[:-1]): - kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim]) - h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS) - h2 = tf.nn.relu(h1) - x = x + h2 - tmp_dim = dim - i = len(self.G_STRUCTURE) - 1 - # - # This seems to be an extra hidden layer: - # It's goal is to map continuous values to discrete values (pre-trained to do this) - kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]]) - h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i), - labels=label, n_labels=self.NUM_LABELS) - h2 = tf.nn.tanh(h1) - x = x + h2 - # This seems to be the output layer - # - kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE]) - bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE]) - x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias)) - return x + """ + def __init__(self,**args): + GNet.__init__(self,**args) + self.discriminator = Discriminator(**args) + def loss(self,**args): + fake = args['fake'] + label = args['label'] + y_hat_fake = self.discriminator.network(inputs=fake, label=label) + #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) + all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) + loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs) + #tf.add_to_collection('glosses', loss) + tf.compat.v1.add_to_collection('glosses', loss) + return loss, loss + def load_meta(self, column): + super().load_meta(column) + self.discriminator.load_meta(column) + def network(self,**args) : + """ + This function will build the network that will generate the synthetic candidates + :inputs matrix of data that we need + :dim dimensions of ... + """ + x = args['inputs'] + tmp_dim = self.Z_DIM if 'dim' not in args else args['dim'] + label = args['label'] + + with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): + for i, dim in enumerate(self.G_STRUCTURE[:-1]): + kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim]) + h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS) + h2 = tf.nn.relu(h1) + x = x + h2 + tmp_dim = dim + i = len(self.G_STRUCTURE) - 1 + # + # This seems to be an extra hidden layer: + # It's goal is to map continuous values to discrete values (pre-trained to do this) + kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]]) + h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i), + labels=label, n_labels=self.NUM_LABELS) + h2 = tf.nn.tanh(h1) + x = x + h2 + # This seems to be the output layer + # + kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE]) + bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE]) + x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias)) + return x class Discriminator(GNet): - def __init__(self,**args): - GNet.__init__(self,**args) - def network(self,**args): - """ - This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output (neuron) - :inputs - :label - """ - x = args['inputs'] - label = args['label'] - with tf.compat.v1.variable_scope('D', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): - for i, dim in enumerate(self.D_STRUCTURE[1:]): - kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[i], dim]) - bias = self.get.variables(name='b_' + str(i), shape=[dim]) - # print (["\t",bias,kernel]) - x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias)) - x = self.normalize(inputs=x, name='cln' + str(i), shift=1,labels=label, n_labels=self.NUM_LABELS) - i = len(self.D_STRUCTURE) - kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[-1], 1]) - bias = self.get.variables(name='b_' + str(i), shape=[1]) - y = tf.add(tf.matmul(x, kernel), bias) - return y - - def loss(self,**args) : - """ - This function compute the loss of - :real - :fake - :label - """ - real = args['real'] - fake = args['fake'] - label = args['label'] - epsilon = tf.random.uniform(shape=[self.BATCHSIZE_PER_GPU,1],minval=0,maxval=1) + def __init__(self,**args): + GNet.__init__(self,**args) + def network(self,**args): + """ + This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output (neuron) + :inputs + :label + """ + x = args['inputs'] + label = args['label'] + with tf.compat.v1.variable_scope('D', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): + for i, dim in enumerate(self.D_STRUCTURE[1:]): + kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[i], dim]) + bias = self.get.variables(name='b_' + str(i), shape=[dim]) + # print (["\t",bias,kernel]) + x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias)) + x = self.normalize(inputs=x, name='cln' + str(i), shift=1,labels=label, n_labels=self.NUM_LABELS) + i = len(self.D_STRUCTURE) + kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[-1], 1]) + bias = self.get.variables(name='b_' + str(i), shape=[1]) + y = tf.add(tf.matmul(x, kernel), bias) + return y - x_hat = real + epsilon * (fake - real) - y_hat_fake = self.network(inputs=fake, label=label) - - y_hat_real = self.network(inputs=real, label=label) - y_hat = self.network(inputs=x_hat, label=label) - - grad = tf.gradients(y_hat, [x_hat])[0] - slopes = tf.sqrt(tf.reduce_sum(tf.square(grad), 1)) - gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2) - #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) - w_distance = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake) - loss = w_distance + 10 * gradient_penalty + sum(all_regs) - #tf.add_to_collection('dlosses', loss) - tf.compat.v1.add_to_collection('dlosses', loss) - - return w_distance, loss + def loss(self,**args) : + """ + This function compute the loss of + :real + :fake + :label + """ + real = args['real'] + fake = args['fake'] + label = args['label'] + epsilon = tf.random.uniform(shape=[self.BATCHSIZE_PER_GPU,1],minval=0,maxval=1) + + x_hat = real + epsilon * (fake - real) + y_hat_fake = self.network(inputs=fake, label=label) + + y_hat_real = self.network(inputs=real, label=label) + y_hat = self.network(inputs=x_hat, label=label) + + grad = tf.gradients(y_hat, [x_hat])[0] + slopes = tf.sqrt(tf.reduce_sum(tf.square(grad), 1)) + gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2) + #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) + all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) + w_distance = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake) + loss = w_distance + 10 * gradient_penalty + sum(all_regs) + #tf.add_to_collection('dlosses', loss) + tf.compat.v1.add_to_collection('dlosses', loss) + + return w_distance, loss class Train (GNet): - def __init__(self,**args): - GNet.__init__(self,**args) - self.generator = Generator(**args) - self.discriminator = Discriminator(**args) - self._REAL = args['real'] - self._LABEL= args['label'] - self.column = args['column'] - # print ([" *** ",self.BATCHSIZE_PER_GPU]) - - self.meta = self.log_meta() - if(self.logger): - - self.logger.write( self.meta ) - - self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta) - def load_meta(self, column): - """ - This function will delegate the calls to load meta data to it's dependents - column name - """ - super().load_meta(column) - self.generator.load_meta(column) - self.discriminator.load_meta(column) - def loss(self,**args): - """ - This function will compute a "tower" loss of the generated candidate against real data - Training will consist in having both generator and discriminators - :scope - :stage - :real - :label - """ - - scope = args['scope'] - stage = args['stage'] - real = args['real'] - label = args['label'] - label = tf.cast(label, tf.int32) - # - # @TODO: Ziqi needs to explain what's going on here - m = [[i] for i in np.arange(self._LABEL.shape[1]-2)] - label = label[:, 1] * len(m) + tf.squeeze( - tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32)) - ) - # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] ) - z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) + def __init__(self,**args): + GNet.__init__(self,**args) + self.generator = Generator(**args) + self.discriminator = Discriminator(**args) + self._REAL = args['real'] + self._LABEL= args['label'] + self.column = args['column'] + # print ([" *** ",self.BATCHSIZE_PER_GPU]) + + self.meta = self.log_meta() + if(self.logger): + + self.logger.write( self.meta ) + + self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta) + def load_meta(self, column): + """ + This function will delegate the calls to load meta data to it's dependents + column name + """ + super().load_meta(column) + self.generator.load_meta(column) + self.discriminator.load_meta(column) + def loss(self,**args): + """ + This function will compute a "tower" loss of the generated candidate against real data + Training will consist in having both generator and discriminators + :scope + :stage + :real + :label + """ + + scope = args['scope'] + stage = args['stage'] + real = args['real'] + label = args['label'] + label = tf.cast(label, tf.int32) + # + # @TODO: Ziqi needs to explain what's going on here + m = [[i] for i in np.arange(self._LABEL.shape[1]-2)] + label = label[:, 1] * len(m) + tf.squeeze( + tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32)) + ) + # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] ) + z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) + + fake = self.generator.network(inputs=z, label=label) + if stage == 'D': + w, loss = self.discriminator.loss(real=real, fake=fake, label=label) + #losses = tf.get_collection('dlosses', scope) + flag = 'dlosses' + losses = tf.compat.v1.get_collection('dlosses', scope) + else: + w, loss = self.generator.loss(fake=fake, label=label) + #losses = tf.get_collection('glosses', scope) + flag = 'glosses' + losses = tf.compat.v1.get_collection('glosses', scope) + # losses = tf.compat.v1.get_collection(flag, scope) + + total_loss = tf.add_n(losses, name='total_loss') + + return total_loss, w + def input_fn(self): + """ + This function seems to produce + """ + features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32) + labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32) + dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) + dataset = dataset.repeat(10000) + dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) + dataset = dataset.prefetch(1) + # iterator = dataset.make_initializable_iterator() + iterator = tf.compat.v1.data.make_initializable_iterator(dataset) + # next_element = iterator.get_next() + # init_op = iterator.initializer + return iterator, features_placeholder, labels_placeholder - fake = self.generator.network(inputs=z, label=label) - if stage == 'D': - w, loss = self.discriminator.loss(real=real, fake=fake, label=label) - #losses = tf.get_collection('dlosses', scope) - flag = 'dlosses' - losses = tf.compat.v1.get_collection('dlosses', scope) - else: - w, loss = self.generator.loss(fake=fake, label=label) - #losses = tf.get_collection('glosses', scope) - flag = 'glosses' - losses = tf.compat.v1.get_collection('glosses', scope) - # losses = tf.compat.v1.get_collection(flag, scope) - - total_loss = tf.add_n(losses, name='total_loss') + def network(self,**args): + # def graph(stage, opt): + # global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False) + stage = args['stage'] + opt = args['opt'] + tower_grads = [] + per_gpu_w = [] + iterator, features_placeholder, labels_placeholder = self.input_fn() + with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): + for i in range(self.NUM_GPUS): + with tf.device('/gpu:%d' % i): + with tf.name_scope('%s_%d' % ('TOWER', i)) as scope: + (real, label) = iterator.get_next() + loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL) + #tf.get_variable_scope().reuse_variables() + tf.compat.v1.get_variable_scope().reuse_variables() + #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage) + vars_ = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=stage) + grads = opt.compute_gradients(loss, vars_) + tower_grads.append(grads) + per_gpu_w.append(w) + + grads = self.average_gradients(tower_grads) + apply_gradient_op = opt.apply_gradients(grads) + + mean_w = tf.reduce_mean(per_gpu_w) + train_op = apply_gradient_op + return train_op, mean_w, iterator, features_placeholder, labels_placeholder + def apply(self,**args): + # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10 + REAL = self._REAL + LABEL= self._LABEL + if (self.logger): + pass + + with tf.device('/cpu:0'): + opt_d = tf.compat.v1.train.AdamOptimizer(1e-4) + opt_g = tf.compat.v1.train.AdamOptimizer(1e-4) + + train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d) + train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g) + # saver = tf.train.Saver() + saver = tf.compat.v1.train.Saver() + # init = tf.global_variables_initializer() + init = tf.compat.v1.global_variables_initializer() + logs = [] + #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: + with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: + sess.run(init) + sess.run(iterator_d.initializer, + feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL}) + sess.run(iterator_g.initializer, + feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL}) + + for epoch in range(1, self.MAX_EPOCHS + 1): + start_time = time.time() + w_sum = 0 + for i in range(self.STEPS_PER_EPOCH): + for _ in range(2): + _, w = sess.run([train_d, w_distance]) + w_sum += w + sess.run(train_g) + duration = time.time() - start_time + + assert not np.isnan(w_sum), 'Model diverged with loss = NaN' + + format_str = 'epoch: %d, w_distance = %f (%.1f)' + print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) + # print (dir (w_distance)) + + logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) }) + + if epoch % self.MAX_EPOCHS == 0: + # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + suffix = self.get.suffix() + _name = os.sep.join([self.train_dir,suffix]) + # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) + saver.save(sess, _name, write_meta_graph=False, global_step=epoch) + # + # + if self.logger : + row = {"logs":logs} #,"model":pickle.dump(sess)} + self.logger.write(row) + # + # @TODO: + # We should upload the files in the checkpoint + # This would allow the learnt model to be portable to another system + # + tf.compat.v1.reset_default_graph() - return total_loss, w - def input_fn(self): +class Predict(GNet): """ - This function seems to produce + This class uses synthetic data given a learned model """ - features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32) - labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32) - dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) - dataset = dataset.repeat(10000) - dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) - dataset = dataset.prefetch(1) - # iterator = dataset.make_initializable_iterator() - iterator = tf.compat.v1.data.make_initializable_iterator(dataset) - # next_element = iterator.get_next() - # init_op = iterator.initializer - return iterator, features_placeholder, labels_placeholder - - def network(self,**args): - # def graph(stage, opt): - # global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False) - stage = args['stage'] - opt = args['opt'] - tower_grads = [] - per_gpu_w = [] - iterator, features_placeholder, labels_placeholder = self.input_fn() - with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): - for i in range(self.NUM_GPUS): - with tf.device('/gpu:%d' % i): - with tf.name_scope('%s_%d' % ('TOWER', i)) as scope: - (real, label) = iterator.get_next() - loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL) - #tf.get_variable_scope().reuse_variables() - tf.compat.v1.get_variable_scope().reuse_variables() - #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage) - vars_ = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=stage) - grads = opt.compute_gradients(loss, vars_) - tower_grads.append(grads) - per_gpu_w.append(w) - - grads = self.average_gradients(tower_grads) - apply_gradient_op = opt.apply_gradients(grads) - - mean_w = tf.reduce_mean(per_gpu_w) - train_op = apply_gradient_op - return train_op, mean_w, iterator, features_placeholder, labels_placeholder - def apply(self,**args): - # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10 - REAL = self._REAL - LABEL= self._LABEL - if (self.logger): - pass - - with tf.device('/cpu:0'): - opt_d = tf.compat.v1.train.AdamOptimizer(1e-4) - opt_g = tf.compat.v1.train.AdamOptimizer(1e-4) - - train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d) - train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g) - # saver = tf.train.Saver() - saver = tf.compat.v1.train.Saver() - # init = tf.global_variables_initializer() - init = tf.compat.v1.global_variables_initializer() - logs = [] - #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: - with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: - sess.run(init) - sess.run(iterator_d.initializer, - feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL}) - sess.run(iterator_g.initializer, - feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL}) - - for epoch in range(1, self.MAX_EPOCHS + 1): - start_time = time.time() - w_sum = 0 - for i in range(self.STEPS_PER_EPOCH): - for _ in range(2): - _, w = sess.run([train_d, w_distance]) - w_sum += w - sess.run(train_g) - duration = time.time() - start_time - - assert not np.isnan(w_sum), 'Model diverged with loss = NaN' - - format_str = 'epoch: %d, w_distance = %f (%.1f)' - print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) - # print (dir (w_distance)) - - logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) }) - - if epoch % self.MAX_EPOCHS == 0: - # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - suffix = self.get.suffix() - _name = os.sep.join([self.train_dir,suffix]) - # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) - saver.save(sess, _name, write_meta_graph=False, global_step=epoch) + def __init__(self,**args): + GNet.__init__(self,**args) + self.generator = Generator(**args) + self.values = args['values'] + def load_meta(self, column): + super().load_meta(column) + self.generator.load_meta(column) + def apply(self,**args): + # print (self.train_dir) + # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + suffix = self.get.suffix() + model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) + demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] + tf.compat.v1.reset_default_graph() + z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) + y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32) + ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] + label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) + + fake = self.generator.network(inputs=z, label=label) + init = tf.compat.v1.global_variables_initializer() + saver = tf.compat.v1.train.Saver() + df = pd.DataFrame() + CANDIDATE_COUNT = 1000 + NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0] + with tf.compat.v1.Session() as sess: + + # sess.run(init) + saver.restore(sess, model_dir) + labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) + + found = [] + labels= demo + for i in np.arange(CANDIDATE_COUNT) : + + f = sess.run(fake,feed_dict={y:labels}) + # + # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes + # The code below will insure we have some acceptable cardinal relationships between id and synthetic values + # + df = ( pd.DataFrame(np.round(f).astype(np.int32))) + print (df.head()) + print () + p = 0 not in df.sum(axis=1).values + + if p: + found.append(df) + if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT: + break + else: + continue + + # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms + # df = (i * df).sum(axis=1) # + # In case we are dealing with actual values like diagnosis codes we can perform # - if self.logger : - row = {"logs":logs} #,"model":pickle.dump(sess)} - self.logger.write(row) - # - # @TODO: - # We should upload the files in the checkpoint - # This would allow the learnt model to be portable to another system - # - tf.compat.v1.reset_default_graph() + df = found[np.random.choice(np.arange(len(found)),1)[0]] + columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] + + # r = np.zeros((self.ROW_COUNT,len(columns))) + r = np.zeros(self.ROW_COUNT) + df.columns = self.values + if len(found): + print (len(found),NTH_VALID_CANDIDATE) + # x = df * self.values + + df = pd.DataFrame( df.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) + df.columns = columns + + + + + tf.compat.v1.reset_default_graph() + + return df.to_dict(orient='list') + # return df.to_dict(orient='list') + # count = str(len(os.listdir(self.out_dir))) + # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv']) + # df.to_csv(_name,index=False) + + + # output.extend(np.round(f)) + + # for m in range(2): + # for n in range(2, self.NUM_LABELS): + # idx1 = (demo[:, m] == 1) + # idx2 = (demo[:, n] == 1) + # idx = [idx1[j] and idx2[j] for j in range(len(idx1))] + # num = np.sum(idx) + # print ("___________________list__") + # print (idx1) + # print (idx2) + # print (idx) + # print (num) + # print ("_____________________") + # nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU)) + # label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS)) + # label_input[:, n] = 1 + # label_input[:, m] = 1 + # output = [] + # for i in range(nbatch): + # f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]}) + # output.extend(np.round(f)) + # output = np.array(output)[:num] + # print ([m,n,output]) + + # np.save(self.out_dir + str(m) + str(n), output) + -class Predict(GNet): - """ - This class uses synthetic data given a learned model - """ - def __init__(self,**args): - GNet.__init__(self,**args) - self.generator = Generator(**args) - self.values = args['values'] - def load_meta(self, column): - super().load_meta(column) - self.generator.load_meta(column) - def apply(self,**args): - # print (self.train_dir) - # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - suffix = self.get.suffix() - model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) - demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] - tf.compat.v1.reset_default_graph() - z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) - y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32) - ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] - label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) +if __name__ == '__main__' : + # + # Now we get things done ... + column = SYS_ARGS['column'] + column_id = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id' + column_id = column_id.split(',') if ',' in column_id else column_id + df = pd.read_csv(SYS_ARGS['raw-data']) + LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values - fake = self.generator.network(inputs=z, label=label) - init = tf.compat.v1.global_variables_initializer() - saver = tf.compat.v1.train.Saver() - df = pd.DataFrame() - CANDIDATE_COUNT = 1000 - NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0] - with tf.compat.v1.Session() as sess: - - # sess.run(init) - saver.restore(sess, model_dir) - labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) - - found = [] - labels= demo - for i in np.arange(CANDIDATE_COUNT) : + context = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4] + if set(['train','learn']) & set(SYS_ARGS.keys()): + + df = pd.read_csv(SYS_ARGS['raw-data']) + + # cols = SYS_ARGS['column'] + # _map,_df = (Binary()).Export(df) + # i = np.arange(_map[column]['start'],_map[column]['end']) + max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10 + # REAL = _df[:,i] + REAL = pd.get_dummies(df[column]).astype(np.float32).values + LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values + trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id) + trainer.apply() - f = sess.run(fake,feed_dict={y:labels}) + + + + # + # We should train upon this data # - # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes - # The code below will insure we have some acceptable cardinal relationships between id and synthetic values + # -- we need to convert the data-frame to binary matrix, given a column # - df = ( pd.DataFrame(np.round(f).astype(np.int32))) - p = 0 not in df.sum(axis=1).values + pass + elif 'generate' in SYS_ARGS: + values = df[column].unique().tolist() + values.sort() - if p: - found.append(df) - if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT: - break - else: - continue - - # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms - # df = (i * df).sum(axis=1) - # - # In case we are dealing with actual values like diagnosis codes we can perform - # - df = found[np.random.choice(np.arange(len(found)),1)[0]] - columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] - - # r = np.zeros((self.ROW_COUNT,len(columns))) - r = np.zeros(self.ROW_COUNT) - df.columns = self.values - if len(found): - print (len(found),NTH_VALID_CANDIDATE) - # x = df * self.values + p = Predict(context=context,label=LABEL,values=values,column=column) + p.load_meta(column) + r = p.apply() + print (df) + print () + df[column] = r[column] + print (df) - df = pd.DataFrame( df.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) - df.columns = columns - - - - tf.compat.v1.reset_default_graph() - - return df.to_dict(orient='list') - # return df.to_dict(orient='list') - # count = str(len(os.listdir(self.out_dir))) - # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv']) - # df.to_csv(_name,index=False) - - - # output.extend(np.round(f)) - - # for m in range(2): - # for n in range(2, self.NUM_LABELS): - # idx1 = (demo[:, m] == 1) - # idx2 = (demo[:, n] == 1) - # idx = [idx1[j] and idx2[j] for j in range(len(idx1))] - # num = np.sum(idx) - # print ("___________________list__") - # print (idx1) - # print (idx2) - # print (idx) - # print (num) - # print ("_____________________") - # nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU)) - # label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS)) - # label_input[:, n] = 1 - # label_input[:, m] = 1 - # output = [] - # for i in range(nbatch): - # f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]}) - # output.extend(np.round(f)) - # output = np.array(output)[:num] - # print ([m,n,output]) - - # np.save(self.out_dir + str(m) + str(n), output) - - -if __name__ == '__main__' : - # - # Now we get things done ... - column = SYS_ARGS['column'] - column_id = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id' - column_id = column_id.split(',') if ',' in column_id else column_id - df = pd.read_csv(SYS_ARGS['raw-data']) - LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values - - context = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4] - if set(['train','learn']) & set(SYS_ARGS.keys()): - - df = pd.read_csv(SYS_ARGS['raw-data']) - - # cols = SYS_ARGS['column'] - # _map,_df = (Binary()).Export(df) - # i = np.arange(_map[column]['start'],_map[column]['end']) - max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10 - # REAL = _df[:,i] - REAL = pd.get_dummies(df[column]).astype(np.float32).values - LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values - trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id) - trainer.apply() - - - - - # - # We should train upon this data - # - # -- we need to convert the data-frame to binary matrix, given a column - # + else: + print (SYS_ARGS.keys()) + print (__doc__) pass - elif 'generate' in SYS_ARGS: - values = df[column].unique().tolist() - values.sort() - - p = Predict(context=context,label=LABEL,values=values,column=column) - p.load_meta(column) - r = p.apply() - print (df) - print () - df[column] = r[column] - print (df) - - - else: - print (SYS_ARGS.keys()) - print (__doc__) - pass diff --git a/setup.py b/setup.py index 8034249..a0b96c7 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.0.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.0.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git' From 0f0c2642c2e8d1d3a2463c6945c18441a7392691 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 18 Feb 2020 02:59:39 -0600 Subject: [PATCH 010/250] bug fix with binary matrix generation --- data/bridge.py | 8 +++++--- data/gan.py | 8 +------- data/maker/__init__.py | 12 +++++++++--- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/data/bridge.py b/data/bridge.py index fa323af..019f065 100644 --- a/data/bridge.py +++ b/data/bridge.py @@ -191,12 +191,13 @@ class Binary : # # This will give us a map of how each column was mapped to a bitstream - _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0) + # _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0) + _map = df.fillna('').apply(lambda column: self.__stream(column),axis=0) # # We will merge this to have a healthy matrix _matrix = _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1) - _matrix = np.matrix([list(item) for item in _matrix]) + _matrix = np.matrix([list(item) for item in _matrix]).astype(np.float32) # # let's format the map so we don't have an unreasonable amount of data # @@ -210,7 +211,8 @@ class Binary : _m[name] = {"start":beg,"end":end} beg = end - return _m,_matrix.astype(np.float32) + # return _m,_matrix.astype(np.float32) + return _matrix def Import(self,df,values,_map): """ diff --git a/data/gan.py b/data/gan.py index 367d63c..3d600a3 100644 --- a/data/gan.py +++ b/data/gan.py @@ -397,17 +397,13 @@ class Train (GNet): labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32) dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) dataset = dataset.repeat(10000) - dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) + dataset = dataset.batch(batch_size=3000) dataset = dataset.prefetch(1) # iterator = dataset.make_initializable_iterator() iterator = tf.compat.v1.data.make_initializable_iterator(dataset) - # next_element = iterator.get_next() - # init_op = iterator.initializer return iterator, features_placeholder, labels_placeholder def network(self,**args): - # def graph(stage, opt): - # global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False) stage = args['stage'] opt = args['opt'] tower_grads = [] @@ -540,8 +536,6 @@ class Predict(GNet): # The code below will insure we have some acceptable cardinal relationships between id and synthetic values # df = ( pd.DataFrame(np.round(f).astype(np.int32))) - print (df.head()) - print () p = 0 not in df.sum(axis=1).values if p: diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 12abc8d..74ae718 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -12,6 +12,7 @@ import pandas as pd import numpy as np import data.gan as gan from transport import factory +from data.bridge import Binary import threading as thread def train (**args) : """ @@ -32,9 +33,12 @@ def train (**args) : # If we have several columns we will proceed one at a time (it could be done in separate threads) # @TODO : Consider performing this task on several threads/GPUs simulataneously # - args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values + handler = Binary() + # args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values + args['label'] = handler.Export(df[[column_id]]) for col in column : - args['real'] = pd.get_dummies(df[col]).astype(np.float32).values + # args['real'] = pd.get_dummies(df[col]).astype(np.float32).values + args['real'] = handler.Export(df[[col]]) args['column'] = col args['context'] = col context = args['context'] @@ -77,7 +81,9 @@ def generate(**args): #@TODO: # If the identifier is not present, we should fine a way to determine or make one # - args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values + # args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values + bwrangler = Binary() + args['label'] = bwrangler.Export(df[[column_id]]) _df = df.copy() for col in column : args['context'] = col From dab3ab7bf732504f0205536402a1976c18ca3df0 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 18 Feb 2020 03:09:47 -0600 Subject: [PATCH 011/250] version stuff --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a0b96c7..fcc12c1 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.0.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.1.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git' From 4a25af6b1345223d9acb20f6eef74b09dd083eeb Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 18 Feb 2020 12:25:47 -0600 Subject: [PATCH 012/250] removing conditions, it blows up computational space --- data/gan.py | 89 +++++++++++++++++++++++++++--------------- data/maker/__init__.py | 7 ++-- setup.py | 2 +- 3 files changed, 62 insertions(+), 36 deletions(-) diff --git a/data/gan.py b/data/gan.py index 3d600a3..77fcf3d 100644 --- a/data/gan.py +++ b/data/gan.py @@ -72,7 +72,7 @@ class GNet : elif 'label' in args and len(args['label']) == 1 : self.NUM_LABELS = args['label'].shape[0] else: - self.NUM_LABELS = 8 + self.NUM_LABELS = None # self.Z_DIM = 128 #self.X_SPACE_SIZE self.Z_DIM = 128 #-- used as rows down stream self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM] @@ -180,14 +180,19 @@ class GNet : shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing mean, var = tf.nn.moments(inputs, shift, keep_dims=True) shape = inputs.shape[1].value - offset_m = self.get.variables(shape=[n_labels,shape], name='offset'+name, - initializer=tf.zeros_initializer) - scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name, - initializer=tf.ones_initializer) - - offset = tf.nn.embedding_lookup(offset_m, labels) - scale = tf.nn.embedding_lookup(scale_m, labels) - result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8) + if labels is not None: + offset_m = self.get.variables(shape=[1,shape], name='offset'+name, + initializer=tf.zeros_initializer) + scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name, + initializer=tf.ones_initializer) + offset = tf.nn.embedding_lookup(offset_m, labels) + scale = tf.nn.embedding_lookup(scale_m, labels) + + else: + offset = None + scale = None + + result = tf.nn.batch_normalization(inputs, mean, var,offset,scale, 1e-8) return result def _variable_on_cpu(self,**args): @@ -248,7 +253,7 @@ class Generator (GNet): x = args['inputs'] tmp_dim = self.Z_DIM if 'dim' not in args else args['dim'] label = args['label'] - + print (self.NUM_LABELS) with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): for i, dim in enumerate(self.G_STRUCTURE[:-1]): kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim]) @@ -331,7 +336,7 @@ class Train (GNet): self.generator = Generator(**args) self.discriminator = Discriminator(**args) self._REAL = args['real'] - self._LABEL= args['label'] + self._LABEL= args['label'] if 'label' in args else None self.column = args['column'] # print ([" *** ",self.BATCHSIZE_PER_GPU]) @@ -340,7 +345,7 @@ class Train (GNet): self.logger.write( self.meta ) - self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta) + # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta) def load_meta(self, column): """ This function will delegate the calls to load meta data to it's dependents @@ -363,13 +368,16 @@ class Train (GNet): stage = args['stage'] real = args['real'] label = args['label'] - label = tf.cast(label, tf.int32) - # - # @TODO: Ziqi needs to explain what's going on here - m = [[i] for i in np.arange(self._LABEL.shape[1]-2)] - label = label[:, 1] * len(m) + tf.squeeze( - tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32)) - ) + + + if label is not None : + label = tf.cast(label, tf.int32) + # + # @TODO: Ziqi needs to explain what's going on here + m = [[i] for i in np.arange(self._LABEL.shape[1]-2)] + label = label[:, 1] * len(m) + tf.squeeze( + tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32)) + ) # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] ) z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) @@ -394,8 +402,13 @@ class Train (GNet): This function seems to produce """ features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32) - labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32) - dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) + LABEL_SHAPE = [None,None] if self._LABEL is None else self._LABEL.shape + labels_placeholder = tf.compat.v1.placeholder(shape=LABEL_SHAPE, dtype=tf.float32) + if self._LABEL is not None : + dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) + else : + dataset = tf.data.Dataset.from_tensor_slices(features_placeholder) + # labels_placeholder = None dataset = dataset.repeat(10000) dataset = dataset.batch(batch_size=3000) dataset = dataset.prefetch(1) @@ -413,7 +426,10 @@ class Train (GNet): for i in range(self.NUM_GPUS): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ('TOWER', i)) as scope: - (real, label) = iterator.get_next() + if self._LABEL is not None : + (real, label) = iterator.get_next() + else: + real = iterator.get_next() loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL) #tf.get_variable_scope().reuse_variables() tf.compat.v1.get_variable_scope().reuse_variables() @@ -450,11 +466,12 @@ class Train (GNet): #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: sess.run(init) + sess.run(iterator_d.initializer, - feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL}) + feed_dict={features_placeholder_d: REAL}) sess.run(iterator_g.initializer, - feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL}) - + feed_dict={features_placeholder_g: REAL}) + for epoch in range(1, self.MAX_EPOCHS + 1): start_time = time.time() w_sum = 0 @@ -511,9 +528,11 @@ class Predict(GNet): tf.compat.v1.reset_default_graph() z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32) - ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] - label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) - + if self._LABEL is not None : + ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] + label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) + else: + label = None fake = self.generator.network(inputs=z, label=label) init = tf.compat.v1.global_variables_initializer() saver = tf.compat.v1.train.Saver() @@ -524,13 +543,19 @@ class Predict(GNet): # sess.run(init) saver.restore(sess, model_dir) - labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) + if self._LABEL is not None : + labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) + labels= demo + else: + labels = None found = [] - labels= demo + for i in np.arange(CANDIDATE_COUNT) : - - f = sess.run(fake,feed_dict={y:labels}) + if labels : + f = sess.run(fake,feed_dict={y:labels}) + else: + f = sess.run(fake) # # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes # The code below will insure we have some acceptable cardinal relationships between id and synthetic values diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 74ae718..71fdc68 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -25,7 +25,7 @@ def train (**args) : """ column = args['column'] if (isinstance(args['column'],list)) else [args['column']] - column_id = args['id'] + # column_id = args['id'] df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) df.columns = [name.lower() for name in df.columns] @@ -35,7 +35,8 @@ def train (**args) : # handler = Binary() # args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values - args['label'] = handler.Export(df[[column_id]]) + # args['label'] = handler.Export(df[[column_id]]) + # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1) for col in column : # args['real'] = pd.get_dummies(df[col]).astype(np.float32).values args['real'] = handler.Export(df[[col]]) @@ -83,7 +84,7 @@ def generate(**args): # # args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values bwrangler = Binary() - args['label'] = bwrangler.Export(df[[column_id]]) + # args['label'] = bwrangler.Export(df[[column_id]]) _df = df.copy() for col in column : args['context'] = col diff --git a/setup.py b/setup.py index fcc12c1..50155cc 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ def read(fname): args = {"name":"data-maker","version":"1.1.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] -args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git' +args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' if sys.version_info[0] == 2 : args['use_2to3'] = False From cac2dd293def20f8343ef0e3647e2f83c9f6c461 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 18 Feb 2020 16:56:24 -0600 Subject: [PATCH 013/250] bug fix with dimensionalities and removing conditions --- data/gan.py | 9 ++++++--- setup.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/data/gan.py b/data/gan.py index 77fcf3d..c18277c 100644 --- a/data/gan.py +++ b/data/gan.py @@ -79,7 +79,8 @@ class GNet : if 'real' in args : self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM] - self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) if 'real' in args else 256 + # self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) if 'real' in args else 256 + self.BATCHSIZE_PER_GPU = 3000 if 'batch_size' not in args else int(args['batch_size']) self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs']) @@ -410,7 +411,7 @@ class Train (GNet): dataset = tf.data.Dataset.from_tensor_slices(features_placeholder) # labels_placeholder = None dataset = dataset.repeat(10000) - dataset = dataset.batch(batch_size=3000) + dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) dataset = dataset.prefetch(1) # iterator = dataset.make_initializable_iterator() iterator = tf.compat.v1.data.make_initializable_iterator(dataset) @@ -430,7 +431,8 @@ class Train (GNet): (real, label) = iterator.get_next() else: real = iterator.get_next() - loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL) + label= None + loss, w = self.loss(scope=scope, stage=stage, real=real, label=label) #tf.get_variable_scope().reuse_variables() tf.compat.v1.get_variable_scope().reuse_variables() #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage) @@ -465,6 +467,7 @@ class Train (GNet): logs = [] #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: + sess.run(init) sess.run(iterator_d.initializer, diff --git a/setup.py b/setup.py index 50155cc..8d41539 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.1.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.1.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From f63ede2fc58c983635b4c5a89ef33031938232d9 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 18 Feb 2020 17:23:13 -0600 Subject: [PATCH 014/250] tweak with batch size/gpu (bug with small data) --- data/gan.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/data/gan.py b/data/gan.py index c18277c..ed8facd 100644 --- a/data/gan.py +++ b/data/gan.py @@ -59,8 +59,8 @@ class GNet : self.logs = {} self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] - if self.NUM_GPUS > 1 : - os.environ['CUDA_VISIBLE_DEVICES'] = "4" + # if self.NUM_GPUS > 1 : + # os.environ['CUDA_VISIBLE_DEVICES'] = "4" self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854 self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE] @@ -78,9 +78,12 @@ class GNet : self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM] if 'real' in args : self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM] - - # self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) if 'real' in args else 256 - self.BATCHSIZE_PER_GPU = 3000 if 'batch_size' not in args else int(args['batch_size']) + PROPOSED_BATCH_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size']) + if args['real'].shape[0] < PROPOSED_BATCH_PER_GPU : + self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) + else: + self.BATCHSIZE_PER_GPU = PROPOSED_BATCH_PER_GPU + # self.BATCHSIZE_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size']) self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs']) @@ -254,7 +257,7 @@ class Generator (GNet): x = args['inputs'] tmp_dim = self.Z_DIM if 'dim' not in args else args['dim'] label = args['label'] - print (self.NUM_LABELS) + with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): for i, dim in enumerate(self.G_STRUCTURE[:-1]): kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim]) From 74c1f9d511a494bcf5e6d40c3d0e9690e088cea4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 20 Feb 2020 09:52:53 -0600 Subject: [PATCH 015/250] bug fix with class hierarchy --- data/gan.py | 12 ++++++------ data/maker/__init__.py | 4 ++-- setup.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/data/gan.py b/data/gan.py index ed8facd..fd30070 100644 --- a/data/gan.py +++ b/data/gan.py @@ -76,13 +76,13 @@ class GNet : # self.Z_DIM = 128 #self.X_SPACE_SIZE self.Z_DIM = 128 #-- used as rows down stream self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM] - if 'real' in args : - self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM] PROPOSED_BATCH_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size']) - if args['real'].shape[0] < PROPOSED_BATCH_PER_GPU : - self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) - else: - self.BATCHSIZE_PER_GPU = PROPOSED_BATCH_PER_GPU + self.BATCHSIZE_PER_GPU = PROPOSED_BATCH_PER_GPU + if 'real' in args : + self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM] + + if args['real'].shape[0] < PROPOSED_BATCH_PER_GPU : + self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) # self.BATCHSIZE_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size']) self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 71fdc68..cbd1ea9 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -38,8 +38,8 @@ def train (**args) : # args['label'] = handler.Export(df[[column_id]]) # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1) for col in column : - # args['real'] = pd.get_dummies(df[col]).astype(np.float32).values - args['real'] = handler.Export(df[[col]]) + args['real'] = pd.get_dummies(df[col]).astype(np.float32).values + # args['real'] = handler.Export(df[[col]]) args['column'] = col args['context'] = col context = args['context'] diff --git a/setup.py b/setup.py index 8d41539..a7e0642 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.1.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.1.2","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From cd88a9660a64f6115f27065a52551ca9fa8dd35e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 20 Feb 2020 22:37:25 -0600 Subject: [PATCH 016/250] bug fix , generator --- data/gan.py | 1 + data/maker/__init__.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/data/gan.py b/data/gan.py index fd30070..2083f69 100644 --- a/data/gan.py +++ b/data/gan.py @@ -581,6 +581,7 @@ class Predict(GNet): # # In case we are dealing with actual values like diagnosis codes we can perform # + df = found[np.random.choice(np.arange(len(found)),1)[0]] columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] diff --git a/data/maker/__init__.py b/data/maker/__init__.py index cbd1ea9..3c04b57 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -38,7 +38,7 @@ def train (**args) : # args['label'] = handler.Export(df[[column_id]]) # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1) for col in column : - args['real'] = pd.get_dummies(df[col]).astype(np.float32).values + args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values # args['real'] = handler.Export(df[[col]]) args['column'] = col args['context'] = col From c1a500fe4c3d18fa0606b2e68ad980515cd30f52 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 20 Feb 2020 23:08:35 -0600 Subject: [PATCH 017/250] bug fix , generator --- data/gan.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/data/gan.py b/data/gan.py index 2083f69..4c05566 100644 --- a/data/gan.py +++ b/data/gan.py @@ -568,8 +568,9 @@ class Predict(GNet): # df = ( pd.DataFrame(np.round(f).astype(np.int32))) p = 0 not in df.sum(axis=1).values - - if p: + x = df.sum(axis=1).values + print ( [np.sum(x),x.size]) + if np.divide( np.sum(x), x.size) : found.append(df) if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT: break From 553ee75a0681a80ea95fd0bdcf7920d383510c8d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 25 Feb 2020 11:41:40 -0600 Subject: [PATCH 018/250] bug fix around shape of candidate data to generate --- data/gan.py | 49 +++++++++++++++++++++++++++++++----------- data/maker/__init__.py | 8 +++---- 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/data/gan.py b/data/gan.py index 4c05566..6e6454e 100644 --- a/data/gan.py +++ b/data/gan.py @@ -166,7 +166,15 @@ class GNet : return _object def mkdir (self,path): if not os.path.exists(path) : - os.mkdir(path) + if os.sep in path : + pass + root = [] + for loc in path.split(os.sep) : + root.append(loc) + os.mkdir(os.sep.join(root)) + + else: + os.mkdir(path) def normalize(self,**args): @@ -520,8 +528,10 @@ class Predict(GNet): """ def __init__(self,**args): GNet.__init__(self,**args) - self.generator = Generator(**args) - self.values = args['values'] + self.generator = Generator(**args) + self.values = args['values'] + self.ROW_COUNT = args['row_count'] + self.MISSING_VALUES = args['no_value'] def load_meta(self, column): super().load_meta(column) self.generator.load_meta(column) @@ -532,8 +542,8 @@ class Predict(GNet): model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] tf.compat.v1.reset_default_graph() - z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) - y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32) + z = tf.random.normal(shape=[self.ROW_COUNT, self.Z_DIM]) + y = tf.compat.v1.placeholder(shape=[self.ROW_COUNT, self.NUM_LABELS], dtype=tf.int32) if self._LABEL is not None : ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) @@ -556,7 +566,7 @@ class Predict(GNet): labels = None found = [] - + ratio = [] for i in np.arange(CANDIDATE_COUNT) : if labels : f = sess.run(fake,feed_dict={y:labels}) @@ -569,10 +579,11 @@ class Predict(GNet): df = ( pd.DataFrame(np.round(f).astype(np.int32))) p = 0 not in df.sum(axis=1).values x = df.sum(axis=1).values - print ( [np.sum(x),x.size]) - if np.divide( np.sum(x), x.size) : + + if np.divide( np.sum(x), x.size) > .9 or p: + ratio.append(np.divide( np.sum(x), x.size)) found.append(df) - if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT: + if i == CANDIDATE_COUNT: break else: continue @@ -582,8 +593,9 @@ class Predict(GNet): # # In case we are dealing with actual values like diagnosis codes we can perform # - - df = found[np.random.choice(np.arange(len(found)),1)[0]] + INDEX = np.random.choice(np.arange(len(found)),1)[0] + INDEX = ratio.index(np.max(ratio)) + df = found[INDEX] columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] # r = np.zeros((self.ROW_COUNT,len(columns))) @@ -592,9 +604,20 @@ class Predict(GNet): if len(found): print (len(found),NTH_VALID_CANDIDATE) # x = df * self.values - - df = pd.DataFrame( df.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) + # + # let's get the missing rows (if any) ... + # + ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1) + if ii : + # + #@TODO Have this be a configurable variable + missing = np.repeat(0, np.where(ii==1)[0].size) + else: + missing = [] + i = np.where(ii == 0)[0] + df = pd.DataFrame( df.iloc.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) df.columns = columns + df = df[columns[0]].append(pd.Series(missing)) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 3c04b57..6205b78 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -77,25 +77,23 @@ def generate(**args): df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) column = args['column'] if (isinstance(args['column'],list)) else [args['column']] - column_id = args['id'] + # column_id = args['id'] # #@TODO: # If the identifier is not present, we should fine a way to determine or make one # - # args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values - bwrangler = Binary() - # args['label'] = bwrangler.Export(df[[column_id]]) _df = df.copy() for col in column : args['context'] = col args['column'] = col values = df[col].unique().tolist() - # values.sort() args['values'] = values + args['row_count'] = df.shape[0] # # we can determine the cardinalities here so we know what to allow or disallow handler = gan.Predict (**args) handler.load_meta(col) + # handler.ROW_COUNT = df[col].shape[0] r = handler.apply() # print (r) _df[col] = r[col] From 1db182a528ce89d9bf6215f285446499124e122f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 25 Feb 2020 11:44:30 -0600 Subject: [PATCH 019/250] update version # --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a7e0642..aefd6d0 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.1.2","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.1.3","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 994e71160e54b039f87bcf456adfe6664452eb0e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 25 Feb 2020 11:54:27 -0600 Subject: [PATCH 020/250] bug fix with directory --- data/gan.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/data/gan.py b/data/gan.py index 6e6454e..0d449d2 100644 --- a/data/gan.py +++ b/data/gan.py @@ -171,9 +171,10 @@ class GNet : root = [] for loc in path.split(os.sep) : root.append(loc) - os.mkdir(os.sep.join(root)) + if not os.path.exists(os.sep.join(root)) : + os.mkdir(os.sep.join(root)) - else: + elif not os.path.exists(path): os.mkdir(path) From 334915894a68e80981f45491ab0357cdcc26aa9e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 25 Feb 2020 12:08:09 -0600 Subject: [PATCH 021/250] bug fix ... --- data/params.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/params.py b/data/params.py index 999b919..55b3109 100644 --- a/data/params.py +++ b/data/params.py @@ -1,6 +1,6 @@ import sys -SYS_ARGS = {'context':''} +# SYS_ARGS = {'context':''} if len(sys.argv) > 1: N = len(sys.argv) From b1a9a9fcb977867bf7c880be572f3b5ad6b27faa Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 25 Feb 2020 12:09:04 -0600 Subject: [PATCH 022/250] - --- data/params.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/params.py b/data/params.py index 55b3109..c667063 100644 --- a/data/params.py +++ b/data/params.py @@ -1,6 +1,6 @@ import sys -# SYS_ARGS = {'context':''} +SYS_ARGS = {} if len(sys.argv) > 1: N = len(sys.argv) From 93176a2d09e3234b9937865bc3033214a40d0a23 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 25 Feb 2020 12:23:40 -0600 Subject: [PATCH 023/250] bug fix: ambiguous thruth value of series --- data/gan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/gan.py b/data/gan.py index 0d449d2..621cea9 100644 --- a/data/gan.py +++ b/data/gan.py @@ -609,7 +609,7 @@ class Predict(GNet): # let's get the missing rows (if any) ... # ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1) - if ii : + if ii.shape[0] == 0 : # #@TODO Have this be a configurable variable missing = np.repeat(0, np.where(ii==1)[0].size) From 94780281d076eb95a225353326d12515c036caee Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 25 Feb 2020 12:25:03 -0600 Subject: [PATCH 024/250] bug fix: iloc index missing on generate samples --- data/gan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/gan.py b/data/gan.py index 621cea9..382dc41 100644 --- a/data/gan.py +++ b/data/gan.py @@ -616,7 +616,7 @@ class Predict(GNet): else: missing = [] i = np.where(ii == 0)[0] - df = pd.DataFrame( df.iloc.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) + df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) df.columns = columns df = df[columns[0]].append(pd.Series(missing)) From 0656474ca890c62f30cef940a0e513afa8006db3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 25 Feb 2020 12:27:45 -0600 Subject: [PATCH 025/250] bug fix: data-frame should be returned (not series) --- data/gan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/gan.py b/data/gan.py index 382dc41..f2f3cdb 100644 --- a/data/gan.py +++ b/data/gan.py @@ -624,7 +624,7 @@ class Predict(GNet): tf.compat.v1.reset_default_graph() - + df = pd.DataFrame(df) return df.to_dict(orient='list') # return df.to_dict(orient='list') # count = str(len(os.listdir(self.out_dir))) From 1cfd2059a472b59d6780e71d680c4ce7c4dbf0db Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 25 Feb 2020 12:33:44 -0600 Subject: [PATCH 026/250] bug fix: generated sample structure --- data/gan.py | 1 + 1 file changed, 1 insertion(+) diff --git a/data/gan.py b/data/gan.py index f2f3cdb..d1b3123 100644 --- a/data/gan.py +++ b/data/gan.py @@ -625,6 +625,7 @@ class Predict(GNet): tf.compat.v1.reset_default_graph() df = pd.DataFrame(df) + df.columns = columns return df.to_dict(orient='list') # return df.to_dict(orient='list') # count = str(len(os.listdir(self.out_dir))) From a51be50a862ef3c93436f667dd13d133644671ac Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 26 Feb 2020 09:25:13 -0600 Subject: [PATCH 027/250] bug fix: missing values when generated --- data/gan.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/data/gan.py b/data/gan.py index d1b3123..204f8af 100644 --- a/data/gan.py +++ b/data/gan.py @@ -594,6 +594,7 @@ class Predict(GNet): # # In case we are dealing with actual values like diagnosis codes we can perform # + INDEX = np.random.choice(np.arange(len(found)),1)[0] INDEX = ratio.index(np.max(ratio)) df = found[INDEX] @@ -609,7 +610,9 @@ class Predict(GNet): # let's get the missing rows (if any) ... # ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1) - if ii.shape[0] == 0 : + # print ([' **** ',ii.sum()]) + + if ii.shape[0] > 0 : # #@TODO Have this be a configurable variable missing = np.repeat(0, np.where(ii==1)[0].size) From 98a1062a3044b0fd662240540996808f80e73411 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 26 Feb 2020 09:32:29 -0600 Subject: [PATCH 028/250] bug fixes with missing values --- data/maker/__init__.py | 4 +- data/maker/__main__.py | 5 +- gan.py | 705 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 712 insertions(+), 2 deletions(-) create mode 100644 gan.py diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 6205b78..d5a4308 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -95,7 +95,9 @@ def generate(**args): handler.load_meta(col) # handler.ROW_COUNT = df[col].shape[0] r = handler.apply() - # print (r) + # print (r) + # + print ([_df.shape,len(r[col])]) _df[col] = r[col] # break return _df \ No newline at end of file diff --git a/data/maker/__main__.py b/data/maker/__main__.py index 63b464b..583be60 100644 --- a/data/maker/__main__.py +++ b/data/maker/__main__.py @@ -12,11 +12,14 @@ if 'config' in SYS_ARGS : else: # # + ARGS['no_value'] = '' _df = data.maker.generate(**ARGS) odf = pd.read_csv (ARGS['data']) odf.columns = [name.lower() for name in odf.columns] column = ARGS['column'] if isinstance(ARGS['column'],list) else [ARGS['column']] - print(pd.merge(odf,_df, on='id')) + print (odf.head()) + print (_df.head()) + # print(pd.merge(odf,_df,rsuffix='_io')) # print (_df[column].risk.evaluate(flag='synth')) # print (odf[column].risk.evaluate(flag='original')) # _x = pd.get_dummies(_df[column]).values diff --git a/gan.py b/gan.py new file mode 100644 index 0000000..2e4d503 --- /dev/null +++ b/gan.py @@ -0,0 +1,705 @@ +""" +This code was originally writen by Ziqi Zhang in order to generate synthetic data. +The code is an implementation of a Generative Adversarial Network that uses the Wasserstein Distance (WGAN). +It is intended to be used in 2 modes (embedded in code or using CLI) + +USAGE : + +The following parameters should be provided in a configuration file (JSON format) +python data/maker --config + +CONFIGURATION FILE STRUCTURE : + + context what it is you are loading (stroke, hypertension, ...) + data path of the file to be loaded + logs folder to store training model and meta data about learning + max_epochs number of iterations in learning + num_gpu number of gpus to be used (will still run if the GPUs are not available) + +EMBEDDED IN CODE : + +""" +import tensorflow as tf +from tensorflow.contrib.layers import l2_regularizer +import numpy as np +import pandas as pd +import time +import os +import sys +from data.params import SYS_ARGS +from data.bridge import Binary +import json +import pickle + +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ['CUDA_VISIBLE_DEVICES'] = "0" +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' + +# STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256 +# NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu']) +# BATCHSIZE_PER_GPU = 2000 +# TOTAL_BATCHSIZE = BATCHSIZE_PER_GPU * NUM_GPUS + +class void : + pass +class GNet : + def log(self,**args): + self.logs = dict(args,**self.logs) + + + """ + This is the base class of a generative network functions, the details will be implemented in the subclasses. + An instance of this class is accessed as follows + object.layers.normalize applies batch normalization or otherwise + obect.get.variables instanciate variables on cpu and return a reference (tensor) + """ + def __init__(self,**args): + self.layers = void() + self.layers.normalize = self.normalize + self.logs = {} + + self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] + # if self.NUM_GPUS > 1 : + # os.environ['CUDA_VISIBLE_DEVICES'] = "4" + + self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854 + self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE] + self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis + # self.NUM_LABELS = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1] + + if 'label' in args and len(args['label'].shape) == 2 : + self.NUM_LABELS = args['label'].shape[1] + elif 'label' in args and len(args['label']) == 1 : + self.NUM_LABELS = args['label'].shape[0] + else: + self.NUM_LABELS = None + # self.Z_DIM = 128 #self.X_SPACE_SIZE + self.Z_DIM = 128 #-- used as rows down stream + self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM] + PROPOSED_BATCH_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size']) + self.BATCHSIZE_PER_GPU = PROPOSED_BATCH_PER_GPU + if 'real' in args : + self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM] + + if args['real'].shape[0] < PROPOSED_BATCH_PER_GPU : + self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) + # self.BATCHSIZE_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size']) + self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS + self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) + self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs']) + self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100 + self.CONTEXT = args['context'] + self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None} + self._REAL = args['real'] if 'real' in args else None + self._LABEL = args['label'] if 'label' in args else None + + self.get = void() + self.get.variables = self._variable_on_cpu + self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + self.logger = args['logger'] if 'logger' in args and args['logger'] else None + self.init_logs(**args) + + def init_logs(self,**args): + self.log_dir = args['logs'] if 'logs' in args else 'logs' + self.mkdir(self.log_dir) + # + # + for key in ['train','output'] : + self.mkdir(os.sep.join([self.log_dir,key])) + self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT])) + + self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) + self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) + if self.logger : + # + # We will clear the logs from the data-store + # + column = self.ATTRIBUTES['synthetic'] + db = self.logger.db + if db[column].count() > 0 : + db.backup.insert({'name':column,'logs':list(db[column].find()) }) + db[column].drop() + + def load_meta(self,column): + """ + This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model. + Because prediction and training can happen independently + """ + # suffix = "-".join(column) if isinstance(column,list)else column + suffix = self.get.suffix() + _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json']) + if os.path.exists(_name) : + attr = json.loads((open(_name)).read()) + for key in attr : + value = attr[key] + setattr(self,key,value) + self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) + self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) + + + def log_meta(self,**args) : + + _object = { + # '_id':'meta', + 'CONTEXT':self.CONTEXT, + 'ATTRIBUTES':self.ATTRIBUTES, + 'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU, + 'Z_DIM':self.Z_DIM, + "X_SPACE_SIZE":self.X_SPACE_SIZE, + "D_STRUCTURE":self.D_STRUCTURE, + "G_STRUCTURE":self.G_STRUCTURE, + "NUM_GPUS":self.NUM_GPUS, + "NUM_LABELS":self.NUM_LABELS, + "MAX_EPOCHS":self.MAX_EPOCHS, + "ROW_COUNT":self.ROW_COUNT + } + if args and 'key' in args and 'value' in args : + key = args['key'] + value= args['value'] + object[key] = value + # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column + suffix = self.get.suffix() + _name = os.sep.join([self.out_dir,'meta-'+suffix]) + + f = open(_name+'.json','w') + f.write(json.dumps(_object)) + return _object + def mkdir (self,path): + if not os.path.exists(path) : + os.mkdir(path) + + + def normalize(self,**args): + """ + This function will perform a batch normalization on an network layer + inputs input layer of the neural network + name name of the scope the + labels labels (attributes not synthesized) by default None + n_labels number of labels default None + """ + inputs = args['inputs'] + name = args['name'] + labels = None if 'labels' not in args else args['labels'] + n_labels= None if 'n_labels' not in args else args['n_labels'] + shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing + mean, var = tf.nn.moments(inputs, shift, keep_dims=True) + shape = inputs.shape[1].value + if labels is not None: + offset_m = self.get.variables(shape=[1,shape], name='offset'+name, + initializer=tf.zeros_initializer) + scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name, + initializer=tf.ones_initializer) + offset = tf.nn.embedding_lookup(offset_m, labels) + scale = tf.nn.embedding_lookup(scale_m, labels) + + else: + offset = None + scale = None + + result = tf.nn.batch_normalization(inputs, mean, var,offset,scale, 1e-8) + return result + + def _variable_on_cpu(self,**args): + """ + This function makes sure variables/tensors are not created on the GPU but rather on the CPU + """ + + name = args['name'] + shape = args['shape'] + initializer=None if 'initializer' not in args else args['initializer'] + with tf.device('/cpu:0') : + cpu_var = tf.compat.v1.get_variable(name,shape,initializer= initializer) + return cpu_var + def average_gradients(self,tower_grads): + average_grads = [] + for grad_and_vars in zip(*tower_grads): + grads = [] + for g, _ in grad_and_vars: + expanded_g = tf.expand_dims(g, 0) + grads.append(expanded_g) + + grad = tf.concat(axis=0, values=grads) + grad = tf.reduce_mean(grad, 0) + + v = grad_and_vars[0][1] + grad_and_var = (grad, v) + average_grads.append(grad_and_var) + return average_grads + + +class Generator (GNet): + """ + This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator, this allows the generator not to be random + + """ + def __init__(self,**args): + GNet.__init__(self,**args) + self.discriminator = Discriminator(**args) + def loss(self,**args): + fake = args['fake'] + label = args['label'] + y_hat_fake = self.discriminator.network(inputs=fake, label=label) + #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) + all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) + loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs) + #tf.add_to_collection('glosses', loss) + tf.compat.v1.add_to_collection('glosses', loss) + return loss, loss + def load_meta(self, column): + super().load_meta(column) + self.discriminator.load_meta(column) + def network(self,**args) : + """ + This function will build the network that will generate the synthetic candidates + :inputs matrix of data that we need + :dim dimensions of ... + """ + x = args['inputs'] + tmp_dim = self.Z_DIM if 'dim' not in args else args['dim'] + label = args['label'] + + with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): + for i, dim in enumerate(self.G_STRUCTURE[:-1]): + kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim]) + h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS) + h2 = tf.nn.relu(h1) + x = x + h2 + tmp_dim = dim + i = len(self.G_STRUCTURE) - 1 + # + # This seems to be an extra hidden layer: + # It's goal is to map continuous values to discrete values (pre-trained to do this) + kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]]) + h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i), + labels=label, n_labels=self.NUM_LABELS) + h2 = tf.nn.tanh(h1) + x = x + h2 + # This seems to be the output layer + # + kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE]) + bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE]) + x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias)) + return x + +class Discriminator(GNet): + def __init__(self,**args): + GNet.__init__(self,**args) + def network(self,**args): + """ + This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output (neuron) + :inputs + :label + """ + x = args['inputs'] + label = args['label'] + with tf.compat.v1.variable_scope('D', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): + for i, dim in enumerate(self.D_STRUCTURE[1:]): + kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[i], dim]) + bias = self.get.variables(name='b_' + str(i), shape=[dim]) + # print (["\t",bias,kernel]) + x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias)) + x = self.normalize(inputs=x, name='cln' + str(i), shift=1,labels=label, n_labels=self.NUM_LABELS) + i = len(self.D_STRUCTURE) + kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[-1], 1]) + bias = self.get.variables(name='b_' + str(i), shape=[1]) + y = tf.add(tf.matmul(x, kernel), bias) + return y + + def loss(self,**args) : + """ + This function compute the loss of + :real + :fake + :label + """ + real = args['real'] + fake = args['fake'] + label = args['label'] + epsilon = tf.random.uniform(shape=[self.BATCHSIZE_PER_GPU,1],minval=0,maxval=1) + + x_hat = real + epsilon * (fake - real) + y_hat_fake = self.network(inputs=fake, label=label) + + y_hat_real = self.network(inputs=real, label=label) + y_hat = self.network(inputs=x_hat, label=label) + + grad = tf.gradients(y_hat, [x_hat])[0] + slopes = tf.sqrt(tf.reduce_sum(tf.square(grad), 1)) + gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2) + #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) + all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) + w_distance = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake) + loss = w_distance + 10 * gradient_penalty + sum(all_regs) + #tf.add_to_collection('dlosses', loss) + tf.compat.v1.add_to_collection('dlosses', loss) + + return w_distance, loss +class Train (GNet): + def __init__(self,**args): + GNet.__init__(self,**args) + self.generator = Generator(**args) + self.discriminator = Discriminator(**args) + self._REAL = args['real'] + self._LABEL= args['label'] if 'label' in args else None + self.column = args['column'] + # print ([" *** ",self.BATCHSIZE_PER_GPU]) + + self.meta = self.log_meta() + if(self.logger): + + self.logger.write( self.meta ) + + # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta) + def load_meta(self, column): + """ + This function will delegate the calls to load meta data to it's dependents + column name + """ + super().load_meta(column) + self.generator.load_meta(column) + self.discriminator.load_meta(column) + def loss(self,**args): + """ + This function will compute a "tower" loss of the generated candidate against real data + Training will consist in having both generator and discriminators + :scope + :stage + :real + :label + """ + + scope = args['scope'] + stage = args['stage'] + real = args['real'] + label = args['label'] + + + if label is not None : + label = tf.cast(label, tf.int32) + # + # @TODO: Ziqi needs to explain what's going on here + m = [[i] for i in np.arange(self._LABEL.shape[1]-2)] + label = label[:, 1] * len(m) + tf.squeeze( + tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32)) + ) + # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] ) + z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) + + fake = self.generator.network(inputs=z, label=label) + if stage == 'D': + w, loss = self.discriminator.loss(real=real, fake=fake, label=label) + #losses = tf.get_collection('dlosses', scope) + flag = 'dlosses' + losses = tf.compat.v1.get_collection('dlosses', scope) + else: + w, loss = self.generator.loss(fake=fake, label=label) + #losses = tf.get_collection('glosses', scope) + flag = 'glosses' + losses = tf.compat.v1.get_collection('glosses', scope) + # losses = tf.compat.v1.get_collection(flag, scope) + + total_loss = tf.add_n(losses, name='total_loss') + + return total_loss, w + def input_fn(self): + """ + This function seems to produce + """ + features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32) + LABEL_SHAPE = [None,None] if self._LABEL is None else self._LABEL.shape + labels_placeholder = tf.compat.v1.placeholder(shape=LABEL_SHAPE, dtype=tf.float32) + if self._LABEL is not None : + dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) + else : + dataset = tf.data.Dataset.from_tensor_slices(features_placeholder) + # labels_placeholder = None + dataset = dataset.repeat(10000) + dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) + dataset = dataset.prefetch(1) + # iterator = dataset.make_initializable_iterator() + iterator = tf.compat.v1.data.make_initializable_iterator(dataset) + return iterator, features_placeholder, labels_placeholder + + def network(self,**args): + stage = args['stage'] + opt = args['opt'] + tower_grads = [] + per_gpu_w = [] + iterator, features_placeholder, labels_placeholder = self.input_fn() + with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): + for i in range(self.NUM_GPUS): + with tf.device('/gpu:%d' % i): + with tf.name_scope('%s_%d' % ('TOWER', i)) as scope: + if self._LABEL is not None : + (real, label) = iterator.get_next() + else: + real = iterator.get_next() + label= None + loss, w = self.loss(scope=scope, stage=stage, real=real, label=label) + #tf.get_variable_scope().reuse_variables() + tf.compat.v1.get_variable_scope().reuse_variables() + #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage) + vars_ = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=stage) + grads = opt.compute_gradients(loss, vars_) + tower_grads.append(grads) + per_gpu_w.append(w) + + grads = self.average_gradients(tower_grads) + apply_gradient_op = opt.apply_gradients(grads) + + mean_w = tf.reduce_mean(per_gpu_w) + train_op = apply_gradient_op + return train_op, mean_w, iterator, features_placeholder, labels_placeholder + def apply(self,**args): + # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10 + REAL = self._REAL + LABEL= self._LABEL + if (self.logger): + pass + + with tf.device('/cpu:0'): + opt_d = tf.compat.v1.train.AdamOptimizer(1e-4) + opt_g = tf.compat.v1.train.AdamOptimizer(1e-4) + + train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d) + train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g) + # saver = tf.train.Saver() + saver = tf.compat.v1.train.Saver() + # init = tf.global_variables_initializer() + init = tf.compat.v1.global_variables_initializer() + logs = [] + #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: + with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: + + sess.run(init) + + sess.run(iterator_d.initializer, + feed_dict={features_placeholder_d: REAL}) + sess.run(iterator_g.initializer, + feed_dict={features_placeholder_g: REAL}) + + for epoch in range(1, self.MAX_EPOCHS + 1): + start_time = time.time() + w_sum = 0 + for i in range(self.STEPS_PER_EPOCH): + for _ in range(2): + _, w = sess.run([train_d, w_distance]) + w_sum += w + sess.run(train_g) + duration = time.time() - start_time + + assert not np.isnan(w_sum), 'Model diverged with loss = NaN' + + format_str = 'epoch: %d, w_distance = %f (%.1f)' + print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) + # print (dir (w_distance)) + + logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) }) + + if epoch % self.MAX_EPOCHS == 0: + # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + suffix = self.get.suffix() + _name = os.sep.join([self.train_dir,suffix]) + # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) + saver.save(sess, _name, write_meta_graph=False, global_step=epoch) + # + # + if self.logger : + row = {"logs":logs} #,"model":pickle.dump(sess)} + self.logger.write(row) + # + # @TODO: + # We should upload the files in the checkpoint + # This would allow the learnt model to be portable to another system + # + tf.compat.v1.reset_default_graph() + +class Predict(GNet): + """ + This class uses synthetic data given a learned model + """ + def __init__(self,**args): + GNet.__init__(self,**args) + self.generator = Generator(**args) + self.values = args['values'] + def load_meta(self, column): + super().load_meta(column) + self.generator.load_meta(column) + def apply(self,**args): + # print (self.train_dir) + # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + suffix = self.get.suffix() + model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) + demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] + tf.compat.v1.reset_default_graph() + #z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) + z = tf.random.normal(shape=[self._REAL.shape[0], self.Z_DIM]) + y = tf.compat.v1.placeholder(shape=[self._REAL.shape[0], self.NUM_LABELS], dtype=tf.int32) + #y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32) + if self._LABEL is not None : + ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] + label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) + else: + label = None + fake = self.generator.network(inputs=z, label=label) + init = tf.compat.v1.global_variables_initializer() + saver = tf.compat.v1.train.Saver() + df = pd.DataFrame() + CANDIDATE_COUNT = 10000 + NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0] + with tf.compat.v1.Session() as sess: + + # sess.run(init) + saver.restore(sess, model_dir) + if self._LABEL is not None : + labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) + labels= demo + else: + labels = None + + found = [] + + for i in np.arange(CANDIDATE_COUNT) : + if labels : + f = sess.run(fake,feed_dict={y:labels}) + else: + f = sess.run(fake) + # + # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes + # The code below will insure we have some acceptable cardinal relationships between id and synthetic values + # + df = ( pd.DataFrame(np.round(f).astype(np.int32))) + p = 0 not in df.sum(axis=1).values + x = df.sum(axis=1).values + if np.divide( np.sum(x), x.size) > .9: + found.append(df) + if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT: + break + else: + continue + + # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms + # df = (i * df).sum(axis=1) + # + # In case we are dealing with actual values like diagnosis codes we can perform + # + INDEX =np.random.choice(np.arange(len(found)),1)[0] + #df = found[np.random.choice(np.arange(len(found)),1)[0]] + df = found[INDEX] + columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] + + # r = np.zeros((self.ROW_COUNT,len(columns))) + r = np.zeros(self.ROW_COUNT) + df.columns = self.values + if len(found): + print (len(found),NTH_VALID_CANDIDATE) + # x = df * self.values + # + # let's get the rows with no values synthesized (for whatever reason) + # + ii = df.apply(lambda row: np.sum(row) == 0,axis=1) + if np.sum(ii) > 0 : + missing = np.repeat(np.nan, np.where(ii==1)[0].size) + else: + missing = [] + print (len (missing), df.shape) + i = np.where(ii == 0)[0] + df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row == 1)[0],1)[0]] ,axis=1)) + df.columns = columns + df = df[columns[0]].append(pd.Series(missing)) + + + + + + tf.compat.v1.reset_default_graph() + df = pd.DataFrame(df) + df.columns = columns + print (df.head()) + print (df.shape) + return df.to_dict(orient='list') + # return df.to_dict(orient='list') + # count = str(len(os.listdir(self.out_dir))) + # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv']) + # df.to_csv(_name,index=False) + + + # output.extend(np.round(f)) + + # for m in range(2): + # for n in range(2, self.NUM_LABELS): + # idx1 = (demo[:, m] == 1) + # idx2 = (demo[:, n] == 1) + # idx = [idx1[j] and idx2[j] for j in range(len(idx1))] + # num = np.sum(idx) + # print ("___________________list__") + # print (idx1) + # print (idx2) + # print (idx) + # print (num) + # print ("_____________________") + # nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU)) + # label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS)) + # label_input[:, n] = 1 + # label_input[:, m] = 1 + # output = [] + # for i in range(nbatch): + # f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]}) + # output.extend(np.round(f)) + # output = np.array(output)[:num] + # print ([m,n,output]) + + # np.save(self.out_dir + str(m) + str(n), output) + + +if __name__ == '__main__' : + # + # Now we get things done ... + column = SYS_ARGS['column'] + column_id = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id' + column_id = column_id.split(',') if ',' in column_id else column_id + df = pd.read_csv(SYS_ARGS['raw-data']) + LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values + + context = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4] + if set(['train','learn']) & set(SYS_ARGS.keys()): + + df = pd.read_csv(SYS_ARGS['raw-data']) + + # cols = SYS_ARGS['column'] + # _map,_df = (Binary()).Export(df) + # i = np.arange(_map[column]['start'],_map[column]['end']) + max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10 + # REAL = _df[:,i] + REAL = pd.get_dummies(df[column]).astype(np.float32).values + LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values + trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id) + trainer.apply() + + + + + # + # We should train upon this data + # + # -- we need to convert the data-frame to binary matrix, given a column + # + pass + elif 'generate' in SYS_ARGS: + values = df[column].unique().tolist() + values.sort() + + p = Predict(context=context,label=LABEL,values=values,column=column) + p.load_meta(column) + r = p.apply() + print (df) + print () + df[column] = r[column] + print (df) + + + else: + print (SYS_ARGS.keys()) + print (__doc__) + pass + From d5a343da8401f9a6873c2dbcd0ef0428f7bcc1b3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 26 Feb 2020 09:33:35 -0600 Subject: [PATCH 029/250] house keeping work --- gan.py | 705 --------------------------------------------------------- 1 file changed, 705 deletions(-) delete mode 100644 gan.py diff --git a/gan.py b/gan.py deleted file mode 100644 index 2e4d503..0000000 --- a/gan.py +++ /dev/null @@ -1,705 +0,0 @@ -""" -This code was originally writen by Ziqi Zhang in order to generate synthetic data. -The code is an implementation of a Generative Adversarial Network that uses the Wasserstein Distance (WGAN). -It is intended to be used in 2 modes (embedded in code or using CLI) - -USAGE : - -The following parameters should be provided in a configuration file (JSON format) -python data/maker --config - -CONFIGURATION FILE STRUCTURE : - - context what it is you are loading (stroke, hypertension, ...) - data path of the file to be loaded - logs folder to store training model and meta data about learning - max_epochs number of iterations in learning - num_gpu number of gpus to be used (will still run if the GPUs are not available) - -EMBEDDED IN CODE : - -""" -import tensorflow as tf -from tensorflow.contrib.layers import l2_regularizer -import numpy as np -import pandas as pd -import time -import os -import sys -from data.params import SYS_ARGS -from data.bridge import Binary -import json -import pickle - -os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" -os.environ['CUDA_VISIBLE_DEVICES'] = "0" -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' - -# STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256 -# NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu']) -# BATCHSIZE_PER_GPU = 2000 -# TOTAL_BATCHSIZE = BATCHSIZE_PER_GPU * NUM_GPUS - -class void : - pass -class GNet : - def log(self,**args): - self.logs = dict(args,**self.logs) - - - """ - This is the base class of a generative network functions, the details will be implemented in the subclasses. - An instance of this class is accessed as follows - object.layers.normalize applies batch normalization or otherwise - obect.get.variables instanciate variables on cpu and return a reference (tensor) - """ - def __init__(self,**args): - self.layers = void() - self.layers.normalize = self.normalize - self.logs = {} - - self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] - # if self.NUM_GPUS > 1 : - # os.environ['CUDA_VISIBLE_DEVICES'] = "4" - - self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854 - self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE] - self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis - # self.NUM_LABELS = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1] - - if 'label' in args and len(args['label'].shape) == 2 : - self.NUM_LABELS = args['label'].shape[1] - elif 'label' in args and len(args['label']) == 1 : - self.NUM_LABELS = args['label'].shape[0] - else: - self.NUM_LABELS = None - # self.Z_DIM = 128 #self.X_SPACE_SIZE - self.Z_DIM = 128 #-- used as rows down stream - self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM] - PROPOSED_BATCH_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size']) - self.BATCHSIZE_PER_GPU = PROPOSED_BATCH_PER_GPU - if 'real' in args : - self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM] - - if args['real'].shape[0] < PROPOSED_BATCH_PER_GPU : - self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) - # self.BATCHSIZE_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size']) - self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS - self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) - self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs']) - self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100 - self.CONTEXT = args['context'] - self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None} - self._REAL = args['real'] if 'real' in args else None - self._LABEL = args['label'] if 'label' in args else None - - self.get = void() - self.get.variables = self._variable_on_cpu - self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - self.logger = args['logger'] if 'logger' in args and args['logger'] else None - self.init_logs(**args) - - def init_logs(self,**args): - self.log_dir = args['logs'] if 'logs' in args else 'logs' - self.mkdir(self.log_dir) - # - # - for key in ['train','output'] : - self.mkdir(os.sep.join([self.log_dir,key])) - self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT])) - - self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) - self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) - if self.logger : - # - # We will clear the logs from the data-store - # - column = self.ATTRIBUTES['synthetic'] - db = self.logger.db - if db[column].count() > 0 : - db.backup.insert({'name':column,'logs':list(db[column].find()) }) - db[column].drop() - - def load_meta(self,column): - """ - This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model. - Because prediction and training can happen independently - """ - # suffix = "-".join(column) if isinstance(column,list)else column - suffix = self.get.suffix() - _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json']) - if os.path.exists(_name) : - attr = json.loads((open(_name)).read()) - for key in attr : - value = attr[key] - setattr(self,key,value) - self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) - self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) - - - def log_meta(self,**args) : - - _object = { - # '_id':'meta', - 'CONTEXT':self.CONTEXT, - 'ATTRIBUTES':self.ATTRIBUTES, - 'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU, - 'Z_DIM':self.Z_DIM, - "X_SPACE_SIZE":self.X_SPACE_SIZE, - "D_STRUCTURE":self.D_STRUCTURE, - "G_STRUCTURE":self.G_STRUCTURE, - "NUM_GPUS":self.NUM_GPUS, - "NUM_LABELS":self.NUM_LABELS, - "MAX_EPOCHS":self.MAX_EPOCHS, - "ROW_COUNT":self.ROW_COUNT - } - if args and 'key' in args and 'value' in args : - key = args['key'] - value= args['value'] - object[key] = value - # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column - suffix = self.get.suffix() - _name = os.sep.join([self.out_dir,'meta-'+suffix]) - - f = open(_name+'.json','w') - f.write(json.dumps(_object)) - return _object - def mkdir (self,path): - if not os.path.exists(path) : - os.mkdir(path) - - - def normalize(self,**args): - """ - This function will perform a batch normalization on an network layer - inputs input layer of the neural network - name name of the scope the - labels labels (attributes not synthesized) by default None - n_labels number of labels default None - """ - inputs = args['inputs'] - name = args['name'] - labels = None if 'labels' not in args else args['labels'] - n_labels= None if 'n_labels' not in args else args['n_labels'] - shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing - mean, var = tf.nn.moments(inputs, shift, keep_dims=True) - shape = inputs.shape[1].value - if labels is not None: - offset_m = self.get.variables(shape=[1,shape], name='offset'+name, - initializer=tf.zeros_initializer) - scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name, - initializer=tf.ones_initializer) - offset = tf.nn.embedding_lookup(offset_m, labels) - scale = tf.nn.embedding_lookup(scale_m, labels) - - else: - offset = None - scale = None - - result = tf.nn.batch_normalization(inputs, mean, var,offset,scale, 1e-8) - return result - - def _variable_on_cpu(self,**args): - """ - This function makes sure variables/tensors are not created on the GPU but rather on the CPU - """ - - name = args['name'] - shape = args['shape'] - initializer=None if 'initializer' not in args else args['initializer'] - with tf.device('/cpu:0') : - cpu_var = tf.compat.v1.get_variable(name,shape,initializer= initializer) - return cpu_var - def average_gradients(self,tower_grads): - average_grads = [] - for grad_and_vars in zip(*tower_grads): - grads = [] - for g, _ in grad_and_vars: - expanded_g = tf.expand_dims(g, 0) - grads.append(expanded_g) - - grad = tf.concat(axis=0, values=grads) - grad = tf.reduce_mean(grad, 0) - - v = grad_and_vars[0][1] - grad_and_var = (grad, v) - average_grads.append(grad_and_var) - return average_grads - - -class Generator (GNet): - """ - This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator, this allows the generator not to be random - - """ - def __init__(self,**args): - GNet.__init__(self,**args) - self.discriminator = Discriminator(**args) - def loss(self,**args): - fake = args['fake'] - label = args['label'] - y_hat_fake = self.discriminator.network(inputs=fake, label=label) - #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) - loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs) - #tf.add_to_collection('glosses', loss) - tf.compat.v1.add_to_collection('glosses', loss) - return loss, loss - def load_meta(self, column): - super().load_meta(column) - self.discriminator.load_meta(column) - def network(self,**args) : - """ - This function will build the network that will generate the synthetic candidates - :inputs matrix of data that we need - :dim dimensions of ... - """ - x = args['inputs'] - tmp_dim = self.Z_DIM if 'dim' not in args else args['dim'] - label = args['label'] - - with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): - for i, dim in enumerate(self.G_STRUCTURE[:-1]): - kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim]) - h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS) - h2 = tf.nn.relu(h1) - x = x + h2 - tmp_dim = dim - i = len(self.G_STRUCTURE) - 1 - # - # This seems to be an extra hidden layer: - # It's goal is to map continuous values to discrete values (pre-trained to do this) - kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]]) - h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i), - labels=label, n_labels=self.NUM_LABELS) - h2 = tf.nn.tanh(h1) - x = x + h2 - # This seems to be the output layer - # - kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE]) - bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE]) - x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias)) - return x - -class Discriminator(GNet): - def __init__(self,**args): - GNet.__init__(self,**args) - def network(self,**args): - """ - This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output (neuron) - :inputs - :label - """ - x = args['inputs'] - label = args['label'] - with tf.compat.v1.variable_scope('D', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): - for i, dim in enumerate(self.D_STRUCTURE[1:]): - kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[i], dim]) - bias = self.get.variables(name='b_' + str(i), shape=[dim]) - # print (["\t",bias,kernel]) - x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias)) - x = self.normalize(inputs=x, name='cln' + str(i), shift=1,labels=label, n_labels=self.NUM_LABELS) - i = len(self.D_STRUCTURE) - kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[-1], 1]) - bias = self.get.variables(name='b_' + str(i), shape=[1]) - y = tf.add(tf.matmul(x, kernel), bias) - return y - - def loss(self,**args) : - """ - This function compute the loss of - :real - :fake - :label - """ - real = args['real'] - fake = args['fake'] - label = args['label'] - epsilon = tf.random.uniform(shape=[self.BATCHSIZE_PER_GPU,1],minval=0,maxval=1) - - x_hat = real + epsilon * (fake - real) - y_hat_fake = self.network(inputs=fake, label=label) - - y_hat_real = self.network(inputs=real, label=label) - y_hat = self.network(inputs=x_hat, label=label) - - grad = tf.gradients(y_hat, [x_hat])[0] - slopes = tf.sqrt(tf.reduce_sum(tf.square(grad), 1)) - gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2) - #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) - w_distance = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake) - loss = w_distance + 10 * gradient_penalty + sum(all_regs) - #tf.add_to_collection('dlosses', loss) - tf.compat.v1.add_to_collection('dlosses', loss) - - return w_distance, loss -class Train (GNet): - def __init__(self,**args): - GNet.__init__(self,**args) - self.generator = Generator(**args) - self.discriminator = Discriminator(**args) - self._REAL = args['real'] - self._LABEL= args['label'] if 'label' in args else None - self.column = args['column'] - # print ([" *** ",self.BATCHSIZE_PER_GPU]) - - self.meta = self.log_meta() - if(self.logger): - - self.logger.write( self.meta ) - - # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta) - def load_meta(self, column): - """ - This function will delegate the calls to load meta data to it's dependents - column name - """ - super().load_meta(column) - self.generator.load_meta(column) - self.discriminator.load_meta(column) - def loss(self,**args): - """ - This function will compute a "tower" loss of the generated candidate against real data - Training will consist in having both generator and discriminators - :scope - :stage - :real - :label - """ - - scope = args['scope'] - stage = args['stage'] - real = args['real'] - label = args['label'] - - - if label is not None : - label = tf.cast(label, tf.int32) - # - # @TODO: Ziqi needs to explain what's going on here - m = [[i] for i in np.arange(self._LABEL.shape[1]-2)] - label = label[:, 1] * len(m) + tf.squeeze( - tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32)) - ) - # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] ) - z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) - - fake = self.generator.network(inputs=z, label=label) - if stage == 'D': - w, loss = self.discriminator.loss(real=real, fake=fake, label=label) - #losses = tf.get_collection('dlosses', scope) - flag = 'dlosses' - losses = tf.compat.v1.get_collection('dlosses', scope) - else: - w, loss = self.generator.loss(fake=fake, label=label) - #losses = tf.get_collection('glosses', scope) - flag = 'glosses' - losses = tf.compat.v1.get_collection('glosses', scope) - # losses = tf.compat.v1.get_collection(flag, scope) - - total_loss = tf.add_n(losses, name='total_loss') - - return total_loss, w - def input_fn(self): - """ - This function seems to produce - """ - features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32) - LABEL_SHAPE = [None,None] if self._LABEL is None else self._LABEL.shape - labels_placeholder = tf.compat.v1.placeholder(shape=LABEL_SHAPE, dtype=tf.float32) - if self._LABEL is not None : - dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) - else : - dataset = tf.data.Dataset.from_tensor_slices(features_placeholder) - # labels_placeholder = None - dataset = dataset.repeat(10000) - dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) - dataset = dataset.prefetch(1) - # iterator = dataset.make_initializable_iterator() - iterator = tf.compat.v1.data.make_initializable_iterator(dataset) - return iterator, features_placeholder, labels_placeholder - - def network(self,**args): - stage = args['stage'] - opt = args['opt'] - tower_grads = [] - per_gpu_w = [] - iterator, features_placeholder, labels_placeholder = self.input_fn() - with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): - for i in range(self.NUM_GPUS): - with tf.device('/gpu:%d' % i): - with tf.name_scope('%s_%d' % ('TOWER', i)) as scope: - if self._LABEL is not None : - (real, label) = iterator.get_next() - else: - real = iterator.get_next() - label= None - loss, w = self.loss(scope=scope, stage=stage, real=real, label=label) - #tf.get_variable_scope().reuse_variables() - tf.compat.v1.get_variable_scope().reuse_variables() - #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage) - vars_ = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=stage) - grads = opt.compute_gradients(loss, vars_) - tower_grads.append(grads) - per_gpu_w.append(w) - - grads = self.average_gradients(tower_grads) - apply_gradient_op = opt.apply_gradients(grads) - - mean_w = tf.reduce_mean(per_gpu_w) - train_op = apply_gradient_op - return train_op, mean_w, iterator, features_placeholder, labels_placeholder - def apply(self,**args): - # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10 - REAL = self._REAL - LABEL= self._LABEL - if (self.logger): - pass - - with tf.device('/cpu:0'): - opt_d = tf.compat.v1.train.AdamOptimizer(1e-4) - opt_g = tf.compat.v1.train.AdamOptimizer(1e-4) - - train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d) - train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g) - # saver = tf.train.Saver() - saver = tf.compat.v1.train.Saver() - # init = tf.global_variables_initializer() - init = tf.compat.v1.global_variables_initializer() - logs = [] - #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: - with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: - - sess.run(init) - - sess.run(iterator_d.initializer, - feed_dict={features_placeholder_d: REAL}) - sess.run(iterator_g.initializer, - feed_dict={features_placeholder_g: REAL}) - - for epoch in range(1, self.MAX_EPOCHS + 1): - start_time = time.time() - w_sum = 0 - for i in range(self.STEPS_PER_EPOCH): - for _ in range(2): - _, w = sess.run([train_d, w_distance]) - w_sum += w - sess.run(train_g) - duration = time.time() - start_time - - assert not np.isnan(w_sum), 'Model diverged with loss = NaN' - - format_str = 'epoch: %d, w_distance = %f (%.1f)' - print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) - # print (dir (w_distance)) - - logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) }) - - if epoch % self.MAX_EPOCHS == 0: - # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - suffix = self.get.suffix() - _name = os.sep.join([self.train_dir,suffix]) - # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) - saver.save(sess, _name, write_meta_graph=False, global_step=epoch) - # - # - if self.logger : - row = {"logs":logs} #,"model":pickle.dump(sess)} - self.logger.write(row) - # - # @TODO: - # We should upload the files in the checkpoint - # This would allow the learnt model to be portable to another system - # - tf.compat.v1.reset_default_graph() - -class Predict(GNet): - """ - This class uses synthetic data given a learned model - """ - def __init__(self,**args): - GNet.__init__(self,**args) - self.generator = Generator(**args) - self.values = args['values'] - def load_meta(self, column): - super().load_meta(column) - self.generator.load_meta(column) - def apply(self,**args): - # print (self.train_dir) - # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - suffix = self.get.suffix() - model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) - demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] - tf.compat.v1.reset_default_graph() - #z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) - z = tf.random.normal(shape=[self._REAL.shape[0], self.Z_DIM]) - y = tf.compat.v1.placeholder(shape=[self._REAL.shape[0], self.NUM_LABELS], dtype=tf.int32) - #y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32) - if self._LABEL is not None : - ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] - label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) - else: - label = None - fake = self.generator.network(inputs=z, label=label) - init = tf.compat.v1.global_variables_initializer() - saver = tf.compat.v1.train.Saver() - df = pd.DataFrame() - CANDIDATE_COUNT = 10000 - NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0] - with tf.compat.v1.Session() as sess: - - # sess.run(init) - saver.restore(sess, model_dir) - if self._LABEL is not None : - labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) - labels= demo - else: - labels = None - - found = [] - - for i in np.arange(CANDIDATE_COUNT) : - if labels : - f = sess.run(fake,feed_dict={y:labels}) - else: - f = sess.run(fake) - # - # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes - # The code below will insure we have some acceptable cardinal relationships between id and synthetic values - # - df = ( pd.DataFrame(np.round(f).astype(np.int32))) - p = 0 not in df.sum(axis=1).values - x = df.sum(axis=1).values - if np.divide( np.sum(x), x.size) > .9: - found.append(df) - if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT: - break - else: - continue - - # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms - # df = (i * df).sum(axis=1) - # - # In case we are dealing with actual values like diagnosis codes we can perform - # - INDEX =np.random.choice(np.arange(len(found)),1)[0] - #df = found[np.random.choice(np.arange(len(found)),1)[0]] - df = found[INDEX] - columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] - - # r = np.zeros((self.ROW_COUNT,len(columns))) - r = np.zeros(self.ROW_COUNT) - df.columns = self.values - if len(found): - print (len(found),NTH_VALID_CANDIDATE) - # x = df * self.values - # - # let's get the rows with no values synthesized (for whatever reason) - # - ii = df.apply(lambda row: np.sum(row) == 0,axis=1) - if np.sum(ii) > 0 : - missing = np.repeat(np.nan, np.where(ii==1)[0].size) - else: - missing = [] - print (len (missing), df.shape) - i = np.where(ii == 0)[0] - df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row == 1)[0],1)[0]] ,axis=1)) - df.columns = columns - df = df[columns[0]].append(pd.Series(missing)) - - - - - - tf.compat.v1.reset_default_graph() - df = pd.DataFrame(df) - df.columns = columns - print (df.head()) - print (df.shape) - return df.to_dict(orient='list') - # return df.to_dict(orient='list') - # count = str(len(os.listdir(self.out_dir))) - # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv']) - # df.to_csv(_name,index=False) - - - # output.extend(np.round(f)) - - # for m in range(2): - # for n in range(2, self.NUM_LABELS): - # idx1 = (demo[:, m] == 1) - # idx2 = (demo[:, n] == 1) - # idx = [idx1[j] and idx2[j] for j in range(len(idx1))] - # num = np.sum(idx) - # print ("___________________list__") - # print (idx1) - # print (idx2) - # print (idx) - # print (num) - # print ("_____________________") - # nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU)) - # label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS)) - # label_input[:, n] = 1 - # label_input[:, m] = 1 - # output = [] - # for i in range(nbatch): - # f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]}) - # output.extend(np.round(f)) - # output = np.array(output)[:num] - # print ([m,n,output]) - - # np.save(self.out_dir + str(m) + str(n), output) - - -if __name__ == '__main__' : - # - # Now we get things done ... - column = SYS_ARGS['column'] - column_id = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id' - column_id = column_id.split(',') if ',' in column_id else column_id - df = pd.read_csv(SYS_ARGS['raw-data']) - LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values - - context = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4] - if set(['train','learn']) & set(SYS_ARGS.keys()): - - df = pd.read_csv(SYS_ARGS['raw-data']) - - # cols = SYS_ARGS['column'] - # _map,_df = (Binary()).Export(df) - # i = np.arange(_map[column]['start'],_map[column]['end']) - max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10 - # REAL = _df[:,i] - REAL = pd.get_dummies(df[column]).astype(np.float32).values - LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values - trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id) - trainer.apply() - - - - - # - # We should train upon this data - # - # -- we need to convert the data-frame to binary matrix, given a column - # - pass - elif 'generate' in SYS_ARGS: - values = df[column].unique().tolist() - values.sort() - - p = Predict(context=context,label=LABEL,values=values,column=column) - p.load_meta(column) - r = p.apply() - print (df) - print () - df[column] = r[column] - print (df) - - - else: - print (SYS_ARGS.keys()) - print (__doc__) - pass - From bd6fb03f8d228028d2be53a33ad50cceb77fdb94 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 26 Feb 2020 09:34:01 -0600 Subject: [PATCH 030/250] updating version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index aefd6d0..477c48a 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.1.3","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.1.4","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 3fbd68309fb57b467063e9ee0b79eb06ff35c7d7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 28 Feb 2020 21:37:26 -0600 Subject: [PATCH 031/250] Handling of continous values --- data/gan.py | 8 +-- data/maker/__init__.py | 114 ++++++++++++++++++++++++++++++++++------- data/maker/__main__.py | 6 +-- 3 files changed, 103 insertions(+), 25 deletions(-) diff --git a/data/gan.py b/data/gan.py index 204f8af..c2aadb5 100644 --- a/data/gan.py +++ b/data/gan.py @@ -604,7 +604,7 @@ class Predict(GNet): r = np.zeros(self.ROW_COUNT) df.columns = self.values if len(found): - print (len(found),NTH_VALID_CANDIDATE) + # print (len(found),NTH_VALID_CANDIDATE) # x = df * self.values # # let's get the missing rows (if any) ... @@ -704,10 +704,10 @@ if __name__ == '__main__' : p = Predict(context=context,label=LABEL,values=values,column=column) p.load_meta(column) r = p.apply() - print (df) - print () + # print (df) + # print () df[column] = r[column] - print (df) + # print (df) else: diff --git a/data/maker/__init__.py b/data/maker/__init__.py index d5a4308..6114ad2 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -14,6 +14,68 @@ import data.gan as gan from transport import factory from data.bridge import Binary import threading as thread +class ContinuousToDiscrete : + @staticmethod + def binary(X,n=4) : + """ + This function will convert a continous stream of information into a variety a bit stream of bins + """ + # BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist() + + BOUNDS = ContinuousToDiscrete.bounds(X,n) + + # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS] + _matrix = [] + m = [] + for value in X : + x_ = np.zeros(n) + _matrix.append(x_) + for row in BOUNDS : + + if value>= row.left and value <= row.right : + index = BOUNDS.index(row) + x_[index] = 1 + break + + return _matrix + + @staticmethod + def bounds(x,n): + return list(pd.cut(np.array(x),n).categories) + + + + @staticmethod + def continuous(X,BIN_SIZE=4) : + """ + This function will approximate a binary vector given boundary information + :X binary matrix + :BIN_SIZE + """ + BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE) + + values = [] + _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE) + # # print (BOUNDS) + + # values = [] + for row in _BINARY : + # ubound = BOUNDS[row.index(1)] + index = np.where(row == 1)[0][0] + + ubound = BOUNDS[ index ].right + lbound = BOUNDS[ index ].left + + x_ = np.round(np.random.uniform(lbound,ubound),3).astype(float) + values.append(x_) + + lbound = ubound + + return values + + + + def train (**args) : """ This function is intended to train the GAN in order to learn about the distribution of the features @@ -24,22 +86,30 @@ def train (**args) : :context label of what we are synthesizing """ column = args['column'] if (isinstance(args['column'],list)) else [args['column']] - + CONTINUOUS = args['continuous'] if 'continuous' in args else [] # column_id = args['id'] df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) df.columns = [name.lower() for name in df.columns] - + # + # @TODO: + # Consider sequential training of sub population for extremely large datasets + # + # # If we have several columns we will proceed one at a time (it could be done in separate threads) # @TODO : Consider performing this task on several threads/GPUs simulataneously # - handler = Binary() - # args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values - # args['label'] = handler.Export(df[[column_id]]) - # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1) - for col in column : - args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values - # args['real'] = handler.Export(df[[col]]) + for col in column : + # args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values + # if 'float' not in df[col].dtypes.name : + # args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values + if 'float' in df[col].dtypes.name and col in CONTINUOUS: + BIN_SIZE = 10 if 'bin_size' not in args else int(args['bin_size']) + args['real'] = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32) + else: + args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values + + args['column'] = col args['context'] = col context = args['context'] @@ -75,7 +145,7 @@ def generate(**args): """ # df = args['data'] df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) - + CONTINUOUS = args['continous'] if 'continuous' in args else [] column = args['column'] if (isinstance(args['column'],list)) else [args['column']] # column_id = args['id'] # @@ -86,18 +156,26 @@ def generate(**args): for col in column : args['context'] = col args['column'] = col - values = df[col].unique().tolist() - args['values'] = values - args['row_count'] = df.shape[0] + + if 'float' in df[col].dtypes.name or col in CONTINUOUS : + # + # We should create the bins for the values we are observing here + BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) + values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE) + else: + values = df[col].unique().tolist() + + args['values'] = values + args['row_count'] = df.shape[0] # # we can determine the cardinalities here so we know what to allow or disallow handler = gan.Predict (**args) handler.load_meta(col) - # handler.ROW_COUNT = df[col].shape[0] - r = handler.apply() - # print (r) - # - print ([_df.shape,len(r[col])]) + r = handler.apply() _df[col] = r[col] + # + # @TODO: log basic stats about the synthetic attribute + # + # break return _df \ No newline at end of file diff --git a/data/maker/__main__.py b/data/maker/__main__.py index 583be60..d71d400 100644 --- a/data/maker/__main__.py +++ b/data/maker/__main__.py @@ -17,9 +17,9 @@ if 'config' in SYS_ARGS : odf = pd.read_csv (ARGS['data']) odf.columns = [name.lower() for name in odf.columns] column = ARGS['column'] if isinstance(ARGS['column'],list) else [ARGS['column']] - print (odf.head()) - print (_df.head()) - # print(pd.merge(odf,_df,rsuffix='_io')) + # print (odf.head()) + # print (_df.head()) + print(odf.join(_df[column],rsuffix='_io')) # print (_df[column].risk.evaluate(flag='synth')) # print (odf[column].risk.evaluate(flag='original')) # _x = pd.get_dummies(_df[column]).values From a2988a59720101eeb4648434938e6540a212fcfa Mon Sep 17 00:00:00 2001 From: "Steve L. Nyemba" Date: Sun, 1 Mar 2020 12:07:02 -0600 Subject: [PATCH 032/250] pipeline --- pipeline.py | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 pipeline.py diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000..e6e1225 --- /dev/null +++ b/pipeline.py @@ -0,0 +1,126 @@ +import json +from transport import factory +import os +from multiprocessing import Process +import pandas as pd +from google.oauth2 import service_account +import data.maker + +from data.params import SYS_ARGS + +f = open ('config.json') +PIPELINE = json.loads(f.read()) +f.close() +# +# The configuration array is now loaded and we will execute the pipe line as follows +DATASET='combined20190510_deid' + +class Components : + @staticmethod + def get(args): + SQL = args['sql'] + if 'condition' in args : + condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')']) + SQL = " ".join([SQL,'WHERE',condition]) + + SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 " + return SQL #+ " LIMIT 10000 " + + @staticmethod + def train(args): + """ + This function will instanciate a worker that will train given a message that is provided to it + This is/will be a separate process that will + """ + print (['starting .... ',args['notify'],args['context']] ) + #SQL = args['sql'] + #if 'condition' in args : + # condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')']) + # SQL = " ".join([SQL,'WHERE',condition]) + print ( args['context']) + logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) + log_folder = os.sep.join(["logs",args['context']]) + _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":250,"num_gpus":2,"column":args['columns'],"id":"person_id","logger":logger} + os.environ['CUDA_VISIBLE_DEVICES'] = args['gpu'] + #SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 " + SQL = Components.get(args) + if 'limit' in args : + SQL = ' '.join([SQL,'limit',args['limit'] ]) + _args['max_epochs'] = 250 if 'max_epochs' not in args else args['max_epochs'] + credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') + _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard') + #_args['data'] = _args['data'].astype(object) + _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + data.maker.train(**_args) + @staticmethod + def generate(args): + """ + This function will generate data and store it to a given, + """ + logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) + log_folder = os.sep.join(["logs",args['context']]) + _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":250,"num_gpus":2,"column":args['columns'],"id":"person_id","logger":logger} + os.environ['CUDA_VISIBLE_DEVICES'] = args['gpu'] + SQL = Components.get(args) + if 'limit' in args : + SQL = " ".join([SQL ,'limit', args['limit'] ]) + credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') + _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').fillna('') + #_args['data'] = _args['data'].astype(object) + _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + + _args['max_epochs'] = 250 if 'max_epochs' not in args else args['max_epochs'] + + _args['no_value'] = args['no_value'] if 'no_value' in args else '' + #credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') + #_args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard') + #_args['data'] = _args['data'].astype(object) + _dc = data.maker.generate(**_args) + # + # We need to post the generate the data in order to : + # 1. compare immediately + # 2. synthetic copy + # + cols = _dc.columns.tolist() + print (args['columns']) + data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query) + base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) + print (_args['data'].shape) + print (_args['data'].shape) + for name in cols : + _args['data'][name] = _dc[name] + # filename = os.sep.join([log_folder,'output',name+'.csv']) + # data_comp[[name]].to_csv(filename,index=False) + + # + #-- Let us store all of this into bigquery + prefix = args['notify']+'.'+_args['context'] + table = '_'.join([prefix,'compare','io']) + data_comp.to_gbq(if_exists='replace',destination_table=table,credentials=credentials,chunksize=50000) + _args['data'].to_gbq(if_exists='replace',destination_table=table.replace('compare','full'),credentials=credentials,chunksize=50000) + data_comp.to_csv(os.sep.join([log_folder,table+'.csv']),index=False) + + +if __name__ == '__main__' : + index = int(SYS_ARGS['index']) + + args = (PIPELINE[index]) + #if 'limit' in SYS_ARGS : + # args['limit'] = SYS_ARGS['limit'] + #args['dataset'] = 'combined20190510' + SYS_ARGS['dataset'] = 'combined20190510_deid' if 'dataset' not in SYS_ARGS else SYS_ARGS['dataset'] + #if 'max_epochs' in SYS_ARGS : + # args['max_epochs'] = SYS_ARGS['max_epochs'] + args = dict(args,**SYS_ARGS) + if 'generate' in SYS_ARGS : + Components.generate(args) + + else: + + Components.train(args) +#for args in PIPELINE : + #args['dataset'] = 'combined20190510' + #process = Process(target=Components.train,args=(args,)) + #process.name = args['context'] + #process.start() +# Components.train(args) From 8e722d5bf1e12b7694589ca9dc3b716c55841584 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 4 Mar 2020 11:49:18 -0600 Subject: [PATCH 033/250] bug fix and upgrades to base functionalities --- data/gan.py | 19 +- data/maker/__init__.py | 35 ++-- pipeline.py | 384 ++++++++++++++++++++++++++++++----------- 3 files changed, 313 insertions(+), 125 deletions(-) diff --git a/data/gan.py b/data/gan.py index c2aadb5..b3b9cf8 100644 --- a/data/gan.py +++ b/data/gan.py @@ -431,9 +431,9 @@ class Train (GNet): def network(self,**args): stage = args['stage'] - opt = args['opt'] + opt = args['opt'] tower_grads = [] - per_gpu_w = [] + per_gpu_w = [] iterator, features_placeholder, labels_placeholder = self.input_fn() with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): for i in range(self.NUM_GPUS): @@ -550,6 +550,7 @@ class Predict(GNet): label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) else: label = None + fake = self.generator.network(inputs=z, label=label) init = tf.compat.v1.global_variables_initializer() saver = tf.compat.v1.train.Saver() @@ -577,11 +578,13 @@ class Predict(GNet): # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes # The code below will insure we have some acceptable cardinal relationships between id and synthetic values # - df = ( pd.DataFrame(np.round(f).astype(np.int32))) + df = pd.DataFrame(np.round(f).astype(np.int32)) + + p = 0 not in df.sum(axis=1).values x = df.sum(axis=1).values - if np.divide( np.sum(x), x.size) > .9 or p: + if np.divide( np.sum(x), x.size) > .9 or p and np.sum(x) == x.size: ratio.append(np.divide( np.sum(x), x.size)) found.append(df) if i == CANDIDATE_COUNT: @@ -597,11 +600,13 @@ class Predict(GNet): INDEX = np.random.choice(np.arange(len(found)),1)[0] INDEX = ratio.index(np.max(ratio)) + df = found[INDEX] columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] # r = np.zeros((self.ROW_COUNT,len(columns))) - r = np.zeros(self.ROW_COUNT) + # r = np.zeros(self.ROW_COUNT) + df.columns = self.values if len(found): # print (len(found),NTH_VALID_CANDIDATE) @@ -618,6 +623,10 @@ class Predict(GNet): missing = np.repeat(0, np.where(ii==1)[0].size) else: missing = [] + # + # @TODO: + # Log the findings here in terms of ratio, missing, candidate count + # print ([np.max(ratio),len(missing),len(found),i]) i = np.where(ii == 0)[0] df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) df.columns = columns diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 6114ad2..080939c 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -15,6 +15,7 @@ from transport import factory from data.bridge import Binary import threading as thread class ContinuousToDiscrete : + ROUND_UP = 2 @staticmethod def binary(X,n=4) : """ @@ -22,7 +23,7 @@ class ContinuousToDiscrete : """ # BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist() - BOUNDS = ContinuousToDiscrete.bounds(X,n) + BOUNDS = ContinuousToDiscrete.bounds(np.round(X,ContinuousToDiscrete.ROUND_UP),n) # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS] _matrix = [] @@ -41,7 +42,7 @@ class ContinuousToDiscrete : @staticmethod def bounds(x,n): - return list(pd.cut(np.array(x),n).categories) + return list(pd.cut(np.array( np.round(x,ContinuousToDiscrete.ROUND_UP) ),n).categories) @@ -66,7 +67,7 @@ class ContinuousToDiscrete : ubound = BOUNDS[ index ].right lbound = BOUNDS[ index ].left - x_ = np.round(np.random.uniform(lbound,ubound),3).astype(float) + x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float) values.append(x_) lbound = ubound @@ -104,10 +105,10 @@ def train (**args) : # if 'float' not in df[col].dtypes.name : # args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values if 'float' in df[col].dtypes.name and col in CONTINUOUS: - BIN_SIZE = 10 if 'bin_size' not in args else int(args['bin_size']) + BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) args['real'] = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32) else: - args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values + args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values args['column'] = col @@ -157,25 +158,27 @@ def generate(**args): args['context'] = col args['column'] = col - if 'float' in df[col].dtypes.name or col in CONTINUOUS : - # - # We should create the bins for the values we are observing here - BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) - values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE) - else: - values = df[col].unique().tolist() + # if 'float' in df[col].dtypes.name or col in CONTINUOUS : + # # + # # We should create the bins for the values we are observing here + # BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) + # values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE) + # # values = np.unique(values).tolist() + # else: + values = df[col].unique().tolist() args['values'] = values args['row_count'] = df.shape[0] # # we can determine the cardinalities here so we know what to allow or disallow - handler = gan.Predict (**args) + handler = gan.Predict (**args) handler.load_meta(col) - r = handler.apply() - _df[col] = r[col] + r = handler.apply() + BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) + _df[col] = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if 'float' in df[col].dtypes.name or col in CONTINUOUS else r[col] # # @TODO: log basic stats about the synthetic attribute # - + # print (r)s # break return _df \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index e6e1225..134ca8c 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,5 +1,6 @@ import json from transport import factory +import numpy as np import os from multiprocessing import Process import pandas as pd @@ -8,119 +9,294 @@ import data.maker from data.params import SYS_ARGS -f = open ('config.json') -PIPELINE = json.loads(f.read()) -f.close() # # The configuration array is now loaded and we will execute the pipe line as follows -DATASET='combined20190510_deid' +DATASET='combined20190510' class Components : - @staticmethod - def get(args): - SQL = args['sql'] - if 'condition' in args : - condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')']) - SQL = " ".join([SQL,'WHERE',condition]) - SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 " - return SQL #+ " LIMIT 10000 " + @staticmethod + def get(args): + """ + This function returns a data-frame provided a bigquery sql statement with conditions (and limits for testing purposes) + The function must be wrapped around a lambda this makes testing easier and changing data stores transparent to the rest of the code. (Vital when testing) + :sql basic sql statement + :condition optional condition and filters + """ + SQL = args['sql'] + if 'condition' in args : + condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')']) + SQL = " ".join([SQL,'WHERE',condition]) - @staticmethod - def train(args): - """ - This function will instanciate a worker that will train given a message that is provided to it - This is/will be a separate process that will - """ - print (['starting .... ',args['notify'],args['context']] ) - #SQL = args['sql'] - #if 'condition' in args : - # condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')']) - # SQL = " ".join([SQL,'WHERE',condition]) - print ( args['context']) - logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) - log_folder = os.sep.join(["logs",args['context']]) - _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":250,"num_gpus":2,"column":args['columns'],"id":"person_id","logger":logger} - os.environ['CUDA_VISIBLE_DEVICES'] = args['gpu'] - #SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 " - SQL = Components.get(args) - if 'limit' in args : - SQL = ' '.join([SQL,'limit',args['limit'] ]) - _args['max_epochs'] = 250 if 'max_epochs' not in args else args['max_epochs'] - credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') - _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard') - #_args['data'] = _args['data'].astype(object) - _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 - data.maker.train(**_args) - @staticmethod - def generate(args): - """ - This function will generate data and store it to a given, - """ - logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) - log_folder = os.sep.join(["logs",args['context']]) - _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":250,"num_gpus":2,"column":args['columns'],"id":"person_id","logger":logger} - os.environ['CUDA_VISIBLE_DEVICES'] = args['gpu'] - SQL = Components.get(args) - if 'limit' in args : - SQL = " ".join([SQL ,'limit', args['limit'] ]) - credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') - _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').fillna('') - #_args['data'] = _args['data'].astype(object) - _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 - - _args['max_epochs'] = 250 if 'max_epochs' not in args else args['max_epochs'] + SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 " + if 'limit' in args : + SQL = SQL + 'LIMIT ' + args['limit'] + credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') + df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna() + return df + + # return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna() + @staticmethod + def split(X,MAX_ROWS=3,PART_SIZE=3): + + return list(pd.cut( np.arange(X.shape[0]+1),PART_SIZE).categories) - _args['no_value'] = args['no_value'] if 'no_value' in args else '' - #credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') - #_args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard') - #_args['data'] = _args['data'].astype(object) - _dc = data.maker.generate(**_args) - # - # We need to post the generate the data in order to : - # 1. compare immediately - # 2. synthetic copy - # - cols = _dc.columns.tolist() - print (args['columns']) - data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query) - base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) - print (_args['data'].shape) - print (_args['data'].shape) - for name in cols : - _args['data'][name] = _dc[name] - # filename = os.sep.join([log_folder,'output',name+'.csv']) - # data_comp[[name]].to_csv(filename,index=False) + def train(self,**args): + """ + This function will perform training on the basis of a given pointer that reads data - # - #-- Let us store all of this into bigquery - prefix = args['notify']+'.'+_args['context'] - table = '_'.join([prefix,'compare','io']) - data_comp.to_gbq(if_exists='replace',destination_table=table,credentials=credentials,chunksize=50000) - _args['data'].to_gbq(if_exists='replace',destination_table=table.replace('compare','full'),credentials=credentials,chunksize=50000) - data_comp.to_csv(os.sep.join([log_folder,table+'.csv']),index=False) - + """ + # + # @TODO: we need to log something here about the parameters being passed + pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args) + df = pointer() + # + # Now we can parse the arguments and submit the entire thing to training + # + + logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) + log_folder = args['logs'] if 'logs' in args else 'logs' + _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) + _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + + MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 + PART_SIZE = args['part_size'] if 'part_size' in args else 0 + + if df.shape[0] > MAX_ROWS and 'partition' not in args: + lbound = 0 + bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories) + # bounds = Components.split(df,MAX_ROWS,PART_SIZE) + + qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'}) + + for b in bounds : + part_index = bounds.index(b) + ubound = int(b.right) + + + _data = df.iloc[lbound:ubound][args['columns']] + lbound = ubound + + # _args['logs'] = os.sep.join([log_folder,str(part_index)]) + _args['partition'] = str(part_index) + _args['logger'] = {'args':{'dbname':'aou','doc':args['context']},'type':'mongo.MongoWriter'} + # + # We should post the the partitions to a queue server (at least the instructions on ): + # - where to get the data + # - and athe arguments to use (partition #,columns,gpu,epochs) + # + info = {"rows":_data.shape[0],"cols":_data.shape[1], "paritition":part_index,"logs":_args['logs']} + p = {"args":_args,"data":_data.to_dict(orient="records"),"info":info} + qwriter.write(p) + # + # @TODO: + # - Notify that information was just posted to the queue + info['max_rows'] = MAX_ROWS + info['part_size'] = PART_SIZE + logger.write({"module":"train","action":"setup-partition","input":info}) + + pass + else: + partition = args['partition'] if 'partition' in args else '' + log_folder = os.sep.join([log_folder,args['context'],partition]) + _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) + _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' + + _args['data'] = df + # + # @log : + # Logging information about the training process for this partition (or not) + # + info = {"rows":df.shape[0],"cols":df.shape[1], "partition":partition,"logs":_args['logs']} + logger.write({"module":"train","action":"train","input":info}) + data.maker.train(**_args) + + pass + + # @staticmethod + def generate(self,args): + """ + This function will generate data and store it to a given, + """ + logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) + log_folder = args['logs'] if 'logs' in args else 'logs' + partition = args['partition'] if 'partition' in args else '' + log_folder = os.sep.join([log_folder,args['context'],partition]) + _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) + _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' + _args['no_value']= args['no_value'] + MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 + PART_SIZE = args['part_size'] if 'part_size' in args else 0 + + # credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') + # _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna() + reader = args['reader'] + df = reader() + if 'partition' in args : + bounds = Components.split(df,MAX_ROWS,PART_SIZE) + # bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories) + lbound = int(bounds[int(partition)].left) + ubound = int(bounds[int(partition)].right) + df = df.iloc[lbound:ubound] + _args['data'] = df + # _args['data'] = reader() + #_args['data'] = _args['data'].astype(object) + _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + _dc = data.maker.generate(**_args) + # + # We need to post the generate the data in order to : + # 1. compare immediately + # 2. synthetic copy + # + + cols = _dc.columns.tolist() + + data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query) + base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) + + for name in cols : + _args['data'][name] = _dc[name] + info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}} + if partition != '' : + info['partition'] = partition + logger.write(info) + # filename = os.sep.join([log_folder,'output',name+'.csv']) + # data_comp[[name]].to_csv(filename,index=False) + + # + #-- Let us store all of this into bigquery + prefix = args['notify']+'.'+_args['context'] + table = '_'.join([prefix,partition,'io']).replace('__','_') + folder = os.sep.join([args['logs'],args['context'],partition,'output']) + if 'file' in args : + + _fname = os.sep.join([folder,table.replace('_io','_full_io.csv')]) + _pname = os.sep.join([folder,table])+'.csv' + data_comp.to_csv( _pname,index=False) + _args['data'].to_csv(_fname,index=False) + + + else: + credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') + _pname = os.sep.join([folder,table+'.csv']) + _fname = table.replace('_io','_full_io') + data_comp.to_gbq(if_exists='replace',destination_table=_pname,credentials='credentials',chunk_size=50000) + data_comp.to_csv(_pname,index=False) + INSERT_FLAG = 'replace' if 'partition' not in args else 'append' + _args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=_fname,credentials='credentials',chunk_size=50000) + + info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} } + if partition : + info ['partition'] = partition + logger.write({"module":"generate","action":"write","info":info} ) + @staticmethod + def callback(channel,method,header,stream): + + info = json.loads(stream) + logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']}) + + logger.write({'module':'process','action':'read-partition','input':info['info']}) + df = pd.DataFrame(info['data']) + args = info['args'] + if int(args['num_gpu']) > 1 and args['gpu'] > 0: + args['gpu'] = args['gpu'] + args['num_gpu'] + args['reader'] = lambda: df + # + # @TODO: Fix + # There is an inconsistency in column/columns ... fix this shit! + # + args['columns'] = args['column'] + (Components()).train(**args) + logger.write({"module":"process","action":"exit","info":info["info"]}) + channel.close() + channel.connection.close() + pass + if __name__ == '__main__' : - index = int(SYS_ARGS['index']) + filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json' + f = open (filename) + PIPELINE = json.loads(f.read()) + f.close() + index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else 0 + + args = (PIPELINE[index]) + args['dataset'] = 'combined20190510' + args = dict(args,**SYS_ARGS) + args['max_rows'] = int(args['max_rows']) if 'max_rows' in args else 3 + args['part_size']= int(args['part_size']) if 'part_size' in args else 3 + + # + # @TODO: + # Log what was initiated so we have context of this processing ... + # + if 'listen' not in SYS_ARGS : + if 'file' in args : + reader = lambda: pd.read_csv(args['file']) ; + else: + reader = lambda: Components().get(args) + args['reader'] = reader + + if 'generate' in SYS_ARGS : + # + # Let us see if we have partitions given the log folder + + content = os.listdir( os.sep.join([args['logs'],args['context']])) + generator = Components() + if ''.join(content).isnumeric() : + # + # we have partitions we are working with + + for id in ''.join(content) : + args['partition'] = id + + generator.generate(args) + else: + generator.generate(args) + # Components.generate(args) + elif 'listen' in args : + # + # This will start a worker just in case to listen to a queue + if 'read' in SYS_ARGS : + QUEUE_TYPE = 'queue.QueueReader' + pointer = lambda qreader: qreader.read(1) + else: + QUEUE_TYPE = 'queue.QueueListener' + pointer = lambda qlistener: qlistener.listen() + N = int(SYS_ARGS['jobs']) if 'jobs' in SYS_ARGS else 1 + + qhandlers = [factory.instance(type=QUEUE_TYPE,args={'queue':'aou.io'}) for i in np.arange(N)] + jobs = [] + for qhandler in qhandlers : + qhandler.callback = Components.callback + job = Process(target=pointer,args=(qhandler,)) + job.start() + jobs.append(job) + # + # let us wait for the jobs + print (["Started ",len(jobs)," trainers"]) + while len(jobs) > 0 : + + jobs = [job for job in jobs if job.is_alive()] + + # pointer(qhandler) + - args = (PIPELINE[index]) - #if 'limit' in SYS_ARGS : - # args['limit'] = SYS_ARGS['limit'] - #args['dataset'] = 'combined20190510' - SYS_ARGS['dataset'] = 'combined20190510_deid' if 'dataset' not in SYS_ARGS else SYS_ARGS['dataset'] - #if 'max_epochs' in SYS_ARGS : - # args['max_epochs'] = SYS_ARGS['max_epochs'] - args = dict(args,**SYS_ARGS) - if 'generate' in SYS_ARGS : - Components.generate(args) - - else: - - Components.train(args) + # qreader.read(1) + pass + else: + + trainer = Components() + trainer.train(**args) + # Components.train(**args) #for args in PIPELINE : - #args['dataset'] = 'combined20190510' - #process = Process(target=Components.train,args=(args,)) - #process.name = args['context'] - #process.start() -# Components.train(args) + #args['dataset'] = 'combined20190510' + #process = Process(target=Components.train,args=(args,)) + #process.name = args['context'] + #process.start() +# Components.train(args) From eb1abdf9892cabf6654f55855be0c0aba9a36772 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 4 Mar 2020 12:16:50 -0600 Subject: [PATCH 034/250] bug fix with installer within branch --- pipeline.py | 1 + setup.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 134ca8c..04658da 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import json from transport import factory import numpy as np diff --git a/setup.py b/setup.py index 477c48a..e6b988a 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.1.4","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.1.6","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' @@ -12,4 +12,5 @@ args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' if sys.version_info[0] == 2 : args['use_2to3'] = False args['use_2to3_exclude_fixers'] = ['lib2to3.fixes.fix_import'] +args['scripts']=['pipeline.py'] setup(**args) From 076db1ec2ccf391656f906e360eb8b2cd9ec8521 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 4 Mar 2020 13:22:25 -0600 Subject: [PATCH 035/250] bug fix --- pipeline.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index 04658da..fd5a28e 100644 --- a/pipeline.py +++ b/pipeline.py @@ -33,7 +33,7 @@ class Components : if 'limit' in args : SQL = SQL + 'LIMIT ' + args['limit'] credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') - df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna() + df = pd.read_gbq(SQL,credentials=credentials,dialect='standard') return df # return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna() @@ -51,7 +51,9 @@ class Components : # @TODO: we need to log something here about the parameters being passed pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args) df = pointer() - + if df.shape[0] == 0 : + print ("CAN NOT TRAIN EMPTY DATASET ") + return # # Now we can parse the arguments and submit the entire thing to training # From 14cce1ef09ed96ebb16cf9b785494ea4a08df096 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 4 Mar 2020 13:40:26 -0600 Subject: [PATCH 036/250] bug fix with worker ... --- pipeline.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipeline.py b/pipeline.py index fd5a28e..8c8a7d7 100644 --- a/pipeline.py +++ b/pipeline.py @@ -233,6 +233,7 @@ if __name__ == '__main__' : args = dict(args,**SYS_ARGS) args['max_rows'] = int(args['max_rows']) if 'max_rows' in args else 3 args['part_size']= int(args['part_size']) if 'part_size' in args else 3 + # # @TODO: @@ -265,6 +266,7 @@ if __name__ == '__main__' : elif 'listen' in args : # # This will start a worker just in case to listen to a queue + SYS_ARGS = dict(args) #-- things get lost in context if 'read' in SYS_ARGS : QUEUE_TYPE = 'queue.QueueReader' pointer = lambda qreader: qreader.read(1) From 832581303b623c4cc6cc3cf43f4716ac1427f773 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 4 Mar 2020 14:08:10 -0600 Subject: [PATCH 037/250] bug fix: gpu assignement error --- pipeline.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 8c8a7d7..b53ba52 100644 --- a/pipeline.py +++ b/pipeline.py @@ -63,6 +63,7 @@ class Components : _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + _args['gpu'] = args['gpu'] if 'gpu' in args else 0 MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 PART_SIZE = args['part_size'] if 'part_size' in args else 0 @@ -85,6 +86,7 @@ class Components : # _args['logs'] = os.sep.join([log_folder,str(part_index)]) _args['partition'] = str(part_index) _args['logger'] = {'args':{'dbname':'aou','doc':args['context']},'type':'mongo.MongoWriter'} + # # We should post the the partitions to a queue server (at least the instructions on ): # - where to get the data @@ -207,8 +209,9 @@ class Components : logger.write({'module':'process','action':'read-partition','input':info['info']}) df = pd.DataFrame(info['data']) args = info['args'] + args['gpu'] = int(info['info']['partition']) if int(args['num_gpu']) > 1 and args['gpu'] > 0: - args['gpu'] = args['gpu'] + args['num_gpu'] + args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else 0 args['reader'] = lambda: df # # @TODO: Fix From 2b7b1757f92840a65fa8a599ff475f06e7be00ca Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 4 Mar 2020 14:17:28 -0600 Subject: [PATCH 038/250] bug fixes with callback (worker) --- pipeline.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index b53ba52..a39dbc7 100644 --- a/pipeline.py +++ b/pipeline.py @@ -211,7 +211,7 @@ class Components : args = info['args'] args['gpu'] = int(info['info']['partition']) if int(args['num_gpu']) > 1 and args['gpu'] > 0: - args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else 0 + args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else 0 #-- 8 max gpus args['reader'] = lambda: df # # @TODO: Fix diff --git a/setup.py b/setup.py index e6b988a..26e2e9d 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.1.6","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.1.7","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 142491cf5bcd15f61f5cc79023b1cc9dcfeed00f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 4 Mar 2020 14:27:54 -0600 Subject: [PATCH 039/250] Bug Fix: GPU work load --- pipeline.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pipeline.py b/pipeline.py index a39dbc7..fcf9912 100644 --- a/pipeline.py +++ b/pipeline.py @@ -209,9 +209,10 @@ class Components : logger.write({'module':'process','action':'read-partition','input':info['info']}) df = pd.DataFrame(info['data']) args = info['args'] - args['gpu'] = int(info['info']['partition']) - if int(args['num_gpu']) > 1 and args['gpu'] > 0: - args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else 0 #-- 8 max gpus + MAX_GPUS = 8 + args['gpu'] = int(info['info']['partition']) if info['info']['partition'] < MAX_GPUS else np.random.choice(np.arange(MAX_GPUS),1).astype(int).tolist()[0] + # if int(args['num_gpu']) > 1 and args['gpu'] > 0: + # args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else args['gpu'] #-- 8 max gpus args['reader'] = lambda: df # # @TODO: Fix From 84b3e6c0484f7931a587ccfc41a3f4810ebbdffe Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 4 Mar 2020 14:30:40 -0600 Subject: [PATCH 040/250] dataset ... (fix) --- pipeline.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pipeline.py b/pipeline.py index fcf9912..6bbec2f 100644 --- a/pipeline.py +++ b/pipeline.py @@ -12,7 +12,7 @@ from data.params import SYS_ARGS # # The configuration array is now loaded and we will execute the pipe line as follows -DATASET='combined20190510' +DATASET='combined20191004v2_deid' class Components : @@ -233,11 +233,12 @@ if __name__ == '__main__' : index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else 0 args = (PIPELINE[index]) - args['dataset'] = 'combined20190510' + args = dict(args,**SYS_ARGS) args['max_rows'] = int(args['max_rows']) if 'max_rows' in args else 3 args['part_size']= int(args['part_size']) if 'part_size' in args else 3 - + if 'dataset' not in args : + args['dataset'] = 'combined20191004v2_deid' # # @TODO: From f4295041f9e441922ceec7416b79a739f4eb036c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 4 Mar 2020 16:38:44 -0600 Subject: [PATCH 041/250] bug fix: type --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 6bbec2f..cd527a5 100644 --- a/pipeline.py +++ b/pipeline.py @@ -92,7 +92,7 @@ class Components : # - where to get the data # - and athe arguments to use (partition #,columns,gpu,epochs) # - info = {"rows":_data.shape[0],"cols":_data.shape[1], "paritition":part_index,"logs":_args['logs']} + info = {"rows":_data.shape[0],"cols":_data.shape[1], "partition":part_index,"logs":_args['logs']} p = {"args":_args,"data":_data.to_dict(orient="records"),"info":info} qwriter.write(p) # From dd7fd5696bf682cd9b70786e2ccaa454aa62d841 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 5 Mar 2020 11:49:14 -0600 Subject: [PATCH 042/250] bug fix with partitions (generation may require it regardless) --- data/gan.py | 4 ++-- pipeline.py | 30 ++++++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/data/gan.py b/data/gan.py index b3b9cf8..a591f34 100644 --- a/data/gan.py +++ b/data/gan.py @@ -356,7 +356,7 @@ class Train (GNet): self.meta = self.log_meta() if(self.logger): - self.logger.write( self.meta ) + self.logger.write({"module":"gan-train","action":"start","input":self.meta} ) # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta) def load_meta(self, column): @@ -514,7 +514,7 @@ class Train (GNet): # # if self.logger : - row = {"logs":logs} #,"model":pickle.dump(sess)} + row = {"module":"gan-train","action":"logs","input":logs} #,"model":pickle.dump(sess)} self.logger.write(row) # # @TODO: diff --git a/pipeline.py b/pipeline.py index cd527a5..58b5380 100644 --- a/pipeline.py +++ b/pipeline.py @@ -131,6 +131,7 @@ class Components : log_folder = args['logs'] if 'logs' in args else 'logs' partition = args['partition'] if 'partition' in args else '' log_folder = os.sep.join([log_folder,args['context'],partition]) + _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 @@ -143,12 +144,31 @@ class Components : # _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna() reader = args['reader'] df = reader() - if 'partition' in args : + bounds = Components.split(df,MAX_ROWS,PART_SIZE) + if partition != '' and os.path.exists(log_folder): bounds = Components.split(df,MAX_ROWS,PART_SIZE) # bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories) lbound = int(bounds[int(partition)].left) ubound = int(bounds[int(partition)].right) df = df.iloc[lbound:ubound] + else: + # + # We have an implicit partition here + # bounds = Components.split(df,MAX_ROWS,PART_SIZE) + logger.write({"module":"generate","action":"virtual-parititions","input":{"rows":df.shape[0],"max_rows":MAX_ROWS,"part_size":PART_SIZE}}) + for item in bounds : + + lbound = int(item.left) + ubound = int(item.right) + args['reader'] = lambda: df[lbound:ubound] + args['partition'] = bounds.index(item) + + self.generate(args) + return ; + if not os.path.exists(log_folder) : + log_folder = log_folder.replace(partition,'') + _args['logs'] = log_folder + _args['data'] = df # _args['data'] = reader() #_args['data'] = _args['data'].astype(object) @@ -193,7 +213,7 @@ class Components : _fname = table.replace('_io','_full_io') data_comp.to_gbq(if_exists='replace',destination_table=_pname,credentials='credentials',chunk_size=50000) data_comp.to_csv(_pname,index=False) - INSERT_FLAG = 'replace' if 'partition' not in args else 'append' + INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' _args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=_fname,credentials='credentials',chunk_size=50000) info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} } @@ -235,8 +255,9 @@ if __name__ == '__main__' : args = (PIPELINE[index]) args = dict(args,**SYS_ARGS) - args['max_rows'] = int(args['max_rows']) if 'max_rows' in args else 3 - args['part_size']= int(args['part_size']) if 'part_size' in args else 3 + args['max_rows'] = int(args['max_rows']) if 'max_rows' in args else 3 + args['part_size'] = int(args['part_size']) if 'part_size' in args else 4 + args['logs'] = args['logs'] if 'logs' in args else 'logs' if 'dataset' not in args : args['dataset'] = 'combined20191004v2_deid' @@ -257,6 +278,7 @@ if __name__ == '__main__' : content = os.listdir( os.sep.join([args['logs'],args['context']])) generator = Components() + if ''.join(content).isnumeric() : # # we have partitions we are working with From c9115fe473fe3d4b2c2b7996abd968feacba90b7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 5 Mar 2020 11:50:04 -0600 Subject: [PATCH 043/250] version update --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 26e2e9d..02f49a2 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.1.7","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.1.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 27473989f9804564f081285b884c5921752d5a94 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 5 Mar 2020 12:03:04 -0600 Subject: [PATCH 044/250] bug fix ... --- pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index 58b5380..5442935 100644 --- a/pipeline.py +++ b/pipeline.py @@ -130,8 +130,8 @@ class Components : logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) log_folder = args['logs'] if 'logs' in args else 'logs' partition = args['partition'] if 'partition' in args else '' - log_folder = os.sep.join([log_folder,args['context'],partition]) - + log_folder = os.sep.join([log_folder,args['context'],str(partition)]) + _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 From c75bb54d2b8bd4df6a5a55c76db62a3e3b9caaee Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 5 Mar 2020 23:49:55 -0600 Subject: [PATCH 045/250] bug fix: mandatory partitioning while training --- pipeline.py | 81 ++++++++++++++++++++++------------------------------- setup.py | 2 +- 2 files changed, 35 insertions(+), 48 deletions(-) diff --git a/pipeline.py b/pipeline.py index 5442935..917026d 100644 --- a/pipeline.py +++ b/pipeline.py @@ -61,27 +61,23 @@ class Components : logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) log_folder = args['logs'] if 'logs' in args else 'logs' _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 _args['gpu'] = args['gpu'] if 'gpu' in args else 0 - MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 - PART_SIZE = args['part_size'] if 'part_size' in args else 0 + # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 + PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 - if df.shape[0] > MAX_ROWS and 'partition' not in args: + if 'partition' not in args: lbound = 0 - bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories) + # bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories) # bounds = Components.split(df,MAX_ROWS,PART_SIZE) - + columns = args['columns'] + df = np.array_split(df[columns].values,PART_SIZE) qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'}) - - for b in bounds : - part_index = bounds.index(b) - ubound = int(b.right) - - - _data = df.iloc[lbound:ubound][args['columns']] - lbound = ubound + part_index = 0 + for _df in df: # _args['logs'] = os.sep.join([log_folder,str(part_index)]) _args['partition'] = str(part_index) @@ -92,14 +88,20 @@ class Components : # - where to get the data # - and athe arguments to use (partition #,columns,gpu,epochs) # - info = {"rows":_data.shape[0],"cols":_data.shape[1], "partition":part_index,"logs":_args['logs']} - p = {"args":_args,"data":_data.to_dict(orient="records"),"info":info} + + _df = pd.DataFrame(_df,columns=columns) + # print (columns) + + info = {"rows":_df.shape[0],"cols":_df.shape[1], "partition":part_index,"logs":_args['logs'],"num_gpu":2,"part_size":PART_SIZE} + p = {"args":_args,"data":_df.to_dict(orient="records"),"info":info} + part_index += 1 qwriter.write(p) # # @TODO: # - Notify that information was just posted to the queue - info['max_rows'] = MAX_ROWS - info['part_size'] = PART_SIZE + # In case we want slow-mode, we can store the partitions in mongodb and process (Yes|No)? + # + logger.write({"module":"train","action":"setup-partition","input":info}) pass @@ -137,37 +139,18 @@ class Components : _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' _args['no_value']= args['no_value'] - MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 - PART_SIZE = args['part_size'] if 'part_size' in args else 0 + # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 + PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 # credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') # _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna() reader = args['reader'] df = reader() - bounds = Components.split(df,MAX_ROWS,PART_SIZE) + # bounds = Components.split(df,MAX_ROWS,PART_SIZE) if partition != '' and os.path.exists(log_folder): - bounds = Components.split(df,MAX_ROWS,PART_SIZE) - # bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories) - lbound = int(bounds[int(partition)].left) - ubound = int(bounds[int(partition)].right) - df = df.iloc[lbound:ubound] - else: - # - # We have an implicit partition here - # bounds = Components.split(df,MAX_ROWS,PART_SIZE) - logger.write({"module":"generate","action":"virtual-parititions","input":{"rows":df.shape[0],"max_rows":MAX_ROWS,"part_size":PART_SIZE}}) - for item in bounds : - - lbound = int(item.left) - ubound = int(item.right) - args['reader'] = lambda: df[lbound:ubound] - args['partition'] = bounds.index(item) - - self.generate(args) - return ; - if not os.path.exists(log_folder) : - log_folder = log_folder.replace(partition,'') - _args['logs'] = log_folder + columns = args['columns'] + df = np.array_split(df[columns].values,PART_SIZE) + df = pd.DataFrame(df[ int (partition) ],columns = columns) _args['data'] = df # _args['data'] = reader() @@ -189,7 +172,7 @@ class Components : _args['data'][name] = _dc[name] info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}} if partition != '' : - info['partition'] = partition + info['partition'] = int(partition) logger.write(info) # filename = os.sep.join([log_folder,'output',name+'.csv']) # data_comp[[name]].to_csv(filename,index=False) @@ -218,7 +201,7 @@ class Components : info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} } if partition : - info ['partition'] = partition + info ['partition'] = int(partition) logger.write({"module":"generate","action":"write","info":info} ) @staticmethod def callback(channel,method,header,stream): @@ -229,8 +212,12 @@ class Components : logger.write({'module':'process','action':'read-partition','input':info['info']}) df = pd.DataFrame(info['data']) args = info['args'] - MAX_GPUS = 8 - args['gpu'] = int(info['info']['partition']) if info['info']['partition'] < MAX_GPUS else np.random.choice(np.arange(MAX_GPUS),1).astype(int).tolist()[0] + if args['num_gpu'] > 1 : + args['gpu'] = int(info['info']['partition']) if info['info']['partition'] == 0 else info['info']['partition'] + 2 + args['num_gpu'] = 2 + else: + args['gpu'] = 0 + args['num_gpu'] = 1 # if int(args['num_gpu']) > 1 and args['gpu'] > 0: # args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else args['gpu'] #-- 8 max gpus args['reader'] = lambda: df @@ -296,7 +283,7 @@ if __name__ == '__main__' : SYS_ARGS = dict(args) #-- things get lost in context if 'read' in SYS_ARGS : QUEUE_TYPE = 'queue.QueueReader' - pointer = lambda qreader: qreader.read(1) + pointer = lambda qreader: qreader.read() else: QUEUE_TYPE = 'queue.QueueListener' pointer = lambda qlistener: qlistener.listen() diff --git a/setup.py b/setup.py index 02f49a2..bcacb62 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.1.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.1.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 32a5e19060e286a2b8a24f296b91e2768ccff45d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Mar 2020 11:17:29 -0600 Subject: [PATCH 046/250] bug fix with minor corrections --- pipeline.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pipeline.py b/pipeline.py index 917026d..c5a16d8 100644 --- a/pipeline.py +++ b/pipeline.py @@ -92,7 +92,7 @@ class Components : _df = pd.DataFrame(_df,columns=columns) # print (columns) - info = {"rows":_df.shape[0],"cols":_df.shape[1], "partition":part_index,"logs":_args['logs'],"num_gpu":2,"part_size":PART_SIZE} + info = {"rows":_df.shape[0],"cols":_df.shape[1], "partition":part_index,"logs":_args['logs'],"num_gpu":1,"part_size":PART_SIZE} p = {"args":_args,"data":_df.to_dict(orient="records"),"info":info} part_index += 1 qwriter.write(p) @@ -134,7 +134,7 @@ class Components : partition = args['partition'] if 'partition' in args else '' log_folder = os.sep.join([log_folder,args['context'],str(partition)]) - _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' @@ -147,15 +147,18 @@ class Components : reader = args['reader'] df = reader() # bounds = Components.split(df,MAX_ROWS,PART_SIZE) - if partition != '' and os.path.exists(log_folder): + if partition != '' : columns = args['columns'] df = np.array_split(df[columns].values,PART_SIZE) df = pd.DataFrame(df[ int (partition) ],columns = columns) + info = {"parition":int(partition),"rows":df.shape[0],"cols":df.shape[0],"part_size":PART_SIZE} + logger.write({"module":"generate","action":"partition","input":info}) _args['data'] = df # _args['data'] = reader() #_args['data'] = _args['data'].astype(object) - _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + _args['num_gpu'] = 1 + _args['gpu'] = partition _dc = data.maker.generate(**_args) # # We need to post the generate the data in order to : @@ -205,7 +208,9 @@ class Components : logger.write({"module":"generate","action":"write","info":info} ) @staticmethod def callback(channel,method,header,stream): - + if stream.decode('utf8') in ['QUIT','EXIT','END'] : + channel.close() + channel.connection.close() info = json.loads(stream) logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']}) @@ -214,10 +219,10 @@ class Components : args = info['args'] if args['num_gpu'] > 1 : args['gpu'] = int(info['info']['partition']) if info['info']['partition'] == 0 else info['info']['partition'] + 2 - args['num_gpu'] = 2 + else: args['gpu'] = 0 - args['num_gpu'] = 1 + args['num_gpu'] = 1 # if int(args['num_gpu']) > 1 and args['gpu'] > 0: # args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else args['gpu'] #-- 8 max gpus args['reader'] = lambda: df @@ -242,8 +247,7 @@ if __name__ == '__main__' : args = (PIPELINE[index]) args = dict(args,**SYS_ARGS) - args['max_rows'] = int(args['max_rows']) if 'max_rows' in args else 3 - args['part_size'] = int(args['part_size']) if 'part_size' in args else 4 + args['logs'] = args['logs'] if 'logs' in args else 'logs' if 'dataset' not in args : args['dataset'] = 'combined20191004v2_deid' From 8c5193cb6d4293682f838597bcca7e8287e37d6d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Mar 2020 11:40:47 -0600 Subject: [PATCH 047/250] bug fix ... (hopfully makes a difference) --- pipeline.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index c5a16d8..df92427 100644 --- a/pipeline.py +++ b/pipeline.py @@ -2,6 +2,7 @@ import json from transport import factory import numpy as np +import time import os from multiprocessing import Process import pandas as pd @@ -76,7 +77,12 @@ class Components : columns = args['columns'] df = np.array_split(df[columns].values,PART_SIZE) qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'}) - part_index = 0 + part_index = 0 + # + # let's start n processes to listen & train this mother ... + # + #-- hopefully they learn as daemons + for _df in df: # _args['logs'] = os.sep.join([log_folder,str(part_index)]) @@ -206,6 +212,7 @@ class Components : if partition : info ['partition'] = int(partition) logger.write({"module":"generate","action":"write","info":info} ) + @staticmethod def callback(channel,method,header,stream): if stream.decode('utf8') in ['QUIT','EXIT','END'] : @@ -306,6 +313,7 @@ if __name__ == '__main__' : while len(jobs) > 0 : jobs = [job for job in jobs if job.is_alive()] + time.sleep(2) # pointer(qhandler) From 57e32261c668260b64a088a165e35cdfbaad8294 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Mar 2020 12:11:22 -0600 Subject: [PATCH 048/250] creating processes for the generators --- pipeline.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index df92427..c042588 100644 --- a/pipeline.py +++ b/pipeline.py @@ -280,11 +280,21 @@ if __name__ == '__main__' : if ''.join(content).isnumeric() : # # we have partitions we are working with - + make = lambda args: (Components()).generate(args) + jobs = [] + print (["Started ",len(jobs),"generators"]) for id in ''.join(content) : args['partition'] = id + job = Process(target=make,args=(args,args)) + + job.start() + jobs.append(job) + + while (len(jobs)> 0) : + jobs = [jobs for job in jobs if job.is_alive()] + time.sleep(2) - generator.generate(args) + # generator.generate(args) else: generator.generate(args) # Components.generate(args) From 872744c682d26c751d2dfb377e05dc2afb64f95b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Mar 2020 13:00:32 -0600 Subject: [PATCH 049/250] bug fix with queue connection dropping out --- pipeline.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/pipeline.py b/pipeline.py index c042588..65eda3e 100644 --- a/pipeline.py +++ b/pipeline.py @@ -99,7 +99,7 @@ class Components : # print (columns) info = {"rows":_df.shape[0],"cols":_df.shape[1], "partition":part_index,"logs":_args['logs'],"num_gpu":1,"part_size":PART_SIZE} - p = {"args":_args,"data":_df.to_dict(orient="records"),"info":info} + p = {"args":_args,"data":_df.to_dict(orient="records"),"input":info} part_index += 1 qwriter.write(p) # @@ -124,7 +124,8 @@ class Components : # @log : # Logging information about the training process for this partition (or not) # - info = {"rows":df.shape[0],"cols":df.shape[1], "partition":partition,"logs":_args['logs']} + info = {"rows":df.shape[0],"cols":df.shape[1], "partition":int(partition),"logs":_args['logs']} + logger.write({"module":"train","action":"train","input":info}) data.maker.train(**_args) @@ -211,7 +212,7 @@ class Components : info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} } if partition : info ['partition'] = int(partition) - logger.write({"module":"generate","action":"write","info":info} ) + logger.write({"module":"generate","action":"write","input":info} ) @staticmethod def callback(channel,method,header,stream): @@ -221,11 +222,11 @@ class Components : info = json.loads(stream) logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']}) - logger.write({'module':'process','action':'read-partition','input':info['info']}) + logger.write({'module':'process','action':'read-partition','input':info['input']}) df = pd.DataFrame(info['data']) args = info['args'] if args['num_gpu'] > 1 : - args['gpu'] = int(info['info']['partition']) if info['info']['partition'] == 0 else info['info']['partition'] + 2 + args['gpu'] = int(info['info']['partition']) if info['input']['partition'] == 0 else info['input']['partition'] + 2 else: args['gpu'] = 0 @@ -237,11 +238,12 @@ class Components : # @TODO: Fix # There is an inconsistency in column/columns ... fix this shit! # - args['columns'] = args['column'] - (Components()).train(**args) - logger.write({"module":"process","action":"exit","info":info["info"]}) channel.close() channel.connection.close() + args['columns'] = args['column'] + (Components()).train(**args) + logger.write({"module":"process","action":"exit","input":info["input"]}) + pass if __name__ == '__main__' : @@ -280,18 +282,19 @@ if __name__ == '__main__' : if ''.join(content).isnumeric() : # # we have partitions we are working with - make = lambda args: (Components()).generate(args) + make = lambda _args: (Components()).generate(_args) jobs = [] - print (["Started ",len(jobs),"generators"]) + for id in ''.join(content) : args['partition'] = id - job = Process(target=make,args=(args,args)) - + job = Process(target=make,args=(args,)) + job.name = 'generator # '+str(id) job.start() jobs.append(job) - - while (len(jobs)> 0) : - jobs = [jobs for job in jobs if job.is_alive()] + + print (["Started ",len(jobs),"generator"+"s" if len(jobs)>1 else "" ]) + while len(jobs)> 0 : + jobs = [job for job in jobs if job.is_alive()] time.sleep(2) # generator.generate(args) From 8e5b475a01f1a246ff01f7cf414e3d403d4e9a19 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Mar 2020 13:27:03 -0600 Subject: [PATCH 050/250] inefficient data load (urgh) --- pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 65eda3e..15d562a 100644 --- a/pipeline.py +++ b/pipeline.py @@ -269,7 +269,8 @@ if __name__ == '__main__' : if 'file' in args : reader = lambda: pd.read_csv(args['file']) ; else: - reader = lambda: Components().get(args) + _df = Components().get(args) + reader = lambda: _df args['reader'] = reader if 'generate' in SYS_ARGS : From 7dfd4032863932283ddbe29797ff72e9197195dc Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Mar 2020 14:20:06 -0600 Subject: [PATCH 051/250] bug fix, gpu configuration memeory error --- pipeline.py | 51 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/pipeline.py b/pipeline.py index 15d562a..59745a9 100644 --- a/pipeline.py +++ b/pipeline.py @@ -143,29 +143,36 @@ class Components : _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) - _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 - os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' + # _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + + if args['num_gpu'] > 1 : + _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0] + else: + _args['gpu'] = 0 + _args['num_gpu'] = 1 + _args['no_value']= args['no_value'] # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 # credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') # _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna() - reader = args['reader'] - df = reader() + # reader = args['reader'] + # df = reader() + df = args['reader']() if 'reader' in args else args['data'] # bounds = Components.split(df,MAX_ROWS,PART_SIZE) - if partition != '' : - columns = args['columns'] - df = np.array_split(df[columns].values,PART_SIZE) - df = pd.DataFrame(df[ int (partition) ],columns = columns) - info = {"parition":int(partition),"rows":df.shape[0],"cols":df.shape[0],"part_size":PART_SIZE} - logger.write({"module":"generate","action":"partition","input":info}) - + # if partition != '' : + # columns = args['columns'] + # df = np.array_split(df[columns].values,PART_SIZE) + # df = pd.DataFrame(df[ int (partition) ],columns = columns) + info = {"parition":int(partition),"rows":df.shape[0],"cols":df.shape[0],"part_size":PART_SIZE} + logger.write({"module":"generate","action":"partition","input":info}) + _args['data'] = df # _args['data'] = reader() #_args['data'] = _args['data'].astype(object) - _args['num_gpu'] = 1 - _args['gpu'] = partition + # _args['num_gpu'] = 1 + _dc = data.maker.generate(**_args) # # We need to post the generate the data in order to : @@ -226,7 +233,7 @@ class Components : df = pd.DataFrame(info['data']) args = info['args'] if args['num_gpu'] > 1 : - args['gpu'] = int(info['info']['partition']) if info['input']['partition'] == 0 else info['input']['partition'] + 2 + args['gpu'] = int(info['input']['partition']) if info['input']['partition'] < 8 else np.random.choice(np.arange(8),1).astype(int)[0] else: args['gpu'] = 0 @@ -269,8 +276,8 @@ if __name__ == '__main__' : if 'file' in args : reader = lambda: pd.read_csv(args['file']) ; else: - _df = Components().get(args) - reader = lambda: _df + DATA = Components().get(args) + reader = lambda: DATA args['reader'] = reader if 'generate' in SYS_ARGS : @@ -279,15 +286,23 @@ if __name__ == '__main__' : content = os.listdir( os.sep.join([args['logs'],args['context']])) generator = Components() - + DATA = reader() if ''.join(content).isnumeric() : # # we have partitions we are working with make = lambda _args: (Components()).generate(_args) jobs = [] - + del args['reader'] + columns = DATA.columns.tolist() + DATA = np.array_split(DATA[args['columns']],len(content)) for id in ''.join(content) : args['partition'] = id + args['data'] = pd.DataFrame(DATA[(int(id))],columns=args['columns']) + if args['num_gpu'] > 0 : + args['gpu'] = id + else: + args['gpu']=0 + args['num_gpu']=1 job = Process(target=make,args=(args,)) job.name = 'generator # '+str(id) job.start() From 411aa170ebc00b00032ba6055494a8c22c0013e2 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Mar 2020 14:33:41 -0600 Subject: [PATCH 052/250] gpu fix --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 59745a9..e6283fd 100644 --- a/pipeline.py +++ b/pipeline.py @@ -165,7 +165,7 @@ class Components : # columns = args['columns'] # df = np.array_split(df[columns].values,PART_SIZE) # df = pd.DataFrame(df[ int (partition) ],columns = columns) - info = {"parition":int(partition),"rows":df.shape[0],"cols":df.shape[0],"part_size":PART_SIZE} + info = {"parition":int(partition),"gpu":_args["gpu"],"rows":df.shape[0],"cols":df.shape[1],"part_size":PART_SIZE} logger.write({"module":"generate","action":"partition","input":info}) _args['data'] = df From ff6ae5a622b05b5f45915168ef8135f0dc9ed713 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Mar 2020 14:42:40 -0600 Subject: [PATCH 053/250] ... --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index e6283fd..dfa0331 100644 --- a/pipeline.py +++ b/pipeline.py @@ -150,7 +150,7 @@ class Components : else: _args['gpu'] = 0 _args['num_gpu'] = 1 - + os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) _args['no_value']= args['no_value'] # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 From 6f51eedca80dde48264647a8fefb03d742bd2506 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Mar 2020 14:45:22 -0600 Subject: [PATCH 054/250] xx --- pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index dfa0331..2ce90a9 100644 --- a/pipeline.py +++ b/pipeline.py @@ -298,11 +298,11 @@ if __name__ == '__main__' : for id in ''.join(content) : args['partition'] = id args['data'] = pd.DataFrame(DATA[(int(id))],columns=args['columns']) - if args['num_gpu'] > 0 : + if args['num_gpu'] > 1 : args['gpu'] = id else: args['gpu']=0 - args['num_gpu']=1 + job = Process(target=make,args=(args,)) job.name = 'generator # '+str(id) job.start() From 49177957b8e5f7f96621ec2e261f9a59bb2b815a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Mar 2020 14:56:28 -0600 Subject: [PATCH 055/250] ... --- pipeline.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index 2ce90a9..9d9c097 100644 --- a/pipeline.py +++ b/pipeline.py @@ -78,6 +78,7 @@ class Components : df = np.array_split(df[columns].values,PART_SIZE) qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'}) part_index = 0 + # # let's start n processes to listen & train this mother ... # @@ -145,7 +146,7 @@ class Components : _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) # _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 - if args['num_gpu'] > 1 : + if int(args['num_gpu']) > 1 : _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0] else: _args['gpu'] = 0 @@ -295,10 +296,17 @@ if __name__ == '__main__' : del args['reader'] columns = DATA.columns.tolist() DATA = np.array_split(DATA[args['columns']],len(content)) + for id in ''.join(content) : + if 'focus' in args and int(args['focus']) != int(id) : + # + # This handles failures/recoveries for whatever reason + # If we are only interested in generating data for a given partition + continue + args['partition'] = id args['data'] = pd.DataFrame(DATA[(int(id))],columns=args['columns']) - if args['num_gpu'] > 1 : + if int(args['num_gpu']) > 1 : args['gpu'] = id else: args['gpu']=0 From e02a4a60abd8a936d77b5720beb0e27a34718307 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Mar 2020 15:26:18 -0600 Subject: [PATCH 056/250] acceptance criteria fix --- data/gan.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/data/gan.py b/data/gan.py index a591f34..80c3f8e 100644 --- a/data/gan.py +++ b/data/gan.py @@ -584,7 +584,7 @@ class Predict(GNet): p = 0 not in df.sum(axis=1).values x = df.sum(axis=1).values - if np.divide( np.sum(x), x.size) > .9 or p and np.sum(x) == x.size: + if x.max() == 1 and np.divide( np.sum(x), x.size) > .9 or p and np.sum(x) == x.size and x.size == self.values.size: ratio.append(np.divide( np.sum(x), x.size)) found.append(df) if i == CANDIDATE_COUNT: @@ -606,7 +606,9 @@ class Predict(GNet): # r = np.zeros((self.ROW_COUNT,len(columns))) # r = np.zeros(self.ROW_COUNT) - + if self.logger : + info = {"found":len(found),"selected":INDEX, "ratio": ratio[INDEX],"rows":df.shape[0],"cols":df.shape[1]} + self.logger.write({"module":"gan-generate","action":"generate","input":info}) df.columns = self.values if len(found): # print (len(found),NTH_VALID_CANDIDATE) From 78718b6c42793d8df227f4d08f800f5dc4b89cfb Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Mar 2020 15:39:13 -0600 Subject: [PATCH 057/250] ... --- data/gan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/gan.py b/data/gan.py index 80c3f8e..7952f23 100644 --- a/data/gan.py +++ b/data/gan.py @@ -584,7 +584,7 @@ class Predict(GNet): p = 0 not in df.sum(axis=1).values x = df.sum(axis=1).values - if x.max() == 1 and np.divide( np.sum(x), x.size) > .9 or p and np.sum(x) == x.size and x.size == self.values.size: + if np.divide( np.sum(x), x.size) > .9 or p and np.sum(x) == x.size and x.size == self.values.size: ratio.append(np.divide( np.sum(x), x.size)) found.append(df) if i == CANDIDATE_COUNT: From 97669f3b6b5100be7397de81967e6785c5d915da Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Mar 2020 15:42:13 -0600 Subject: [PATCH 058/250] setup ... --- data/gan.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/data/gan.py b/data/gan.py index 7952f23..bf27b3b 100644 --- a/data/gan.py +++ b/data/gan.py @@ -607,7 +607,7 @@ class Predict(GNet): # r = np.zeros((self.ROW_COUNT,len(columns))) # r = np.zeros(self.ROW_COUNT) if self.logger : - info = {"found":len(found),"selected":INDEX, "ratio": ratio[INDEX],"rows":df.shape[0],"cols":df.shape[1]} + info = {"found":len(found),"selected":INDEX, "ratio": ratio[INDEX],"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)} self.logger.write({"module":"gan-generate","action":"generate","input":info}) df.columns = self.values if len(found): diff --git a/setup.py b/setup.py index bcacb62..bf63cb0 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.1.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.2.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From d72fb6b4e34a1a88323fd37fb13211b3aea2bda1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Mar 2020 16:22:37 -0600 Subject: [PATCH 059/250] bug fix ... --- data/gan.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/data/gan.py b/data/gan.py index bf27b3b..1df26a3 100644 --- a/data/gan.py +++ b/data/gan.py @@ -578,18 +578,20 @@ class Predict(GNet): # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes # The code below will insure we have some acceptable cardinal relationships between id and synthetic values # - df = pd.DataFrame(np.round(f).astype(np.int32)) + + df = pd.DataFrame(np.round(f)).astype(np.int32) p = 0 not in df.sum(axis=1).values x = df.sum(axis=1).values - if np.divide( np.sum(x), x.size) > .9 or p and np.sum(x) == x.size and x.size == self.values.size: + if np.divide( np.sum(x), x.size) > .9 or p and np.sum(x) == x.size : ratio.append(np.divide( np.sum(x), x.size)) found.append(df) if i == CANDIDATE_COUNT: break else: + continue # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms @@ -597,9 +599,13 @@ class Predict(GNet): # # In case we are dealing with actual values like diagnosis codes we can perform # - - INDEX = np.random.choice(np.arange(len(found)),1)[0] - INDEX = ratio.index(np.max(ratio)) + _index = [found.index(item) for item in found if item.shape[1] == len(self.values)] + if not _index : + INDEX = np.random.choice(np.arange(len(found)),1)[0] + INDEX = ratio.index(np.max(ratio)) + else: + INDEX = _index[0] + df = found[INDEX] columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] From 718e57840159558cb0bc2c6773a41919369652d0 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 7 Mar 2020 09:16:17 -0600 Subject: [PATCH 060/250] bug fix, trainer --- data/gan.py | 4 +-- data/maker/__init__.py | 8 ++++-- pipeline.py | 60 +++++++++++++++++++++++++++++++++--------- 3 files changed, 55 insertions(+), 17 deletions(-) diff --git a/data/gan.py b/data/gan.py index 1df26a3..898d4ea 100644 --- a/data/gan.py +++ b/data/gan.py @@ -581,7 +581,6 @@ class Predict(GNet): df = pd.DataFrame(np.round(f)).astype(np.int32) - p = 0 not in df.sum(axis=1).values x = df.sum(axis=1).values @@ -599,7 +598,8 @@ class Predict(GNet): # # In case we are dealing with actual values like diagnosis codes we can perform # - _index = [found.index(item) for item in found if item.shape[1] == len(self.values)] + N = len(found) + _index = [i for i in range(0,N) if found[i].shape[1] == len(self.values)] if not _index : INDEX = np.random.choice(np.arange(len(found)),1)[0] INDEX = ratio.index(np.max(ratio)) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 080939c..f4bce16 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -37,11 +37,14 @@ class ContinuousToDiscrete : index = BOUNDS.index(row) x_[index] = 1 break - + # + # for items in BOUNDS : + # index = BOUNDS.index(items) return _matrix @staticmethod def bounds(x,n): + # return np.array_split(x,n) return list(pd.cut(np.array( np.round(x,ContinuousToDiscrete.ROUND_UP) ),n).categories) @@ -175,7 +178,8 @@ def generate(**args): handler.load_meta(col) r = handler.apply() BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) - _df[col] = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if 'float' in df[col].dtypes.name or col in CONTINUOUS else r[col] + _df[col] = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col] + # _df[col] = r[col] # # @TODO: log basic stats about the synthetic attribute # diff --git a/pipeline.py b/pipeline.py index 9d9c097..6234c26 100644 --- a/pipeline.py +++ b/pipeline.py @@ -50,11 +50,12 @@ class Components : """ # # @TODO: we need to log something here about the parameters being passed - pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args) - df = pointer() - if df.shape[0] == 0 : - print ("CAN NOT TRAIN EMPTY DATASET ") - return + # pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args) + df = args['reader']() + + # if df.shape[0] == 0 : + # print ("CAN NOT TRAIN EMPTY DATASET ") + # return # # Now we can parse the arguments and submit the entire thing to training # @@ -113,18 +114,29 @@ class Components : pass else: + print ('.....') partition = args['partition'] if 'partition' in args else '' - log_folder = os.sep.join([log_folder,args['context'],partition]) + log_folder = os.sep.join([log_folder,args['context'],str(partition)]) _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) - _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 - os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' + + # + # We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel + # + if int(args['num_gpu']) > 1 : + _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0] + else: + _args['gpu'] = 0 + _args['num_gpu'] = 1 + os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) + _args['data'] = df # # @log : # Logging information about the training process for this partition (or not) # + info = {"rows":df.shape[0],"cols":df.shape[1], "partition":int(partition),"logs":_args['logs']} logger.write({"module":"train","action":"train","input":info}) @@ -291,7 +303,7 @@ if __name__ == '__main__' : if ''.join(content).isnumeric() : # # we have partitions we are working with - make = lambda _args: (Components()).generate(_args) + jobs = [] del args['reader'] columns = DATA.columns.tolist() @@ -310,13 +322,13 @@ if __name__ == '__main__' : args['gpu'] = id else: args['gpu']=0 - + make = lambda _args: (Components()).generate(_args) job = Process(target=make,args=(args,)) job.name = 'generator # '+str(id) job.start() jobs.append(job) - print (["Started ",len(jobs),"generator"+"s" if len(jobs)>1 else "" ]) + print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ]) while len(jobs)> 0 : jobs = [job for job in jobs if job.is_alive()] time.sleep(2) @@ -358,9 +370,31 @@ if __name__ == '__main__' : # qreader.read(1) pass else: + PART_SIZE = int(args['jobs']) if 'jobs' in args else 8 + DATA = reader() + DATA = np.array_split(DATA[args['columns']],PART_SIZE) + jobs = [] + for index in range(0,int(args['jobs'])) : + if 'focus' in args and int(args['focus']) != index : + continue + args['partition'] = index + _df = pd.DataFrame(DATA[index],columns=args['columns']) + args['reader'] = lambda: _df + make = lambda _args: (Components()).train(**_args) + job = Process(target=make,args=(args,)) + job.name = 'Trainer # ' + str(index) + job.start() + jobs.append(job) + # args['gpu'] + print (["Started ",len(jobs),"trainers" if len(jobs)>1 else "trainer" ]) + while len(jobs)> 0 : + jobs = [job for job in jobs if job.is_alive()] + time.sleep(2) + + # trainer = Components() + # trainer.train(**args) + - trainer = Components() - trainer.train(**args) # Components.train(**args) #for args in PIPELINE : #args['dataset'] = 'combined20190510' From 330d6b6ae681dcc50f647d17a777354980fa3f58 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 8 Mar 2020 08:48:38 -0500 Subject: [PATCH 061/250] bug fix with partition & data -access --- data/gan.py | 43 +++++++--- data/maker/__init__.py | 27 ++++-- pipeline.py | 184 ++++++++++++++++++----------------------- 3 files changed, 131 insertions(+), 123 deletions(-) diff --git a/data/gan.py b/data/gan.py index 898d4ea..a6d35e1 100644 --- a/data/gan.py +++ b/data/gan.py @@ -532,10 +532,13 @@ class Predict(GNet): self.generator = Generator(**args) self.values = args['values'] self.ROW_COUNT = args['row_count'] + self.oROW_COUNT = self.ROW_COUNT + self.MISSING_VALUES = args['no_value'] def load_meta(self, column): super().load_meta(column) self.generator.load_meta(column) + self.ROW_COUNT = self.oROW_COUNT def apply(self,**args): # print (self.train_dir) # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] @@ -544,6 +547,7 @@ class Predict(GNet): demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] tf.compat.v1.reset_default_graph() z = tf.random.normal(shape=[self.ROW_COUNT, self.Z_DIM]) + y = tf.compat.v1.placeholder(shape=[self.ROW_COUNT, self.NUM_LABELS], dtype=tf.int32) if self._LABEL is not None : ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] @@ -569,6 +573,8 @@ class Predict(GNet): found = [] ratio = [] + __x__ = None + __ratio=0 for i in np.arange(CANDIDATE_COUNT) : if labels : f = sess.run(fake,feed_dict={y:labels}) @@ -590,7 +596,8 @@ class Predict(GNet): if i == CANDIDATE_COUNT: break else: - + __x__ = df if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __x__ + __ratio = np.divide( np.sum(x), x.size) if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __ratio continue # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms @@ -600,23 +607,33 @@ class Predict(GNet): # N = len(found) _index = [i for i in range(0,N) if found[i].shape[1] == len(self.values)] - if not _index : - INDEX = np.random.choice(np.arange(len(found)),1)[0] - INDEX = ratio.index(np.max(ratio)) - else: - INDEX = _index[0] + if not _index and not found : + df = __x__ + INDEX = -1 + else : + if not _index : + INDEX = np.random.choice(np.arange(len(found)),1)[0] + INDEX = ratio.index(np.max(ratio)) + else: + INDEX = _index[0] - df = found[INDEX] + df = found[INDEX] columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] # r = np.zeros((self.ROW_COUNT,len(columns))) # r = np.zeros(self.ROW_COUNT) if self.logger : - info = {"found":len(found),"selected":INDEX, "ratio": ratio[INDEX],"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)} + info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)} + if INDEX > 0 : + info =dict(info ,**{"selected":INDEX, "ratio": ratio[INDEX] }) + else : + + info['selected'] = -1 + info['ratio'] = __ratio self.logger.write({"module":"gan-generate","action":"generate","input":info}) df.columns = self.values - if len(found): + if len(found) or df.columns.size == len(self.values): # print (len(found),NTH_VALID_CANDIDATE) # x = df * self.values # @@ -639,10 +656,14 @@ class Predict(GNet): df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) df.columns = columns df = df[columns[0]].append(pd.Series(missing)) - + if self.logger : + + info= {"missing": i.size,"rows":df.shape[0],"cols":1} + self.logger.write({"module":"gan-generate","action":"compile.io","input":info}) + - + # print(df.head()) tf.compat.v1.reset_default_graph() df = pd.DataFrame(df) df.columns = columns diff --git a/data/maker/__init__.py b/data/maker/__init__.py index f4bce16..4be97b8 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -107,23 +107,33 @@ def train (**args) : # args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values # if 'float' not in df[col].dtypes.name : # args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values - if 'float' in df[col].dtypes.name and col in CONTINUOUS: + if col in CONTINUOUS: BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) args['real'] = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32) else: - args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values + df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False) + # print (df[col].dtypes) + # print (df[col].dropna/(axis=1).unique()) + args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values + + - args['column'] = col - args['context'] = col context = args['context'] if 'store' in args : args['store']['args']['doc'] = context logger = factory.instance(**args['store']) args['logger'] = logger + info = {"rows":args['real'].shape[0],"cols":args['real'].shape[1],"name":col} + logger.write({"module":"gan-train","action":"data-prep","input":info}) else: logger = None + args['column'] = col + args['context'] = col + + # + # If the s trainer = gan.Train(**args) trainer.apply() def post(**args): @@ -149,6 +159,7 @@ def generate(**args): """ # df = args['data'] df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) + CONTINUOUS = args['continous'] if 'continuous' in args else [] column = args['column'] if (isinstance(args['column'],list)) else [args['column']] # column_id = args['id'] @@ -168,7 +179,8 @@ def generate(**args): # values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE) # # values = np.unique(values).tolist() # else: - values = df[col].unique().tolist() + values = df[col].dropna().unique().tolist() + args['values'] = values args['row_count'] = df.shape[0] @@ -178,8 +190,9 @@ def generate(**args): handler.load_meta(col) r = handler.apply() BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) - _df[col] = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col] - # _df[col] = r[col] + + # _df[col] = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col] + _df[col] = r[col] # # @TODO: log basic stats about the synthetic attribute # diff --git a/pipeline.py b/pipeline.py index 6234c26..0f2c258 100644 --- a/pipeline.py +++ b/pipeline.py @@ -30,11 +30,13 @@ class Components : condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')']) SQL = " ".join([SQL,'WHERE',condition]) - SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 " + SQL = SQL.replace(':dataset',args['dataset']) #+ " LI " + if 'limit' in args : - SQL = SQL + 'LIMIT ' + args['limit'] + SQL = SQL + ' LIMIT ' + args['limit'] + credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') - df = pd.read_gbq(SQL,credentials=credentials,dialect='standard') + df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').astype(object) return df # return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna() @@ -51,7 +53,8 @@ class Components : # # @TODO: we need to log something here about the parameters being passed # pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args) - df = args['reader']() + df = args['data'] + # if df.shape[0] == 0 : # print ("CAN NOT TRAIN EMPTY DATASET ") @@ -62,85 +65,43 @@ class Components : logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) log_folder = args['logs'] if 'logs' in args else 'logs' - _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + # _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} - _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) - _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 - _args['gpu'] = args['gpu'] if 'gpu' in args else 0 + # _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) + # _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + # _args['gpu'] = args['gpu'] if 'gpu' in args else 0 - # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 - PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 - - if 'partition' not in args: - lbound = 0 - # bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories) - # bounds = Components.split(df,MAX_ROWS,PART_SIZE) - columns = args['columns'] - df = np.array_split(df[columns].values,PART_SIZE) - qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'}) - part_index = 0 - - # - # let's start n processes to listen & train this mother ... - # - #-- hopefully they learn as daemons + # # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 + PART_SIZE = int(args['part_size']) - for _df in df: - - # _args['logs'] = os.sep.join([log_folder,str(part_index)]) - _args['partition'] = str(part_index) - _args['logger'] = {'args':{'dbname':'aou','doc':args['context']},'type':'mongo.MongoWriter'} - - # - # We should post the the partitions to a queue server (at least the instructions on ): - # - where to get the data - # - and athe arguments to use (partition #,columns,gpu,epochs) - # + partition = args['partition'] + log_folder = os.sep.join([log_folder,args['context'],str(partition)]) + _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) - _df = pd.DataFrame(_df,columns=columns) - # print (columns) - - info = {"rows":_df.shape[0],"cols":_df.shape[1], "partition":part_index,"logs":_args['logs'],"num_gpu":1,"part_size":PART_SIZE} - p = {"args":_args,"data":_df.to_dict(orient="records"),"input":info} - part_index += 1 - qwriter.write(p) - # - # @TODO: - # - Notify that information was just posted to the queue - # In case we want slow-mode, we can store the partitions in mongodb and process (Yes|No)? - # - - logger.write({"module":"train","action":"setup-partition","input":info}) - - pass + # + # We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel + # + if int(args['num_gpu']) > 1 : + _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0] else: - print ('.....') - partition = args['partition'] if 'partition' in args else '' - log_folder = os.sep.join([log_folder,args['context'],str(partition)]) - _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} - _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) - - # - # We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel - # - if int(args['num_gpu']) > 1 : - _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0] - else: - _args['gpu'] = 0 - _args['num_gpu'] = 1 - os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) - - - _args['data'] = df - # - # @log : - # Logging information about the training process for this partition (or not) - # - - info = {"rows":df.shape[0],"cols":df.shape[1], "partition":int(partition),"logs":_args['logs']} - - logger.write({"module":"train","action":"train","input":info}) - data.maker.train(**_args) + _args['gpu'] = 0 + _args['num_gpu'] = 1 + os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) + + _args['store'] = {'type':'mongo.MongoWriter','args':{'dbname':'aou','doc':args['context']}} + _args['data'] = args['data'] + + # print (['partition ',partition,df.value_source_concept_id.unique()]) + # + # @log : + # Logging information about the training process for this partition (or not) + # + + info = {"rows":df.shape[0],"cols":df.shape[1], "partition":int(partition),"logs":_args['logs']} + + logger.write({"module":"train","action":"train","input":info}) + data.maker.train(**_args) pass @@ -210,6 +171,7 @@ class Components : # #-- Let us store all of this into bigquery prefix = args['notify']+'.'+_args['context'] + partition = str(partition) table = '_'.join([prefix,partition,'io']).replace('__','_') folder = os.sep.join([args['logs'],args['context'],partition,'output']) if 'file' in args : @@ -219,17 +181,19 @@ class Components : data_comp.to_csv( _pname,index=False) _args['data'].to_csv(_fname,index=False) - + _id = 'path' else: credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') _pname = os.sep.join([folder,table+'.csv']) _fname = table.replace('_io','_full_io') - data_comp.to_gbq(if_exists='replace',destination_table=_pname,credentials='credentials',chunk_size=50000) + partial = '.'.join(['io',args['context']+'_partial_io']) + complete= '.'.join(['io',args['context']+'_full_io']) + data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=50000) data_comp.to_csv(_pname,index=False) INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' - _args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=_fname,credentials='credentials',chunk_size=50000) - - info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} } + _args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=complete,credentials=credentials,chunksize=50000) + _id = 'dataset' + info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} } if partition : info ['partition'] = int(partition) logger.write({"module":"generate","action":"write","input":info} ) @@ -280,18 +244,18 @@ if __name__ == '__main__' : args['logs'] = args['logs'] if 'logs' in args else 'logs' if 'dataset' not in args : args['dataset'] = 'combined20191004v2_deid' - + PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 # # @TODO: # Log what was initiated so we have context of this processing ... # if 'listen' not in SYS_ARGS : if 'file' in args : - reader = lambda: pd.read_csv(args['file']) ; + DATA = pd.read_csv(args['file']) ; else: DATA = Components().get(args) - reader = lambda: DATA - args['reader'] = reader + COLUMNS = DATA.columns + DATA = np.array_split(DATA,PART_SIZE) if 'generate' in SYS_ARGS : # @@ -299,32 +263,34 @@ if __name__ == '__main__' : content = os.listdir( os.sep.join([args['logs'],args['context']])) generator = Components() - DATA = reader() + if ''.join(content).isnumeric() : # # we have partitions we are working with jobs = [] - del args['reader'] - columns = DATA.columns.tolist() - DATA = np.array_split(DATA[args['columns']],len(content)) + + # columns = DATA.columns.tolist() + + # DATA = np.array_split(DATA,PART_SIZE) - for id in ''.join(content) : - if 'focus' in args and int(args['focus']) != int(id) : + for index in range(0,PART_SIZE) : + if 'focus' in args and int(args['focus']) != index : # # This handles failures/recoveries for whatever reason # If we are only interested in generating data for a given partition continue - - args['partition'] = id - args['data'] = pd.DataFrame(DATA[(int(id))],columns=args['columns']) + # index = id.index(id) + + args['partition'] = index + args['data'] = DATA[index] if int(args['num_gpu']) > 1 : - args['gpu'] = id + args['gpu'] = index else: args['gpu']=0 make = lambda _args: (Components()).generate(_args) job = Process(target=make,args=(args,)) - job.name = 'generator # '+str(id) + job.name = 'generator # '+str(index) job.start() jobs.append(job) @@ -370,18 +336,26 @@ if __name__ == '__main__' : # qreader.read(1) pass else: - PART_SIZE = int(args['jobs']) if 'jobs' in args else 8 - DATA = reader() - DATA = np.array_split(DATA[args['columns']],PART_SIZE) + + # DATA = np.array_split(DATA,PART_SIZE) + jobs = [] - for index in range(0,int(args['jobs'])) : + for index in range(0,PART_SIZE) : if 'focus' in args and int(args['focus']) != index : continue + args['part_size'] = PART_SIZE args['partition'] = index - _df = pd.DataFrame(DATA[index],columns=args['columns']) - args['reader'] = lambda: _df + # _df = pd.DataFrame(DATA[index],columns=args['columns']) + args['data'] = DATA[index] + args['data'].to_csv('aou-'+str(index)+'csv',index=False) + # args['reader'] = lambda: _df + if int(args['num_gpu']) > 1 : + args['gpu'] = index + else: + args['gpu']=0 + make = lambda _args: (Components()).train(**_args) - job = Process(target=make,args=(args,)) + job = Process(target=make,args=( dict(args),)) job.name = 'Trainer # ' + str(index) job.start() jobs.append(job) From 266bdc8bd282ca5b1588434a18f8dcbc3067fb1b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 8 Mar 2020 15:00:26 -0500 Subject: [PATCH 062/250] bug fix with batch_size (GPU load) --- pipeline.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 0f2c258..418ccbf 100644 --- a/pipeline.py +++ b/pipeline.py @@ -78,7 +78,8 @@ class Components : log_folder = os.sep.join([log_folder,args['context'],str(partition)]) _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) - + if 'batch_size' in args : + _args['batch_size'] = int(args['batch_size']) # # We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel # @@ -118,6 +119,8 @@ class Components : _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) # _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + if 'batch_size' in args : + _args['batch_size'] = int(args['batch_size']) if int(args['num_gpu']) > 1 : _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0] From e07c3553884fc9726cc464e9523f28a1a7f55794 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 8 Mar 2020 19:33:08 -0500 Subject: [PATCH 063/250] bug fix, with logs and partitioning --- data/gan.py | 11 +++++++---- data/maker/__init__.py | 4 ++-- pipeline.py | 8 +++++--- setup.py | 2 +- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/data/gan.py b/data/gan.py index a6d35e1..3c41f59 100644 --- a/data/gan.py +++ b/data/gan.py @@ -59,6 +59,7 @@ class GNet : self.logs = {} self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] + self.PARTITION = args['partition'] # if self.NUM_GPUS > 1 : # os.environ['CUDA_VISIBLE_DEVICES'] = "4" @@ -356,7 +357,7 @@ class Train (GNet): self.meta = self.log_meta() if(self.logger): - self.logger.write({"module":"gan-train","action":"start","input":self.meta} ) + self.logger.write({"module":"gan-train","action":"start","input":{"partition":self.PARTITION,"meta":self.meta} } ) # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta) def load_meta(self, column): @@ -408,7 +409,7 @@ class Train (GNet): # losses = tf.compat.v1.get_collection(flag, scope) total_loss = tf.add_n(losses, name='total_loss') - + print (total_loss) return total_loss, w def input_fn(self): """ @@ -514,7 +515,7 @@ class Train (GNet): # # if self.logger : - row = {"module":"gan-train","action":"logs","input":logs} #,"model":pickle.dump(sess)} + row = {"module":"gan-train","action":"logs","input":{"partition":self.PARTITION,"logs":logs}} #,"model":pickle.dump(sess)} self.logger.write(row) # # @TODO: @@ -623,6 +624,7 @@ class Predict(GNet): # r = np.zeros((self.ROW_COUNT,len(columns))) # r = np.zeros(self.ROW_COUNT) + if self.logger : info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)} if INDEX > 0 : @@ -631,6 +633,7 @@ class Predict(GNet): info['selected'] = -1 info['ratio'] = __ratio + info['partition'] = self.PARTITION self.logger.write({"module":"gan-generate","action":"generate","input":info}) df.columns = self.values if len(found) or df.columns.size == len(self.values): @@ -658,7 +661,7 @@ class Predict(GNet): df = df[columns[0]].append(pd.Series(missing)) if self.logger : - info= {"missing": i.size,"rows":df.shape[0],"cols":1} + info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION} self.logger.write({"module":"gan-generate","action":"compile.io","input":info}) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 4be97b8..729654f 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -111,7 +111,7 @@ def train (**args) : BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) args['real'] = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32) else: - df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False) + # df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False) # print (df[col].dtypes) # print (df[col].dropna/(axis=1).unique()) args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values @@ -124,7 +124,7 @@ def train (**args) : args['store']['args']['doc'] = context logger = factory.instance(**args['store']) args['logger'] = logger - info = {"rows":args['real'].shape[0],"cols":args['real'].shape[1],"name":col} + info = {"rows":args['real'].shape[0],"cols":args['real'].shape[1],"name":col,"partition":args['partition']} logger.write({"module":"gan-train","action":"data-prep","input":info}) else: diff --git a/pipeline.py b/pipeline.py index 418ccbf..89ba16f 100644 --- a/pipeline.py +++ b/pipeline.py @@ -89,7 +89,8 @@ class Components : _args['gpu'] = 0 _args['num_gpu'] = 1 os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) - + _args['partition'] = int(partition) + _args['continuous']= args['continuous'] if 'continuous' in args else [] _args['store'] = {'type':'mongo.MongoWriter','args':{'dbname':'aou','doc':args['context']}} _args['data'] = args['data'] @@ -144,7 +145,8 @@ class Components : # df = pd.DataFrame(df[ int (partition) ],columns = columns) info = {"parition":int(partition),"gpu":_args["gpu"],"rows":df.shape[0],"cols":df.shape[1],"part_size":PART_SIZE} logger.write({"module":"generate","action":"partition","input":info}) - + _args['partition'] = int(partition) + _args['continuous']= args['continuous'] if 'continuous' in args else [] _args['data'] = df # _args['data'] = reader() #_args['data'] = _args['data'].astype(object) @@ -194,7 +196,7 @@ class Components : data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=50000) data_comp.to_csv(_pname,index=False) INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' - _args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=complete,credentials=credentials,chunksize=50000) + _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=50000) _id = 'dataset' info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} } if partition : diff --git a/setup.py b/setup.py index bf63cb0..5a8f7b6 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.2.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.2.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 8455cd7554acfac8927bab1d8a21015209ed14a3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 8 Mar 2020 20:27:27 -0500 Subject: [PATCH 064/250] bug fix: typo --- data/maker/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 729654f..354b78f 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -160,7 +160,7 @@ def generate(**args): # df = args['data'] df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) - CONTINUOUS = args['continous'] if 'continuous' in args else [] + CONTINUOUS = args['continuous'] if 'continuous' in args else [] column = args['column'] if (isinstance(args['column'],list)) else [args['column']] # column_id = args['id'] # From bbd03c4a63aeb109dd878d4e50cd9b8568bf8b45 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 9 Mar 2020 13:10:26 -0500 Subject: [PATCH 065/250] bug fix with GPU allocation --- pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline.py b/pipeline.py index 89ba16f..7a2cf3a 100644 --- a/pipeline.py +++ b/pipeline.py @@ -84,7 +84,7 @@ class Components : # We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel # if int(args['num_gpu']) > 1 : - _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0] + _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int) else: _args['gpu'] = 0 _args['num_gpu'] = 1 @@ -124,7 +124,7 @@ class Components : _args['batch_size'] = int(args['batch_size']) if int(args['num_gpu']) > 1 : - _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0] + _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int) else: _args['gpu'] = 0 _args['num_gpu'] = 1 @@ -215,7 +215,7 @@ class Components : df = pd.DataFrame(info['data']) args = info['args'] if args['num_gpu'] > 1 : - args['gpu'] = int(info['input']['partition']) if info['input']['partition'] < 8 else np.random.choice(np.arange(8),1).astype(int)[0] + args['gpu'] = int(info['input']['partition']) if info['input']['partition'] < 8 else np.random.choice(np.arange(8)).astype(int) else: args['gpu'] = 0 From fc08a8f643d0f38d12a728a4d3045f4f7be8f9bd Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 12 Mar 2020 09:41:54 -0500 Subject: [PATCH 066/250] bug fix: continuous variable handling --- data/gan.py | 2 +- data/maker/__init__.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/data/gan.py b/data/gan.py index 3c41f59..4f34634 100644 --- a/data/gan.py +++ b/data/gan.py @@ -409,7 +409,7 @@ class Train (GNet): # losses = tf.compat.v1.get_collection(flag, scope) total_loss = tf.add_n(losses, name='total_loss') - print (total_loss) + # print (total_loss) return total_loss, w def input_fn(self): """ diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 354b78f..97cc3dd 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -22,9 +22,10 @@ class ContinuousToDiscrete : This function will convert a continous stream of information into a variety a bit stream of bins """ # BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist() - - BOUNDS = ContinuousToDiscrete.bounds(np.round(X,ContinuousToDiscrete.ROUND_UP),n) - + # print ( X.values.astype(np.float32)) + # print ("___________________________") + values = X.values.astype(np.float32) + BOUNDS = ContinuousToDiscrete.bounds(values,n) # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS] _matrix = [] m = [] @@ -40,12 +41,13 @@ class ContinuousToDiscrete : # # for items in BOUNDS : # index = BOUNDS.index(items) - return _matrix + return np.array(_matrix) @staticmethod def bounds(x,n): # return np.array_split(x,n) - return list(pd.cut(np.array( np.round(x,ContinuousToDiscrete.ROUND_UP) ),n).categories) + values = np.round(x,ContinuousToDiscrete.ROUND_UP) + return list(pd.cut(values,n).categories) From 60cbf2dd3fd32ae8f5712d22dcceb367945a24a1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 12 Mar 2020 09:55:29 -0500 Subject: [PATCH 067/250] bug fix: continuous values --- data/maker/__init__.py | 1 + pipeline.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 97cc3dd..2b51670 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -112,6 +112,7 @@ def train (**args) : if col in CONTINUOUS: BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) args['real'] = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32) + # print ( pd.DataFrame(args['real']).head() ) else: # df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False) # print (df[col].dtypes) diff --git a/pipeline.py b/pipeline.py index 7a2cf3a..9eee8c5 100644 --- a/pipeline.py +++ b/pipeline.py @@ -143,7 +143,7 @@ class Components : # columns = args['columns'] # df = np.array_split(df[columns].values,PART_SIZE) # df = pd.DataFrame(df[ int (partition) ],columns = columns) - info = {"parition":int(partition),"gpu":_args["gpu"],"rows":df.shape[0],"cols":df.shape[1],"part_size":PART_SIZE} + info = {"parition":int(partition),"gpu":_args["gpu"],"rows":str(df.shape[0]),"cols":str(df.shape[1]),"part_size":int(PART_SIZE)} logger.write({"module":"generate","action":"partition","input":info}) _args['partition'] = int(partition) _args['continuous']= args['continuous'] if 'continuous' in args else [] @@ -352,7 +352,7 @@ if __name__ == '__main__' : args['partition'] = index # _df = pd.DataFrame(DATA[index],columns=args['columns']) args['data'] = DATA[index] - args['data'].to_csv('aou-'+str(index)+'csv',index=False) + # args['data'].to_csv('aou-'+str(index)+'csv',index=False) # args['reader'] = lambda: _df if int(args['num_gpu']) > 1 : args['gpu'] = index From d30d2233c865e2dc03a807f70b554734774150a3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 12 Mar 2020 09:56:08 -0500 Subject: [PATCH 068/250] versioning ... --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5a8f7b6..78c52ea 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.2.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.2.2","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 772d841ee80d9279c0348356ea268787d54ef44b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 12 Mar 2020 14:37:01 -0500 Subject: [PATCH 069/250] bug fix ... --- data/maker/__init__.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 2b51670..5b4cb7e 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -24,24 +24,25 @@ class ContinuousToDiscrete : # BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist() # print ( X.values.astype(np.float32)) # print ("___________________________") - values = X.values.astype(np.float32) + values = np.array(X).astype(np.float32) BOUNDS = ContinuousToDiscrete.bounds(values,n) # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS] _matrix = [] m = [] for value in X : x_ = np.zeros(n) - _matrix.append(x_) + for row in BOUNDS : if value>= row.left and value <= row.right : index = BOUNDS.index(row) x_[index] = 1 break + _matrix += x_.tolist() # # for items in BOUNDS : # index = BOUNDS.index(items) - return np.array(_matrix) + return np.array(_matrix).reshape(len(X),n) @staticmethod def bounds(x,n): @@ -92,7 +93,7 @@ def train (**args) : :context label of what we are synthesizing """ column = args['column'] if (isinstance(args['column'],list)) else [args['column']] - CONTINUOUS = args['continuous'] if 'continuous' in args else [] + # CONTINUOUS = args['continuous'] if 'continuous' in args else [] # column_id = args['id'] df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) df.columns = [name.lower() for name in df.columns] @@ -109,15 +110,16 @@ def train (**args) : # args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values # if 'float' not in df[col].dtypes.name : # args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values - if col in CONTINUOUS: - BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) - args['real'] = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32) - # print ( pd.DataFrame(args['real']).head() ) - else: + # if col in CONTINUOUS: + # BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) + # args['real'] = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32) + # # args['real'] = args['real'].reshape(df.shape[0],BIN_SIZE) + + # else: # df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False) # print (df[col].dtypes) # print (df[col].dropna/(axis=1).unique()) - args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values + args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values @@ -170,6 +172,7 @@ def generate(**args): #@TODO: # If the identifier is not present, we should fine a way to determine or make one # + BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) _df = df.copy() for col in column : args['context'] = col @@ -181,10 +184,15 @@ def generate(**args): # BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) # values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE) # # values = np.unique(values).tolist() + # else: + # if col in CONTINUOUS : + # values = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32).T + # else: values = df[col].dropna().unique().tolist() + args['values'] = values args['row_count'] = df.shape[0] # @@ -192,10 +200,9 @@ def generate(**args): handler = gan.Predict (**args) handler.load_meta(col) r = handler.apply() - BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) - # _df[col] = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col] - _df[col] = r[col] + _df[col] = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col] + # _df[col] = r[col] # # @TODO: log basic stats about the synthetic attribute # From e81e50c94f8fdf051cbf76d9479cc68a40b1ef5d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 14 Mar 2020 11:12:13 -0500 Subject: [PATCH 070/250] Bug fix with the number of candidates generated --- data/gan.py | 10 +- data/maker/__init__.py | 1 + drive/pipeline.py | 303 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 311 insertions(+), 3 deletions(-) create mode 100644 drive/pipeline.py diff --git a/data/gan.py b/data/gan.py index 4f34634..28d5ea3 100644 --- a/data/gan.py +++ b/data/gan.py @@ -424,6 +424,7 @@ class Train (GNet): dataset = tf.data.Dataset.from_tensor_slices(features_placeholder) # labels_placeholder = None dataset = dataset.repeat(10000) + print ([' ******* ',self.BATCHSIZE_PER_GPU]) dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) dataset = dataset.prefetch(1) # iterator = dataset.make_initializable_iterator() @@ -560,7 +561,7 @@ class Predict(GNet): init = tf.compat.v1.global_variables_initializer() saver = tf.compat.v1.train.Saver() df = pd.DataFrame() - CANDIDATE_COUNT = 1000 + CANDIDATE_COUNT = 10 #0 if self.ROW_COUNT < 1000 else 100 NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0] with tf.compat.v1.Session() as sess: @@ -594,13 +595,16 @@ class Predict(GNet): if np.divide( np.sum(x), x.size) > .9 or p and np.sum(x) == x.size : ratio.append(np.divide( np.sum(x), x.size)) found.append(df) - if i == CANDIDATE_COUNT: + + # break + if len(found) == CANDIDATE_COUNT: + break else: __x__ = df if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __x__ __ratio = np.divide( np.sum(x), x.size) if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __ratio continue - + # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms # df = (i * df).sum(axis=1) # diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 5b4cb7e..3a016cf 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -208,4 +208,5 @@ def generate(**args): # # print (r)s # break + return _df \ No newline at end of file diff --git a/drive/pipeline.py b/drive/pipeline.py new file mode 100644 index 0000000..04658da --- /dev/null +++ b/drive/pipeline.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +import json +from transport import factory +import numpy as np +import os +from multiprocessing import Process +import pandas as pd +from google.oauth2 import service_account +import data.maker + +from data.params import SYS_ARGS + +# +# The configuration array is now loaded and we will execute the pipe line as follows +DATASET='combined20190510' + +class Components : + + @staticmethod + def get(args): + """ + This function returns a data-frame provided a bigquery sql statement with conditions (and limits for testing purposes) + The function must be wrapped around a lambda this makes testing easier and changing data stores transparent to the rest of the code. (Vital when testing) + :sql basic sql statement + :condition optional condition and filters + """ + SQL = args['sql'] + if 'condition' in args : + condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')']) + SQL = " ".join([SQL,'WHERE',condition]) + + SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 " + if 'limit' in args : + SQL = SQL + 'LIMIT ' + args['limit'] + credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') + df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna() + return df + + # return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna() + @staticmethod + def split(X,MAX_ROWS=3,PART_SIZE=3): + + return list(pd.cut( np.arange(X.shape[0]+1),PART_SIZE).categories) + + def train(self,**args): + """ + This function will perform training on the basis of a given pointer that reads data + + """ + # + # @TODO: we need to log something here about the parameters being passed + pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args) + df = pointer() + + # + # Now we can parse the arguments and submit the entire thing to training + # + + logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) + log_folder = args['logs'] if 'logs' in args else 'logs' + _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) + _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + + MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 + PART_SIZE = args['part_size'] if 'part_size' in args else 0 + + if df.shape[0] > MAX_ROWS and 'partition' not in args: + lbound = 0 + bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories) + # bounds = Components.split(df,MAX_ROWS,PART_SIZE) + + qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'}) + + for b in bounds : + part_index = bounds.index(b) + ubound = int(b.right) + + + _data = df.iloc[lbound:ubound][args['columns']] + lbound = ubound + + # _args['logs'] = os.sep.join([log_folder,str(part_index)]) + _args['partition'] = str(part_index) + _args['logger'] = {'args':{'dbname':'aou','doc':args['context']},'type':'mongo.MongoWriter'} + # + # We should post the the partitions to a queue server (at least the instructions on ): + # - where to get the data + # - and athe arguments to use (partition #,columns,gpu,epochs) + # + info = {"rows":_data.shape[0],"cols":_data.shape[1], "paritition":part_index,"logs":_args['logs']} + p = {"args":_args,"data":_data.to_dict(orient="records"),"info":info} + qwriter.write(p) + # + # @TODO: + # - Notify that information was just posted to the queue + info['max_rows'] = MAX_ROWS + info['part_size'] = PART_SIZE + logger.write({"module":"train","action":"setup-partition","input":info}) + + pass + else: + partition = args['partition'] if 'partition' in args else '' + log_folder = os.sep.join([log_folder,args['context'],partition]) + _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) + _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' + + _args['data'] = df + # + # @log : + # Logging information about the training process for this partition (or not) + # + info = {"rows":df.shape[0],"cols":df.shape[1], "partition":partition,"logs":_args['logs']} + logger.write({"module":"train","action":"train","input":info}) + data.maker.train(**_args) + + pass + + # @staticmethod + def generate(self,args): + """ + This function will generate data and store it to a given, + """ + logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) + log_folder = args['logs'] if 'logs' in args else 'logs' + partition = args['partition'] if 'partition' in args else '' + log_folder = os.sep.join([log_folder,args['context'],partition]) + _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) + _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' + _args['no_value']= args['no_value'] + MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 + PART_SIZE = args['part_size'] if 'part_size' in args else 0 + + # credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') + # _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna() + reader = args['reader'] + df = reader() + if 'partition' in args : + bounds = Components.split(df,MAX_ROWS,PART_SIZE) + # bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories) + lbound = int(bounds[int(partition)].left) + ubound = int(bounds[int(partition)].right) + df = df.iloc[lbound:ubound] + _args['data'] = df + # _args['data'] = reader() + #_args['data'] = _args['data'].astype(object) + _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 + _dc = data.maker.generate(**_args) + # + # We need to post the generate the data in order to : + # 1. compare immediately + # 2. synthetic copy + # + + cols = _dc.columns.tolist() + + data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query) + base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) + + for name in cols : + _args['data'][name] = _dc[name] + info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}} + if partition != '' : + info['partition'] = partition + logger.write(info) + # filename = os.sep.join([log_folder,'output',name+'.csv']) + # data_comp[[name]].to_csv(filename,index=False) + + # + #-- Let us store all of this into bigquery + prefix = args['notify']+'.'+_args['context'] + table = '_'.join([prefix,partition,'io']).replace('__','_') + folder = os.sep.join([args['logs'],args['context'],partition,'output']) + if 'file' in args : + + _fname = os.sep.join([folder,table.replace('_io','_full_io.csv')]) + _pname = os.sep.join([folder,table])+'.csv' + data_comp.to_csv( _pname,index=False) + _args['data'].to_csv(_fname,index=False) + + + else: + credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') + _pname = os.sep.join([folder,table+'.csv']) + _fname = table.replace('_io','_full_io') + data_comp.to_gbq(if_exists='replace',destination_table=_pname,credentials='credentials',chunk_size=50000) + data_comp.to_csv(_pname,index=False) + INSERT_FLAG = 'replace' if 'partition' not in args else 'append' + _args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=_fname,credentials='credentials',chunk_size=50000) + + info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} } + if partition : + info ['partition'] = partition + logger.write({"module":"generate","action":"write","info":info} ) + @staticmethod + def callback(channel,method,header,stream): + + info = json.loads(stream) + logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']}) + + logger.write({'module':'process','action':'read-partition','input':info['info']}) + df = pd.DataFrame(info['data']) + args = info['args'] + if int(args['num_gpu']) > 1 and args['gpu'] > 0: + args['gpu'] = args['gpu'] + args['num_gpu'] + args['reader'] = lambda: df + # + # @TODO: Fix + # There is an inconsistency in column/columns ... fix this shit! + # + args['columns'] = args['column'] + (Components()).train(**args) + logger.write({"module":"process","action":"exit","info":info["info"]}) + channel.close() + channel.connection.close() + pass + +if __name__ == '__main__' : + filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json' + f = open (filename) + PIPELINE = json.loads(f.read()) + f.close() + index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else 0 + + args = (PIPELINE[index]) + args['dataset'] = 'combined20190510' + args = dict(args,**SYS_ARGS) + args['max_rows'] = int(args['max_rows']) if 'max_rows' in args else 3 + args['part_size']= int(args['part_size']) if 'part_size' in args else 3 + + # + # @TODO: + # Log what was initiated so we have context of this processing ... + # + if 'listen' not in SYS_ARGS : + if 'file' in args : + reader = lambda: pd.read_csv(args['file']) ; + else: + reader = lambda: Components().get(args) + args['reader'] = reader + + if 'generate' in SYS_ARGS : + # + # Let us see if we have partitions given the log folder + + content = os.listdir( os.sep.join([args['logs'],args['context']])) + generator = Components() + if ''.join(content).isnumeric() : + # + # we have partitions we are working with + + for id in ''.join(content) : + args['partition'] = id + + generator.generate(args) + else: + generator.generate(args) + # Components.generate(args) + elif 'listen' in args : + # + # This will start a worker just in case to listen to a queue + if 'read' in SYS_ARGS : + QUEUE_TYPE = 'queue.QueueReader' + pointer = lambda qreader: qreader.read(1) + else: + QUEUE_TYPE = 'queue.QueueListener' + pointer = lambda qlistener: qlistener.listen() + N = int(SYS_ARGS['jobs']) if 'jobs' in SYS_ARGS else 1 + + qhandlers = [factory.instance(type=QUEUE_TYPE,args={'queue':'aou.io'}) for i in np.arange(N)] + jobs = [] + for qhandler in qhandlers : + qhandler.callback = Components.callback + job = Process(target=pointer,args=(qhandler,)) + job.start() + jobs.append(job) + # + # let us wait for the jobs + print (["Started ",len(jobs)," trainers"]) + while len(jobs) > 0 : + + jobs = [job for job in jobs if job.is_alive()] + + # pointer(qhandler) + + + # qreader.read(1) + pass + else: + + trainer = Components() + trainer.train(**args) + # Components.train(**args) +#for args in PIPELINE : + #args['dataset'] = 'combined20190510' + #process = Process(target=Components.train,args=(args,)) + #process.name = args['context'] + #process.start() +# Components.train(args) From f9496ed8061cf0f1c452f75ffb3a421af119446d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 15 Mar 2020 10:25:19 -0500 Subject: [PATCH 071/250] bug fix with program dying --- pipeline.py | 20 +++++++++++--------- setup.py | 2 +- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pipeline.py b/pipeline.py index 9eee8c5..bfdd72e 100644 --- a/pipeline.py +++ b/pipeline.py @@ -76,10 +76,11 @@ class Components : partition = args['partition'] log_folder = os.sep.join([log_folder,args['context'],str(partition)]) - _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) if 'batch_size' in args : _args['batch_size'] = int(args['batch_size']) + # # We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel # @@ -143,7 +144,7 @@ class Components : # columns = args['columns'] # df = np.array_split(df[columns].values,PART_SIZE) # df = pd.DataFrame(df[ int (partition) ],columns = columns) - info = {"parition":int(partition),"gpu":_args["gpu"],"rows":str(df.shape[0]),"cols":str(df.shape[1]),"part_size":int(PART_SIZE)} + info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE)} logger.write({"module":"generate","action":"partition","input":info}) _args['partition'] = int(partition) _args['continuous']= args['continuous'] if 'continuous' in args else [] @@ -163,7 +164,6 @@ class Components : data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query) base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) - for name in cols : _args['data'][name] = _dc[name] info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}} @@ -193,10 +193,14 @@ class Components : _fname = table.replace('_io','_full_io') partial = '.'.join(['io',args['context']+'_partial_io']) complete= '.'.join(['io',args['context']+'_full_io']) - data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=50000) data_comp.to_csv(_pname,index=False) - INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' - _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=50000) + if 'dump' in args : + print (_args['data'].head()) + else: + data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=50000) + + INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' + _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=50000) _id = 'dataset' info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} } if partition : @@ -247,6 +251,7 @@ if __name__ == '__main__' : args = dict(args,**SYS_ARGS) args['logs'] = args['logs'] if 'logs' in args else 'logs' + args['batch_size'] = 2000 if 'batch_size' not in args else int(args['batch_size']) if 'dataset' not in args : args['dataset'] = 'combined20191004v2_deid' PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 @@ -350,10 +355,7 @@ if __name__ == '__main__' : continue args['part_size'] = PART_SIZE args['partition'] = index - # _df = pd.DataFrame(DATA[index],columns=args['columns']) args['data'] = DATA[index] - # args['data'].to_csv('aou-'+str(index)+'csv',index=False) - # args['reader'] = lambda: _df if int(args['num_gpu']) > 1 : args['gpu'] = index else: diff --git a/setup.py b/setup.py index 78c52ea..4a4e87b 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.2.2","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.2.3","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From af6ab356d832014d9608ff70812d47e07b24aa53 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 16 Mar 2020 16:22:34 -0500 Subject: [PATCH 072/250] bug fix: index number or context --- data/gan.py | 2 +- pipeline.py | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/data/gan.py b/data/gan.py index 28d5ea3..c85776a 100644 --- a/data/gan.py +++ b/data/gan.py @@ -424,7 +424,7 @@ class Train (GNet): dataset = tf.data.Dataset.from_tensor_slices(features_placeholder) # labels_placeholder = None dataset = dataset.repeat(10000) - print ([' ******* ',self.BATCHSIZE_PER_GPU]) + dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) dataset = dataset.prefetch(1) # iterator = dataset.make_initializable_iterator() diff --git a/pipeline.py b/pipeline.py index bfdd72e..b838043 100644 --- a/pipeline.py +++ b/pipeline.py @@ -244,8 +244,19 @@ if __name__ == '__main__' : f = open (filename) PIPELINE = json.loads(f.read()) f.close() - index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else 0 - + index = SYS_ARGS['index'] + if index.isnumeric() : + index = int(SYS_ARGS['index']) + else: + # + # The index provided is a key to a pipeline entry mainly the context + # + N = len(PIPELINE) + f = [i for i in range(0,N) if PIPELINE[i]['context'] == index] + index = f[0] if f else 0 + # + # print + print ("..::: ",PIPELINE[index]['context']) args = (PIPELINE[index]) args = dict(args,**SYS_ARGS) From 2f6f43c9c694383d02563b6e3fa4abe9471c4f95 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 18 Mar 2020 23:16:36 -0500 Subject: [PATCH 073/250] bug fix: statistics for quick assessment --- pipeline.py | 20 ++++++++++++++++++-- setup.py | 2 +- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/pipeline.py b/pipeline.py index b838043..76496bd 100644 --- a/pipeline.py +++ b/pipeline.py @@ -163,6 +163,21 @@ class Components : cols = _dc.columns.tolist() data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query) + # + # performing basic analytics on the synthetic data generated (easy to quickly asses) + # + info = {"module":"generate","action":"io-stats","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}} + logs = [] + for name in data_comp.columns.tolist() : + g = pd.DataFrame(data_comp.groupby([name]).size()) + g.columns = ['counts'] + g[name] = g.index.tolist() + g.index = np.arange(g.shape[0]) + logs.append({"name":name,"counts": g.to_dict(orient='records')}) + info['input']['logs'] = logs + logger.write(info) + + base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) for name in cols : _args['data'][name] = _dc[name] @@ -170,6 +185,7 @@ class Components : if partition != '' : info['partition'] = int(partition) logger.write(info) + # filename = os.sep.join([log_folder,'output',name+'.csv']) # data_comp[[name]].to_csv(filename,index=False) @@ -197,10 +213,10 @@ class Components : if 'dump' in args : print (_args['data'].head()) else: - data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=50000) + data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' - _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=50000) + _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000) _id = 'dataset' info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} } if partition : diff --git a/setup.py b/setup.py index 4a4e87b..0f38464 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.2.3","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.2.4","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 915601236cd0f06a99f2e7fbdbaa5153da7f25f6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 25 Mar 2020 17:43:23 -0500 Subject: [PATCH 074/250] bug fix with ICD and some minor improvements --- data/gan.py | 13 +++-- data/maker/__init__.py | 53 +++++++++++++----- pipeline.py | 124 ++++++++++++++++------------------------- setup.py | 2 +- 4 files changed, 97 insertions(+), 95 deletions(-) diff --git a/data/gan.py b/data/gan.py index c85776a..a6dece6 100644 --- a/data/gan.py +++ b/data/gan.py @@ -172,7 +172,7 @@ class GNet : root = [] for loc in path.split(os.sep) : root.append(loc) - if not os.path.exists(os.sep.join(root)) : + if not os.path.exists(os.sep.join(root)) : os.mkdir(os.sep.join(root)) elif not os.path.exists(path): @@ -535,8 +535,12 @@ class Predict(GNet): self.values = args['values'] self.ROW_COUNT = args['row_count'] self.oROW_COUNT = self.ROW_COUNT - - self.MISSING_VALUES = args['no_value'] + if args['no_value'] in ['na','','NA'] : + self.MISSING_VALUES = np.nan + else : + self.MISSING_VALUES = args['no_value'] + # self.MISSING_VALUES = args['no_value'] + # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value'] def load_meta(self, column): super().load_meta(column) self.generator.load_meta(column) @@ -652,7 +656,8 @@ class Predict(GNet): if ii.shape[0] > 0 : # #@TODO Have this be a configurable variable - missing = np.repeat(0, np.where(ii==1)[0].size) + + missing = np.repeat(self.MISSING_VALUES, np.where(ii==1)[0].size) else: missing = [] # diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 3a016cf..e252de5 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -62,21 +62,28 @@ class ContinuousToDiscrete : BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE) values = [] - _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE) - # # print (BOUNDS) - - # values = [] - for row in _BINARY : - # ubound = BOUNDS[row.index(1)] - index = np.where(row == 1)[0][0] + # _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE) + # # # print (BOUNDS) + l = {} + for value in X : + values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if value >= item.left and value <= item.right ] + - ubound = BOUNDS[ index ].right - lbound = BOUNDS[ index ].left + + # # values = [] + # for row in _BINARY : + # # ubound = BOUNDS[row.index(1)] + # index = np.where(row == 1)[0][0] + + # ubound = BOUNDS[ index ].right + # lbound = BOUNDS[ index ].left - x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float) - values.append(x_) + # x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float) + # values.append(x_) - lbound = ubound + # lbound = ubound + + # values = [np.random.uniform() for item in BOUNDS] return values @@ -173,6 +180,8 @@ def generate(**args): # If the identifier is not present, we should fine a way to determine or make one # BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) + NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value'] + _df = df.copy() for col in column : args['context'] = col @@ -195,13 +204,29 @@ def generate(**args): args['values'] = values args['row_count'] = df.shape[0] + if col in NO_VALUE : + args['no_value'] = NO_VALUE[col] + else: + args['no_value'] = NO_VALUE + # # we can determine the cardinalities here so we know what to allow or disallow handler = gan.Predict (**args) handler.load_meta(col) r = handler.apply() - - _df[col] = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col] + if col in CONTINUOUS : + r[col] = np.array(r[col]) + MISSING= np.nan if args['no_value'] in ['na','','NA'] else args['no_value'] + + if np.isnan(MISSING): + i = np.isnan(r[col]) + i = np.where (i == False)[0] + else: + i = np.where( r[col] != None)[0] + _approx = ContinuousToDiscrete.continuous(r[col][i],BIN_SIZE) + r[col][i] = _approx + + _df[col] = r[col] #ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col] # _df[col] = r[col] # # @TODO: log basic stats about the synthetic attribute diff --git a/pipeline.py b/pipeline.py index 76496bd..0d19e60 100644 --- a/pipeline.py +++ b/pipeline.py @@ -16,7 +16,12 @@ from data.params import SYS_ARGS DATASET='combined20191004v2_deid' class Components : - + class KEYS : + PIPELINE_KEY = 'pipeline' + SQL_FILTER = 'filter' + @staticmethod + def get_logger(**args) : + return factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) @staticmethod def get(args): """ @@ -26,15 +31,19 @@ class Components : :condition optional condition and filters """ SQL = args['sql'] - if 'condition' in args : - condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')']) + if Components.KEYS.SQL_FILTER in args : + SQL_FILTER = Components.KEYS.SQL_FILTER + condition = ' '.join([args[SQL_FILTER]['field'],args[SQL_FILTER]['qualifier'],'(',args[SQL_FILTER]['value'],')']) SQL = " ".join([SQL,'WHERE',condition]) SQL = SQL.replace(':dataset',args['dataset']) #+ " LI " if 'limit' in args : SQL = SQL + ' LIMIT ' + args['limit'] - + # + # let's log the sql query that has been performed here + logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) + logger.write({"module":"bigquery","action":"read","input":{"sql":SQL}}) credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').astype(object) return df @@ -131,6 +140,7 @@ class Components : _args['num_gpu'] = 1 os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) _args['no_value']= args['no_value'] + # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 @@ -166,19 +176,27 @@ class Components : # # performing basic analytics on the synthetic data generated (easy to quickly asses) # - info = {"module":"generate","action":"io-stats","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}} - logs = [] - for name in data_comp.columns.tolist() : - g = pd.DataFrame(data_comp.groupby([name]).size()) - g.columns = ['counts'] - g[name] = g.index.tolist() - g.index = np.arange(g.shape[0]) - logs.append({"name":name,"counts": g.to_dict(orient='records')}) - info['input']['logs'] = logs + info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}} + x = {} + for name in args['columns'] : + ident = data_comp.apply(lambda row: 1*(row[name]==row[name+'_io']),axis=1).sum() + count = data_comp[name].unique().size + _ident= data_comp.shape[1] - ident + _count= data_comp[name+'_io'].unique().size + + info['input']['logs'] += [{"name":name,"identical":int(ident),"no_identical":int(_ident),"original_count":count,"synthetic_count":_count}] + # for name in data_comp.columns.tolist() : + # g = pd.DataFrame(data_comp.groupby([name]).size()) + # g.columns = ['counts'] + # g[name] = g.index.tolist() + # g.index = np.arange(g.shape[0]) + # logs.append({"name":name,"counts": g.to_dict(orient='records')}) + # info['input']['logs'] = logs logger.write(info) base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) + cols = _dc.columns.tolist() for name in cols : _args['data'][name] = _dc[name] info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}} @@ -223,43 +241,14 @@ class Components : info ['partition'] = int(partition) logger.write({"module":"generate","action":"write","input":info} ) - @staticmethod - def callback(channel,method,header,stream): - if stream.decode('utf8') in ['QUIT','EXIT','END'] : - channel.close() - channel.connection.close() - info = json.loads(stream) - logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']}) - - logger.write({'module':'process','action':'read-partition','input':info['input']}) - df = pd.DataFrame(info['data']) - args = info['args'] - if args['num_gpu'] > 1 : - args['gpu'] = int(info['input']['partition']) if info['input']['partition'] < 8 else np.random.choice(np.arange(8)).astype(int) - - else: - args['gpu'] = 0 - args['num_gpu'] = 1 - # if int(args['num_gpu']) > 1 and args['gpu'] > 0: - # args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else args['gpu'] #-- 8 max gpus - args['reader'] = lambda: df - # - # @TODO: Fix - # There is an inconsistency in column/columns ... fix this shit! - # - channel.close() - channel.connection.close() - args['columns'] = args['column'] - (Components()).train(**args) - logger.write({"module":"process","action":"exit","input":info["input"]}) - - pass + if __name__ == '__main__' : filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json' f = open (filename) - PIPELINE = json.loads(f.read()) + _config = json.loads(f.read()) f.close() + PIPELINE = _config['pipeline'] index = SYS_ARGS['index'] if index.isnumeric() : index = int(SYS_ARGS['index']) @@ -274,10 +263,17 @@ if __name__ == '__main__' : # print print ("..::: ",PIPELINE[index]['context']) args = (PIPELINE[index]) - + for key in _config : + if key == 'pipeline' or key in args: + # + # skip in case of pipeline or if key exists in the selected pipeline (provided by index) + # + continue + + args[key] = _config[key] args = dict(args,**SYS_ARGS) - args['logs'] = args['logs'] if 'logs' in args else 'logs' + args['batch_size'] = 2000 if 'batch_size' not in args else int(args['batch_size']) if 'dataset' not in args : args['dataset'] = 'combined20191004v2_deid' @@ -340,38 +336,14 @@ if __name__ == '__main__' : else: generator.generate(args) # Components.generate(args) - elif 'listen' in args : + elif 'finalize' in args : # - # This will start a worker just in case to listen to a queue - SYS_ARGS = dict(args) #-- things get lost in context - if 'read' in SYS_ARGS : - QUEUE_TYPE = 'queue.QueueReader' - pointer = lambda qreader: qreader.read() - else: - QUEUE_TYPE = 'queue.QueueListener' - pointer = lambda qlistener: qlistener.listen() - N = int(SYS_ARGS['jobs']) if 'jobs' in SYS_ARGS else 1 - - qhandlers = [factory.instance(type=QUEUE_TYPE,args={'queue':'aou.io'}) for i in np.arange(N)] - jobs = [] - for qhandler in qhandlers : - qhandler.callback = Components.callback - job = Process(target=pointer,args=(qhandler,)) - job.start() - jobs.append(job) + # This will finalize a given set of synthetic operations into a table # - # let us wait for the jobs - print (["Started ",len(jobs)," trainers"]) - while len(jobs) > 0 : - - jobs = [job for job in jobs if job.is_alive()] - time.sleep(2) + idataset = args['input'] if 'input' in args else 'io' #-- input dataset + odataset = args['output'] #-- output dataset + labels = [name.strip() for name in args['labels'].split(',') ] - # pointer(qhandler) - - - # qreader.read(1) - pass else: # DATA = np.array_split(DATA,PART_SIZE) diff --git a/setup.py b/setup.py index 0f38464..c441e36 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.2.4","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.2.5","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From a1ac97fbca76c3ad3ec7299145ceb781b5a94296 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 25 Mar 2020 22:22:08 -0500 Subject: [PATCH 075/250] bug fix, multiple epochs --- data/gan.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/data/gan.py b/data/gan.py index a6dece6..41daa3d 100644 --- a/data/gan.py +++ b/data/gan.py @@ -507,7 +507,8 @@ class Train (GNet): logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) }) - if epoch % self.MAX_EPOCHS == 0: + # if epoch % self.MAX_EPOCHS == 0: + if epoch in [5,10,50, self.MAX_EPOCHS] : # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] suffix = self.get.suffix() _name = os.sep.join([self.train_dir,suffix]) From 6e0f89cd3c3fd008f32018c85021f62c6f46696c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 26 Mar 2020 23:39:59 -0500 Subject: [PATCH 076/250] bug fix: epochs, process control (generator) --- data/gan.py | 2 +- pipeline.py | 27 ++++++++++++++------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/data/gan.py b/data/gan.py index 41daa3d..c54f5bd 100644 --- a/data/gan.py +++ b/data/gan.py @@ -508,7 +508,7 @@ class Train (GNet): logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) }) # if epoch % self.MAX_EPOCHS == 0: - if epoch in [5,10,50, self.MAX_EPOCHS] : + if epoch in [5,10,20,50,75, self.MAX_EPOCHS] : # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] suffix = self.get.suffix() _name = os.sep.join([self.train_dir,suffix]) diff --git a/pipeline.py b/pipeline.py index 0d19e60..884609f 100644 --- a/pipeline.py +++ b/pipeline.py @@ -178,13 +178,14 @@ class Components : # info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}} x = {} - for name in args['columns'] : - ident = data_comp.apply(lambda row: 1*(row[name]==row[name+'_io']),axis=1).sum() - count = data_comp[name].unique().size - _ident= data_comp.shape[1] - ident - _count= data_comp[name+'_io'].unique().size + # for name in args['columns'] : + # ident = data_comp.apply(lambda row: 1*(row[name]==row[name+'_io']),axis=1).sum() + # count = data_comp[name].unique().size + # _ident= data_comp.shape[1] - ident + # _count= data_comp[name+'_io'].unique().size + # _count= len(set(data_comp[name+'_io'].values.tolist())) - info['input']['logs'] += [{"name":name,"identical":int(ident),"no_identical":int(_ident),"original_count":count,"synthetic_count":_count}] + # info['input']['logs'] += [{"name":name,"identical":int(ident),"no_identical":int(_ident),"original_count":count,"synthetic_count":_count}] # for name in data_comp.columns.tolist() : # g = pd.DataFrame(data_comp.groupby([name]).size()) # g.columns = ['counts'] @@ -192,17 +193,17 @@ class Components : # g.index = np.arange(g.shape[0]) # logs.append({"name":name,"counts": g.to_dict(orient='records')}) # info['input']['logs'] = logs - logger.write(info) + # logger.write(info) base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) cols = _dc.columns.tolist() - for name in cols : - _args['data'][name] = _dc[name] - info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}} - if partition != '' : - info['partition'] = int(partition) - logger.write(info) + # for name in cols : + # _args['data'][name] = _dc[name] + # info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}} + # if partition != '' : + # info['partition'] = int(partition) + # logger.write(info) # filename = os.sep.join([log_folder,'output',name+'.csv']) # data_comp[[name]].to_csv(filename,index=False) From 205adf8fa65b73e6b1070da59ed6345ed68f1ae1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 26 Mar 2020 23:40:09 -0500 Subject: [PATCH 077/250] bug fix: epochs, process control (generator) --- pipeline.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipeline.py b/pipeline.py index 884609f..7620afd 100644 --- a/pipeline.py +++ b/pipeline.py @@ -327,6 +327,8 @@ if __name__ == '__main__' : job.name = 'generator # '+str(index) job.start() jobs.append(job) + if len(jobs) == 1 : + job.join() print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ]) while len(jobs)> 0 : From e8906d1646720294e08a8524587def89c36ce375 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 27 Mar 2020 00:34:05 -0500 Subject: [PATCH 078/250] bug fix: process causing error when writing to bigquery --- pipeline.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pipeline.py b/pipeline.py index 7620afd..6e847fb 100644 --- a/pipeline.py +++ b/pipeline.py @@ -4,7 +4,7 @@ from transport import factory import numpy as np import time import os -from multiprocessing import Process +from multiprocessing import Process, Lock import pandas as pd from google.oauth2 import service_account import data.maker @@ -16,9 +16,11 @@ from data.params import SYS_ARGS DATASET='combined20191004v2_deid' class Components : + lock = Lock() class KEYS : PIPELINE_KEY = 'pipeline' SQL_FILTER = 'filter' + @staticmethod def get_logger(**args) : return factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) @@ -232,10 +234,12 @@ class Components : if 'dump' in args : print (_args['data'].head()) else: + Components.lock.acquire() data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000) + Components.lock.release() _id = 'dataset' info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} } if partition : @@ -327,8 +331,8 @@ if __name__ == '__main__' : job.name = 'generator # '+str(index) job.start() jobs.append(job) - if len(jobs) == 1 : - job.join() + # if len(jobs) == 1 : + # job.join() print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ]) while len(jobs)> 0 : From 459afa2890291dd1c0bc2a8cc63c75e6b7bdd0ae Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 27 Mar 2020 01:52:16 -0500 Subject: [PATCH 079/250] bug fix:generated data has JSON object --- pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index 6e847fb..066a418 100644 --- a/pipeline.py +++ b/pipeline.py @@ -200,8 +200,8 @@ class Components : base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) cols = _dc.columns.tolist() - # for name in cols : - # _args['data'][name] = _dc[name] + for name in cols : + _args['data'][name] = _dc[name] # info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}} # if partition != '' : # info['partition'] = int(partition) From 4c297679dc197ace2aec0fe87ca5ea847c70e890 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 1 Apr 2020 00:21:51 -0500 Subject: [PATCH 080/250] bug fixes and optimizations --- data/maker/__init__.py | 46 +++++++++++++++++++++++++----------------- pipeline.py | 2 +- setup.py | 2 +- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index e252de5..378c226 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -27,22 +27,25 @@ class ContinuousToDiscrete : values = np.array(X).astype(np.float32) BOUNDS = ContinuousToDiscrete.bounds(values,n) # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS] - _matrix = [] - m = [] - for value in X : - x_ = np.zeros(n) + # _matrix = [] + # m = [] + # for value in X : + # x_ = np.zeros(n) - for row in BOUNDS : + # for row in BOUNDS : - if value>= row.left and value <= row.right : - index = BOUNDS.index(row) - x_[index] = 1 - break - _matrix += x_.tolist() - # - # for items in BOUNDS : - # index = BOUNDS.index(items) - return np.array(_matrix).reshape(len(X),n) + # if value>= row.left and value <= row.right : + # index = BOUNDS.index(row) + # x_[index] = 1 + # break + # _matrix += x_.tolist() + # # + # # for items in BOUNDS : + # # index = BOUNDS.index(items) + + # return np.array(_matrix).reshape(len(X),n) + matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n) + @staticmethod def bounds(x,n): @@ -65,9 +68,15 @@ class ContinuousToDiscrete : # _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE) # # # print (BOUNDS) l = {} - for value in X : - values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if value >= item.left and value <= item.right ] + for i in np.arange(len(X)): #value in X : + + value = X[i] + for item in BOUNDS : + if value >= item.left and value <= item.right : + values += [np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP)] + break + # values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if value >= item.left and value <= item.right ] # # values = [] @@ -223,11 +232,10 @@ def generate(**args): i = np.where (i == False)[0] else: i = np.where( r[col] != None)[0] - _approx = ContinuousToDiscrete.continuous(r[col][i],BIN_SIZE) + _approx = ContinuousToDiscrete.continuous(r[col][i],BIN_SIZE) #-- approximating based on arbitrary bins r[col][i] = _approx - _df[col] = r[col] #ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col] - # _df[col] = r[col] + _df[col] = r[col] # # @TODO: log basic stats about the synthetic attribute # diff --git a/pipeline.py b/pipeline.py index 066a418..5af9550 100644 --- a/pipeline.py +++ b/pipeline.py @@ -47,7 +47,7 @@ class Components : logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) logger.write({"module":"bigquery","action":"read","input":{"sql":SQL}}) credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') - df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').astype(object) + df = pd.read_gbq(SQL,credentials=credentials,dialect='standard') return df # return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna() diff --git a/setup.py b/setup.py index c441e36..1c8aef0 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.2.5","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.2.6","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 89ed5d5d46ae0d109cdd1d7b5415c1778f30bce6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 1 Apr 2020 00:22:21 -0500 Subject: [PATCH 081/250] simplify the CLI interface to leverage existing configuration --- finalize.py | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 finalize.py diff --git a/finalize.py b/finalize.py new file mode 100644 index 0000000..a375b37 --- /dev/null +++ b/finalize.py @@ -0,0 +1,159 @@ +""" +This file will perform basic tasks to finalize the GAN process by performing the following : + - basic stats & analytics + - rebuild io to another dataset +""" +import pandas as pd +import numpy as np +from google.oauth2 import service_account +from google.cloud import bigquery as bq +from data.params import SYS_ARGS +import json +class Analytics : + """ + This class will compile basic analytics about a given dataset i.e compare original/synthetic + """ + @staticmethod + def distribution(**args): + context = args['context'] + df = args['data'] + # + #-- This data frame counts unique values for each feature (space) + df_counts = pd.DataFrame(df.apply(lambda col: col.unique().size),columns=['counts']).T # unique counts + # + #-- Get the distributions for common values + # + names = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False] + ddf = df.apply(lambda col: pd.DataFrame(col.values,columns=[col.name]).groupby([col.name]).size() ).fillna(0) + ddf[context] = ddf.index + + pass + def distance(**args): + """ + This function will measure the distance between + """ + df = args['data'] + names = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False] +class Utils : + class get : + @staticmethod + def config(**args) : + contexts = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts'] + pipeline = args['pipeline'] + return [ item for item in pipeline if item['context'] in contexts] + @staticmethod + def sql(**args) : + """ + This function is intended to build SQL query for the remainder of the table that was not synthesized + :config configuration entries + :from source of the table name + :dataset name of the source dataset + + """ + SQL = ["SELECT * FROM :from "] + SQL_FILTER = [] + NO_FILTERS_FOUND = True + pipeline = Utils.get.config(**args) + REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='} + for item in pipeline : + + + if 'filter' in item : + if NO_FILTERS_FOUND : + NO_FILTERS_FOUND = False + SQL += ['WHERE'] + # + # Let us load the filter in the SQL Query + FILTER = item['filter'] + QUALIFIER = REVERSE_QUALIFIER[FILTER['qualifier'].upper()] + SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')'])] + src = ".".join([args['dataset'],args['from']]) + SQL += [" AND ".join(SQL_FILTER)] + # + # let's pull the field schemas out of the table definition + # + + return " ".join(SQL).replace(":from",src) + + +def mk(**args) : + dataset = args['dataset'] + client = args['client'] if 'client' in args else bq.Client.from_service_account_file(args['private_key']) + # + # let us see if we have a dataset handy here + # + datasets = list(client.list_datasets()) + found = [item for item in datasets if item.dataset_id == dataset] + + if not found : + + return client.create_dataset(dataset) + return found[0] + +def move (**args): + """ + This function will move a table from the synthetic dataset into a designated location + This is the simplest case for finalizing a synthetic data set + :private_key + """ + private_key = args['private_key'] + client = bq.Client.from_service_account_json(private_key) + config = Utils.get.config(**args) + dataset = args['dataset'] + SQL = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in config] + SQL += [Utils.get.sql(**args)] + SQL = ('\n UNION ALL \n'.join(SQL).replace(':dataset','io')) + + + # + # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table + # + + + + odataset = mk(dataset=dataset+'_io',client=client) + # SQL = "SELECT * FROM io.:context_full_io".replace(':context',context) + config = bq.QueryJobConfig() + config.destination = client.dataset(odataset.dataset_id).table(args['from']) + config.use_query_cache = True + config.allow_large_results = True + config.priority = 'INTERACTIVE' + # + # + + schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema + fields = [" ".join(["CAST (",item.name,"AS",item.field_type.replace("INTEGER","INT64").replace("FLOAT","FLOAT64"),") ",item.name]) for item in schema] + SQL = SQL.replace("*"," , ".join(fields)) + # print (SQL) + out = client.query(SQL,location='US',job_config=config) + print (dir (out)) + + + + +import pandas as pd +import numpy as np +from google.oauth2 import service_account +import json + +# path = '../curation-prod.json' +# credentials = service_account.Credentials.from_service_account_file(path) +# df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard') +f = open('config.json') +config = json.loads(f.read()) +args = config['pipeline'] +f.close() + + +if __name__ == '__main__' : + """ + Usage : + finalize -- --contexts --from + """ + if 'move' in SYS_ARGS : + table = SYS_ARGS['from'] + contexts = [item['context'] for item in config['pipeline'] if item['from'] == args['from']] + args = dict(config,**{"private_key":"../curation-prod.json"}) + args = dict(args,**SYS_ARGS) + args['contexts'] = contexts + move(**args) \ No newline at end of file From 5c9fda4018f9fa37a965e42966d8e6d3b5ba1e49 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 1 Apr 2020 00:25:20 -0500 Subject: [PATCH 082/250] bug fix: CLI parameter handling (wrong reference) --- finalize.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/finalize.py b/finalize.py index a375b37..fd50d1d 100644 --- a/finalize.py +++ b/finalize.py @@ -139,7 +139,8 @@ import json # path = '../curation-prod.json' # credentials = service_account.Credentials.from_service_account_file(path) # df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard') -f = open('config.json') +filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config'] +f = open(filename) config = json.loads(f.read()) args = config['pipeline'] f.close() @@ -152,8 +153,10 @@ if __name__ == '__main__' : """ if 'move' in SYS_ARGS : table = SYS_ARGS['from'] - contexts = [item['context'] for item in config['pipeline'] if item['from'] == args['from']] + contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']] args = dict(config,**{"private_key":"../curation-prod.json"}) args = dict(args,**SYS_ARGS) args['contexts'] = contexts - move(**args) \ No newline at end of file + move(**args) + else: + print ("NOT YET READY !") \ No newline at end of file From debbd48627ff0dfb001d0bc083ec340853815a0b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 1 Apr 2020 00:53:56 -0500 Subject: [PATCH 083/250] bug fix: batch size per GPU --- pipeline.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pipeline.py b/pipeline.py index 5af9550..9a6b8aa 100644 --- a/pipeline.py +++ b/pipeline.py @@ -277,9 +277,8 @@ if __name__ == '__main__' : args[key] = _config[key] args = dict(args,**SYS_ARGS) - - - args['batch_size'] = 2000 if 'batch_size' not in args else int(args['batch_size']) + if 'batch_size' not in args : + args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size']) if 'dataset' not in args : args['dataset'] = 'combined20191004v2_deid' PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 From 87d54c508dd51a4929a593ef76dd723ee8396b60 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 1 Apr 2020 02:36:22 -0500 Subject: [PATCH 084/250] bug fix: with filter dataset --- finalize.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/finalize.py b/finalize.py index fd50d1d..5310e6d 100644 --- a/finalize.py +++ b/finalize.py @@ -66,7 +66,7 @@ class Utils : # Let us load the filter in the SQL Query FILTER = item['filter'] QUALIFIER = REVERSE_QUALIFIER[FILTER['qualifier'].upper()] - SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')'])] + SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')']).replace(":dataset",args['dataset'])] src = ".".join([args['dataset'],args['from']]) SQL += [" AND ".join(SQL_FILTER)] # @@ -126,7 +126,8 @@ def move (**args): SQL = SQL.replace("*"," , ".join(fields)) # print (SQL) out = client.query(SQL,location='US',job_config=config) - print (dir (out)) + print () + print (out.job_id) From 6d84b25d956dc90f0c8e6fdfec1f253e3adb98b9 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 2 Apr 2020 00:04:05 -0500 Subject: [PATCH 085/250] bug fix: multiple conditions on statement --- data/gan.py | 6 ++-- pipeline.py | 84 ++++++++++++++++++++++------------------------------- 2 files changed, 38 insertions(+), 52 deletions(-) diff --git a/data/gan.py b/data/gan.py index c54f5bd..a46740a 100644 --- a/data/gan.py +++ b/data/gan.py @@ -536,10 +536,10 @@ class Predict(GNet): self.values = args['values'] self.ROW_COUNT = args['row_count'] self.oROW_COUNT = self.ROW_COUNT - if args['no_value'] in ['na','','NA'] : - self.MISSING_VALUES = np.nan - else : + self.MISSING_VALUES = np.nan + if 'no_value' in args and args['no_value'] not in ['na','','NA'] : self.MISSING_VALUES = args['no_value'] + # self.MISSING_VALUES = args['no_value'] # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value'] def load_meta(self, column): diff --git a/pipeline.py b/pipeline.py index 9a6b8aa..acb4f6c 100644 --- a/pipeline.py +++ b/pipeline.py @@ -20,7 +20,12 @@ class Components : class KEYS : PIPELINE_KEY = 'pipeline' SQL_FILTER = 'filter' - + @staticmethod + def get_filter (**args): + if args['qualifier'] == 'IN' : + return ' '.join([args['field'],args['qualifier'],'(',args['value'],')']) + else: + return ' '.join([args['field'],args['qualifier'],args['value']]) @staticmethod def get_logger(**args) : return factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) @@ -34,8 +39,11 @@ class Components : """ SQL = args['sql'] if Components.KEYS.SQL_FILTER in args : - SQL_FILTER = Components.KEYS.SQL_FILTER - condition = ' '.join([args[SQL_FILTER]['field'],args[SQL_FILTER]['qualifier'],'(',args[SQL_FILTER]['value'],')']) + FILTER_KEY = Components.KEYS.SQL_FILTER + SQL_FILTER = args[FILTER_KEY] if type(args[FILTER_KEY]) == list else [args[FILTER_KEY]] + # condition = ' '.join([args[FILTER_KEY]['field'],args[FILTER_KEY]['qualifier'],'(',args[FILTER_KEY]['value'],')']) + + condition = ' AND '.join([Components.get_filter(**item) for item in SQL_FILTER]) SQL = " ".join([SQL,'WHERE',condition]) SQL = SQL.replace(':dataset',args['dataset']) #+ " LI " @@ -76,13 +84,6 @@ class Components : logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) log_folder = args['logs'] if 'logs' in args else 'logs' - # _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} - - # _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) - # _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 - # _args['gpu'] = args['gpu'] if 'gpu' in args else 0 - - # # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 PART_SIZE = int(args['part_size']) partition = args['partition'] @@ -156,16 +157,22 @@ class Components : # columns = args['columns'] # df = np.array_split(df[columns].values,PART_SIZE) # df = pd.DataFrame(df[ int (partition) ],columns = columns) - info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE)} + # max_rows = int(args['partition_max_rows']) if 'partition_max_rows' in args else 1000000 + # N = np.divide(df.shape[0],max_rows).astype(int) + 1 + info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE),"partition-info":{"count":int(N),"max_rows":max_rows}} logger.write({"module":"generate","action":"partition","input":info}) _args['partition'] = int(partition) _args['continuous']= args['continuous'] if 'continuous' in args else [] - _args['data'] = df - # _args['data'] = reader() - #_args['data'] = _args['data'].astype(object) - # _args['num_gpu'] = 1 + # + # How many rows sub-partition must we divide this into ? + # -- Let us tray assessing - _dc = data.maker.generate(**_args) + + df = np.array_split(df,N) + _dc = pd.DataFrame() + # for mdf in df : + _args['data'] = df + _dc = _dc.append(data.maker.generate(**_args)) # # We need to post the generate the data in order to : # 1. compare immediately @@ -180,35 +187,13 @@ class Components : # info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}} x = {} - # for name in args['columns'] : - # ident = data_comp.apply(lambda row: 1*(row[name]==row[name+'_io']),axis=1).sum() - # count = data_comp[name].unique().size - # _ident= data_comp.shape[1] - ident - # _count= data_comp[name+'_io'].unique().size - # _count= len(set(data_comp[name+'_io'].values.tolist())) - - # info['input']['logs'] += [{"name":name,"identical":int(ident),"no_identical":int(_ident),"original_count":count,"synthetic_count":_count}] - # for name in data_comp.columns.tolist() : - # g = pd.DataFrame(data_comp.groupby([name]).size()) - # g.columns = ['counts'] - # g[name] = g.index.tolist() - # g.index = np.arange(g.shape[0]) - # logs.append({"name":name,"counts": g.to_dict(orient='records')}) - # info['input']['logs'] = logs - # logger.write(info) - + # + # @TODO: Send data over to a process for analytics base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) cols = _dc.columns.tolist() for name in cols : _args['data'][name] = _dc[name] - # info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}} - # if partition != '' : - # info['partition'] = int(partition) - # logger.write(info) - - # filename = os.sep.join([log_folder,'output',name+'.csv']) - # data_comp[[name]].to_csv(filename,index=False) # #-- Let us store all of this into bigquery @@ -265,7 +250,7 @@ if __name__ == '__main__' : f = [i for i in range(0,N) if PIPELINE[i]['context'] == index] index = f[0] if f else 0 # - # print + print ("..::: ",PIPELINE[index]['context']) args = (PIPELINE[index]) for key in _config : @@ -274,8 +259,8 @@ if __name__ == '__main__' : # skip in case of pipeline or if key exists in the selected pipeline (provided by index) # continue - args[key] = _config[key] + args = dict(args,**SYS_ARGS) if 'batch_size' not in args : args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size']) @@ -286,13 +271,13 @@ if __name__ == '__main__' : # @TODO: # Log what was initiated so we have context of this processing ... # - if 'listen' not in SYS_ARGS : - if 'file' in args : - DATA = pd.read_csv(args['file']) ; - else: - DATA = Components().get(args) - COLUMNS = DATA.columns - DATA = np.array_split(DATA,PART_SIZE) + # if 'listen' not in SYS_ARGS : + if 'file' in args : + DATA = pd.read_csv(args['file']) ; + else: + DATA = Components().get(args) + COLUMNS = DATA.columns + DATA = np.array_split(DATA,PART_SIZE) if 'generate' in SYS_ARGS : # @@ -325,6 +310,7 @@ if __name__ == '__main__' : args['gpu'] = index else: args['gpu']=0 + make = lambda _args: (Components()).generate(_args) job = Process(target=make,args=(args,)) job.name = 'generator # '+str(index) From 8418208da013fc9cd6b204fd636f3d60b82e70b8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 2 Apr 2020 01:36:57 -0500 Subject: [PATCH 086/250] bug fix: max_rows missing --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index acb4f6c..78ffefe 100644 --- a/pipeline.py +++ b/pipeline.py @@ -159,7 +159,7 @@ class Components : # df = pd.DataFrame(df[ int (partition) ],columns = columns) # max_rows = int(args['partition_max_rows']) if 'partition_max_rows' in args else 1000000 # N = np.divide(df.shape[0],max_rows).astype(int) + 1 - info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE),"partition-info":{"count":int(N),"max_rows":max_rows}} + info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE)} logger.write({"module":"generate","action":"partition","input":info}) _args['partition'] = int(partition) _args['continuous']= args['continuous'] if 'continuous' in args else [] From 00e640df21cc780f3155e1d2a8064a4cf49ec953 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 2 Apr 2020 07:52:09 -0500 Subject: [PATCH 087/250] bug fix: @TODO: split partition data so we always process a decent sized data --- pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 78ffefe..4985156 100644 --- a/pipeline.py +++ b/pipeline.py @@ -152,6 +152,7 @@ class Components : # reader = args['reader'] # df = reader() df = args['reader']() if 'reader' in args else args['data'] + # bounds = Components.split(df,MAX_ROWS,PART_SIZE) # if partition != '' : # columns = args['columns'] @@ -168,7 +169,7 @@ class Components : # -- Let us tray assessing - df = np.array_split(df,N) + _dc = pd.DataFrame() # for mdf in df : _args['data'] = df From c758e840046f085a3ac5a53a0793f292c9bdd2b6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 9 Apr 2020 10:51:11 -0500 Subject: [PATCH 088/250] setup finalize --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 1c8aef0..939c241 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.2.6","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.2.7","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' @@ -12,5 +12,5 @@ args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' if sys.version_info[0] == 2 : args['use_2to3'] = False args['use_2to3_exclude_fixers'] = ['lib2to3.fixes.fix_import'] -args['scripts']=['pipeline.py'] +args['scripts']=['pipeline.py','finalize.py'] setup(**args) From e78d72af2107d462d42d01b1d44e97011cfb52ad Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 9 Apr 2020 10:52:48 -0500 Subject: [PATCH 089/250] setup finalize --- finalize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/finalize.py b/finalize.py index 5310e6d..b163b0d 100644 --- a/finalize.py +++ b/finalize.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """ This file will perform basic tasks to finalize the GAN process by performing the following : - basic stats & analytics From ba1f38770d317c6eb38a870debd543e431ad82b9 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 9 Apr 2020 12:42:29 -0500 Subject: [PATCH 090/250] bug fix ... --- pipeline.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index 4985156..2b5f028 100644 --- a/pipeline.py +++ b/pipeline.py @@ -166,8 +166,11 @@ class Components : _args['continuous']= args['continuous'] if 'continuous' in args else [] # # How many rows sub-partition must we divide this into ? - # -- Let us tray assessing - + # let us fix the data types here every _id field will be an np.int64... + # + for name in df.columns.tolist(): + if name.endwith('_id') : + df[name] = df[name].astype(np.int64) _dc = pd.DataFrame() From 944a3edbf6b718ed1d256c59fd777be46e56d1f3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 9 Apr 2020 12:57:25 -0500 Subject: [PATCH 091/250] bug fix ... with data-typing in data-frame --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 2b5f028..4cac273 100644 --- a/pipeline.py +++ b/pipeline.py @@ -169,7 +169,7 @@ class Components : # let us fix the data types here every _id field will be an np.int64... # for name in df.columns.tolist(): - if name.endwith('_id') : + if name.endswith('_id') : df[name] = df[name].astype(np.int64) From e4b164a34b142212379692172ae18dcf9e802af3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 9 Apr 2020 19:21:25 -0500 Subject: [PATCH 092/250] bug fix with typing --- pipeline.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 4cac273..3950942 100644 --- a/pipeline.py +++ b/pipeline.py @@ -168,9 +168,14 @@ class Components : # How many rows sub-partition must we divide this into ? # let us fix the data types here every _id field will be an np.int64... # + for name in df.columns.tolist(): + if name.endswith('_id') : - df[name] = df[name].astype(np.int64) + if df[name].isnull().sum() > 0 : + df[name].fillna(0,inplace=True) + else: + df[name] = df[name].astype(np.int64) _dc = pd.DataFrame() From 0016aec576004c562b9901cfd8478f850360bec2 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 11 Apr 2020 13:10:01 -0500 Subject: [PATCH 093/250] adding autopilot/drone mode for training and automatic data generation --- finalize.py | 38 ++++++++++++++++++++++++++++---------- pipeline.py | 3 +++ setup.py | 2 +- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/finalize.py b/finalize.py index b163b0d..079830d 100644 --- a/finalize.py +++ b/finalize.py @@ -101,10 +101,15 @@ def move (**args): client = bq.Client.from_service_account_json(private_key) config = Utils.get.config(**args) dataset = args['dataset'] - SQL = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in config] - SQL += [Utils.get.sql(**args)] - SQL = ('\n UNION ALL \n'.join(SQL).replace(':dataset','io')) - + if 'contexts' in args : + SQL = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in config] + SQL += [Utils.get.sql(**args)] + SQL = ('\n UNION ALL \n'.join(SQL).replace(':dataset','io')) + else: + # + # moving a table to a designated location + tablename = args['from'] + SQL = "SELECT * FROM :dataset.:table".replace(":dataset",dataset).replace(":table",tablename) # # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table @@ -128,7 +133,7 @@ def move (**args): # print (SQL) out = client.query(SQL,location='US',job_config=config) print () - print (out.job_id) + return (out.job_id) @@ -154,11 +159,24 @@ if __name__ == '__main__' : finalize -- --contexts --from
""" if 'move' in SYS_ARGS : - table = SYS_ARGS['from'] + # table = SYS_ARGS['from'] + # args = dict(config,**{"private_key":"../curation-prod.json"}) + args = dict(args,**SYS_ARGS) contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']] - args = dict(config,**{"private_key":"../curation-prod.json"}) - args = dict(args,**SYS_ARGS) - args['contexts'] = contexts - move(**args) + log = [] + if contexts : + args['contexts'] = contexts + log = move(**args) + + else: + tables = args['from'].split(',') + for name in tables : + name = name.strip() + args['from'] = name + log += [move(**args)] + print ("\n".join(log)) + + + else: print ("NOT YET READY !") \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index 3950942..e54e746 100644 --- a/pipeline.py +++ b/pipeline.py @@ -117,6 +117,9 @@ class Components : logger.write({"module":"train","action":"train","input":info}) data.maker.train(**_args) + if set(['drone','autopilot']) in set( list(args.keys())) : + print (['drone mode enabled ....']) + data.maker.generate(**args) pass diff --git a/setup.py b/setup.py index 939c241..71e14e0 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.2.7","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.2.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 0d0ebee9c0f9e340e047984340994f414931ecad Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 12 Apr 2020 04:50:54 -0500 Subject: [PATCH 094/250] bug fix with autopilot/drone mode --- pipeline.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index e54e746..22c637d 100644 --- a/pipeline.py +++ b/pipeline.py @@ -74,6 +74,13 @@ class Components : # pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args) df = args['data'] + if 'slice' in args and 'max_rows' in args['slice']: + max_rows = args['slice']['max_rows'] + if df.shape[0] > max_rows : + print (".. slicing ") + i = np.random.choice(df.shape[0],max_rows,replace=False) + df = df.iloc[i] + # if df.shape[0] == 0 : # print ("CAN NOT TRAIN EMPTY DATASET ") @@ -117,9 +124,10 @@ class Components : logger.write({"module":"train","action":"train","input":info}) data.maker.train(**_args) + if set(['drone','autopilot']) in set( list(args.keys())) : print (['drone mode enabled ....']) - data.maker.generate(**args) + self.generate(**args) pass @@ -155,6 +163,7 @@ class Components : # reader = args['reader'] # df = reader() df = args['reader']() if 'reader' in args else args['data'] + # bounds = Components.split(df,MAX_ROWS,PART_SIZE) # if partition != '' : From 65a1fadfcaeefc67df9b84ab5053103a72f218c6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 12 Apr 2020 20:07:15 -0500 Subject: [PATCH 095/250] bug fix data type and pipeline --- data/gan.py | 3 ++- pipeline.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/data/gan.py b/data/gan.py index a46740a..5975255 100644 --- a/data/gan.py +++ b/data/gan.py @@ -592,7 +592,8 @@ class Predict(GNet): # The code below will insure we have some acceptable cardinal relationships between id and synthetic values # - df = pd.DataFrame(np.round(f)).astype(np.int32) + # df = pd.DataFrame(np.round(f)).astype(np.int32) + df = pd.DataFrame(np.round(f),dtype=np.int32) p = 0 not in df.sum(axis=1).values x = df.sum(axis=1).values diff --git a/pipeline.py b/pipeline.py index 22c637d..c243ec3 100644 --- a/pipeline.py +++ b/pipeline.py @@ -125,9 +125,9 @@ class Components : logger.write({"module":"train","action":"train","input":info}) data.maker.train(**_args) - if set(['drone','autopilot']) in set( list(args.keys())) : + if 'autopilot' in ( list(args.keys())) : print (['drone mode enabled ....']) - self.generate(**args) + self.generate(args) pass From 310d599d06ab74bccc745b1e66993b9efbae2ead Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 13 Apr 2020 01:30:59 -0500 Subject: [PATCH 096/250] bug fix: volume of data --- data/gan.py | 2 +- pipeline.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/data/gan.py b/data/gan.py index 5975255..4a0fa48 100644 --- a/data/gan.py +++ b/data/gan.py @@ -593,7 +593,7 @@ class Predict(GNet): # # df = pd.DataFrame(np.round(f)).astype(np.int32) - df = pd.DataFrame(np.round(f),dtype=np.int32) + df = pd.DataFrame(np.round(f),dtype=int) p = 0 not in df.sum(axis=1).values x = df.sum(axis=1).values diff --git a/pipeline.py b/pipeline.py index c243ec3..7017592 100644 --- a/pipeline.py +++ b/pipeline.py @@ -163,6 +163,13 @@ class Components : # reader = args['reader'] # df = reader() df = args['reader']() if 'reader' in args else args['data'] + + if 'slice' in args and 'max_rows' in args['slice']: + max_rows = args['slice']['max_rows'] + if df.shape[0] > max_rows : + print (".. slicing ") + i = np.random.choice(df.shape[0],max_rows,replace=False) + df = df.iloc[i] # bounds = Components.split(df,MAX_ROWS,PART_SIZE) From e27624b697cbccb742c4adcb8f84d60e0e944594 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 13 Apr 2020 10:22:32 -0500 Subject: [PATCH 097/250] bug fix: volume of data --- data/gan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/gan.py b/data/gan.py index 4a0fa48..ff51aa8 100644 --- a/data/gan.py +++ b/data/gan.py @@ -593,7 +593,7 @@ class Predict(GNet): # # df = pd.DataFrame(np.round(f)).astype(np.int32) - df = pd.DataFrame(np.round(f),dtype=int) + df = pd.DataFrame(np.round(f),dtype=np.uint8) p = 0 not in df.sum(axis=1).values x = df.sum(axis=1).values From 52e91ec0631a1cf31a35472cb0b2a294c18bfc1e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 14 Apr 2020 00:18:23 -0500 Subject: [PATCH 098/250] bug fix ... --- data/gan.py | 4 +++- pipeline.py | 22 ++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/data/gan.py b/data/gan.py index ff51aa8..5559e4d 100644 --- a/data/gan.py +++ b/data/gan.py @@ -593,7 +593,7 @@ class Predict(GNet): # # df = pd.DataFrame(np.round(f)).astype(np.int32) - df = pd.DataFrame(np.round(f),dtype=np.uint8) + df = pd.DataFrame(np.round(f),dtype=int) p = 0 not in df.sum(axis=1).values x = df.sum(axis=1).values @@ -637,6 +637,8 @@ class Predict(GNet): if self.logger : info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)} + if df.shape[1] > len(self.values) : + df = df.iloc[:len(self.values)] if INDEX > 0 : info =dict(info ,**{"selected":INDEX, "ratio": ratio[INDEX] }) else : diff --git a/pipeline.py b/pipeline.py index 7017592..12746fa 100644 --- a/pipeline.py +++ b/pipeline.py @@ -82,6 +82,9 @@ class Components : df = df.iloc[i] + # + # Certain columns need to be removed too large of a matrix + # # if df.shape[0] == 0 : # print ("CAN NOT TRAIN EMPTY DATASET ") # return @@ -130,7 +133,7 @@ class Components : self.generate(args) pass - + # @staticmethod def generate(self,args): """ @@ -171,7 +174,7 @@ class Components : i = np.random.choice(df.shape[0],max_rows,replace=False) df = df.iloc[i] - + # bounds = Components.split(df,MAX_ROWS,PART_SIZE) # if partition != '' : # columns = args['columns'] @@ -194,13 +197,15 @@ class Components : if df[name].isnull().sum() > 0 : df[name].fillna(0,inplace=True) else: - df[name] = df[name].astype(np.int64) + df[name] = df[name].astype(int) _dc = pd.DataFrame() # for mdf in df : - _args['data'] = df + _args['data'] = df + _dc = _dc.append(data.maker.generate(**_args)) + # # We need to post the generate the data in order to : # 1. compare immediately @@ -356,14 +361,7 @@ if __name__ == '__main__' : else: generator.generate(args) # Components.generate(args) - elif 'finalize' in args : - # - # This will finalize a given set of synthetic operations into a table - # - idataset = args['input'] if 'input' in args else 'io' #-- input dataset - odataset = args['output'] #-- output dataset - labels = [name.strip() for name in args['labels'].split(',') ] - + else: # DATA = np.array_split(DATA,PART_SIZE) From 50da9098679984fd28314037c6c75c9da95f0430 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 14 Apr 2020 00:31:54 -0500 Subject: [PATCH 099/250] bug fix: no value data-type np.nan_to_num --- data/gan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/gan.py b/data/gan.py index 5559e4d..5fc7032 100644 --- a/data/gan.py +++ b/data/gan.py @@ -536,7 +536,7 @@ class Predict(GNet): self.values = args['values'] self.ROW_COUNT = args['row_count'] self.oROW_COUNT = self.ROW_COUNT - self.MISSING_VALUES = np.nan + self.MISSING_VALUES = np.nan_to_num(np.nan) if 'no_value' in args and args['no_value'] not in ['na','','NA'] : self.MISSING_VALUES = args['no_value'] From 821cec8dd77be3843503fdb788883fd9ee38a614 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 14 Apr 2020 01:54:11 -0500 Subject: [PATCH 100/250] fixed issue around data-types/casting misbehavior with pandas and missing values --- data/gan.py | 11 ++++------- data/maker/__init__.py | 7 ++++++- pipeline.py | 5 +++-- setup.py | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/data/gan.py b/data/gan.py index 5fc7032..8a0c7a7 100644 --- a/data/gan.py +++ b/data/gan.py @@ -647,13 +647,8 @@ class Predict(GNet): info['ratio'] = __ratio info['partition'] = self.PARTITION self.logger.write({"module":"gan-generate","action":"generate","input":info}) - df.columns = self.values - if len(found) or df.columns.size == len(self.values): - # print (len(found),NTH_VALID_CANDIDATE) - # x = df * self.values - # - # let's get the missing rows (if any) ... - # + # df.columns = self.values + if len(found) or df.columns.size <= len(self.values): ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1) # print ([' **** ',ii.sum()]) @@ -669,6 +664,8 @@ class Predict(GNet): # Log the findings here in terms of ratio, missing, candidate count # print ([np.max(ratio),len(missing),len(found),i]) i = np.where(ii == 0)[0] + + df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) df.columns = columns df = df[columns[0]].append(pd.Series(missing)) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 378c226..25392f9 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -190,7 +190,7 @@ def generate(**args): # BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value'] - + _df = df.copy() for col in column : args['context'] = col @@ -237,6 +237,11 @@ def generate(**args): _df[col] = r[col] # + # Let's cast the type to the original type (it makes the data more usable) + # + otype = df[col].dtype + _df[col] = _df[col].astype(otype) + # # @TODO: log basic stats about the synthetic attribute # # print (r)s diff --git a/pipeline.py b/pipeline.py index 12746fa..c678a89 100644 --- a/pipeline.py +++ b/pipeline.py @@ -195,8 +195,7 @@ class Components : if name.endswith('_id') : if df[name].isnull().sum() > 0 : - df[name].fillna(0,inplace=True) - else: + df[name].fillna(np.nan_to_num(np.nan),inplace=True) df[name] = df[name].astype(int) @@ -253,9 +252,11 @@ class Components : print (_args['data'].head()) else: Components.lock.acquire() + data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' + print (_args['data'].dtypes) _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000) Components.lock.release() _id = 'dataset' diff --git a/setup.py b/setup.py index 71e14e0..207cb6f 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.2.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.2.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 3dde3bf4ef6eb14d8f094ec6561256d5dcb0001b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 14 Apr 2020 07:26:24 -0500 Subject: [PATCH 101/250] fixed issue around data-types/casting misbehavior with pandas and missing values --- pipeline.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pipeline.py b/pipeline.py index c678a89..80fed9e 100644 --- a/pipeline.py +++ b/pipeline.py @@ -133,7 +133,7 @@ class Components : self.generate(args) pass - + # @staticmethod def generate(self,args): """ @@ -168,11 +168,13 @@ class Components : df = args['reader']() if 'reader' in args else args['data'] if 'slice' in args and 'max_rows' in args['slice']: + max_rows = args['slice']['max_rows'] if df.shape[0] > max_rows : print (".. slicing ") i = np.random.choice(df.shape[0],max_rows,replace=False) df = df.iloc[i] + # bounds = Components.split(df,MAX_ROWS,PART_SIZE) @@ -182,7 +184,7 @@ class Components : # df = pd.DataFrame(df[ int (partition) ],columns = columns) # max_rows = int(args['partition_max_rows']) if 'partition_max_rows' in args else 1000000 # N = np.divide(df.shape[0],max_rows).astype(int) + 1 - info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE)} + info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"space":df[args['columns'][0]].unique().size, "part_size":int(PART_SIZE)} logger.write({"module":"generate","action":"partition","input":info}) _args['partition'] = int(partition) _args['continuous']= args['continuous'] if 'continuous' in args else [] @@ -256,7 +258,7 @@ class Components : data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' - print (_args['data'].dtypes) + _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000) Components.lock.release() _id = 'dataset' From f1076f441b712e860feb1b7a5ce0e16489c9b02d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 14 Apr 2020 15:14:38 -0500 Subject: [PATCH 102/250] limitations on the matrix shape (feature space limitation) per partition --- data/bridge.py | 109 ++++++++++++++++++++++++++--------------- data/maker/__init__.py | 11 +++-- 2 files changed, 77 insertions(+), 43 deletions(-) diff --git a/data/bridge.py b/data/bridge.py index 019f065..41c0429 100644 --- a/data/bridge.py +++ b/data/bridge.py @@ -153,7 +153,7 @@ class Binary : """ This is a utility class to import and export a data to/from a binary matrix """ - def __stream(self,column) : + def __stream(self,column,size=-1) : """ This function will convert a column into a binary matrix with the value-space representing each column of the resulting matrix :column a column vector i.e every item is a row @@ -162,12 +162,19 @@ class Binary : values = column.dropna().unique() values.sort() + column = column.values # # Let's treat the case of missing values i.e nulls # row_count,col_count = column.size,values.size + if row_count * col_count > size and row_count < size: + N = np.divide(size,row_count).astype(int) + i = np.random.choice(col_count,N) + values = values[-i] + col_count = N + - matrix = [ np.zeros(col_count) for i in np.arange(row_count)] + matrix = [ np.zeros(col_count,dtype=np.float32) for i in np.arange(row_count)] # # let's create a binary matrix of the feature that was passed in # The indices of the matrix are inspired by classical x,y axis @@ -176,14 +183,31 @@ class Binary : for yi in np.arange(row_count) : value = column[yi] - if value not in values : - continue - xi = np.where(values == value) - xi = xi[0][0] #-- column index - matrix[yi][xi] = 1 + # if value not in values : + # continue + xi = np.where(values == value) + if xi and xi[0].size > 0: + xi = xi[0][0] #-- column index + matrix[yi][xi] = 1 + + return pd.DataFrame(matrix,columns=values) + def apply(self,column,size): + return self.__stream(column,size) + def get_column_values(self,column,size=-1): + values = column.dropna().unique() + values.sort() - return matrix - def Export(self,df) : + # + # Let's treat the case of missing values i.e nulls + # + row_count,col_count = column.size,values.size + if row_count * col_count > size and row_count < size: + N = np.divide(size,row_count).astype(int) + i = np.random.choice(col_count,N) + values = values[-i] + return values + + def _Export(self,df) : """ This function will convert a data-frame to a binary matrix :return _map,matrix @@ -192,8 +216,9 @@ class Binary : # This will give us a map of how each column was mapped to a bitstream # _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0) - _map = df.fillna('').apply(lambda column: self.__stream(column),axis=0) + # _map = df.fillna(np.nan).apply(lambda column: column,axis=0) + print (df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)) # # We will merge this to have a healthy matrix _matrix = _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1) @@ -239,37 +264,41 @@ if __name__ == '__main__' : --pseudo will create pseudonyms for a given --export will export data to a specified location """ - has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys() - has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys() - if has_basic and has_action : - builder = Builder() - if 'export' in SYS_ARGS : - print () - print ("exporting ....") - if not os.path.exists(SYS_ARGS['export']) : - os.mkdir(SYS_ARGS['export']) - SQL = builder.encode(**SYS_ARGS) - # - # Assuming the user wants to filter the records returned : - # + df = pd.read_csv('sample.csv') + print ( pd.get_dummies(df.race)) + print ( (Binary()).apply(df.race, 30)) + + # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys() + # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys() + # if has_basic and has_action : + # builder = Builder() + # if 'export' in SYS_ARGS : + # print () + # print ("exporting ....") + # if not os.path.exists(SYS_ARGS['export']) : + # os.mkdir(SYS_ARGS['export']) + # SQL = builder.encode(**SYS_ARGS) + # # + # # Assuming the user wants to filter the records returned : + # # - credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key']) - df = pd.read_gbq(SQL,credentials =credentials,dialect='standard') - FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv']) - # - # This would allow us to export it to wherever we see fit - print (FILENAME) - df.to_csv(FILENAME,index=False) - f = open(FILENAME.replace('.csv','.sql'),'w+') - f.write(SQL) - f.close() - elif 'pseudo' in SYS_ARGS : - builder.process(**SYS_ARGS) - else: - print ("") - print (SYS_ARGS.keys()) - print ("has basic ",has_basic) - print ("has action ",has_action) + # credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key']) + # df = pd.read_gbq(SQL,credentials =credentials,dialect='standard') + # FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv']) + # # + # # This would allow us to export it to wherever we see fit + # print (FILENAME) + # df.to_csv(FILENAME,index=False) + # f = open(FILENAME.replace('.csv','.sql'),'w+') + # f.write(SQL) + # f.close() + # elif 'pseudo' in SYS_ARGS : + # builder.process(**SYS_ARGS) + # else: + # print ("") + # print (SYS_ARGS.keys()) + # print ("has basic ",has_basic) + # print ("has action ",has_action) # pseudonym.apply(table='person',dataset='wgan_original',key='./curation-test-2.json') # args = {"dataset":"wgan_original","table":"observation","key":"./curation-test-2.json"} # builder = Builder() diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 25392f9..072b2f2 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -135,7 +135,9 @@ def train (**args) : # df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False) # print (df[col].dtypes) # print (df[col].dropna/(axis=1).unique()) - args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values + # args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values + msize = args['matrix_size'] if 'matrix_size' in args else -1 + args['real'] = (Binary()).apply(df[col],msize) @@ -190,7 +192,7 @@ def generate(**args): # BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value'] - + bhandler = Binary() _df = df.copy() for col in column : args['context'] = col @@ -207,7 +209,10 @@ def generate(**args): # values = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32).T # else: - values = df[col].dropna().unique().tolist() + # values = df[col].dropna().unique().tolist() + msize = args['matrix_size'] if 'matrix_size' in args else -1 + values = bhandler.get_column_values(df[col]) + From f91a58e534417f1826dce701d6fa2ae30d43f4ca Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 14 Apr 2020 15:52:55 -0500 Subject: [PATCH 103/250] limitations on the matrix shape (feature space limitation) per partition --- data/bridge.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/data/bridge.py b/data/bridge.py index 41c0429..ac79272 100644 --- a/data/bridge.py +++ b/data/bridge.py @@ -167,9 +167,11 @@ class Binary : # Let's treat the case of missing values i.e nulls # row_count,col_count = column.size,values.size - if row_count * col_count > size and row_count < size: - N = np.divide(size,row_count).astype(int) - i = np.random.choice(col_count,N) + # if row_count * col_count > size and row_count < size: + if col_count > size : + # N = np.divide(size,row_count).astype(int) + # N = + i = np.random.choice(col_count,size) values = values[-i] col_count = N From ed86ff0add1e177b0b2f54139eef932cd3da1d7b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 14 Apr 2020 15:54:11 -0500 Subject: [PATCH 104/250] limitations on the matrix shape (feature space limitation) per partition --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 207cb6f..44a59b1 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.2.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.3.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 1cf9c6e47ab608b2d067980e33663a622f8e1f6e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 14 Apr 2020 16:01:23 -0500 Subject: [PATCH 105/250] bug fix ... forgot to update a redundancy --- data/bridge.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/data/bridge.py b/data/bridge.py index ac79272..a86deef 100644 --- a/data/bridge.py +++ b/data/bridge.py @@ -174,6 +174,7 @@ class Binary : i = np.random.choice(col_count,size) values = values[-i] col_count = N + matrix = [ np.zeros(col_count,dtype=np.float32) for i in np.arange(row_count)] @@ -203,10 +204,12 @@ class Binary : # Let's treat the case of missing values i.e nulls # row_count,col_count = column.size,values.size - if row_count * col_count > size and row_count < size: - N = np.divide(size,row_count).astype(int) - i = np.random.choice(col_count,N) + if col_count > size : + # N = np.divide(size,row_count).astype(int) + # N = + i = np.random.choice(col_count,size) values = values[-i] + col_count = N return values def _Export(self,df) : From 8f390931f33bc462f6b57603de65c9d604b6ed54 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 14 Apr 2020 16:24:02 -0500 Subject: [PATCH 106/250] bug fix: matrix space restriction --- data/bridge.py | 6 +++--- data/maker/__init__.py | 4 ++-- pipeline.py | 24 +++++++----------------- 3 files changed, 12 insertions(+), 22 deletions(-) diff --git a/data/bridge.py b/data/bridge.py index a86deef..2e38431 100644 --- a/data/bridge.py +++ b/data/bridge.py @@ -173,7 +173,7 @@ class Binary : # N = i = np.random.choice(col_count,size) values = values[-i] - col_count = N + col_count = size @@ -209,7 +209,7 @@ class Binary : # N = i = np.random.choice(col_count,size) values = values[-i] - col_count = N + col_count = size return values def _Export(self,df) : @@ -271,7 +271,7 @@ if __name__ == '__main__' : """ df = pd.read_csv('sample.csv') print ( pd.get_dummies(df.race)) - print ( (Binary()).apply(df.race, 30)) + print ( (Binary()).apply(df.race, 2)) # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys() # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys() diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 072b2f2..78bc08d 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -136,7 +136,7 @@ def train (**args) : # print (df[col].dtypes) # print (df[col].dropna/(axis=1).unique()) # args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values - msize = args['matrix_size'] if 'matrix_size' in args else -1 + msize = args['matrix_size'] if 'matrix_size' in args else 128 args['real'] = (Binary()).apply(df[col],msize) @@ -210,7 +210,7 @@ def generate(**args): # else: # values = df[col].dropna().unique().tolist() - msize = args['matrix_size'] if 'matrix_size' in args else -1 + msize = args['matrix_size'] if 'matrix_size' in args else 128 values = bhandler.get_column_values(df[col]) diff --git a/pipeline.py b/pipeline.py index 80fed9e..54e12c4 100644 --- a/pipeline.py +++ b/pipeline.py @@ -73,21 +73,7 @@ class Components : # @TODO: we need to log something here about the parameters being passed # pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args) df = args['data'] - - if 'slice' in args and 'max_rows' in args['slice']: - max_rows = args['slice']['max_rows'] - if df.shape[0] > max_rows : - print (".. slicing ") - i = np.random.choice(df.shape[0],max_rows,replace=False) - df = df.iloc[i] - - - # - # Certain columns need to be removed too large of a matrix - # - # if df.shape[0] == 0 : - # print ("CAN NOT TRAIN EMPTY DATASET ") - # return + # # Now we can parse the arguments and submit the entire thing to training # @@ -102,8 +88,8 @@ class Components : _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) if 'batch_size' in args : _args['batch_size'] = int(args['batch_size']) - - # + + _args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128 # # We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel # if int(args['num_gpu']) > 1 : @@ -157,6 +143,8 @@ class Components : _args['num_gpu'] = 1 os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) _args['no_value']= args['no_value'] + _args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128 + # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 @@ -298,6 +286,8 @@ if __name__ == '__main__' : args[key] = _config[key] args = dict(args,**SYS_ARGS) + if 'matrix_size' in args : + args['matrix_size'] = int(args['matrix_size']) if 'batch_size' not in args : args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size']) if 'dataset' not in args : From bddba3d908ba5b5680ec6a2c2d7c4101ceeb2807 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 14 Apr 2020 16:32:11 -0500 Subject: [PATCH 107/250] bug fix ... --- data/maker/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 78bc08d..527d245 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -211,7 +211,7 @@ def generate(**args): # else: # values = df[col].dropna().unique().tolist() msize = args['matrix_size'] if 'matrix_size' in args else 128 - values = bhandler.get_column_values(df[col]) + values = bhandler.get_column_values(df[col],msize) From b8f59f85d50b4e82fd61cb7e7691c2f18632422e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 15 Apr 2020 09:18:06 -0500 Subject: [PATCH 108/250] bug fix: with column count --- data/bridge.py | 31 +++++++++++++++++++------------ data/maker/__init__.py | 6 +++--- setup.py | 2 +- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/data/bridge.py b/data/bridge.py index 2e38431..137a504 100644 --- a/data/bridge.py +++ b/data/bridge.py @@ -160,20 +160,17 @@ class Binary : """ # values = np.unique(column) - values = column.dropna().unique() - values.sort() + # values = column.dropna().unique() + + # values.sort() + # column = column.values + values = self.get_column(column,size) column = column.values # # Let's treat the case of missing values i.e nulls # row_count,col_count = column.size,values.size # if row_count * col_count > size and row_count < size: - if col_count > size : - # N = np.divide(size,row_count).astype(int) - # N = - i = np.random.choice(col_count,size) - values = values[-i] - col_count = size @@ -196,7 +193,17 @@ class Binary : return pd.DataFrame(matrix,columns=values) def apply(self,column,size): return self.__stream(column,size) - def get_column_values(self,column,size=-1): + def get_column(self,column,size=-1): + """ + This function will return the columns that are available for processing ... + """ + values = column.dropna().value_counts().index + if size > 0 : + values = values[:size] + values.sort_values() + return values + + def _get_column_values(self,column,size=-1): values = column.dropna().unique() values.sort() @@ -204,7 +211,7 @@ class Binary : # Let's treat the case of missing values i.e nulls # row_count,col_count = column.size,values.size - if col_count > size : + if col_count > size and size > 0: # N = np.divide(size,row_count).astype(int) # N = i = np.random.choice(col_count,size) @@ -270,8 +277,8 @@ if __name__ == '__main__' : --export will export data to a specified location """ df = pd.read_csv('sample.csv') - print ( pd.get_dummies(df.race)) - print ( (Binary()).apply(df.race, 2)) + print ( df.race.value_counts()) + print ( (Binary()).apply(df['race'], 3)) # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys() # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys() diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 527d245..26cc4de 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -136,7 +136,7 @@ def train (**args) : # print (df[col].dtypes) # print (df[col].dropna/(axis=1).unique()) # args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values - msize = args['matrix_size'] if 'matrix_size' in args else 128 + msize = args['matrix_size'] if 'matrix_size' in args else -1 args['real'] = (Binary()).apply(df[col],msize) @@ -210,8 +210,8 @@ def generate(**args): # else: # values = df[col].dropna().unique().tolist() - msize = args['matrix_size'] if 'matrix_size' in args else 128 - values = bhandler.get_column_values(df[col],msize) + msize = args['matrix_size'] if 'matrix_size' in args else -1 + values = bhandler.get_column(df[col],msize) diff --git a/setup.py b/setup.py index 44a59b1..0370cdc 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.3.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.3.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 02d8588f5b2bac248aa482a32f7d0fbe8ad312d2 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 15 Apr 2020 09:19:49 -0500 Subject: [PATCH 109/250] bug fix: with column count --- pipeline.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pipeline.py b/pipeline.py index 54e12c4..d218216 100644 --- a/pipeline.py +++ b/pipeline.py @@ -155,13 +155,13 @@ class Components : # df = reader() df = args['reader']() if 'reader' in args else args['data'] - if 'slice' in args and 'max_rows' in args['slice']: + # if 'slice' in args and 'max_rows' in args['slice']: - max_rows = args['slice']['max_rows'] - if df.shape[0] > max_rows : - print (".. slicing ") - i = np.random.choice(df.shape[0],max_rows,replace=False) - df = df.iloc[i] + # max_rows = args['slice']['max_rows'] + # if df.shape[0] > max_rows : + # print (".. slicing ") + # i = np.random.choice(df.shape[0],max_rows,replace=False) + # df = df.iloc[i] From 9fff0d123e9a1a4d89dd996f9c2a10db5fc78be7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 15 Apr 2020 10:23:14 -0500 Subject: [PATCH 110/250] bug fix urgh --- data/bridge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/bridge.py b/data/bridge.py index 137a504..902c6d3 100644 --- a/data/bridge.py +++ b/data/bridge.py @@ -198,9 +198,9 @@ class Binary : This function will return the columns that are available for processing ... """ values = column.dropna().value_counts().index - if size > 0 : + if size > 0 and column.size > size: values = values[:size] - values.sort_values() + values.sort_values() return values def _get_column_values(self,column,size=-1): From f9da0f1ce7fa27b53c607c21c3030bb7dd8762f5 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 15 Apr 2020 15:22:43 -0500 Subject: [PATCH 111/250] fix: table schema (urgh) --- pipeline.py | 63 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 9 deletions(-) diff --git a/pipeline.py b/pipeline.py index d218216..d636c2f 100644 --- a/pipeline.py +++ b/pipeline.py @@ -7,6 +7,7 @@ import os from multiprocessing import Process, Lock import pandas as pd from google.oauth2 import service_account +from google.cloud import bigquery as bq import data.maker from data.params import SYS_ARGS @@ -115,11 +116,45 @@ class Components : data.maker.train(**_args) if 'autopilot' in ( list(args.keys())) : - print (['drone mode enabled ....']) + print (['autopilot mode enabled ....']) self.generate(args) pass + def shuffle(self,args): + """ + """ + df = args['reader']() if 'reader' in args else args['data'] + + + col = args['columns'][0] + distrib = df[col].value_counts() + values = np.array(distrib.index) + counts = np.array(distrib.values) + np.random.shuffle(values) + np.random.shuffle(counts) + N = len (values) + theta = np.random.sample() + pad = 0 + # print (values) + iovalues = np.zeros(df.shape[0],dtype=df[col].dtype) + for i in range(N) : + # n = int(counts[i] - counts[i]*theta) + n = counts[i] + print ([counts[i],theta,n]) + index = np.where(iovalues == 0)[0] + if index.size > 0 and index.size > n: + index = index[:n] + iovalues[index] = values[i] + + + np.random.shuffle(iovalues) + df[col] = iovalues + return df + def post(self,args): + pass + + # @staticmethod def generate(self,args): """ @@ -181,12 +216,12 @@ class Components : # let us fix the data types here every _id field will be an np.int64... # - for name in df.columns.tolist(): + # for name in df.columns.tolist(): - if name.endswith('_id') : - if df[name].isnull().sum() > 0 : - df[name].fillna(np.nan_to_num(np.nan),inplace=True) - df[name] = df[name].astype(int) + # if name.endswith('_id') : + # if df[name].isnull().sum() > 0 and name not in ['unique_device_id']: + # df[name].fillna(np.nan_to_num(np.nan),inplace=True) + # df[name] = df[name].astype(int) _dc = pd.DataFrame() @@ -232,6 +267,11 @@ class Components : _id = 'path' else: + client = bq.Client.from_service_account_json(args["private_key"]) + full_schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema + full_schema = [{'name':item.name,'type':item.field_type,'description':item.description} for item in full_schema] + io_schema = [{'name':item['name'],'type':item['type'],'description':item['description']} for item in full_schema if item['name'] in args['columns']] + credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') _pname = os.sep.join([folder,table+'.csv']) _fname = table.replace('_io','_full_io') @@ -243,11 +283,11 @@ class Components : else: Components.lock.acquire() - data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) + data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000,table_schema=io_schema) INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' - _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000) + _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000,table_schema=full_schema) Components.lock.release() _id = 'dataset' info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} } @@ -354,7 +394,12 @@ if __name__ == '__main__' : else: generator.generate(args) # Components.generate(args) - + elif 'shuffle' in SYS_ARGS: + args['data'] = DATA[0] + _df = (Components()).shuffle(args) + print (DATA[0][args['columns']]) + print () + print (_df[args['columns']]) else: # DATA = np.array_split(DATA,PART_SIZE) From f920ba0eda1844fab4d71d68a835ab5a5ca54782 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 15 Apr 2020 15:51:53 -0500 Subject: [PATCH 112/250] bug fix --- pipeline.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/pipeline.py b/pipeline.py index d636c2f..5ef3013 100644 --- a/pipeline.py +++ b/pipeline.py @@ -216,6 +216,15 @@ class Components : # let us fix the data types here every _id field will be an np.int64... # + schema = args['schema'] + for item in schema : + if item.field_type == 'INTEGER' and df[item.name].dtype != np.int64: + df[item.name] = np.array(df[item.name].values,dtype=np.int64) + elif item.field_type == 'STRING' and df[item.name].dtype != object : + df[item.name] = np.array(df[item.name],dtype=object) + + + # for name in df.columns.tolist(): # if name.endswith('_id') : @@ -243,7 +252,7 @@ class Components : # performing basic analytics on the synthetic data generated (easy to quickly asses) # info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}} - x = {} + # # @TODO: Send data over to a process for analytics @@ -267,10 +276,6 @@ class Components : _id = 'path' else: - client = bq.Client.from_service_account_json(args["private_key"]) - full_schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema - full_schema = [{'name':item.name,'type':item.field_type,'description':item.description} for item in full_schema] - io_schema = [{'name':item['name'],'type':item['type'],'description':item['description']} for item in full_schema if item['name'] in args['columns']] credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') _pname = os.sep.join([folder,table+'.csv']) @@ -282,12 +287,8 @@ class Components : print (_args['data'].head()) else: Components.lock.acquire() - - data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000,table_schema=io_schema) - - INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' - - _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000,table_schema=full_schema) + data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) + _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000) Components.lock.release() _id = 'dataset' info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} } @@ -340,11 +341,15 @@ if __name__ == '__main__' : # if 'listen' not in SYS_ARGS : if 'file' in args : DATA = pd.read_csv(args['file']) ; + schema = [] else: DATA = Components().get(args) + client = bq.Client.from_service_account_json(args["private_key"]) + schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema + COLUMNS = DATA.columns DATA = np.array_split(DATA,PART_SIZE) - + args['schema'] = schema if 'generate' in SYS_ARGS : # # Let us see if we have partitions given the log folder From 71097103da4d3b4618ee83e3ec50d5a96ccbc8ef Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 29 Apr 2020 01:27:25 -0500 Subject: [PATCH 113/250] fix: handling outliers and missing values --- data/bridge.py | 15 ++++-- data/gan.py | 61 +++++++-------------- data/maker/__init__.py | 81 ++++++---------------------- data/params.py | 4 +- finalize.py | 120 ++++++++++++++++++++++++++++++----------- pipeline.py | 47 +++------------- setup.py | 4 +- 7 files changed, 149 insertions(+), 183 deletions(-) diff --git a/data/bridge.py b/data/bridge.py index 902c6d3..3116a4b 100644 --- a/data/bridge.py +++ b/data/bridge.py @@ -197,12 +197,21 @@ class Binary : """ This function will return the columns that are available for processing ... """ - values = column.dropna().value_counts().index + values = column.dropna().value_counts().index.values + if size > 0 and column.size > size: values = values[:size] - values.sort_values() + values.sort() return values - + def get_missing(self,column,size=-1): + values = column.dropna().value_counts().index.values + if size > 0 and column.size > size : + values = values[size:] + else: + values = np.array([]) + values.sort() + return values.tolist(); + def _get_column_values(self,column,size=-1): values = column.dropna().unique() values.sort() diff --git a/data/gan.py b/data/gan.py index 8a0c7a7..1418a04 100644 --- a/data/gan.py +++ b/data/gan.py @@ -536,9 +536,10 @@ class Predict(GNet): self.values = args['values'] self.ROW_COUNT = args['row_count'] self.oROW_COUNT = self.ROW_COUNT - self.MISSING_VALUES = np.nan_to_num(np.nan) - if 'no_value' in args and args['no_value'] not in ['na','','NA'] : - self.MISSING_VALUES = args['no_value'] + # self.MISSING_VALUES = np.nan_to_num(np.nan) + # if 'no_value' in args and args['no_value'] not in ['na','','NA'] : + # self.MISSING_VALUES = args['no_value'] + self.MISSING_VALUES = args['missing'] # self.MISSING_VALUES = args['no_value'] # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value'] @@ -650,15 +651,18 @@ class Predict(GNet): # df.columns = self.values if len(found) or df.columns.size <= len(self.values): ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1) - # print ([' **** ',ii.sum()]) - - if ii.shape[0] > 0 : + missing = [] + if ii.sum() > 0 : # - #@TODO Have this be a configurable variable - - missing = np.repeat(self.MISSING_VALUES, np.where(ii==1)[0].size) - else: - missing = [] + # If the generator had a reductive effect we should be able to get random values from either : + # - The space of outliers + # - existing values for smaller spaces that have suffered over training + # + + N = ii.sum() + missing_values = self.MISSING_VALUES if self.MISSING_VALUES else self.values + missing = np.random.choice(missing_values,N) + # missing = [] # # @TODO: # Log the findings here in terms of ratio, missing, candidate count @@ -669,6 +673,8 @@ class Predict(GNet): df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) df.columns = columns df = df[columns[0]].append(pd.Series(missing)) + + if self.logger : info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION} @@ -680,40 +686,9 @@ class Predict(GNet): tf.compat.v1.reset_default_graph() df = pd.DataFrame(df) df.columns = columns + np.random.shuffle(df[columns[0]].values) return df.to_dict(orient='list') - # return df.to_dict(orient='list') - # count = str(len(os.listdir(self.out_dir))) - # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv']) - # df.to_csv(_name,index=False) - - # output.extend(np.round(f)) - - # for m in range(2): - # for n in range(2, self.NUM_LABELS): - # idx1 = (demo[:, m] == 1) - # idx2 = (demo[:, n] == 1) - # idx = [idx1[j] and idx2[j] for j in range(len(idx1))] - # num = np.sum(idx) - # print ("___________________list__") - # print (idx1) - # print (idx2) - # print (idx) - # print (num) - # print ("_____________________") - # nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU)) - # label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS)) - # label_input[:, n] = 1 - # label_input[:, m] = 1 - # output = [] - # for i in range(nbatch): - # f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]}) - # output.extend(np.round(f)) - # output = np.array(output)[:num] - # print ([m,n,output]) - - # np.save(self.out_dir + str(m) + str(n), output) - if __name__ == '__main__' : # diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 26cc4de..3e2c9aa 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -21,29 +21,8 @@ class ContinuousToDiscrete : """ This function will convert a continous stream of information into a variety a bit stream of bins """ - # BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist() - # print ( X.values.astype(np.float32)) - # print ("___________________________") values = np.array(X).astype(np.float32) BOUNDS = ContinuousToDiscrete.bounds(values,n) - # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS] - # _matrix = [] - # m = [] - # for value in X : - # x_ = np.zeros(n) - - # for row in BOUNDS : - - # if value>= row.left and value <= row.right : - # index = BOUNDS.index(row) - # x_[index] = 1 - # break - # _matrix += x_.tolist() - # # - # # for items in BOUNDS : - # # index = BOUNDS.index(items) - - # return np.array(_matrix).reshape(len(X),n) matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n) @@ -123,25 +102,9 @@ def train (**args) : # @TODO : Consider performing this task on several threads/GPUs simulataneously # for col in column : - # args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values - # if 'float' not in df[col].dtypes.name : - # args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values - # if col in CONTINUOUS: - # BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) - # args['real'] = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32) - # # args['real'] = args['real'].reshape(df.shape[0],BIN_SIZE) - - # else: - # df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False) - # print (df[col].dtypes) - # print (df[col].dropna/(axis=1).unique()) - # args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values msize = args['matrix_size'] if 'matrix_size' in args else -1 args['real'] = (Binary()).apply(df[col],msize) - - - context = args['context'] if 'store' in args : args['store']['args']['doc'] = context @@ -191,61 +154,49 @@ def generate(**args): # If the identifier is not present, we should fine a way to determine or make one # BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) - NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value'] + # NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value'] bhandler = Binary() _df = df.copy() for col in column : args['context'] = col args['column'] = col - # if 'float' in df[col].dtypes.name or col in CONTINUOUS : - # # - # # We should create the bins for the values we are observing here - # BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) - # values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE) - # # values = np.unique(values).tolist() - # else: - # if col in CONTINUOUS : - # values = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32).T - - # else: - # values = df[col].dropna().unique().tolist() msize = args['matrix_size'] if 'matrix_size' in args else -1 values = bhandler.get_column(df[col],msize) - + MISSING= bhandler.get_missing(df[col],msize) args['values'] = values args['row_count'] = df.shape[0] - if col in NO_VALUE : - args['no_value'] = NO_VALUE[col] - else: - args['no_value'] = NO_VALUE - + # if col in NO_VALUE : + # args['no_value'] = NO_VALUE[col] + # else: + # args['no_value'] = NO_VALUE + # novalue = NO_VALUE[col] if NO_VALUE[col] in ['na',''] else NO_VALUE[col] + # MISSING += [NO_VALUE[col]] + args['missing'] = MISSING # # we can determine the cardinalities here so we know what to allow or disallow handler = gan.Predict (**args) handler.load_meta(col) r = handler.apply() if col in CONTINUOUS : - r[col] = np.array(r[col]) - MISSING= np.nan if args['no_value'] in ['na','','NA'] else args['no_value'] + r[col] = np.array(r[col]) + _approx = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) #-- approximating based on arbitrary bins + r[col] = _approx + - if np.isnan(MISSING): - i = np.isnan(r[col]) - i = np.where (i == False)[0] - else: - i = np.where( r[col] != None)[0] - _approx = ContinuousToDiscrete.continuous(r[col][i],BIN_SIZE) #-- approximating based on arbitrary bins - r[col][i] = _approx _df[col] = r[col] # # Let's cast the type to the original type (it makes the data more usable) # + # print (values) + # print ([col,df[col].dtype,_df[col].tolist()]) otype = df[col].dtype _df[col] = _df[col].astype(otype) + # # @TODO: log basic stats about the synthetic attribute # diff --git a/data/params.py b/data/params.py index c667063..f2c3536 100644 --- a/data/params.py +++ b/data/params.py @@ -9,8 +9,10 @@ if len(sys.argv) > 1: if sys.argv[i].startswith('--'): key = sys.argv[i][2:] #.replace('-','') SYS_ARGS[key] = 1 - if i + 1 < N: + if i + 1 < N and not sys.argv[i + 1].startswith('--'): value = sys.argv[i + 1] = sys.argv[i+1].strip() + else: + value = None if key and value: SYS_ARGS[key] = value diff --git a/finalize.py b/finalize.py index 079830d..d420d7d 100644 --- a/finalize.py +++ b/finalize.py @@ -6,10 +6,13 @@ This file will perform basic tasks to finalize the GAN process by performing the """ import pandas as pd import numpy as np +from multiprocessing import Process, Lock from google.oauth2 import service_account from google.cloud import bigquery as bq +import transport from data.params import SYS_ARGS import json + class Analytics : """ This class will compile basic analytics about a given dataset i.e compare original/synthetic @@ -33,15 +36,23 @@ class Analytics : """ This function will measure the distance between """ - df = args['data'] - names = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False] + pass class Utils : + @staticmethod + def log(**args): + logger = transport.factory.instance(type="mongo.MongoWriter",args={"dbname":"aou","doc":"logs"}) + logger.write(args) + logger.close() class get : @staticmethod - def config(**args) : - contexts = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts'] - pipeline = args['pipeline'] - return [ item for item in pipeline if item['context'] in contexts] + def pipeline(table,path) : + # contexts = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts'] + config = json.loads((open(path)).read()) + pipeline = config['pipeline'] + # return [ item for item in pipeline if item['context'] in contexts] + pipeline = [item for item in pipeline if 'from' in item and item['from'].strip() == table] + Utils.log(module=table,action='init',input={"pipeline":pipeline}) + return pipeline @staticmethod def sql(**args) : """ @@ -54,7 +65,8 @@ class Utils : SQL = ["SELECT * FROM :from "] SQL_FILTER = [] NO_FILTERS_FOUND = True - pipeline = Utils.get.config(**args) + # pipeline = Utils.get.config(**args) + pipeline = args['pipeline'] REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='} for item in pipeline : @@ -73,7 +85,7 @@ class Utils : # # let's pull the field schemas out of the table definition # - + Utils.log(module=args['from'],action='sql',input={"sql":" ".join(SQL) }) return " ".join(SQL).replace(":from",src) @@ -91,26 +103,36 @@ def mk(**args) : return client.create_dataset(dataset) return found[0] -def move (**args): +def move (args): """ This function will move a table from the synthetic dataset into a designated location This is the simplest case for finalizing a synthetic data set :private_key """ - private_key = args['private_key'] - client = bq.Client.from_service_account_json(private_key) - config = Utils.get.config(**args) + pipeline = Utils.get.pipeline(args['from'],args['config']) + _args = json.loads((open(args['config'])).read()) + _args['pipeline'] = pipeline + # del _args['pipeline'] + args = dict(args,**_args) + # del args['pipeline'] + # private_key = args['private_key'] + client = bq.Client.from_service_account_json(args['private_key']) + dataset = args['dataset'] - if 'contexts' in args : - SQL = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in config] + if pipeline : + SQL = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in pipeline] SQL += [Utils.get.sql(**args)] SQL = ('\n UNION ALL \n'.join(SQL).replace(':dataset','io')) else: # # moving a table to a designated location tablename = args['from'] - SQL = "SELECT * FROM :dataset.:table".replace(":dataset",dataset).replace(":table",tablename) - + if 'sql' not in args : + SQL = "SELECT * FROM :dataset.:table" + else: + SQL = args['sql'] + SQL = SQL.replace(":dataset",dataset).replace(":table",tablename) + Utils.log(module=args['from'],action='sql',input={'sql':SQL}) # # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table # @@ -132,7 +154,7 @@ def move (**args): SQL = SQL.replace("*"," , ".join(fields)) # print (SQL) out = client.query(SQL,location='US',job_config=config) - print () + Utils.log(module=args['from'],action='move',input={'job':out.job_id}) return (out.job_id) @@ -158,23 +180,59 @@ if __name__ == '__main__' : Usage : finalize -- --contexts --from
""" + if 'move' in SYS_ARGS : - # table = SYS_ARGS['from'] - # args = dict(config,**{"private_key":"../curation-prod.json"}) - args = dict(args,**SYS_ARGS) - contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']] - log = [] - if contexts : - args['contexts'] = contexts - log = move(**args) + + if 'init' in SYS_ARGS : + dep = config['dep'] if 'dep' in config else {} + info = [] + if 'queries' in dep : + info += dep['queries'] + print ('________') + if 'tables' in dep : + info += dep['tables'] + args = {} + jobs = [] + for item in info : + args = {} + if type(item) == str : + args['from'] = item + name = item + else: + args = item + name = item['from'] + args['config'] = SYS_ARGS['config'] + # args['pipeline'] = [] + job = Process(target=move,args=(args,)) + job.name = name + jobs.append(job) + job.start() + + + # while len(jobs) > 0 : + # jobs = [job for job in jobs if job.is_alive()] + # time.sleep(1) + + else: - tables = args['from'].split(',') - for name in tables : - name = name.strip() - args['from'] = name - log += [move(**args)] - print ("\n".join(log)) + move(SYS_ARGS) + # # table = SYS_ARGS['from'] + # # args = dict(config,**{"private_key":"../curation-prod.json"}) + # args = dict(args,**SYS_ARGS) + # contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']] + # log = [] + # if contexts : + # args['contexts'] = contexts + # log = move(**args) + + # else: + # tables = args['from'].split(',') + # for name in tables : + # name = name.strip() + # args['from'] = name + # log += [move(**args)] + # print ("\n".join(log)) diff --git a/pipeline.py b/pipeline.py index 5ef3013..00f558d 100644 --- a/pipeline.py +++ b/pipeline.py @@ -14,7 +14,6 @@ from data.params import SYS_ARGS # # The configuration array is now loaded and we will execute the pipe line as follows -DATASET='combined20191004v2_deid' class Components : lock = Lock() @@ -120,37 +119,7 @@ class Components : self.generate(args) pass - def shuffle(self,args): - """ - """ - df = args['reader']() if 'reader' in args else args['data'] - - - col = args['columns'][0] - distrib = df[col].value_counts() - values = np.array(distrib.index) - counts = np.array(distrib.values) - np.random.shuffle(values) - np.random.shuffle(counts) - N = len (values) - theta = np.random.sample() - pad = 0 - # print (values) - iovalues = np.zeros(df.shape[0],dtype=df[col].dtype) - for i in range(N) : - # n = int(counts[i] - counts[i]*theta) - n = counts[i] - print ([counts[i],theta,n]) - index = np.where(iovalues == 0)[0] - if index.size > 0 and index.size > n: - index = index[:n] - iovalues[index] = values[i] - - - np.random.shuffle(iovalues) - df[col] = iovalues - - return df + def post(self,args): pass @@ -177,7 +146,7 @@ class Components : _args['gpu'] = 0 _args['num_gpu'] = 1 os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) - _args['no_value']= args['no_value'] + # _args['no_value']= args['no_value'] _args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128 @@ -207,7 +176,7 @@ class Components : # df = pd.DataFrame(df[ int (partition) ],columns = columns) # max_rows = int(args['partition_max_rows']) if 'partition_max_rows' in args else 1000000 # N = np.divide(df.shape[0],max_rows).astype(int) + 1 - info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"space":df[args['columns'][0]].unique().size, "part_size":int(PART_SIZE)} + info = {"name":args['columns'],"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"space":df[args['columns'][0]].unique().size, "part_size":int(PART_SIZE)} logger.write({"module":"generate","action":"partition","input":info}) _args['partition'] = int(partition) _args['continuous']= args['continuous'] if 'continuous' in args else [] @@ -400,11 +369,11 @@ if __name__ == '__main__' : generator.generate(args) # Components.generate(args) elif 'shuffle' in SYS_ARGS: - args['data'] = DATA[0] - _df = (Components()).shuffle(args) - print (DATA[0][args['columns']]) - print () - print (_df[args['columns']]) + + + for data in DATA : + args['data'] = data + _df = (Components()).shuffle(args) else: # DATA = np.array_split(DATA,PART_SIZE) diff --git a/setup.py b/setup.py index 0370cdc..40e8d11 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.3.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.3.2","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' @@ -14,3 +14,5 @@ if sys.version_info[0] == 2 : args['use_2to3_exclude_fixers'] = ['lib2to3.fixes.fix_import'] args['scripts']=['pipeline.py','finalize.py'] setup(**args) + + From 97bae5ef92a9dbf9c53ec2dbfe854e099612d67e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 29 Mar 2021 11:10:57 -0500 Subject: [PATCH 114/250] bug fixes: design improvements --- data/__init__.py | 15 ++ data/gan.py | 225 ++++++++++++-------- data/maker/__init__.py | 110 +++++++++- data/maker/__main__.py | 32 --- pipeline.py | 471 +++++++++++++++++++++-------------------- 5 files changed, 500 insertions(+), 353 deletions(-) delete mode 100644 data/maker/__main__.py diff --git a/data/__init__.py b/data/__init__.py index 98124f1..0ca216d 100644 --- a/data/__init__.py +++ b/data/__init__.py @@ -1,2 +1,17 @@ import data.params as params +from data.params import SYS_ARGS +import transport +from multiprocessing import Process, Queue +from data.maker import prepare +class Trainer (Process) : + pass +class Maker(Process): + pass + +if __name__ == '__main__' : + + logger = transport.factory.instance(SYS_ARGS['store']['logger']) + + + \ No newline at end of file diff --git a/data/gan.py b/data/gan.py index 1418a04..e7ab6cf 100644 --- a/data/gan.py +++ b/data/gan.py @@ -111,15 +111,15 @@ class GNet : self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) - if self.logger : - # - # We will clear the logs from the data-store - # - column = self.ATTRIBUTES['synthetic'] - db = self.logger.db - if db[column].count() > 0 : - db.backup.insert({'name':column,'logs':list(db[column].find()) }) - db[column].drop() + # if self.logger : + + # We will clear the logs from the data-store + + # column = self.ATTRIBUTES['synthetic'] + # db = self.logger.db + # if db[column].count() > 0 : + # db.backup.insert({'name':column,'logs':list(db[column].find()) }) + # db[column].drop() def load_meta(self,column): """ @@ -127,7 +127,7 @@ class GNet : Because prediction and training can happen independently """ # suffix = "-".join(column) if isinstance(column,list)else column - suffix = self.get.suffix() + suffix = self.CONTEXT #self.get.suffix() _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json']) if os.path.exists(_name) : attr = json.loads((open(_name)).read()) @@ -159,7 +159,7 @@ class GNet : value= args['value'] object[key] = value # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column - suffix = self.get.suffix() + suffix = self.CONTEXT #self.get.suffix() _name = os.sep.join([self.out_dir,'meta-'+suffix]) f = open(_name+'.json','w') @@ -351,7 +351,7 @@ class Train (GNet): self.discriminator = Discriminator(**args) self._REAL = args['real'] self._LABEL= args['label'] if 'label' in args else None - self.column = args['column'] + # self.column = args['column'] # print ([" *** ",self.BATCHSIZE_PER_GPU]) self.meta = self.log_meta() @@ -438,6 +438,11 @@ class Train (GNet): per_gpu_w = [] iterator, features_placeholder, labels_placeholder = self.input_fn() with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): + # + # @TODO: Find a way to handle this across multiple CPU in case the GPU are not available + # - abstract hardware specification + # - determine if the GPU/CPU are busy + # for i in range(self.NUM_GPUS): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ('TOWER', i)) as scope: @@ -510,7 +515,7 @@ class Train (GNet): # if epoch % self.MAX_EPOCHS == 0: if epoch in [5,10,20,50,75, self.MAX_EPOCHS] : # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - suffix = self.get.suffix() + suffix = self.CONTEXT #self.get.suffix() _name = os.sep.join([self.train_dir,suffix]) # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) saver.save(sess, _name, write_meta_graph=False, global_step=epoch) @@ -539,7 +544,8 @@ class Predict(GNet): # self.MISSING_VALUES = np.nan_to_num(np.nan) # if 'no_value' in args and args['no_value'] not in ['na','','NA'] : # self.MISSING_VALUES = args['no_value'] - self.MISSING_VALUES = args['missing'] + self.MISSING_VALUES = args['missing'] if 'missing' in args else [] + # self.MISSING_VALUES = args['no_value'] # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value'] @@ -548,9 +554,56 @@ class Predict(GNet): self.generator.load_meta(column) self.ROW_COUNT = self.oROW_COUNT def apply(self,**args): + suffix = self.CONTEXT #self.get.suffix() + model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) + demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] + # + # setup computational graph + tf.compat.v1.reset_default_graph() + z = tf.random.normal(shape=[self.ROW_COUNT, self.Z_DIM]) + + y = tf.compat.v1.placeholder(shape=[self.ROW_COUNT, self.NUM_LABELS], dtype=tf.int32) + if self._LABEL is not None : + ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] + label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) + else: + label = None + + fake = self.generator.network(inputs=z, label=label) + init = tf.compat.v1.global_variables_initializer() + saver = tf.compat.v1.train.Saver() + df = pd.DataFrame() + CANDIDATE_COUNT = args['candidates'] if 'candidates' in args else 1 #0 if self.ROW_COUNT < 1000 else 100 + candidates = [] + + with tf.compat.v1.Session() as sess: + saver.restore(sess, model_dir) + if self._LABEL is not None : + # labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) + labels= demo + else: + labels = None + + for i in np.arange(CANDIDATE_COUNT) : + if labels : + _matrix = sess.run(fake,feed_dict={y:labels}) + else: + _matrix = sess.run(fake) + # + # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes + # The code below will insure we have some acceptable cardinal relationships between id and synthetic values + # + + # df = pd.DataFrame(np.round(f)).astype(np.int32) + candidates.append (np.round(_matrix).astype(np.int64)) + # return candidates[0] if len(candidates) == 1 else candidates + + return candidates + + def _apply(self,**args): # print (self.train_dir) # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - suffix = self.get.suffix() + suffix = self.CONTEXT #self.get.suffix() model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] tf.compat.v1.reset_default_graph() @@ -567,11 +620,12 @@ class Predict(GNet): init = tf.compat.v1.global_variables_initializer() saver = tf.compat.v1.train.Saver() df = pd.DataFrame() - CANDIDATE_COUNT = 10 #0 if self.ROW_COUNT < 1000 else 100 + CANDIDATE_COUNT = 5 #0 if self.ROW_COUNT < 1000 else 100 NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0] with tf.compat.v1.Session() as sess: # sess.run(init) + saver.restore(sess, model_dir) if self._LABEL is not None : labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) @@ -585,109 +639,110 @@ class Predict(GNet): __ratio=0 for i in np.arange(CANDIDATE_COUNT) : if labels : - f = sess.run(fake,feed_dict={y:labels}) + _matrix = sess.run(fake,feed_dict={y:labels}) else: - f = sess.run(fake) + _matrix = sess.run(fake) # # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes # The code below will insure we have some acceptable cardinal relationships between id and synthetic values # # df = pd.DataFrame(np.round(f)).astype(np.int32) - df = pd.DataFrame(np.round(f),dtype=int) - + found.append (np.round(_matrix).astype(np.int64)) + # df = pd.DataFrame(np.round(_matrix),dtype=int) p = 0 not in df.sum(axis=1).values - x = df.sum(axis=1).values + # x = df.sum(axis=1).values - if np.divide( np.sum(x), x.size) > .9 or p and np.sum(x) == x.size : - ratio.append(np.divide( np.sum(x), x.size)) - found.append(df) + # if np.divide( np.sum(x), x.size) > .9 or p and np.sum(x) == x.size : + # ratio.append(np.divide( np.sum(x), x.size)) + # found.append(df) - # break - if len(found) == CANDIDATE_COUNT: + # # break + # if len(found) == CANDIDATE_COUNT: - break - else: - __x__ = df if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __x__ - __ratio = np.divide( np.sum(x), x.size) if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __ratio - continue + # break + # else: + # __x__ = df if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __x__ + # __ratio = np.divide( np.sum(x), x.size) if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __ratio + # continue # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms # df = (i * df).sum(axis=1) # # In case we are dealing with actual values like diagnosis codes we can perform # - N = len(found) - _index = [i for i in range(0,N) if found[i].shape[1] == len(self.values)] - if not _index and not found : - df = __x__ - INDEX = -1 - else : - if not _index : - INDEX = np.random.choice(np.arange(len(found)),1)[0] - INDEX = ratio.index(np.max(ratio)) - else: - INDEX = _index[0] + # N = len(found) + # _index = [i for i in range(0,N) if found[i].shape[1] == len(self.values)] + # if not _index and not found : + # df = __x__ + # INDEX = -1 + # else : + # if not _index : + # INDEX = np.random.choice(np.arange(len(found)),1)[0] + # INDEX = ratio.index(np.max(ratio)) + # else: + # INDEX = _index[0] - df = found[INDEX] - columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] + # df = found[INDEX] + # columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] # r = np.zeros((self.ROW_COUNT,len(columns))) # r = np.zeros(self.ROW_COUNT) - if self.logger : - info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)} - if df.shape[1] > len(self.values) : - df = df.iloc[:len(self.values)] - if INDEX > 0 : - info =dict(info ,**{"selected":INDEX, "ratio": ratio[INDEX] }) - else : + # if self.logger : + # info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)} + # if df.shape[1] > len(self.values) : + # df = df.iloc[:len(self.values)] + # if INDEX > 0 : + # info =dict(info ,**{"selected":INDEX, "ratio": ratio[INDEX] }) + # else : - info['selected'] = -1 - info['ratio'] = __ratio - info['partition'] = self.PARTITION - self.logger.write({"module":"gan-generate","action":"generate","input":info}) - # df.columns = self.values - if len(found) or df.columns.size <= len(self.values): - ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1) - missing = [] - if ii.sum() > 0 : - # - # If the generator had a reductive effect we should be able to get random values from either : - # - The space of outliers - # - existing values for smaller spaces that have suffered over training - # - - N = ii.sum() - missing_values = self.MISSING_VALUES if self.MISSING_VALUES else self.values - missing = np.random.choice(missing_values,N) - # missing = [] - # - # @TODO: - # Log the findings here in terms of ratio, missing, candidate count - # print ([np.max(ratio),len(missing),len(found),i]) - i = np.where(ii == 0)[0] + # info['selected'] = -1 + # info['ratio'] = __ratio + # info['partition'] = self.PARTITION + # self.logger.write({"module":"gan-generate","action":"generate","input":info}) + # # df.columns = self.values + # if len(found) or df.columns.size <= len(self.values): + # ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1) + # missing = [] + # if ii.sum() > 0 : + # # + # # If the generator had a reductive effect we should be able to get random values from either : + # # - The space of outliers + # # - existing values for smaller spaces that have suffered over training + # # + + # N = ii.sum() + # missing_values = self.MISSING_VALUES if self.MISSING_VALUES else self.values + # missing = np.random.choice(missing_values,N) + # # missing = [] + # # + # # @TODO: + # # Log the findings here in terms of ratio, missing, candidate count + # # print ([np.max(ratio),len(missing),len(found),i]) + # i = np.where(ii == 0)[0] - df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) - df.columns = columns - df = df[columns[0]].append(pd.Series(missing)) + # df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) + # df.columns = columns + # df = df[columns[0]].append(pd.Series(missing)) - if self.logger : + # if self.logger : - info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION} - self.logger.write({"module":"gan-generate","action":"compile.io","input":info}) + # info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION} + # self.logger.write({"module":"gan-generate","action":"compile.io","input":info}) # print(df.head()) tf.compat.v1.reset_default_graph() - df = pd.DataFrame(df) - df.columns = columns - np.random.shuffle(df[columns[0]].values) - return df.to_dict(orient='list') + # df = pd.DataFrame(df) + # df.columns = columns + # np.random.shuffle(df[columns[0]].values) + # return df.to_dict(orient='list') + return _matrix if __name__ == '__main__' : diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 3e2c9aa..086df3f 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -14,6 +14,11 @@ import data.gan as gan from transport import factory from data.bridge import Binary import threading as thread +from data.maker import prepare +import copy +import os +import json + class ContinuousToDiscrete : ROUND_UP = 2 @staticmethod @@ -77,8 +82,62 @@ class ContinuousToDiscrete : +def train (**_args): + """ + :params sql + :params store + """ + # + # Let us prepare the data by calling the utility function + # + if 'file' in _args : + # + # We are reading data from a file + _args['data'] = pd.read_csv(_args['file']) + else: + # + # data will be read from elsewhere (a data-store)... + pass + # if 'ignore' in _args and 'columns' in _args['ignore']: + + _inputhandler = prepare.Input(**_args) + values,_matrix = _inputhandler.convert() + args = {"real":_matrix,"context":_args['context']} + _map = {} + if 'store' in _args : + # + # This + args['store'] = copy.deepcopy(_args['store']['logs']) + args['store']['args']['doc'] = _args['context'] + logger = factory.instance(**args['store']) + args['logger'] = logger + + for key in _inputhandler._map : + beg = _inputhandler._map[key]['beg'] + end = _inputhandler._map[key]['end'] + values = _inputhandler._map[key]['values'].tolist() + _map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()} + info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map} + logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":info}) + + args['logs'] = _args['logs'] if 'logs' in _args else 'logs' + args ['max_epochs'] = _args['max_epochs'] + args['matrix_size'] = _matrix.shape[0] + args['batch_size'] = 2000 + args['partition'] = 0 if 'partition' not in _args else _args['partition'] + os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' -def train (**args) : + trainer = gan.Train(**args) + # + # @TODO: Write the map.json in the output directory for the logs + # + f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w') + f.write(json.dumps(_map)) + f.close() + + trainer.apply() + pass +def _train (**args) : """ This function is intended to train the GAN in order to learn about the distribution of the features :column columns that need to be synthesized (discrete) @@ -122,18 +181,53 @@ def train (**args) : # If the s trainer = gan.Train(**args) trainer.apply() -def post(**args): - """ - This uploads the tensorflow checkpoint to a data-store (mongodb, biguqery, s3) - - """ - pass def get(**args): """ This function will restore a checkpoint from a persistant storage on to disk """ pass -def generate(**args): +def generate(**_args): + """ + This function will generate a set of records, before we must load the parameters needed + :param data + :param context + :param logs + """ + f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json'])) + _map = json.loads(f.read()) + f.close() + if 'file' in _args : + df = pd.read_csv(_args['file']) + else: + df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data']) + args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']} + args['logs'] = _args['logs'] if 'logs' in _args else 'logs' + args ['max_epochs'] = _args['max_epochs'] + # args['matrix_size'] = _matrix.shape[0] + args['batch_size'] = 2000 + args['partition'] = 0 if 'partition' not in _args else _args['partition'] + args['row_count'] = df.shape[0] + # + # @TODO: perhaps get the space of values here ... (not sure it's a good idea) + # + _args['map'] = _map + _inputhandler = prepare.Input(**_args) + values,_matrix = _inputhandler.convert() + args['values'] = np.array(values) + if 'gpu' in _args : + os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu']) + handler = gan.Predict (**args) + handler.load_meta(None) + # + # Let us now format the matrices as we expect them to be + # + + candidates = handler.apply(candidates=args['candidates']) + return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates] + + + +def _generate(**args): """ This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset @return pandas.DataFrame diff --git a/data/maker/__main__.py b/data/maker/__main__.py deleted file mode 100644 index d71d400..0000000 --- a/data/maker/__main__.py +++ /dev/null @@ -1,32 +0,0 @@ -import pandas as pd -import data.maker -from data.params import SYS_ARGS -import json -from scipy.stats import wasserstein_distance as wd -import risk -import numpy as np -if 'config' in SYS_ARGS : - ARGS = json.loads(open(SYS_ARGS['config']).read()) - if 'generate' not in SYS_ARGS : - data.maker.train(**ARGS) - else: - # - # - ARGS['no_value'] = '' - _df = data.maker.generate(**ARGS) - odf = pd.read_csv (ARGS['data']) - odf.columns = [name.lower() for name in odf.columns] - column = ARGS['column'] if isinstance(ARGS['column'],list) else [ARGS['column']] - # print (odf.head()) - # print (_df.head()) - print(odf.join(_df[column],rsuffix='_io')) - # print (_df[column].risk.evaluate(flag='synth')) - # print (odf[column].risk.evaluate(flag='original')) - # _x = pd.get_dummies(_df[column]).values - # y = pd.get_dummies(odf[column]).values - # N = _df.shape[0] - # print (np.mean([ wd(_x[i],y[i])for i in range(0,N)])) - # print (wd(_x[0],y[0]) ) - - # column = SYS_ARGS['column'] - # odf = open(SYS_ARGS['data']) \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index 00f558d..4a86d94 100644 --- a/pipeline.py +++ b/pipeline.py @@ -9,7 +9,7 @@ import pandas as pd from google.oauth2 import service_account from google.cloud import bigquery as bq import data.maker - +import copy from data.params import SYS_ARGS # @@ -69,53 +69,45 @@ class Components : This function will perform training on the basis of a given pointer that reads data """ - # - # @TODO: we need to log something here about the parameters being passed - # pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args) - df = args['data'] - - # - # Now we can parse the arguments and submit the entire thing to training - # - - logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) - log_folder = args['logs'] if 'logs' in args else 'logs' - PART_SIZE = int(args['part_size']) - - partition = args['partition'] - log_folder = os.sep.join([log_folder,args['context'],str(partition)]) - _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} - _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) - if 'batch_size' in args : - _args['batch_size'] = int(args['batch_size']) - - _args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128 # - # We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel - # - if int(args['num_gpu']) > 1 : - _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int) + schema = None + if 'file' in args : + + df = pd.read_csv(args['file']) + del args['file'] + elif 'data' not in args : + reader = factory.instance(**args['store']['source']) + if 'row_limit' in args : + df = reader.read(sql=args['sql'],limit=args['row_limit']) + else: + df = reader.read(sql=args['sql']) + schema = reader.meta(table=args['from']) if hasattr(reader,'meta') and 'from' in args else None else: - _args['gpu'] = 0 - _args['num_gpu'] = 1 - os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) - _args['partition'] = int(partition) - _args['continuous']= args['continuous'] if 'continuous' in args else [] - _args['store'] = {'type':'mongo.MongoWriter','args':{'dbname':'aou','doc':args['context']}} - _args['data'] = args['data'] - - # print (['partition ',partition,df.value_source_concept_id.unique()]) - # - # @log : - # Logging information about the training process for this partition (or not) - # - - info = {"rows":df.shape[0],"cols":df.shape[1], "partition":int(partition),"logs":_args['logs']} + df = args['data'] + + + # df = df.fillna('') + if schema : + _schema = {} + for _item in schema : + _type = int + _value = 0 + if _item.field_type == 'FLOAT' : + _type =float + elif _item.field_type != 'INTEGER' : + _type = str + _value = '' + _schema[_item.name] = _type + df[_item.name] = df[_item.name].fillna(_value).astype(_type) + args['schema'] = _schema + # df[_item.name] = df[_item.name].astype(_type) + _args = copy.deepcopy(args) + # _args['store'] = args['store']['source'] + _args['data'] = df - logger.write({"module":"train","action":"train","input":info}) data.maker.train(**_args) if 'autopilot' in ( list(args.keys())) : - print (['autopilot mode enabled ....']) + print (['autopilot mode enabled ....',args['context']]) self.generate(args) pass @@ -129,141 +121,167 @@ class Components : """ This function will generate data and store it to a given, """ - logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) - log_folder = args['logs'] if 'logs' in args else 'logs' - partition = args['partition'] if 'partition' in args else '' - log_folder = os.sep.join([log_folder,args['context'],str(partition)]) - - _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} - _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) + store = args['store']['logs'] + store['doc'] = args['context'] + logger = factory.instance(**store) #type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) + + ostore = args['store']['target'] + writer = factory.instance(**ostore) + # log_folder = args['logs'] if 'logs' in args else 'logs' + # partition = args['partition'] if 'partition' in args else '' + # log_folder = os.sep.join([log_folder,args['context'],str(partition)]) + + # _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} + # _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) # _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 - if 'batch_size' in args : - _args['batch_size'] = int(args['batch_size']) - - if int(args['num_gpu']) > 1 : - _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int) - else: - _args['gpu'] = 0 - _args['num_gpu'] = 1 - os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) - # _args['no_value']= args['no_value'] - _args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128 + # if 'batch_size' in args : + # _args['batch_size'] = int(args['batch_size']) + + # if int(args['num_gpu']) > 1 : + # _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int) + # else: + # _args['gpu'] = 0 + # _args['num_gpu'] = 1 + # os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) + # # _args['no_value']= args['no_value'] + # _args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128 - # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 - PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 + # # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 + # PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 # credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') # _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna() # reader = args['reader'] # df = reader() - df = args['reader']() if 'reader' in args else args['data'] - - # if 'slice' in args and 'max_rows' in args['slice']: - - # max_rows = args['slice']['max_rows'] - # if df.shape[0] > max_rows : - # print (".. slicing ") - # i = np.random.choice(df.shape[0],max_rows,replace=False) - # df = df.iloc[i] + schema = args['schema'] if 'schema' in args else None + if 'file' in args : + df = pd.read_csv(args['file']) + else: + if 'data' not in args : + reader = factory.instance(**args['store']['source']) + if 'row_limit' in args : + df = reader.read(sql=args['sql'],limit=args['row_limit']) + else: + df = reader.read(sql=args['sql']) + if 'schema' not in args and hasattr(reader,'meta'): + schema = reader.meta(table=args['from']) - - # bounds = Components.split(df,MAX_ROWS,PART_SIZE) - # if partition != '' : - # columns = args['columns'] - # df = np.array_split(df[columns].values,PART_SIZE) - # df = pd.DataFrame(df[ int (partition) ],columns = columns) - # max_rows = int(args['partition_max_rows']) if 'partition_max_rows' in args else 1000000 - # N = np.divide(df.shape[0],max_rows).astype(int) + 1 - info = {"name":args['columns'],"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"space":df[args['columns'][0]].unique().size, "part_size":int(PART_SIZE)} - logger.write({"module":"generate","action":"partition","input":info}) - _args['partition'] = int(partition) - _args['continuous']= args['continuous'] if 'continuous' in args else [] - # - # How many rows sub-partition must we divide this into ? - # let us fix the data types here every _id field will be an np.int64... - # - - schema = args['schema'] - for item in schema : - if item.field_type == 'INTEGER' and df[item.name].dtype != np.int64: - df[item.name] = np.array(df[item.name].values,dtype=np.int64) - elif item.field_type == 'STRING' and df[item.name].dtype != object : - df[item.name] = np.array(df[item.name],dtype=object) - - - - # for name in df.columns.tolist(): - - # if name.endswith('_id') : - # if df[name].isnull().sum() > 0 and name not in ['unique_device_id']: - # df[name].fillna(np.nan_to_num(np.nan),inplace=True) - # df[name] = df[name].astype(int) - + + else: + # + # This will account for autopilot mode ... + df = args['data'] + + _info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[0]}} + _dc = pd.DataFrame() # for mdf in df : - _args['data'] = df - - _dc = _dc.append(data.maker.generate(**_args)) + args['data'] = df + args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) + + candidates = (data.maker.generate(**args)) + if 'sql.BQWriter' in ostore['type'] : + #table = ".".join([ostore['['dataset'],args['context']]) + # writer = factory.instance(**ostore) + _columns = None + skip_columns = [] + _schema = [{"name":field.name,"type":field.field_type,"description":field.description} for field in schema] if schema else [] + for _df in candidates : + # + # we need to format the fields here to make sure we have something cohesive + # + + if not skip_columns : + # _columns = set(df.columns) - set(_df.columns) + if 'ignore' in args and 'columns' in args['ignore'] : + + for name in args['ignore']['columns'] : + for _name in _df.columns: + if _name in name: + skip_columns.append(_name) + # + # We perform a series of set operations to insure that the following conditions are met: + # - the synthetic dataset only has fields that need to be synthesized + # - The original dataset has all the fields except those that need to be synthesized + # + + _df = _df[list(set(_df.columns) - set(skip_columns))] + + if set(df.columns) & set(_df.columns) : + _columns = set(df.columns) - set(_df.columns) + df = df[_columns] + + # + # Let us merge the dataset here and and have a comprehensive dataset + + _df = pd.DataFrame.join(df,_df) + + writer.write(_df,schema=_schema,table=args['from']) + # writer.write(df,table=table) + pass + else: + pass + - # - # We need to post the generate the data in order to : - # 1. compare immediately - # 2. synthetic copy - # + # # + # # We need to post the generate the data in order to : + # # 1. compare immediately + # # 2. synthetic copy + # # - cols = _dc.columns.tolist() + # cols = _dc.columns.tolist() - data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query) - # - # performing basic analytics on the synthetic data generated (easy to quickly asses) - # - info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}} + # data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query) + # # + # # performing basic analytics on the synthetic data generated (easy to quickly asses) + # # + # info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}} - # - # @TODO: Send data over to a process for analytics + # # + # # @TODO: Send data over to a process for analytics - base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) - cols = _dc.columns.tolist() - for name in cols : - _args['data'][name] = _dc[name] + # base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) + # cols = _dc.columns.tolist() + # for name in cols : + # _args['data'][name] = _dc[name] - # - #-- Let us store all of this into bigquery - prefix = args['notify']+'.'+_args['context'] - partition = str(partition) - table = '_'.join([prefix,partition,'io']).replace('__','_') - folder = os.sep.join([args['logs'],args['context'],partition,'output']) - if 'file' in args : + # # + # #-- Let us store all of this into bigquery + # prefix = args['notify']+'.'+_args['context'] + # partition = str(partition) + # table = '_'.join([prefix,partition,'io']).replace('__','_') + # folder = os.sep.join([args['logs'],args['context'],partition,'output']) + # if 'file' in args : - _fname = os.sep.join([folder,table.replace('_io','_full_io.csv')]) - _pname = os.sep.join([folder,table])+'.csv' - data_comp.to_csv( _pname,index=False) - _args['data'].to_csv(_fname,index=False) + # _fname = os.sep.join([folder,table.replace('_io','_full_io.csv')]) + # _pname = os.sep.join([folder,table])+'.csv' + # data_comp.to_csv( _pname,index=False) + # _args['data'].to_csv(_fname,index=False) - _id = 'path' - else: + # _id = 'path' + # else: - credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') - _pname = os.sep.join([folder,table+'.csv']) - _fname = table.replace('_io','_full_io') - partial = '.'.join(['io',args['context']+'_partial_io']) - complete= '.'.join(['io',args['context']+'_full_io']) - data_comp.to_csv(_pname,index=False) - if 'dump' in args : - print (_args['data'].head()) - else: - Components.lock.acquire() - data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) - _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000) - Components.lock.release() - _id = 'dataset' - info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} } - if partition : - info ['partition'] = int(partition) - logger.write({"module":"generate","action":"write","input":info} ) + # credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') + # _pname = os.sep.join([folder,table+'.csv']) + # _fname = table.replace('_io','_full_io') + # partial = '.'.join(['io',args['context']+'_partial_io']) + # complete= '.'.join(['io',args['context']+'_full_io']) + # data_comp.to_csv(_pname,index=False) + # if 'dump' in args : + # print (_args['data'].head()) + # else: + # Components.lock.acquire() + # data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) + # _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000) + # Components.lock.release() + # _id = 'dataset' + # info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} } + # if partition : + # info ['partition'] = int(partition) + # logger.write({"module":"generate","action":"write","input":info} ) @@ -308,98 +326,95 @@ if __name__ == '__main__' : # Log what was initiated so we have context of this processing ... # # if 'listen' not in SYS_ARGS : - if 'file' in args : - DATA = pd.read_csv(args['file']) ; - schema = [] - else: - DATA = Components().get(args) - client = bq.Client.from_service_account_json(args["private_key"]) - schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema + # if 'file' in args : + # DATA = pd.read_csv(args['file']) ; + # schema = [] + # else: + # DATA = Components().get(args) + # client = bq.Client.from_service_account_json(args["private_key"]) + # schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema - COLUMNS = DATA.columns - DATA = np.array_split(DATA,PART_SIZE) - args['schema'] = schema + # COLUMNS = DATA.columns + # DATA = np.array_split(DATA,PART_SIZE) + # args['schema'] = schema if 'generate' in SYS_ARGS : # # Let us see if we have partitions given the log folder - content = os.listdir( os.sep.join([args['logs'],args['context']])) + content = os.listdir( os.sep.join([args['logs'],'train',args['context']])) generator = Components() - if ''.join(content).isnumeric() : - # - # we have partitions we are working with + # if ''.join(content).isnumeric() : + # # + # # we have partitions we are working with - jobs = [] + # jobs = [] - # columns = DATA.columns.tolist() + # # columns = DATA.columns.tolist() - # DATA = np.array_split(DATA,PART_SIZE) + # # DATA = np.array_split(DATA,PART_SIZE) - for index in range(0,PART_SIZE) : - if 'focus' in args and int(args['focus']) != index : - # - # This handles failures/recoveries for whatever reason - # If we are only interested in generating data for a given partition - continue - # index = id.index(id) + # for index in range(0,PART_SIZE) : + # if 'focus' in args and int(args['focus']) != index : + # # + # # This handles failures/recoveries for whatever reason + # # If we are only interested in generating data for a given partition + # continue + # # index = id.index(id) - args['partition'] = index - args['data'] = DATA[index] - if int(args['num_gpu']) > 1 : - args['gpu'] = index - else: - args['gpu']=0 + # args['partition'] = index + # args['data'] = DATA[index] + # if int(args['num_gpu']) > 1 : + # args['gpu'] = index + # else: + # args['gpu']=0 - make = lambda _args: (Components()).generate(_args) - job = Process(target=make,args=(args,)) - job.name = 'generator # '+str(index) - job.start() - jobs.append(job) - # if len(jobs) == 1 : - # job.join() + # make = lambda _args: (Components()).generate(_args) + # job = Process(target=make,args=(args,)) + # job.name = 'generator # '+str(index) + # job.start() + # jobs.append(job) + # # if len(jobs) == 1 : + # # job.join() - print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ]) - while len(jobs)> 0 : - jobs = [job for job in jobs if job.is_alive()] - time.sleep(2) + # print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ]) + # while len(jobs)> 0 : + # jobs = [job for job in jobs if job.is_alive()] + # time.sleep(2) - # generator.generate(args) - else: - generator.generate(args) + # # generator.generate(args) + # else: + # generator.generate(args) # Components.generate(args) - elif 'shuffle' in SYS_ARGS: - - - for data in DATA : - args['data'] = data - _df = (Components()).shuffle(args) + generator.generate(args) + else: # DATA = np.array_split(DATA,PART_SIZE) - - jobs = [] - for index in range(0,PART_SIZE) : - if 'focus' in args and int(args['focus']) != index : - continue - args['part_size'] = PART_SIZE - args['partition'] = index - args['data'] = DATA[index] - if int(args['num_gpu']) > 1 : - args['gpu'] = index - else: - args['gpu']=0 + agent = Components() + agent.train(**args) + # jobs = [] + # for index in range(0,PART_SIZE) : + # if 'focus' in args and int(args['focus']) != index : + # continue + # args['part_size'] = PART_SIZE + # args['partition'] = index + # args['data'] = DATA[index] + # if int(args['num_gpu']) > 1 : + # args['gpu'] = index + # else: + # args['gpu']=0 - make = lambda _args: (Components()).train(**_args) - job = Process(target=make,args=( dict(args),)) - job.name = 'Trainer # ' + str(index) - job.start() - jobs.append(job) - # args['gpu'] - print (["Started ",len(jobs),"trainers" if len(jobs)>1 else "trainer" ]) - while len(jobs)> 0 : - jobs = [job for job in jobs if job.is_alive()] - time.sleep(2) + # make = lambda _args: (Components()).train(**_args) + # job = Process(target=make,args=( dict(args),)) + # job.name = 'Trainer # ' + str(index) + # job.start() + # jobs.append(job) + # # args['gpu'] + # print (["Started ",len(jobs),"trainers" if len(jobs)>1 else "trainer" ]) + # while len(jobs)> 0 : + # jobs = [job for job in jobs if job.is_alive()] + # time.sleep(2) # trainer = Components() # trainer.train(**args) From 4725b6eff9176f18601bd1f381398d54a507ebe4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 29 Mar 2021 18:53:57 -0500 Subject: [PATCH 115/250] new features, bug fixes --- bin/data-maker | 1 + data/maker/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) create mode 120000 bin/data-maker diff --git a/bin/data-maker b/bin/data-maker new file mode 120000 index 0000000..f63f773 --- /dev/null +++ b/bin/data-maker @@ -0,0 +1 @@ +pipeline.py \ No newline at end of file diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 086df3f..cfdd8e2 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -219,7 +219,7 @@ def generate(**_args): handler = gan.Predict (**args) handler.load_meta(None) # - # Let us now format the matrices as we expect them to be + # Let us now format the matrices by reverting them to a data-frame with values # candidates = handler.apply(candidates=args['candidates']) diff --git a/setup.py b/setup.py index 40e8d11..7970f14 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.3.2","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.4.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 46f2fd7be406f0bcdda0525655b48e3d64fca398 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 29 Mar 2021 22:59:31 -0500 Subject: [PATCH 116/250] data preparation script (preconditions) --- data/maker/prepare/__init__.py | 252 +++++++++++++++++++++++++++++++++ data/maker/prepare/__main__.py | 1 + 2 files changed, 253 insertions(+) create mode 100644 data/maker/prepare/__init__.py create mode 120000 data/maker/prepare/__main__.py diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py new file mode 100644 index 0000000..2c773de --- /dev/null +++ b/data/maker/prepare/__init__.py @@ -0,0 +1,252 @@ +""" +(c) 2018 - 2021, Vanderbilt University Medical Center +Steve L. Nyemba, steve.l.nyemba@vumc.org + +This file is designed to handle preconditions for a generative adversarial network: + - The file will read/get data from a source specified by transport (or data-frame) + - The class will convert the data to a binary vector + - The class will also help rebuild the data from a binary matrix. +Usage : + +""" +import transport +import json +import pandas as pd +import numpy as np +import cupy as cp +import sys +import os +# from multiprocessing import Process, Queue + +# if 'GPU' in os.environ : +# import cupy as np +# else: +# import numpy as np +class void: + pass +class Hardware : + """ + This class is intended to allow the use of hardware i.e GPU, index or CPU + """ + pass + +class Input : + """ + This class is designed to read data from a source and and perform a variet of operations : + - provide a feature space, and rows (matrix profile) + - a data index map + """ + # def learn(self,**_args): + # """ + # This function is designed to learn about, the data and persist + # :param table + # :param store + # """ + # table = _args['table'] + # reader = transport.factory.instance(**_args['store']) + # df = reader.read(table=table,limit=1) + # self.columns = df.columns.tolist() + + # self._metadf = pd.DataFrame(self.df[self._columns].dtypes.values.astype(str)).T #,self._columns] + # self._metadf.columns = self._columns + + # sql = "SELECT :fields from :table".replace(":table",table) + + + def __init__(self,**_args): + """ + :param table + :param store data-store parameters/configuration + :param sql sql query that pulls a representative sample of the data + """ + self._schema = _args['schema'] if 'schema' in _args else {} + self.df = _args['data'] + if 'sql' not in _args : + # self._initdata(**_args) + # + pass + else: + self._initsql(**_args) + self._map = {} if 'map' not in _args else _args['map'] + # self._metadf = pd.DataFrame(self.df[self._columns].dtypes.values.astype(str)).T #,self._columns] + # self._metadf.columns = self._columns + if 'gpu' in _args and 'GPU' in os.environ: + + np = cp + index = int(_args['gpu']) + np.cuda.Device(index).use() + print(['..:: GPU ',index]) + + def _initsql(self,**_args): + """ + This function will initialize the class on the basis of a data-store and optionally pre-defined columns to be used to be synthesized + :param store data-store configuration + :param sql sql query to be applied to the transported data + :param columns list of columns to be + """ + # _store_args = _args['store'] + # reader = transport.factory.instance(**_store_args) + # sql = _args['sql'] + + # self.df = reader.read(sql=_args['sql']) + + + if 'columns' not in _args : + self._initcols(data=self.df) + else: + self._initcols(data=self.df,columns=_args['columns']) + + pass + def _initcols (self,**_args) : + """ + This function will initialize the columns to be synthesized and/or determine which ones can be synthesized + :param data data-frame that holds the data (matrix) + :param columns optional columns to be synthesized + """ + # df = _args['data'].copy() + row_count = self.df.shape[0] + cols = None if 'columns' not in _args else _args['columns'] + self.columns = self.df.columns.tolist() + if 'columns' in _args : + self._columns = _args['columns'] + else: + # + # We will look into the count and make a judgment call + _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T + MIN_SPACE_SIZE = 2 + self._columns = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist() + def _initdata(self,**_args): + """ + This function will initialize the class with a data-frame and columns of interest (if any) + :param data data-frame that holds the data + :param columns columns that need to be synthesized if any + """ + # + # setting class-level variables to be reused across the class + # self.df = _args['data'] + row_count = self.df.shape[0] + # self.columns = self.df.columns + # self._metadf = self.df.apply(lambda col: col.unique().size) + # _df = pd.DataFrame(self.df.apply(lambda col: col.unique().size )).T + # cols = None if 'columns' not in _args else _args['columns'] + self._initcols(**_args) + + def convert(self,**_args): + """ + This function will convert a data-frame into a binary matrix and provide a map to be able to map the values back to the matrix + :param columns in case we specify the columns to account for (just in case the original assumptions don't hold) + """ + if 'columns' in _args or 'column' in _args : + columns = _args['columns'] if 'columns' in _args else [_args['column']] + else: + columns = self._columns + _df = self.df if 'data' not in _args else _args['data'] + # + # At this point we have the list of features we want to use + i = 0 + + _m = np.array([]) + _values = [] + for name in columns : + # + # In case we have dataset with incomplete value space, we should still be able to generate something meaningful + # + values = None if name not in self._map else list(self._map[name]['values']) + _type = self._schema[name] if name in self._schema else _df[name].dtype + cols, _matrix = self.tobinary(_df[name],values) + _beg,_end = i,i+len(cols) + if name not in self._map : + self._map[name] = {"beg":_beg,"end":_end ,"values":cols} + i += len(cols) + if not _m.shape[0]: + _m = _matrix ; + else: + _m = np.concatenate((_m,_matrix),axis=1) + if values : + _values += list(values) + # + # @NOTE: + # The map should allow us to be able to convert or reconvert the binary matrix to whatever we want ... + # + # self._matrix = _m + + return _values,_m + + def revert(self,**_args) : + """ + This function will take in a binary matrix and based on the map of values it will repopulate it with values + :param _matrix binary matrix + :param column|columns column name or columns if the column is specified + """ + _column = _args['column'] if 'column' in _args else None + + + matrix = _args['matrix'] + row_count = matrix.shape[0] + r = {} + for key in self._map : + if _column and key != _column : + continue + _item = self._map[key] + _beg = _item['beg'] + _end = _item['end'] + columns = np.array(_item['values']) + # + # @NOTE: We are accessing matrices in terms of [row,col], + # The beg,end variables are for the columns in the matrix (mini matrix) + # + # if not _column : + # _matrix = matrix[:,_beg:_end] #-- The understanding is that _end is not included + # else: + # _matrix = matrix + _matrix = matrix[:,_beg:_end] + # + # vectorize the matrix to replace the bits by their actual values (accounting for the data-types) + # @TODO: Find ways to do this on a GPU (for big data) or across threads + # + row_count = _matrix.shape[0] + # r[key] = [columns[np.where(row == 1) [0][0] ] for row in _matrix[:,_beg:_end]] + + r[key] = [columns[np.where(row==1)[0][0]] if np.where(row==1)[0].size > 0 else '' for row in _matrix] + + + return pd.DataFrame(r) + + def tobinary(self,rows,cols=None) : + """ + This function will compile a binary matrix from a row of values this allows hopefully this can be done in parallel, this function can be vectorized and processed + :param rows np.array or list of vector of values + :param cols a space of values if it were to be different fromt he current sample. + """ + + if not cols: + # + # In the advent the sample rows do NOT have the values of the + cols = rows.unique() + cols = np.array(cols) + row_count = len(rows) + # if 'GPU' not in os.environ : + _matrix = np.zeros([row_count,cols.size]) + + [np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0] + # else: + # _matrix = cp.zeros([row_count,cols.size]) + # [cp.put(_matrix[i], cp.where(cols == rows[i]),1)for i in cp.arange(row_count) ] + # _matrix = _matrix.asnumpy() + + + return cols,_matrix + +if __name__ == '__main__' : + df = pd.read_csv('../../sample.csv') + _input = Input(data=df,columns=['age','race']) + _m = _input.convert(column='age') + print (_m.shape) + print (_input.revert(matrix=_m,column='age')) + print (_input._metadf) + +# _args = {"store":{"type":"sql.BQReader","args":{"service_key":"/home/steve/dev/aou/accounts/curation-prod.json"}}} +# _args['table'] = 'io.observation' +# _i = Input(**_args) +# df = pd.read_csv('../../sample.csv') +# print (Input.ToBinary(df.age)) \ No newline at end of file diff --git a/data/maker/prepare/__main__.py b/data/maker/prepare/__main__.py new file mode 120000 index 0000000..93f5256 --- /dev/null +++ b/data/maker/prepare/__main__.py @@ -0,0 +1 @@ +__init__.py \ No newline at end of file From 43873697a0fb46e8ef961388c4bd5b4c9110cf93 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 Mar 2021 04:56:01 -0500 Subject: [PATCH 117/250] bug fixes --- data/gan.py | 12 ++++++++++-- data/maker/__init__.py | 19 ++++++++++--------- data/maker/prepare/__init__.py | 18 +++++++++++------- setup.py | 2 +- 4 files changed, 32 insertions(+), 19 deletions(-) diff --git a/data/gan.py b/data/gan.py index e7ab6cf..c61d1b1 100644 --- a/data/gan.py +++ b/data/gan.py @@ -58,7 +58,14 @@ class GNet : self.layers.normalize = self.normalize self.logs = {} - self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] + # self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] + self.GPU_CHIPS = None if 'gpu' not in args else args['gpu'] + if self.GPU_CHIPS is None: + self.GPU_CHIPS = [0] + if 'CUDA_VISIBLE_DEVICES' in os.environ : + os.environ.pop('CUDA_VISIBLE_DEVICES') + self.NUM_GPUS = len(self.GPU_CHIPS) + self.PARTITION = args['partition'] # if self.NUM_GPUS > 1 : # os.environ['CUDA_VISIBLE_DEVICES'] = "4" @@ -150,6 +157,7 @@ class GNet : "D_STRUCTURE":self.D_STRUCTURE, "G_STRUCTURE":self.G_STRUCTURE, "NUM_GPUS":self.NUM_GPUS, + "GPU_CHIPS":self.GPU_CHIPS, "NUM_LABELS":self.NUM_LABELS, "MAX_EPOCHS":self.MAX_EPOCHS, "ROW_COUNT":self.ROW_COUNT @@ -443,7 +451,7 @@ class Train (GNet): # - abstract hardware specification # - determine if the GPU/CPU are busy # - for i in range(self.NUM_GPUS): + for i in self.GPU_CHIPS : #range(self.NUM_GPUS): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ('TOWER', i)) as scope: if self._LABEL is not None : diff --git a/data/maker/__init__.py b/data/maker/__init__.py index cfdd8e2..fbdf208 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -90,16 +90,16 @@ def train (**_args): # # Let us prepare the data by calling the utility function # - if 'file' in _args : - # - # We are reading data from a file - _args['data'] = pd.read_csv(_args['file']) - else: - # - # data will be read from elsewhere (a data-store)... - pass + # if 'file' in _args : + # # + # # We are reading data from a file + # _args['data'] = pd.read_csv(_args['file']) + # else: + # # + # # data will be read from elsewhere (a data-store)... + # pass # if 'ignore' in _args and 'columns' in _args['ignore']: - + _inputhandler = prepare.Input(**_args) values,_matrix = _inputhandler.convert() args = {"real":_matrix,"context":_args['context']} @@ -107,6 +107,7 @@ def train (**_args): if 'store' in _args : # # This + args['store'] = copy.deepcopy(_args['store']['logs']) args['store']['args']['doc'] = _args['context'] logger = factory.instance(**args['store']) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 2c773de..381dfc0 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -13,7 +13,7 @@ import transport import json import pandas as pd import numpy as np -import cupy as cp +# import cupy as cp import sys import os # from multiprocessing import Process, Queue @@ -62,7 +62,7 @@ class Input : self._schema = _args['schema'] if 'schema' in _args else {} self.df = _args['data'] if 'sql' not in _args : - # self._initdata(**_args) + self._initdata(**_args) # pass else: @@ -70,12 +70,12 @@ class Input : self._map = {} if 'map' not in _args else _args['map'] # self._metadf = pd.DataFrame(self.df[self._columns].dtypes.values.astype(str)).T #,self._columns] # self._metadf.columns = self._columns - if 'gpu' in _args and 'GPU' in os.environ: + # if 'gpu' in _args and 'GPU' in os.environ: - np = cp - index = int(_args['gpu']) - np.cuda.Device(index).use() - print(['..:: GPU ',index]) + # np = cp + # index = int(_args['gpu']) + # np.cuda.Device(index).use() + # print(['..:: GPU ',index]) def _initsql(self,**_args): """ @@ -107,6 +107,8 @@ class Input : row_count = self.df.shape[0] cols = None if 'columns' not in _args else _args['columns'] self.columns = self.df.columns.tolist() + self._io = [] + if 'columns' in _args : self._columns = _args['columns'] else: @@ -115,6 +117,8 @@ class Input : _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T MIN_SPACE_SIZE = 2 self._columns = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist() + self._io = _df.to_dict(orient='records') + def _initdata(self,**_args): """ This function will initialize the class with a data-frame and columns of interest (if any) diff --git a/setup.py b/setup.py index 7970f14..4b96d08 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.4.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.4.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From 13053febb773143b90c5ff5d1122ef355c21a979 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 Mar 2021 05:18:28 -0500 Subject: [PATCH 118/250] ... --- pipeline.py | 101 ++++++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 51 deletions(-) diff --git a/pipeline.py b/pipeline.py index 4a86d94..e643278 100644 --- a/pipeline.py +++ b/pipeline.py @@ -63,6 +63,24 @@ class Components : def split(X,MAX_ROWS=3,PART_SIZE=3): return list(pd.cut( np.arange(X.shape[0]+1),PART_SIZE).categories) + def format_schema(self,schema): + _schema = {} + for _item in schema : + _type = int + _value = 0 + if _item.field_type == 'FLOAT' : + _type =float + elif _item.field_type != 'INTEGER' : + _type = str + _value = '' + _schema[_item.name] = _type + return _schema + def get_ignore(self,**_args) : + if 'columns' in _args and 'data' in _args : + _df = _args['data'] + terms = _args['columns'] + return [name for name in _df.columns if name in terms] + return [] def train(self,**args): """ @@ -83,11 +101,15 @@ class Components : schema = reader.meta(table=args['from']) if hasattr(reader,'meta') and 'from' in args else None else: df = args['data'] - - + + # + # + if 'ignore' in args and 'columns' in args['ignore'] : + _cols = self.get_ignore(data=df,columns=args['ignore']['columns']) + df = df[ list(set(df.columns)- set(_cols))] # df = df.fillna('') if schema : - _schema = {} + _schema = [] for _item in schema : _type = int _value = 0 @@ -96,7 +118,7 @@ class Components : elif _item.field_type != 'INTEGER' : _type = str _value = '' - _schema[_item.name] = _type + _schema += [{"name":_item.name,"type":_item.field_type}] df[_item.name] = df[_item.name].fillna(_value).astype(_type) args['schema'] = _schema # df[_item.name] = df[_item.name].astype(_type) @@ -107,6 +129,8 @@ class Components : data.maker.train(**_args) if 'autopilot' in ( list(args.keys())) : + + args['data'] = df print (['autopilot mode enabled ....',args['context']]) self.generate(args) @@ -127,52 +151,27 @@ class Components : ostore = args['store']['target'] writer = factory.instance(**ostore) - # log_folder = args['logs'] if 'logs' in args else 'logs' - # partition = args['partition'] if 'partition' in args else '' - # log_folder = os.sep.join([log_folder,args['context'],str(partition)]) - - # _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger} - # _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) - # _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 - # if 'batch_size' in args : - # _args['batch_size'] = int(args['batch_size']) - - # if int(args['num_gpu']) > 1 : - # _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int) - # else: - # _args['gpu'] = 0 - # _args['num_gpu'] = 1 - # os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) - # # _args['no_value']= args['no_value'] - # _args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128 - - - # # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 - # PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 - - # credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') - # _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna() - # reader = args['reader'] - # df = reader() + schema = args['schema'] if 'schema' in args else None - if 'file' in args : + if 'data' in args : - df = pd.read_csv(args['file']) + df = args['data'] else: - if 'data' not in args : - reader = factory.instance(**args['store']['source']) - if 'row_limit' in args : - df = reader.read(sql=args['sql'],limit=args['row_limit']) - else: - df = reader.read(sql=args['sql']) - if 'schema' not in args and hasattr(reader,'meta'): - schema = reader.meta(table=args['from']) - + reader = factory.instance(**args['store']['source']) + if 'row_limit' in args : + df = reader.read(sql=args['sql'],limit=args['row_limit']) else: - # - # This will account for autopilot mode ... - df = args['data'] + df = reader.read(sql=args['sql']) + if 'schema' not in args and hasattr(reader,'meta'): + schema = reader.meta(table=args['from']) + schema = [{"name":_item.name,"type":_item.field_type} for _item in schema] + + + # else: + # # + # # This will account for autopilot mode ... + # df = args['data'] _info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[0]}} @@ -188,7 +187,7 @@ class Components : # writer = factory.instance(**ostore) _columns = None skip_columns = [] - _schema = [{"name":field.name,"type":field.field_type,"description":field.description} for field in schema] if schema else [] + _schema = schema for _df in candidates : # # we need to format the fields here to make sure we have something cohesive @@ -197,11 +196,11 @@ class Components : if not skip_columns : # _columns = set(df.columns) - set(_df.columns) if 'ignore' in args and 'columns' in args['ignore'] : - - for name in args['ignore']['columns'] : - for _name in _df.columns: - if _name in name: - skip_columns.append(_name) + skip_columns = self.get_ignore(data=_df,columns=args['ignore']['columns']) + # for name in args['ignore']['columns'] : + # for _name in _df.columns: + # if _name in name: + # skip_columns.append(_name) # # We perform a series of set operations to insure that the following conditions are met: # - the synthetic dataset only has fields that need to be synthesized From ac8968c3e33cfe0408a67074bb7bbc0695981d21 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 Mar 2021 05:23:55 -0500 Subject: [PATCH 119/250] bug fix: fields skipped (training) --- pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline.py b/pipeline.py index e643278..f4db40c 100644 --- a/pipeline.py +++ b/pipeline.py @@ -104,9 +104,6 @@ class Components : # # - if 'ignore' in args and 'columns' in args['ignore'] : - _cols = self.get_ignore(data=df,columns=args['ignore']['columns']) - df = df[ list(set(df.columns)- set(_cols))] # df = df.fillna('') if schema : _schema = [] @@ -125,6 +122,9 @@ class Components : _args = copy.deepcopy(args) # _args['store'] = args['store']['source'] _args['data'] = df + if 'ignore' in args and 'columns' in args['ignore'] : + _cols = self.get_ignore(data=df,columns=args['ignore']['columns']) + _args['data'] = df[ list(set(df.columns)- set(_cols))] data.maker.train(**_args) From 0ef149f76b4d17d0b4bd78847b8c9403d895800a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 Mar 2021 05:35:29 -0500 Subject: [PATCH 120/250] bug fix: fields skipped (training) --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index f4db40c..6cb1be9 100644 --- a/pipeline.py +++ b/pipeline.py @@ -79,7 +79,7 @@ class Components : if 'columns' in _args and 'data' in _args : _df = _args['data'] terms = _args['columns'] - return [name for name in _df.columns if name in terms] + return [name for name in _df.columns if np.sum( [int(field in name )for field in terms ]) > 0 ] return [] def train(self,**args): From 9bbe9b7ff908e8baa3ba9b38fc84a38c4b1d0d09 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 Mar 2021 09:00:57 -0500 Subject: [PATCH 121/250] optimization (minor) --- data/maker/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index fbdf208..611b13d 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -197,17 +197,17 @@ def generate(**_args): f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json'])) _map = json.loads(f.read()) f.close() - if 'file' in _args : - df = pd.read_csv(_args['file']) - else: - df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data']) + # if 'file' in _args : + # df = pd.read_csv(_args['file']) + # else: + # df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data']) args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']} args['logs'] = _args['logs'] if 'logs' in _args else 'logs' args ['max_epochs'] = _args['max_epochs'] # args['matrix_size'] = _matrix.shape[0] args['batch_size'] = 2000 args['partition'] = 0 if 'partition' not in _args else _args['partition'] - args['row_count'] = df.shape[0] + args['row_count'] = _args['data'].shape[0] # # @TODO: perhaps get the space of values here ... (not sure it's a good idea) # From b283f72dc963a433152ab97b5b934b36d65471a8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 Mar 2021 10:48:25 -0500 Subject: [PATCH 122/250] bug fix --- pipeline.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pipeline.py b/pipeline.py index 6cb1be9..d00ddb7 100644 --- a/pipeline.py +++ b/pipeline.py @@ -179,6 +179,10 @@ class Components : _dc = pd.DataFrame() # for mdf in df : args['data'] = df + if 'ignore' in args and 'columns' in args['ignore'] : + _cols = self.get_ignore(data=df,columns=args['ignore']['columns']) + args['data'] = df[ list(set(df.columns)- set(_cols))] + args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) candidates = (data.maker.generate(**args)) From 341b9ffec165a280a00ddd71ad97a468f920e3ac Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 Mar 2021 16:14:48 -0500 Subject: [PATCH 123/250] bug fix: log information about space --- data/maker/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 611b13d..59a7ff0 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -119,7 +119,7 @@ def train (**_args): values = _inputhandler._map[key]['values'].tolist() _map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()} info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map} - logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":info}) + logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io}) args['logs'] = _args['logs'] if 'logs' in _args else 'logs' args ['max_epochs'] = _args['max_epochs'] From bdd752550ef5f3b9bd63868d621471653e3848a7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 Mar 2021 17:01:32 -0500 Subject: [PATCH 124/250] bug fix attempt: large matrix conversion fails --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4b96d08..a2e6744 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,9 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.4.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker", + "version":"1.4.2", + "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' From f65b082fb129cbb37091a35d8823218c766882e7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 Mar 2021 17:01:59 -0500 Subject: [PATCH 125/250] bug fix attempt: large matrix conversion fails --- data/gan.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/data/gan.py b/data/gan.py index c61d1b1..767a24b 100644 --- a/data/gan.py +++ b/data/gan.py @@ -603,7 +603,8 @@ class Predict(GNet): # # df = pd.DataFrame(np.round(f)).astype(np.int32) - candidates.append (np.round(_matrix).astype(np.int64)) + # candidates.append (np.round(_matrix).astype(np.int64)) + candidates.append( [np.round(row).astype(int) for row in _matrix]) # return candidates[0] if len(candidates) == 1 else candidates return candidates From 62a665464dca608439e2328447c9ea0880cd4ed8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 Mar 2021 17:17:10 -0500 Subject: [PATCH 126/250] .. --- data/gan.py | 2 +- data/maker/prepare/__init__.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/data/gan.py b/data/gan.py index 767a24b..985e706 100644 --- a/data/gan.py +++ b/data/gan.py @@ -604,7 +604,7 @@ class Predict(GNet): # df = pd.DataFrame(np.round(f)).astype(np.int32) # candidates.append (np.round(_matrix).astype(np.int64)) - candidates.append( [np.round(row).astype(int) for row in _matrix]) + candidates.append(np.array([np.round(row).astype(int) for row in _matrix])) # return candidates[0] if len(candidates) == 1 else candidates return candidates diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 381dfc0..9fb0fa7 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -111,13 +111,13 @@ class Input : if 'columns' in _args : self._columns = _args['columns'] - else: - # - # We will look into the count and make a judgment call - _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T - MIN_SPACE_SIZE = 2 - self._columns = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist() - self._io = _df.to_dict(orient='records') + # else: + # + # We will look into the count and make a judgment call + _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T + MIN_SPACE_SIZE = 2 + self._columns = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist() + self._io = _df.to_dict(orient='records') def _initdata(self,**_args): """ From 846fa99743eae03d87acd12a3503064398db0e8d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 Mar 2021 17:50:12 -0500 Subject: [PATCH 127/250] bug fix: data type and schema fields (order) --- pipeline.py | 3 ++- setup.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index d00ddb7..47f8547 100644 --- a/pipeline.py +++ b/pipeline.py @@ -192,6 +192,7 @@ class Components : _columns = None skip_columns = [] _schema = schema + cols = [_item['name'] for _item in _schema] for _df in candidates : # # we need to format the fields here to make sure we have something cohesive @@ -222,7 +223,7 @@ class Components : _df = pd.DataFrame.join(df,_df) - writer.write(_df,schema=_schema,table=args['from']) + writer.write(_df[cols],schema=_schema,table=args['from']) # writer.write(df,table=table) pass else: diff --git a/setup.py b/setup.py index a2e6744..450d0d9 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker", - "version":"1.4.2", + "version":"1.4.3", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] From 20ee62178a60df87f161183bf13f89bb70f95c60 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 Mar 2021 22:00:01 -0500 Subject: [PATCH 128/250] bug fixes with data-types --- pipeline.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 47f8547..49b2039 100644 --- a/pipeline.py +++ b/pipeline.py @@ -222,7 +222,9 @@ class Components : # Let us merge the dataset here and and have a comprehensive dataset _df = pd.DataFrame.join(df,_df) - + for _item in _schema : + if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] : + _df[_item['name']] = _df[_item['name']].astype(str) writer.write(_df[cols],schema=_schema,table=args['from']) # writer.write(df,table=table) pass From e0601edea547a28d06c6b82fe313a4f4e5930542 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 12:14:51 -0500 Subject: [PATCH 129/250] bug fix: zeros matrix and continuous variables --- data/maker/prepare/__init__.py | 10 ++++- pipeline.py | 70 ++++++++++++++++++++++++++++++---- setup.py | 2 +- 3 files changed, 71 insertions(+), 11 deletions(-) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 9fb0fa7..e15c63b 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -16,6 +16,9 @@ import numpy as np # import cupy as cp import sys import os +# +# The following is to address the issue over creating a large matrix ... +# # from multiprocessing import Process, Queue # if 'GPU' in os.environ : @@ -230,8 +233,11 @@ class Input : cols = np.array(cols) row_count = len(rows) # if 'GPU' not in os.environ : - _matrix = np.zeros([row_count,cols.size]) - + # _matrix = np.zeros([row_count,cols.size],dtype=int) + # + # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure) + # + _matrix = np.array([np.zeros(cols.size) for i in np.arange(row_count)]) [np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0] # else: # _matrix = cp.zeros([row_count,cols.size]) diff --git a/pipeline.py b/pipeline.py index 49b2039..a38029d 100644 --- a/pipeline.py +++ b/pipeline.py @@ -122,10 +122,20 @@ class Components : _args = copy.deepcopy(args) # _args['store'] = args['store']['source'] _args['data'] = df + # + # The columns that are continuous should also be skipped because they don't need to be synthesied (like-that) + if 'continuous' in args : + x_cols = args['continuous'] + else: + x_cols = [] + if 'ignore' in args and 'columns' in args['ignore'] : _cols = self.get_ignore(data=df,columns=args['ignore']['columns']) _args['data'] = df[ list(set(df.columns)- set(_cols))] - + # + # We need to make sure that continuous columns are removed + if x_cols : + _args['data'] = df[list(set(df.columns) - set(x_cols))] data.maker.train(**_args) if 'autopilot' in ( list(args.keys())) : @@ -136,7 +146,26 @@ class Components : pass - def post(self,args): + def approximate(self,values): + """ + :param values array of values to be approximated + """ + if values.dtype in [int,float] : + r = np.random.dirichlet(values) + x = [] + _type = values.dtype + for index in np.arange(values.size) : + + if np.random.choice([0,1],1)[0] : + value = values[index] + (values[index] * r[index]) + else : + value = values[index] - (values[index] * r[index]) + value = int(value) if _type == int else np.round(value,2) + x.append( value) + np.random.shuffle(x) + return np.array(x) + else: + return values pass @@ -179,10 +208,23 @@ class Components : _dc = pd.DataFrame() # for mdf in df : args['data'] = df + # + # The columns that are continuous should also be skipped because they don't need to be synthesied (like-that) + if 'continuous' in args : + x_cols = args['continuous'] + else: + x_cols = [] + if 'ignore' in args and 'columns' in args['ignore'] : _cols = self.get_ignore(data=df,columns=args['ignore']['columns']) args['data'] = df[ list(set(df.columns)- set(_cols))] - + # + # We need to remove the continuous columns from the data-frame + # @TODO: Abstract this !! + # + if x_cols : + args['data'] = df[list(set(df.columns) - set(x_cols))] + args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) candidates = (data.maker.generate(**args)) @@ -192,7 +234,10 @@ class Components : _columns = None skip_columns = [] _schema = schema - cols = [_item['name'] for _item in _schema] + if schema : + cols = [_item['name'] for _item in _schema] + else: + cols = df.columns for _df in candidates : # # we need to format the fields here to make sure we have something cohesive @@ -206,6 +251,9 @@ class Components : # for _name in _df.columns: # if _name in name: # skip_columns.append(_name) + if x_cols : + for _col in x_cols : + _df[_col] = self.approximate(df[_col]) # # We perform a series of set operations to insure that the following conditions are met: # - the synthetic dataset only has fields that need to be synthesized @@ -222,10 +270,16 @@ class Components : # Let us merge the dataset here and and have a comprehensive dataset _df = pd.DataFrame.join(df,_df) - for _item in _schema : - if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] : - _df[_item['name']] = _df[_item['name']].astype(str) - writer.write(_df[cols],schema=_schema,table=args['from']) + if _schema : + for _item in _schema : + if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] : + _df[_item['name']] = _df[_item['name']].astype(str) + + pass + if _schema : + writer.write(_df[cols],schema=_schema,table=args['from']) + else: + writer.write(_df[cols],table=args['from']) # writer.write(df,table=table) pass else: diff --git a/setup.py b/setup.py index 450d0d9..544f4b3 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker", - "version":"1.4.3", + "version":"1.4.4", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] From 3fb82acd32885856b4fd7abe5b1041e50c7e53c8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 12:21:57 -0500 Subject: [PATCH 130/250] ... --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index a38029d..c1c5719 100644 --- a/pipeline.py +++ b/pipeline.py @@ -253,7 +253,7 @@ class Components : # skip_columns.append(_name) if x_cols : for _col in x_cols : - _df[_col] = self.approximate(df[_col]) + _df[_col] = self.approximate(df[_col].fillna(-1)) # # We perform a series of set operations to insure that the following conditions are met: # - the synthetic dataset only has fields that need to be synthesized From cf478016b06a023c6058012465bb16779534aac0 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 12:28:09 -0500 Subject: [PATCH 131/250] ... --- pipeline.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index c1c5719..6f28eac 100644 --- a/pipeline.py +++ b/pipeline.py @@ -253,7 +253,10 @@ class Components : # skip_columns.append(_name) if x_cols : for _col in x_cols : - _df[_col] = self.approximate(df[_col].fillna(-1)) + if df[_col].unique().size > 0 : + _df[_col] = self.approximate(df[_col].fillna(-1)) + else: + _df[_col] = -1 # # We perform a series of set operations to insure that the following conditions are met: # - the synthetic dataset only has fields that need to be synthesized From 732ccb42e5cd886da632f5a0f523db13bbc07494 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 12:43:09 -0500 Subject: [PATCH 132/250] ... --- pipeline.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipeline.py b/pipeline.py index 6f28eac..8b1dd9e 100644 --- a/pipeline.py +++ b/pipeline.py @@ -136,6 +136,8 @@ class Components : # We need to make sure that continuous columns are removed if x_cols : _args['data'] = df[list(set(df.columns) - set(x_cols))] + if 'gpu' in args : + _args['gpu'] = args['gpu'] data.maker.train(**_args) if 'autopilot' in ( list(args.keys())) : From 5a16e325ac10348eacc4e93eac6a283069ed9722 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 13:09:06 -0500 Subject: [PATCH 133/250] gpu indexing --- data/maker/__init__.py | 79 ++---------------------------------------- pipeline.py | 11 ++++-- 2 files changed, 11 insertions(+), 79 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 59a7ff0..3e42419 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -81,7 +81,6 @@ class ContinuousToDiscrete : return values - def train (**_args): """ :params sql @@ -126,7 +125,7 @@ def train (**_args): args['matrix_size'] = _matrix.shape[0] args['batch_size'] = 2000 args['partition'] = 0 if 'partition' not in _args else _args['partition'] - os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' + # os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' trainer = gan.Train(**args) # @@ -215,8 +214,7 @@ def generate(**_args): _inputhandler = prepare.Input(**_args) values,_matrix = _inputhandler.convert() args['values'] = np.array(values) - if 'gpu' in _args : - os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu']) + handler = gan.Predict (**args) handler.load_meta(None) # @@ -226,76 +224,3 @@ def generate(**_args): candidates = handler.apply(candidates=args['candidates']) return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates] - - -def _generate(**args): - """ - This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset - @return pandas.DataFrame - - :data data-frame to be synthesized - :column columns that need to be synthesized (discrete) - :id column identifying an entity - :logs location on disk where the learnt knowledge of the dataset is - """ - # df = args['data'] - df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) - - CONTINUOUS = args['continuous'] if 'continuous' in args else [] - column = args['column'] if (isinstance(args['column'],list)) else [args['column']] - # column_id = args['id'] - # - #@TODO: - # If the identifier is not present, we should fine a way to determine or make one - # - BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) - # NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value'] - bhandler = Binary() - _df = df.copy() - for col in column : - args['context'] = col - args['column'] = col - - msize = args['matrix_size'] if 'matrix_size' in args else -1 - values = bhandler.get_column(df[col],msize) - MISSING= bhandler.get_missing(df[col],msize) - - - - args['values'] = values - args['row_count'] = df.shape[0] - # if col in NO_VALUE : - # args['no_value'] = NO_VALUE[col] - # else: - # args['no_value'] = NO_VALUE - # novalue = NO_VALUE[col] if NO_VALUE[col] in ['na',''] else NO_VALUE[col] - # MISSING += [NO_VALUE[col]] - args['missing'] = MISSING - # - # we can determine the cardinalities here so we know what to allow or disallow - handler = gan.Predict (**args) - handler.load_meta(col) - r = handler.apply() - if col in CONTINUOUS : - r[col] = np.array(r[col]) - _approx = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) #-- approximating based on arbitrary bins - r[col] = _approx - - - - _df[col] = r[col] - # - # Let's cast the type to the original type (it makes the data more usable) - # - # print (values) - # print ([col,df[col].dtype,_df[col].tolist()]) - otype = df[col].dtype - _df[col] = _df[col].astype(otype) - - # - # @TODO: log basic stats about the synthetic attribute - # - # print (r)s - # break - - return _df \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index 8b1dd9e..6f39b55 100644 --- a/pipeline.py +++ b/pipeline.py @@ -81,7 +81,12 @@ class Components : terms = _args['columns'] return [name for name in _df.columns if np.sum( [int(field in name )for field in terms ]) > 0 ] return [] - + def set_gpu(self,**_args) : + if 'gpu' in _args : + gpu = _args['gpu'] if type(_args['gpu']) != str else [_args['gpu']] + _index = str(gpu[0]) + os.environ['CUDA_VISIBLE_DEVICES'] = _index + return gpu def train(self,**args): """ This function will perform training on the basis of a given pointer that reads data @@ -137,7 +142,7 @@ class Components : if x_cols : _args['data'] = df[list(set(df.columns) - set(x_cols))] if 'gpu' in args : - _args['gpu'] = args['gpu'] + _args['gpu'] = self.set_gpu(gpu=args['gpu']) data.maker.train(**_args) if 'autopilot' in ( list(args.keys())) : @@ -228,6 +233,8 @@ class Components : args['data'] = df[list(set(df.columns) - set(x_cols))] args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) + if 'gpu' in args : + args['gpu'] = self.set_gpu(gpu=args['gpu']) candidates = (data.maker.generate(**args)) if 'sql.BQWriter' in ostore['type'] : From a73e186f77d174056063760af65fbd25ad29ee44 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 13:20:35 -0500 Subject: [PATCH 134/250] gpu indexing --- data/gan.py | 4 ++- data/maker/__init__.py | 59 +++--------------------------------------- pipeline.py | 2 ++ 3 files changed, 9 insertions(+), 56 deletions(-) diff --git a/data/gan.py b/data/gan.py index 985e706..dd8ea6a 100644 --- a/data/gan.py +++ b/data/gan.py @@ -64,7 +64,9 @@ class GNet : self.GPU_CHIPS = [0] if 'CUDA_VISIBLE_DEVICES' in os.environ : os.environ.pop('CUDA_VISIBLE_DEVICES') - self.NUM_GPUS = len(self.GPU_CHIPS) + self.NUM_GPUS = 0 + else: + self.NUM_GPUS = len(self.GPU_CHIPS) self.PARTITION = args['partition'] # if self.NUM_GPUS > 1 : diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 3e42419..bfd6a5f 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -86,18 +86,6 @@ def train (**_args): :params sql :params store """ - # - # Let us prepare the data by calling the utility function - # - # if 'file' in _args : - # # - # # We are reading data from a file - # _args['data'] = pd.read_csv(_args['file']) - # else: - # # - # # data will be read from elsewhere (a data-store)... - # pass - # if 'ignore' in _args and 'columns' in _args['ignore']: _inputhandler = prepare.Input(**_args) values,_matrix = _inputhandler.convert() @@ -125,6 +113,8 @@ def train (**_args): args['matrix_size'] = _matrix.shape[0] args['batch_size'] = 2000 args['partition'] = 0 if 'partition' not in _args else _args['partition'] + if 'gpu' in _args : + args['gpu'] = _args['gpu'] # os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' trainer = gan.Train(**args) @@ -137,50 +127,7 @@ def train (**_args): trainer.apply() pass -def _train (**args) : - """ - This function is intended to train the GAN in order to learn about the distribution of the features - :column columns that need to be synthesized (discrete) - :logs where the output of the (location on disk) - :id identifier of the dataset - :data data-frame to be synthesized - :context label of what we are synthesizing - """ - column = args['column'] if (isinstance(args['column'],list)) else [args['column']] - # CONTINUOUS = args['continuous'] if 'continuous' in args else [] - # column_id = args['id'] - df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) - df.columns = [name.lower() for name in df.columns] - # - # @TODO: - # Consider sequential training of sub population for extremely large datasets - # - - # - # If we have several columns we will proceed one at a time (it could be done in separate threads) - # @TODO : Consider performing this task on several threads/GPUs simulataneously - # - for col in column : - msize = args['matrix_size'] if 'matrix_size' in args else -1 - args['real'] = (Binary()).apply(df[col],msize) - context = args['context'] - if 'store' in args : - args['store']['args']['doc'] = context - logger = factory.instance(**args['store']) - args['logger'] = logger - info = {"rows":args['real'].shape[0],"cols":args['real'].shape[1],"name":col,"partition":args['partition']} - logger.write({"module":"gan-train","action":"data-prep","input":info}) - - else: - logger = None - args['column'] = col - args['context'] = col - - # - # If the s - trainer = gan.Train(**args) - trainer.apply() def get(**args): """ This function will restore a checkpoint from a persistant storage on to disk @@ -214,6 +161,8 @@ def generate(**_args): _inputhandler = prepare.Input(**_args) values,_matrix = _inputhandler.convert() args['values'] = np.array(values) + if 'gpu' in _args : + args['gpu'] = _args['gpu'] handler = gan.Predict (**args) handler.load_meta(None) diff --git a/pipeline.py b/pipeline.py index 6f39b55..e2bbbec 100644 --- a/pipeline.py +++ b/pipeline.py @@ -87,6 +87,8 @@ class Components : _index = str(gpu[0]) os.environ['CUDA_VISIBLE_DEVICES'] = _index return gpu + else : + return None def train(self,**args): """ This function will perform training on the basis of a given pointer that reads data From 5b2aeb0e3e8781a35a590b621e714e5d9278514d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 13:38:28 -0500 Subject: [PATCH 135/250] continuous functions --- pipeline.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pipeline.py b/pipeline.py index e2bbbec..0583116 100644 --- a/pipeline.py +++ b/pipeline.py @@ -142,10 +142,15 @@ class Components : # # We need to make sure that continuous columns are removed if x_cols : - _args['data'] = df[list(set(df.columns) - set(x_cols))] + _args['data'] = _args['data'][list(set(df.columns) - set(x_cols))] if 'gpu' in args : _args['gpu'] = self.set_gpu(gpu=args['gpu']) - data.maker.train(**_args) + if df.shape[0] and df.shape[0] : + # + # We have a full blown matrix to be processed + data.maker.train(**_args) + else: + print ("... skipping training !!") if 'autopilot' in ( list(args.keys())) : @@ -216,7 +221,7 @@ class Components : _dc = pd.DataFrame() # for mdf in df : - args['data'] = df + args['data'] = df.copy() # # The columns that are continuous should also be skipped because they don't need to be synthesied (like-that) if 'continuous' in args : @@ -232,7 +237,7 @@ class Components : # @TODO: Abstract this !! # if x_cols : - args['data'] = df[list(set(df.columns) - set(x_cols))] + args['data'] = args['data'][list(set(df.columns) - set(x_cols))] args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) if 'gpu' in args : From db496f998341565d2895f1682ce1c0c4a995cabd Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 13:43:07 -0500 Subject: [PATCH 136/250] continuous functions skipping fields --- pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index 0583116..f32a45e 100644 --- a/pipeline.py +++ b/pipeline.py @@ -142,7 +142,7 @@ class Components : # # We need to make sure that continuous columns are removed if x_cols : - _args['data'] = _args['data'][list(set(df.columns) - set(x_cols))] + _args['data'] = _args['data'][list(set(_args['data'].columns) - set(x_cols))] if 'gpu' in args : _args['gpu'] = self.set_gpu(gpu=args['gpu']) if df.shape[0] and df.shape[0] : @@ -237,7 +237,7 @@ class Components : # @TODO: Abstract this !! # if x_cols : - args['data'] = args['data'][list(set(df.columns) - set(x_cols))] + args['data'] = args['data'][list(set(args['data'].columns) - set(x_cols))] args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) if 'gpu' in args : From e56254000e1ef89681dcbcd287a0efa770fb6071 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 13:53:29 -0500 Subject: [PATCH 137/250] .. --- pipeline.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pipeline.py b/pipeline.py index f32a45e..a09fbde 100644 --- a/pipeline.py +++ b/pipeline.py @@ -231,7 +231,7 @@ class Components : if 'ignore' in args and 'columns' in args['ignore'] : _cols = self.get_ignore(data=df,columns=args['ignore']['columns']) - args['data'] = df[ list(set(df.columns)- set(_cols))] + args['data'] = args['data'][ list(set(df.columns)- set(_cols))] # # We need to remove the continuous columns from the data-frame # @TODO: Abstract this !! @@ -267,12 +267,6 @@ class Components : # for _name in _df.columns: # if _name in name: # skip_columns.append(_name) - if x_cols : - for _col in x_cols : - if df[_col].unique().size > 0 : - _df[_col] = self.approximate(df[_col].fillna(-1)) - else: - _df[_col] = -1 # # We perform a series of set operations to insure that the following conditions are met: # - the synthetic dataset only has fields that need to be synthesized @@ -284,6 +278,12 @@ class Components : if set(df.columns) & set(_df.columns) : _columns = set(df.columns) - set(_df.columns) df = df[_columns] + if x_cols : + for _col in x_cols : + if df[_col].unique().size > 0 : + _df[_col] = self.approximate(df[_col].fillna(-1)) + else: + _df[_col] = -1 # # Let us merge the dataset here and and have a comprehensive dataset From cd7841be92ab3c894bcf1a9082ce6f9c20c58d68 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 13:56:23 -0500 Subject: [PATCH 138/250] .. --- pipeline.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipeline.py b/pipeline.py index a09fbde..9f57d59 100644 --- a/pipeline.py +++ b/pipeline.py @@ -274,16 +274,16 @@ class Components : # _df = _df[list(set(_df.columns) - set(skip_columns))] - - if set(df.columns) & set(_df.columns) : - _columns = set(df.columns) - set(_df.columns) - df = df[_columns] if x_cols : for _col in x_cols : if df[_col].unique().size > 0 : _df[_col] = self.approximate(df[_col].fillna(-1)) else: _df[_col] = -1 + + if set(df.columns) & set(_df.columns) : + _columns = set(df.columns) - set(_df.columns) + df = df[_columns] # # Let us merge the dataset here and and have a comprehensive dataset From 7ccf9848b2d001b2249deb89fe84cacc12c58558 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 14:04:55 -0500 Subject: [PATCH 139/250] .. --- pipeline.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 9f57d59..72dea06 100644 --- a/pipeline.py +++ b/pipeline.py @@ -236,8 +236,10 @@ class Components : # We need to remove the continuous columns from the data-frame # @TODO: Abstract this !! # + real_df = pd.DataFrame() if x_cols : args['data'] = args['data'][list(set(args['data'].columns) - set(x_cols))] + real_df = args[x_cols].copy() args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) if 'gpu' in args : @@ -276,7 +278,7 @@ class Components : _df = _df[list(set(_df.columns) - set(skip_columns))] if x_cols : for _col in x_cols : - if df[_col].unique().size > 0 : + if real_df[_col].unique().size > 0 : _df[_col] = self.approximate(df[_col].fillna(-1)) else: _df[_col] = -1 @@ -289,6 +291,7 @@ class Components : # Let us merge the dataset here and and have a comprehensive dataset _df = pd.DataFrame.join(df,_df) + if _schema : for _item in _schema : if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] : From 452014ec1783f81bbbc6abc7a19a9d5e051b8cc5 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 14:07:21 -0500 Subject: [PATCH 140/250] ... --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 72dea06..7082b71 100644 --- a/pipeline.py +++ b/pipeline.py @@ -239,7 +239,7 @@ class Components : real_df = pd.DataFrame() if x_cols : args['data'] = args['data'][list(set(args['data'].columns) - set(x_cols))] - real_df = args[x_cols].copy() + real_df = args['data'][x_cols].copy() args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) if 'gpu' in args : From 1178cb7343644b1c9b0c2158aad3c53452e0f3b8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 14:09:38 -0500 Subject: [PATCH 141/250] ... --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 7082b71..29d15a7 100644 --- a/pipeline.py +++ b/pipeline.py @@ -239,7 +239,7 @@ class Components : real_df = pd.DataFrame() if x_cols : args['data'] = args['data'][list(set(args['data'].columns) - set(x_cols))] - real_df = args['data'][x_cols].copy() + real_df = df['data'][x_cols].copy() args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) if 'gpu' in args : From 6bbe7677146b7914ff8ffa37dd44bfbce96837de Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 14:13:00 -0500 Subject: [PATCH 142/250] ... --- pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index 29d15a7..6476221 100644 --- a/pipeline.py +++ b/pipeline.py @@ -239,7 +239,7 @@ class Components : real_df = pd.DataFrame() if x_cols : args['data'] = args['data'][list(set(args['data'].columns) - set(x_cols))] - real_df = df['data'][x_cols].copy() + real_df = df[x_cols].copy() args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) if 'gpu' in args : @@ -279,7 +279,7 @@ class Components : if x_cols : for _col in x_cols : if real_df[_col].unique().size > 0 : - _df[_col] = self.approximate(df[_col].fillna(-1)) + _df[_col] = self.approximate(real_df[_col].fillna(-1)) else: _df[_col] = -1 From fa8915a990873086040e29908edc8f9beeb5f220 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 4 Apr 2021 10:54:44 -0500 Subject: [PATCH 143/250] bug fix: matrix allocation error --- data/maker/prepare/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index e15c63b..ecb47bd 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -237,7 +237,7 @@ class Input : # # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure) # - _matrix = np.array([np.zeros(cols.size) for i in np.arange(row_count)]) + _matrix = np.array([np.repeat(0,cols.size) for i in range(row_count)]) [np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0] # else: # _matrix = cp.zeros([row_count,cols.size]) From a43247ac65101128a8a2a07f6cce45511e5864ee Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 4 Apr 2021 11:17:34 -0500 Subject: [PATCH 144/250] logging generator .... --- pipeline.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 6476221..0a9c549 100644 --- a/pipeline.py +++ b/pipeline.py @@ -217,6 +217,7 @@ class Components : # df = args['data'] _info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[0]}} + logger.write(_info) _dc = pd.DataFrame() @@ -244,7 +245,8 @@ class Components : args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) if 'gpu' in args : args['gpu'] = self.set_gpu(gpu=args['gpu']) - + _info = {"module":"gan-prep","action":"prune","shape":{"rows":args['data'].shape[0],"columns":args['data'].shape[0]}} + logger.write(_info) candidates = (data.maker.generate(**args)) if 'sql.BQWriter' in ostore['type'] : #table = ".".join([ostore['['dataset'],args['context']]) From efd2fd6a9a8419049e59473a62f2e99762d399c7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 4 Apr 2021 11:26:24 -0500 Subject: [PATCH 145/250] logging generator .... --- pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline.py b/pipeline.py index 0a9c549..bbc54bc 100644 --- a/pipeline.py +++ b/pipeline.py @@ -189,7 +189,7 @@ class Components : This function will generate data and store it to a given, """ store = args['store']['logs'] - store['doc'] = args['context'] + store['args']['doc'] = args['context'] logger = factory.instance(**store) #type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) ostore = args['store']['target'] @@ -216,7 +216,7 @@ class Components : # # This will account for autopilot mode ... # df = args['data'] - _info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[0]}} + _info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[1]}} logger.write(_info) @@ -245,7 +245,7 @@ class Components : args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) if 'gpu' in args : args['gpu'] = self.set_gpu(gpu=args['gpu']) - _info = {"module":"gan-prep","action":"prune","shape":{"rows":args['data'].shape[0],"columns":args['data'].shape[0]}} + _info = {"module":"gan-prep","action":"prune","shape":{"rows":args['data'].shape[0],"columns":args['data'].shape[1]}} logger.write(_info) candidates = (data.maker.generate(**args)) if 'sql.BQWriter' in ostore['type'] : From 88d602de1cb5db449c997314465ce9f93fc5cb7a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 4 Apr 2021 11:45:20 -0500 Subject: [PATCH 146/250] ... --- pipeline.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index bbc54bc..87d2af6 100644 --- a/pipeline.py +++ b/pipeline.py @@ -79,7 +79,8 @@ class Components : if 'columns' in _args and 'data' in _args : _df = _args['data'] terms = _args['columns'] - return [name for name in _df.columns if np.sum( [int(field in name )for field in terms ]) > 0 ] + return [name for name in _df.columns if np.sum( [int(field in name )for field in terms ]) ] + return [] def set_gpu(self,**_args) : if 'gpu' in _args : @@ -247,7 +248,10 @@ class Components : args['gpu'] = self.set_gpu(gpu=args['gpu']) _info = {"module":"gan-prep","action":"prune","shape":{"rows":args['data'].shape[0],"columns":args['data'].shape[1]}} logger.write(_info) - candidates = (data.maker.generate(**args)) + if args['data'].shape[0] > 0 and args['data'].shape[1] > 0 : + candidates = (data.maker.generate(**args)) + else: + candidate = [df] if 'sql.BQWriter' in ostore['type'] : #table = ".".join([ostore['['dataset'],args['context']]) # writer = factory.instance(**ostore) From 0a346d7abc72434f57c477d4a2816d67bb388f03 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 4 Apr 2021 11:46:36 -0500 Subject: [PATCH 147/250] ... --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 87d2af6..ece3030 100644 --- a/pipeline.py +++ b/pipeline.py @@ -251,7 +251,7 @@ class Components : if args['data'].shape[0] > 0 and args['data'].shape[1] > 0 : candidates = (data.maker.generate(**args)) else: - candidate = [df] + candidates = [df] if 'sql.BQWriter' in ostore['type'] : #table = ".".join([ostore['['dataset'],args['context']]) # writer = factory.instance(**ostore) From 73115724fe2d91d4ffbc5b21dec50c24d6963480 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 4 Apr 2021 12:05:23 -0500 Subject: [PATCH 148/250] ... --- pipeline.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index ece3030..da7b27e 100644 --- a/pipeline.py +++ b/pipeline.py @@ -216,8 +216,22 @@ class Components : # # # # This will account for autopilot mode ... # df = args['data'] - - _info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[1]}} + _cast = {} + if schema : + dtype = str + name = schema['name'] + novalue = -1 + if schema['type'] == 'INTEGER' : + dtype = np.int64 + + elif schema['type'] == 'FLOAT' : + dtype = np.float64 + else: + novalue = '' + _cast[schema['name']] = dtype + df[name] = df[name].fillna(novalue).astype(dtype) + + _info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[1]},"schema":schema} logger.write(_info) From 856d1e4bd7650b74f07f047f193777766dd1b947 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 4 Apr 2021 12:09:34 -0500 Subject: [PATCH 149/250] ... --- pipeline.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/pipeline.py b/pipeline.py index da7b27e..1ca19e5 100644 --- a/pipeline.py +++ b/pipeline.py @@ -218,18 +218,19 @@ class Components : # df = args['data'] _cast = {} if schema : - dtype = str - name = schema['name'] - novalue = -1 - if schema['type'] == 'INTEGER' : - dtype = np.int64 - - elif schema['type'] == 'FLOAT' : - dtype = np.float64 - else: - novalue = '' - _cast[schema['name']] = dtype - df[name] = df[name].fillna(novalue).astype(dtype) + for _item in schem : + dtype = str + name = _item['name'] + novalue = -1 + if _item['type'] == 'INTEGER' : + dtype = np.int64 + + elif _item['type'] == 'FLOAT' : + dtype = np.float64 + else: + novalue = '' + # _cast[schema['name']] = dtype + df[name] = df[name].fillna(novalue).astype(dtype) _info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[1]},"schema":schema} logger.write(_info) From 0f82f002dfb7512f358e786635f492251e714d20 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 4 Apr 2021 12:11:40 -0500 Subject: [PATCH 150/250] ... --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 1ca19e5..8d35cd8 100644 --- a/pipeline.py +++ b/pipeline.py @@ -218,7 +218,7 @@ class Components : # df = args['data'] _cast = {} if schema : - for _item in schem : + for _item in schema : dtype = str name = _item['name'] novalue = -1 From dbbe0d94ced4649eefc4e245d50b4304c7a79bca Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 4 Apr 2021 12:51:27 -0500 Subject: [PATCH 151/250] bg fix : approximation --- pipeline.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pipeline.py b/pipeline.py index 8d35cd8..00bb80c 100644 --- a/pipeline.py +++ b/pipeline.py @@ -166,7 +166,9 @@ class Components : :param values array of values to be approximated """ if values.dtype in [int,float] : - r = np.random.dirichlet(values) + # + # @TODO: create bins? + r = np.random.dirichlet(values+.001) #-- dirichlet doesn't work on values with zeros x = [] _type = values.dtype for index in np.arange(values.size) : @@ -222,7 +224,7 @@ class Components : dtype = str name = _item['name'] novalue = -1 - if _item['type'] == 'INTEGER' : + if _item['type'] in ['INTEGER','NUMERIC']: dtype = np.int64 elif _item['type'] == 'FLOAT' : @@ -296,11 +298,11 @@ class Components : # - The original dataset has all the fields except those that need to be synthesized # - _df = _df[list(set(_df.columns) - set(skip_columns))] + _df = _df[list(set(_df.columns) - set(skip_columns))].copy() if x_cols : for _col in x_cols : if real_df[_col].unique().size > 0 : - _df[_col] = self.approximate(real_df[_col].fillna(-1)) + _df[_col] = self.approximate(real_df[_col]) else: _df[_col] = -1 From 8997a5ca10dc0a2f9dd58ea3e7ee13a1415298ae Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 4 Apr 2021 13:29:57 -0500 Subject: [PATCH 152/250] bg fix : approximation --- pipeline.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 00bb80c..1bb0707 100644 --- a/pipeline.py +++ b/pipeline.py @@ -300,12 +300,19 @@ class Components : _df = _df[list(set(_df.columns) - set(skip_columns))].copy() if x_cols : + _approx = {} for _col in x_cols : if real_df[_col].unique().size > 0 : + + _df[_col] = self.approximate(real_df[_col]) + _approx[_col] = { + "io":{"min":_df[_col].min(),"max":_df[_col].max(),"mean":_df[_col].mean(),"sd":_df[_col].values.std(),"missing": _df[_col].where(_df[_col] == -1).dropna().count(),"zeros":_df[_col].where(_df[_col] == 0).dropna().count()}, + "real":{"min":real_df[_col].min(),"max":real_df[_col].max(),"mean":real_df[_col].mean(),"sd":real_df[_col].values.std(),"missing": real_df[_col].where(_df[_col] == -1).dropna().count(),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count()} + } else: _df[_col] = -1 - + logger.write({"module":"gan-generate","action":"approximate","status":_approx}) if set(df.columns) & set(_df.columns) : _columns = set(df.columns) - set(_df.columns) df = df[_columns] From fc7b694d0272b9c310b9d501d932e660fc44ab97 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 4 Apr 2021 13:36:47 -0500 Subject: [PATCH 153/250] bg fix : approximation --- pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index 1bb0707..2ed5cdc 100644 --- a/pipeline.py +++ b/pipeline.py @@ -307,8 +307,8 @@ class Components : _df[_col] = self.approximate(real_df[_col]) _approx[_col] = { - "io":{"min":_df[_col].min(),"max":_df[_col].max(),"mean":_df[_col].mean(),"sd":_df[_col].values.std(),"missing": _df[_col].where(_df[_col] == -1).dropna().count(),"zeros":_df[_col].where(_df[_col] == 0).dropna().count()}, - "real":{"min":real_df[_col].min(),"max":real_df[_col].max(),"mean":real_df[_col].mean(),"sd":real_df[_col].values.std(),"missing": real_df[_col].where(_df[_col] == -1).dropna().count(),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count()} + "io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}, + "real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)} } else: _df[_col] = -1 From 12d7573ba8915f5ee43e81f46d5a556db899fbcb Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 4 Apr 2021 13:52:15 -0500 Subject: [PATCH 154/250] bg fix : approximation --- pipeline.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 2ed5cdc..d73f1fc 100644 --- a/pipeline.py +++ b/pipeline.py @@ -169,14 +169,24 @@ class Components : # # @TODO: create bins? r = np.random.dirichlet(values+.001) #-- dirichlet doesn't work on values with zeros + _sd = values[values > 0].std() + _me = values[values > 0].mean() x = [] _type = values.dtype for index in np.arange(values.size) : if np.random.choice([0,1],1)[0] : value = values[index] + (values[index] * r[index]) + else : value = values[index] - (values[index] * r[index]) + # + # randomly shifting the measurements + if np.random.choice([0,1],1)[0] and _me > _sd: + if np.random.choice([0,1],1)[0] : + value = value * np.divide(_me,_sd) + else: + value = value + (np.divide(_me,_sd)) value = int(value) if _type == int else np.round(value,2) x.append( value) np.random.shuffle(x) @@ -305,7 +315,7 @@ class Components : if real_df[_col].unique().size > 0 : - _df[_col] = self.approximate(real_df[_col]) + _df[_col] = self.approximate(real_df[_col].values) _approx[_col] = { "io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}, "real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)} From f26795387ef60690512ee33b3dcfaee2ca7c5c10 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 7 Apr 2021 15:30:59 -0500 Subject: [PATCH 155/250] feature: bootstrap-like with candidates --- data/gan.py | 26 +++++++++---- data/maker/__init__.py | 14 +++++-- data/maker/prepare/__init__.py | 2 +- pipeline.py | 71 +++++++++++++++++++++++++++++++--- 4 files changed, 94 insertions(+), 19 deletions(-) diff --git a/data/gan.py b/data/gan.py index dd8ea6a..643e838 100644 --- a/data/gan.py +++ b/data/gan.py @@ -67,8 +67,9 @@ class GNet : self.NUM_GPUS = 0 else: self.NUM_GPUS = len(self.GPU_CHIPS) + # os.environ['CUDA_VISIBLE_DEVICES'] = str(self.GPU_CHIPS[0]) - self.PARTITION = args['partition'] + self.PARTITION = args['partition'] if 'partition' in args else None # if self.NUM_GPUS > 1 : # os.environ['CUDA_VISIBLE_DEVICES'] = "4" @@ -117,9 +118,14 @@ class GNet : for key in ['train','output'] : self.mkdir(os.sep.join([self.log_dir,key])) self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT])) + if 'partition' in args : + self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT,str(args['partition'])])) self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) + if 'partition' in args : + self.train_dir = os.sep.join([self.train_dir,str(args['partition'])]) + self.out_dir = os.sep.join([self.out_dir,str(args['partition'])]) # if self.logger : # We will clear the logs from the data-store @@ -130,7 +136,7 @@ class GNet : # db.backup.insert({'name':column,'logs':list(db[column].find()) }) # db[column].drop() - def load_meta(self,column): + def load_meta(self,**args): """ This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model. Because prediction and training can happen independently @@ -145,6 +151,9 @@ class GNet : setattr(self,key,value) self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) + if 'partition' in args : + self.train_dir = os.sep.join([self.train_dir,str(args['partition'])]) + self.out_dir = os.sep.join([self.out_dir,str(args['partition'])]) def log_meta(self,**args) : @@ -265,9 +274,9 @@ class Generator (GNet): #tf.add_to_collection('glosses', loss) tf.compat.v1.add_to_collection('glosses', loss) return loss, loss - def load_meta(self, column): - super().load_meta(column) - self.discriminator.load_meta(column) + def load_meta(self, **args): + super().load_meta(**args) + self.discriminator.load_meta(**args) def network(self,**args) : """ This function will build the network that will generate the synthetic candidates @@ -454,6 +463,7 @@ class Train (GNet): # - determine if the GPU/CPU are busy # for i in self.GPU_CHIPS : #range(self.NUM_GPUS): + with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ('TOWER', i)) as scope: if self._LABEL is not None : @@ -559,9 +569,9 @@ class Predict(GNet): # self.MISSING_VALUES = args['no_value'] # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value'] - def load_meta(self, column): - super().load_meta(column) - self.generator.load_meta(column) + def load_meta(self, **args): + super().load_meta(**args) + self.generator.load_meta(**args) self.ROW_COUNT = self.oROW_COUNT def apply(self,**args): suffix = self.CONTEXT #self.get.suffix() diff --git a/data/maker/__init__.py b/data/maker/__init__.py index bfd6a5f..803590a 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -112,7 +112,8 @@ def train (**_args): args ['max_epochs'] = _args['max_epochs'] args['matrix_size'] = _matrix.shape[0] args['batch_size'] = 2000 - args['partition'] = 0 if 'partition' not in _args else _args['partition'] + if 'partition' in _args : + args['partition'] = _args['partition'] if 'gpu' in _args : args['gpu'] = _args['gpu'] # os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' @@ -121,7 +122,8 @@ def train (**_args): # # @TODO: Write the map.json in the output directory for the logs # - f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w') + # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w') + f = open(os.sep.join([trainer.out_dir,'map.json']),'w') f.write(json.dumps(_map)) f.close() @@ -140,7 +142,11 @@ def generate(**_args): :param context :param logs """ - f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json'])) + partition = _args['partition'] if 'partition' in _args else None + if not partition : + f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json'])) + else: + f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json'])) _map = json.loads(f.read()) f.close() # if 'file' in _args : @@ -165,7 +171,7 @@ def generate(**_args): args['gpu'] = _args['gpu'] handler = gan.Predict (**args) - handler.load_meta(None) + handler.load_meta(column=None) # # Let us now format the matrices by reverting them to a data-frame with values # diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index ecb47bd..5ace56a 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -237,7 +237,7 @@ class Input : # # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure) # - _matrix = np.array([np.repeat(0,cols.size) for i in range(row_count)]) + _matrix = np.array([np.repeat(0,cols.size) for i in range(0,row_count)]) [np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0] # else: # _matrix = cp.zeros([row_count,cols.size]) diff --git a/pipeline.py b/pipeline.py index d73f1fc..3f8358b 100644 --- a/pipeline.py +++ b/pipeline.py @@ -146,6 +146,8 @@ class Components : _args['data'] = _args['data'][list(set(_args['data'].columns) - set(x_cols))] if 'gpu' in args : _args['gpu'] = self.set_gpu(gpu=args['gpu']) + if 'partition' in args : + _args['partition'] = args['partition'] if df.shape[0] and df.shape[0] : # # We have a full blown matrix to be processed @@ -154,7 +156,7 @@ class Components : print ("... skipping training !!") if 'autopilot' in ( list(args.keys())) : - + args['data'] = df print (['autopilot mode enabled ....',args['context']]) self.generate(args) @@ -171,6 +173,7 @@ class Components : r = np.random.dirichlet(values+.001) #-- dirichlet doesn't work on values with zeros _sd = values[values > 0].std() _me = values[values > 0].mean() + _mi = values.min() x = [] _type = values.dtype for index in np.arange(values.size) : @@ -182,7 +185,7 @@ class Components : value = values[index] - (values[index] * r[index]) # # randomly shifting the measurements - if np.random.choice([0,1],1)[0] and _me > _sd: + if np.random.choice([0,1],1)[0] and _me > _sd : if np.random.choice([0,1],1)[0] : value = value * np.divide(_me,_sd) else: @@ -273,6 +276,9 @@ class Components : args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) if 'gpu' in args : args['gpu'] = self.set_gpu(gpu=args['gpu']) + # if 'partition' in args : + # args['logs'] = os.sep.join([args['logs'],str(args['partition'])]) + _info = {"module":"gan-prep","action":"prune","shape":{"rows":args['data'].shape[0],"columns":args['data'].shape[1]}} logger.write(_info) if args['data'].shape[0] > 0 and args['data'].shape[1] > 0 : @@ -459,12 +465,18 @@ if __name__ == '__main__' : # COLUMNS = DATA.columns # DATA = np.array_split(DATA,PART_SIZE) # args['schema'] = schema + GPU_CHIPS = SYS_ARGS['gpu'] if 'gpu' in SYS_ARGS else None + if GPU_CHIPS and type(GPU_CHIPS) != list : + GPU_CHIPS = [int(_id.strip()) for _id in GPU_CHIPS.split(',')] if type(GPU_CHIPS) == str else [GPU_CHIPS] + if 'gpu' in SYS_ARGS : + args['gpu'] = GPU_CHIPS + jobs = [] if 'generate' in SYS_ARGS : # # Let us see if we have partitions given the log folder content = os.listdir( os.sep.join([args['logs'],'train',args['context']])) - generator = Components() + # if ''.join(content).isnumeric() : # # @@ -508,13 +520,60 @@ if __name__ == '__main__' : # else: # generator.generate(args) # Components.generate(args) - generator.generate(args) + if '--all-chips' in SYS_ARGS and GPU_CHIPS: + index = 0 + jobs = [] + for _id in GPU_CHIPS : + _args = copy.deepcopy(args) + _args['gpu'] = [int(_gpu)] + _args['partition'] = index + index += 1 + make = lambda _params: (Components()).generate(_params) + job = Process(target=make,args=( dict(_args),)) + job.name = 'Trainer # ' + str(index) + job.start() + jobs.append(job) + pass + else: + generator = Components() + generator.generate(args) else: # DATA = np.array_split(DATA,PART_SIZE) - agent = Components() - agent.train(**args) + # + # Let us create n-jobs across n-gpus, The assumption here is the data that is produced will be a partition + # @TODO: Find better name for partition + # + if GPU_CHIPS and '--all-chips' in SYS_ARGS: + index = 0 + + for _gpu in GPU_CHIPS : + _args = copy.deepcopy(args) + _args['gpu'] = [int(_gpu)] + _args['partition'] = index + index += 1 + make = lambda _params: (Components()).train(**_params) + job = Process(target=make,args=( dict(_args),)) + job.name = 'Trainer # ' + str(index) + job.start() + jobs.append(job) + + + + + else: + # + # The choice of the chip will be made internally + agent = Components() + agent.train(**args) + # + # If we have any obs we should wait till they finish + # + while len(jobs)> 0 : + jobs = [job for job in jobs if job.is_alive()] + time.sleep(2) + # jobs = [] # for index in range(0,PART_SIZE) : # if 'focus' in args and int(args['focus']) != index : From 6a6352169c50beb4a12c39107fc3cbd32fdbc6c7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 12 Apr 2021 12:55:01 -0500 Subject: [PATCH 156/250] adding shuffle feature to be used for very large spaces --- pipeline.py | 88 +++++++++++++++++++++++++++++++++++++++++++++-------- setup.py | 2 +- 2 files changed, 77 insertions(+), 13 deletions(-) diff --git a/pipeline.py b/pipeline.py index 3f8358b..9d33873 100644 --- a/pipeline.py +++ b/pipeline.py @@ -198,6 +198,52 @@ class Components : return values pass + def shuffle(self,_args): + if 'data' in args : + df = data['data'] + else: + reader = factory.instance(**args['store']['source']) + if 'file' in args : + df = pd.read_csv(args['file']) + else: + if 'row_limit' in args and 'sql' in args: + df = reader.read(sql=args['sql'],limit=args['row_limit']) + else: + df = reader.read(sql=args['sql']) + schema = None + if 'schema' not in args and hasattr(reader,'meta') and 'file' not in args: + schema = reader.meta(table=args['from']) + schema = [{"name":_item.name,"type":_item.field_type} for _item in schema] + # + # We are shufling designated colmns and will be approximating the others + # + x_cols = [] #-- coumns tobe approximated. + _cols = [] #-- columns to be ignored + if 'continuous' in args : + x_cols = args['continuous'] + if 'ignore' in args and 'columns' in args['ignore'] : + _cols = self.get_ignore(data=df,columns=args['ignore']['columns']) + + + for name in list (set(df.columns) - set(_cols)) : + i = np.arange(df.shape[0]) + np.random.shuffle(i) + if name in x_cols : + df[name] = self.approximate(df[name].values) + df[name] = df.iloc[i][name] + self.post(data=df,schema=schema,store=args['store']['target']) + def post(self,**_args) : + _schema = _args['schema'] if 'schema' in _args else None + writer = factory.instance(**_args['store']) + _df = _args['data'] + if _schema : + + for _item in _schema : + if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] : + _df[_item['name']] = _df[_item['name']].astype(str) + writer.write(_df,schema=_schema,table=args['from']) + else: + writer.write(_df,table=args['from']) # @staticmethod def generate(self,args): @@ -338,20 +384,25 @@ class Components : _df = pd.DataFrame.join(df,_df) - if _schema : - for _item in _schema : - if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] : - _df[_item['name']] = _df[_item['name']].astype(str) + # if _schema : + # for _item in _schema : + # if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] : + # _df[_item['name']] = _df[_item['name']].astype(str) - pass + # pass + _params = {'data':_df,'store' : ostore} if _schema : - writer.write(_df[cols],schema=_schema,table=args['from']) - else: - writer.write(_df[cols],table=args['from']) - # writer.write(df,table=table) - pass - else: + _params ['schema'] = _schema + self.post(**_params) + # if _schema : + # writer.write(_df[cols],schema=_schema,table=args['from']) + # self.post(data=_df,schema=) + # else: + # writer.write(_df[cols],table=args['from']) + pass + # else: + # pass # # @@ -537,7 +588,20 @@ if __name__ == '__main__' : else: generator = Components() generator.generate(args) - + elif 'shuffle' in SYS_ARGS : + index = 0 + if GPU_CHIPS and '--all-chips': + + for index in GPU_CHIPS : + publisher = lambda _params: ( Components() ).shuffle(_params) + job = Process (target = publisher,args=( dict(args))) + job.name = 'Shuffler #' + str(index) + job.start() + jobs.append(job) + else: + shuffler = Components() + shuffler.shuffle(args) + pass else: # DATA = np.array_split(DATA,PART_SIZE) diff --git a/setup.py b/setup.py index 544f4b3..4eb869f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker", - "version":"1.4.4", + "version":"1.4.5", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] From abed87db22ad47c1d8e9c717967692078248fb36 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 12 Apr 2021 15:11:41 -0500 Subject: [PATCH 157/250] bug fix: column specification for shuffle --- pipeline.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pipeline.py b/pipeline.py index 9d33873..dcf649c 100644 --- a/pipeline.py +++ b/pipeline.py @@ -224,12 +224,13 @@ class Components : if 'ignore' in args and 'columns' in args['ignore'] : _cols = self.get_ignore(data=df,columns=args['ignore']['columns']) - - for name in list (set(df.columns) - set(_cols)) : + columns = args['columns'] if 'columns' in args else df.columns + columns = list(set(columns) - set(_cols)) + for name in columns : i = np.arange(df.shape[0]) np.random.shuffle(i) if name in x_cols : - df[name] = self.approximate(df[name].values) + df[name] = self.approximate(df.iloc[i][name].values) df[name] = df.iloc[i][name] self.post(data=df,schema=schema,store=args['store']['target']) def post(self,**_args) : From 677a99425ae6c20c05f53869ad2c5bf628befb41 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 13 Apr 2021 10:24:36 -0500 Subject: [PATCH 158/250] bug fix: date formatting --- pipeline.py | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/pipeline.py b/pipeline.py index dcf649c..0aba799 100644 --- a/pipeline.py +++ b/pipeline.py @@ -11,7 +11,7 @@ from google.cloud import bigquery as bq import data.maker import copy from data.params import SYS_ARGS - + # # The configuration array is now loaded and we will execute the pipe line as follows @@ -205,6 +205,8 @@ class Components : reader = factory.instance(**args['store']['source']) if 'file' in args : df = pd.read_csv(args['file']) + elif 'data' in _args : + df = _args['data'] else: if 'row_limit' in args and 'sql' in args: df = reader.read(sql=args['sql'],limit=args['row_limit']) @@ -226,25 +228,45 @@ class Components : columns = args['columns'] if 'columns' in args else df.columns columns = list(set(columns) - set(_cols)) - for name in columns : - i = np.arange(df.shape[0]) - np.random.shuffle(i) - if name in x_cols : - df[name] = self.approximate(df.iloc[i][name].values) - df[name] = df.iloc[i][name] + # for name in columns: + # i = np.arange(df.shape[0]) + # np.random.shuffle(i) + # if name in x_cols : + # if df[name].unique().size > 0 : + # df[name] = self.approximate(df.iloc[i][name].fillna(0).values) + # df[name] = df[name].copy().astype(str) + # pass + + df.index = np.arange(df.shape[0]) self.post(data=df,schema=schema,store=args['store']['target']) def post(self,**_args) : _schema = _args['schema'] if 'schema' in _args else None writer = factory.instance(**_args['store']) _df = _args['data'] if _schema : - + columns = [] for _item in _schema : - if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] : - _df[_item['name']] = _df[_item['name']].astype(str) + name = _item['name'] + _type = str + _value = 0 + if _item['type'] in ['DATE','TIMESTAMP','DATETIMESTAMP','DATETIME'] : + if _item['type'] == 'DATE' : + _df[name] = _df[name].dt.date + + + + else: + if _item['type'] == 'INTEGER' : + _type = np.int64 + elif _item['type'] in ['FLOAT','NUMERIC']: + _type = np.float64 + else: + _value = '' + _df[name] = _df[name].fillna(_value).astype(_type) + columns.append(name) writer.write(_df,schema=_schema,table=args['from']) else: - writer.write(_df,table=args['from']) + writer.write(_df[columns],table=args['from']) # @staticmethod def generate(self,args): From be55b14e2b5723f39c387d1ebd97e7daf333463d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 13 Apr 2021 17:41:30 -0500 Subject: [PATCH 159/250] bug fix --- pipeline.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pipeline.py b/pipeline.py index 0aba799..2a3919c 100644 --- a/pipeline.py +++ b/pipeline.py @@ -228,13 +228,13 @@ class Components : columns = args['columns'] if 'columns' in args else df.columns columns = list(set(columns) - set(_cols)) - # for name in columns: - # i = np.arange(df.shape[0]) - # np.random.shuffle(i) - # if name in x_cols : - # if df[name].unique().size > 0 : - # df[name] = self.approximate(df.iloc[i][name].fillna(0).values) - # df[name] = df[name].copy().astype(str) + for name in columns: + i = np.arange(df.shape[0]) + np.random.shuffle(i) + if name in x_cols : + if df[name].unique().size > 0 : + df[name] = self.approximate(df.iloc[i][name].fillna(0).values) + df[name] = df[name].astype(str) # pass df.index = np.arange(df.shape[0]) @@ -539,7 +539,7 @@ if __name__ == '__main__' : # COLUMNS = DATA.columns # DATA = np.array_split(DATA,PART_SIZE) # args['schema'] = schema - GPU_CHIPS = SYS_ARGS['gpu'] if 'gpu' in SYS_ARGS else None + GPU_CHIPS = args['gpu'] if 'gpu' in args else None if GPU_CHIPS and type(GPU_CHIPS) != list : GPU_CHIPS = [int(_id.strip()) for _id in GPU_CHIPS.split(',')] if type(GPU_CHIPS) == str else [GPU_CHIPS] if 'gpu' in SYS_ARGS : @@ -594,7 +594,7 @@ if __name__ == '__main__' : # else: # generator.generate(args) # Components.generate(args) - if '--all-chips' in SYS_ARGS and GPU_CHIPS: + if 'all-chips' in SYS_ARGS and GPU_CHIPS: index = 0 jobs = [] for _id in GPU_CHIPS : @@ -613,7 +613,7 @@ if __name__ == '__main__' : generator.generate(args) elif 'shuffle' in SYS_ARGS : index = 0 - if GPU_CHIPS and '--all-chips': + if GPU_CHIPS and 'all-chips' in SYS_ARGS: for index in GPU_CHIPS : publisher = lambda _params: ( Components() ).shuffle(_params) @@ -632,7 +632,7 @@ if __name__ == '__main__' : # Let us create n-jobs across n-gpus, The assumption here is the data that is produced will be a partition # @TODO: Find better name for partition # - if GPU_CHIPS and '--all-chips' in SYS_ARGS: + if GPU_CHIPS and 'all-chips' in SYS_ARGS: index = 0 for _gpu in GPU_CHIPS : From 567671c43ec783c97e65186c53536b2fc47b4fbd Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 13 Apr 2021 17:43:43 -0500 Subject: [PATCH 160/250] bug fix --- pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index 2a3919c..ae6c2b8 100644 --- a/pipeline.py +++ b/pipeline.py @@ -634,14 +634,14 @@ if __name__ == '__main__' : # if GPU_CHIPS and 'all-chips' in SYS_ARGS: index = 0 - + print (['... launching ',len(GPU_CHIPS),' jobs',args['context']]) for _gpu in GPU_CHIPS : _args = copy.deepcopy(args) _args['gpu'] = [int(_gpu)] _args['partition'] = index index += 1 make = lambda _params: (Components()).train(**_params) - job = Process(target=make,args=( dict(_args),)) + job = Process(target=make,args=( _args,)) job.name = 'Trainer # ' + str(index) job.start() jobs.append(job) From e44fae01a6307ff4310505c2d5dddf6db69fd715 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 13 Apr 2021 17:46:24 -0500 Subject: [PATCH 161/250] bug fix --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index ae6c2b8..56b742f 100644 --- a/pipeline.py +++ b/pipeline.py @@ -617,7 +617,7 @@ if __name__ == '__main__' : for index in GPU_CHIPS : publisher = lambda _params: ( Components() ).shuffle(_params) - job = Process (target = publisher,args=( dict(args))) + job = Process (target = publisher,args=( args,)) job.name = 'Shuffler #' + str(index) job.start() jobs.append(job) From 3eb28dd798f53e551d0b178b9459ee935dd98e11 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 13 Apr 2021 17:53:15 -0500 Subject: [PATCH 162/250] bug fix: data-typing --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 56b742f..c9d01d0 100644 --- a/pipeline.py +++ b/pipeline.py @@ -234,7 +234,7 @@ class Components : if name in x_cols : if df[name].unique().size > 0 : df[name] = self.approximate(df.iloc[i][name].fillna(0).values) - df[name] = df[name].astype(str) + # df[name] = df[name].astype(str) # pass df.index = np.arange(df.shape[0]) From 94798fd9a2572245942b255cb850e620dc35b877 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 28 Apr 2021 16:47:38 -0500 Subject: [PATCH 163/250] bug fix: finalize to remove duplicate keys --- pipeline.py | 123 ++++++++++++++++++++++++++-------------------------- setup.py | 2 +- 2 files changed, 63 insertions(+), 62 deletions(-) diff --git a/pipeline.py b/pipeline.py index c9d01d0..78559cb 100644 --- a/pipeline.py +++ b/pipeline.py @@ -268,7 +268,48 @@ class Components : else: writer.write(_df[columns],table=args['from']) - # @staticmethod + def finalize(self,args): + """ + This function performs post-processing opertions on a synthetic table i.e : + - remove duplicate keys + - remove orphaned keys i.e + """ + reader = factory.instance(**args['store']['source']) + logger = factory.instance(**args['store']['logs']) + target = args['store']['target']['args']['dataset'] + source = args['store']['source']['args']['dataset'] + table = args['from'] + schema = reader.meta(table=args['from']) + # + # keys : + unique_field = "_".join([args['from'],'id']) if 'unique_fields' not in args else args['unique_fields'] + fields = [ item.name if item.name != unique_field else "y."+item.name for item in schema] + SQL = [ + "SELECT :fields FROM ", + "(SELECT ROW_NUMBER() OVER() AS row_number,* FROM :target.:table) x","INNER JOIN", + "(SELECT ROW_NUMBER() OVER() AS row_number, :unique_field FROM :source.:table) y", + "ON y.row_number = x.row_number" + ] + SQL = " ".join(SQL).replace(":fields",",".join(fields)).replace(":table",table).replace(":source",source).replace(":target",target) + SQL = SQL.replace(":unique_field",unique_field) + # + # Use a native job to get this done ... + # + client = bq.Client.from_service_account_json(args['store']['source']['args']["private_key"]) + job = bq.QueryJobConfig() + job.destination = client.dataset(target).table(table) + job.use_query_cache = True + job.allow_large_results = True + # job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY) + job.write_disposition = "WRITE_TRUNCATE" + job.priority = 'BATCH' + r = client.query(SQL,location='US',job_config=job) + logger.write({"job":r.job_id,"action":"finalize", "args":{"sql":SQL,"source":"".join([source,table]),"destimation":".".join([target,table])}}) + # + # Keep a log of what just happened... + # + otable = ".".join([args['store']['source']['args']['dataset'],args['from']]) + dtable = ".".join([args['store']['target']['args']['dataset'],args['from']]) def generate(self,args): """ This function will generate data and store it to a given, @@ -527,18 +568,7 @@ if __name__ == '__main__' : # @TODO: # Log what was initiated so we have context of this processing ... # - # if 'listen' not in SYS_ARGS : - # if 'file' in args : - # DATA = pd.read_csv(args['file']) ; - # schema = [] - # else: - # DATA = Components().get(args) - # client = bq.Client.from_service_account_json(args["private_key"]) - # schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema - # COLUMNS = DATA.columns - # DATA = np.array_split(DATA,PART_SIZE) - # args['schema'] = schema GPU_CHIPS = args['gpu'] if 'gpu' in args else None if GPU_CHIPS and type(GPU_CHIPS) != list : GPU_CHIPS = [int(_id.strip()) for _id in GPU_CHIPS.split(',')] if type(GPU_CHIPS) == str else [GPU_CHIPS] @@ -550,50 +580,6 @@ if __name__ == '__main__' : # Let us see if we have partitions given the log folder content = os.listdir( os.sep.join([args['logs'],'train',args['context']])) - - - # if ''.join(content).isnumeric() : - # # - # # we have partitions we are working with - - # jobs = [] - - # # columns = DATA.columns.tolist() - - # # DATA = np.array_split(DATA,PART_SIZE) - - # for index in range(0,PART_SIZE) : - # if 'focus' in args and int(args['focus']) != index : - # # - # # This handles failures/recoveries for whatever reason - # # If we are only interested in generating data for a given partition - # continue - # # index = id.index(id) - - # args['partition'] = index - # args['data'] = DATA[index] - # if int(args['num_gpu']) > 1 : - # args['gpu'] = index - # else: - # args['gpu']=0 - - # make = lambda _args: (Components()).generate(_args) - # job = Process(target=make,args=(args,)) - # job.name = 'generator # '+str(index) - # job.start() - # jobs.append(job) - # # if len(jobs) == 1 : - # # job.join() - - # print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ]) - # while len(jobs)> 0 : - # jobs = [job for job in jobs if job.is_alive()] - # time.sleep(2) - - # # generator.generate(args) - # else: - # generator.generate(args) - # Components.generate(args) if 'all-chips' in SYS_ARGS and GPU_CHIPS: index = 0 jobs = [] @@ -625,7 +611,7 @@ if __name__ == '__main__' : shuffler = Components() shuffler.shuffle(args) pass - else: + elif 'train' in SYS_ARGS: # DATA = np.array_split(DATA,PART_SIZE) # @@ -657,10 +643,25 @@ if __name__ == '__main__' : # # If we have any obs we should wait till they finish # - while len(jobs)> 0 : - jobs = [job for job in jobs if job.is_alive()] - time.sleep(2) - + DIRTY = 0 + while len(jobs)> 0 : + DIRTY =1 + jobs = [job for job in jobs if job.is_alive()] + time.sleep(2) + if DIRTY: + print (["..:: jobs finished "]) + # + # We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations + # + print (['finalize' in SYS_ARGS, ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) ]) + if 'finalize' in SYS_ARGS or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) : + # + # We should pull all the primary keys and regenerate them in order to insure some form of consistency + # + + (Components()).finalize(args) + # finalize(args) + pass # jobs = [] # for index in range(0,PART_SIZE) : # if 'focus' in args and int(args['focus']) != index : diff --git a/setup.py b/setup.py index 4eb869f..d75f1d3 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker", - "version":"1.4.5", + "version":"1.4.6", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] From 089c1d1d76b36ef6b054969cfc94c3141db7e3d9 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 28 Apr 2021 18:16:55 -0500 Subject: [PATCH 164/250] bug fix ... --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 78559cb..5759696 100644 --- a/pipeline.py +++ b/pipeline.py @@ -287,7 +287,7 @@ class Components : SQL = [ "SELECT :fields FROM ", "(SELECT ROW_NUMBER() OVER() AS row_number,* FROM :target.:table) x","INNER JOIN", - "(SELECT ROW_NUMBER() OVER() AS row_number, :unique_field FROM :source.:table) y", + "(SELECT ROW_NUMBER() OVER() AS row_number, :unique_field FROM :source.:table ORDER BY RAND()) y", "ON y.row_number = x.row_number" ] SQL = " ".join(SQL).replace(":fields",",".join(fields)).replace(":table",table).replace(":source",source).replace(":target",target) From 79c5f3ff259993e89de370f4847759b191e024e2 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 10 May 2021 14:10:31 -0500 Subject: [PATCH 165/250] bug fix ... --- pipeline.py | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline.py b/pipeline.py index 5759696..2311007 100644 --- a/pipeline.py +++ b/pipeline.py @@ -244,7 +244,7 @@ class Components : writer = factory.instance(**_args['store']) _df = _args['data'] if _schema : - columns = [] + columns = _df.columns.tolist for _item in _schema : name = _item['name'] _type = str @@ -266,7 +266,7 @@ class Components : columns.append(name) writer.write(_df,schema=_schema,table=args['from']) else: - writer.write(_df[columns],table=args['from']) + writer.write(_df,table=args['from']) def finalize(self,args): """ diff --git a/setup.py b/setup.py index d75f1d3..1efc05e 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker", - "version":"1.4.6", + "version":"1.4.7", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] From b10296246da4d944a723587cf8ed6183239c9cfd Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 10 May 2021 14:33:18 -0500 Subject: [PATCH 166/250] bug fix ... --- data/maker/__init__.py | 9 ++++++--- pipeline.py | 6 +++--- setup.py | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 803590a..4867bf6 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -144,9 +144,12 @@ def generate(**_args): """ partition = _args['partition'] if 'partition' in _args else None if not partition : - f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json'])) + LOG_DIR = os.sep.join([_args['logs'],'output',_args['context']]) + # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json'])) else: - f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json'])) + LOG_DIR = os.sep.join([_args['logs'],'output',_args['context'],str(partition)]) + # f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json'])) + f = open(os.sep.join([LOG_DIR,'map.json'])) _map = json.loads(f.read()) f.close() # if 'file' in _args : @@ -154,7 +157,7 @@ def generate(**_args): # else: # df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data']) args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']} - args['logs'] = _args['logs'] if 'logs' in _args else 'logs' + args['logs'] = LOG_DIR if 'logs' in _args else 'logs' args ['max_epochs'] = _args['max_epochs'] # args['matrix_size'] = _matrix.shape[0] args['batch_size'] = 2000 diff --git a/pipeline.py b/pipeline.py index 2311007..a958bb8 100644 --- a/pipeline.py +++ b/pipeline.py @@ -583,10 +583,10 @@ if __name__ == '__main__' : if 'all-chips' in SYS_ARGS and GPU_CHIPS: index = 0 jobs = [] - for _id in GPU_CHIPS : + for _gpu in GPU_CHIPS : _args = copy.deepcopy(args) _args['gpu'] = [int(_gpu)] - _args['partition'] = index + _args['partition'] = int(_gpu) #index index += 1 make = lambda _params: (Components()).generate(_params) job = Process(target=make,args=( dict(_args),)) @@ -624,7 +624,7 @@ if __name__ == '__main__' : for _gpu in GPU_CHIPS : _args = copy.deepcopy(args) _args['gpu'] = [int(_gpu)] - _args['partition'] = index + _args['partition'] = int(_gpu) #index index += 1 make = lambda _params: (Components()).train(**_params) job = Process(target=make,args=( _args,)) diff --git a/setup.py b/setup.py index 1efc05e..1c126f5 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker", - "version":"1.4.7", + "version":"1.4.7.1", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] From 7de89a576ae0595ccc5eca2264bbc85d8a34afc1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 10 May 2021 14:43:29 -0500 Subject: [PATCH 167/250] bug fix --- data/maker/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 4867bf6..3a4caf6 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -174,7 +174,11 @@ def generate(**_args): args['gpu'] = _args['gpu'] handler = gan.Predict (**args) - handler.load_meta(column=None) + lparams = {'columns':None} + if partition : + lparams['partition'] = partition + + handler.load_meta(lparams) # # Let us now format the matrices by reverting them to a data-frame with values # From 28d919cade137a9c6498a18432de0f3d37f4e8e4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 10 May 2021 14:43:44 -0500 Subject: [PATCH 168/250] bug fix --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1c126f5..9f091c8 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker", - "version":"1.4.7.1", + "version":"1.4.7.2", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] From 6e45704252be28c5c50aeada6ddeb14bfd9b39ff Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 10 May 2021 14:49:08 -0500 Subject: [PATCH 169/250] bug fixes .... --- data/maker/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 3a4caf6..8180903 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -178,7 +178,7 @@ def generate(**_args): if partition : lparams['partition'] = partition - handler.load_meta(lparams) + handler.load_meta(**lparams) # # Let us now format the matrices by reverting them to a data-frame with values # diff --git a/setup.py b/setup.py index 9f091c8..d09d66d 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker", - "version":"1.4.7.2", + "version":"1.4.7.3", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] From d54758aac30467b8534250a7bc58aaafdc3afb9f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 10 May 2021 15:02:55 -0500 Subject: [PATCH 170/250] bug fix ... --- data/maker/__init__.py | 11 ++++++----- pipeline.py | 2 +- setup.py | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 8180903..7439e45 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -144,20 +144,22 @@ def generate(**_args): """ partition = _args['partition'] if 'partition' in _args else None if not partition : - LOG_DIR = os.sep.join([_args['logs'],'output',_args['context']]) + MAP_FLDER = os.sep.join([_args['logs'],'output',_args['context']]) # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json'])) else: - LOG_DIR = os.sep.join([_args['logs'],'output',_args['context'],str(partition)]) + MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context'],str(partition)]) # f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json'])) - f = open(os.sep.join([LOG_DIR,'map.json'])) + f = open(os.sep.join([MAP_FOLDER,'map.json'])) _map = json.loads(f.read()) f.close() + # + # # if 'file' in _args : # df = pd.read_csv(_args['file']) # else: # df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data']) args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']} - args['logs'] = LOG_DIR if 'logs' in _args else 'logs' + args['logs'] = _args['logs'] if 'logs' in _args else 'logs' args ['max_epochs'] = _args['max_epochs'] # args['matrix_size'] = _matrix.shape[0] args['batch_size'] = 2000 @@ -177,7 +179,6 @@ def generate(**_args): lparams = {'columns':None} if partition : lparams['partition'] = partition - handler.load_meta(**lparams) # # Let us now format the matrices by reverting them to a data-frame with values diff --git a/pipeline.py b/pipeline.py index a958bb8..27f23e6 100644 --- a/pipeline.py +++ b/pipeline.py @@ -244,7 +244,7 @@ class Components : writer = factory.instance(**_args['store']) _df = _args['data'] if _schema : - columns = _df.columns.tolist + columns = [] for _item in _schema : name = _item['name'] _type = str diff --git a/setup.py b/setup.py index d09d66d..7e014c7 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker", - "version":"1.4.7.3", + "version":"1.4.7.4", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] From 776a1103f294a9941f36dc6ba19191ad49b00f3f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 12 May 2021 09:33:57 -0500 Subject: [PATCH 171/250] bug fix with dates --- pipeline.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pipeline.py b/pipeline.py index 27f23e6..252a850 100644 --- a/pipeline.py +++ b/pipeline.py @@ -252,6 +252,9 @@ class Components : if _item['type'] in ['DATE','TIMESTAMP','DATETIMESTAMP','DATETIME'] : if _item['type'] == 'DATE' : _df[name] = _df[name].dt.date + _df[name] = pd.to_datetime(_df[name],errors='coerce') + + From 79b83c71d5043427c37bd81f3beebc4637fac9eb Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 12 May 2021 10:14:53 -0500 Subject: [PATCH 172/250] bug fix: date, hack put in place --- pipeline.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pipeline.py b/pipeline.py index 252a850..b6e808f 100644 --- a/pipeline.py +++ b/pipeline.py @@ -251,13 +251,16 @@ class Components : _value = 0 if _item['type'] in ['DATE','TIMESTAMP','DATETIMESTAMP','DATETIME'] : if _item['type'] == 'DATE' : - _df[name] = _df[name].dt.date - _df[name] = pd.to_datetime(_df[name],errors='coerce') - - - - - + # + # There is an issue with missing dates that needs to be resolved. + # for some reason a missing date/time here will cause the types to turn into timestamp (problem) + # The following is a hack to address the issue (alas) assuming 10 digit dates and 'NaT' replaces missing date values (pandas specifications) + # + _df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10]) + #_df[name] = _df[name].dt.date + # _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce') + else: + print ([' ** ',name,_item['type']]) else: if _item['type'] == 'INTEGER' : _type = np.int64 From 14933b877f742fa6628e852fe3ef951d20ab6a2d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 12 May 2021 10:28:33 -0500 Subject: [PATCH 173/250] bug fix with dates --- pipeline.py | 4 +--- setup.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pipeline.py b/pipeline.py index b6e808f..3644a7e 100644 --- a/pipeline.py +++ b/pipeline.py @@ -259,8 +259,6 @@ class Components : _df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10]) #_df[name] = _df[name].dt.date # _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce') - else: - print ([' ** ',name,_item['type']]) else: if _item['type'] == 'INTEGER' : _type = np.int64 @@ -660,7 +658,7 @@ if __name__ == '__main__' : # We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations # print (['finalize' in SYS_ARGS, ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) ]) - if 'finalize' in SYS_ARGS or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) : + if 'autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) : # # We should pull all the primary keys and regenerate them in order to insure some form of consistency # diff --git a/setup.py b/setup.py index 7e014c7..eb8ea4d 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker", - "version":"1.4.7.4", + "version":"1.4.7.5", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] From 4ed0e31aa5c94ea11cf2d6e96e459e7a941cce44 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 12 May 2021 10:37:06 -0500 Subject: [PATCH 174/250] bug fix ... --- pipeline.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipeline.py b/pipeline.py index 3644a7e..9aad2de 100644 --- a/pipeline.py +++ b/pipeline.py @@ -652,17 +652,17 @@ if __name__ == '__main__' : DIRTY =1 jobs = [job for job in jobs if job.is_alive()] time.sleep(2) - if DIRTY: - print (["..:: jobs finished "]) + # if DIRTY: + # print (["..:: jobs finished "]) # # We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations # - print (['finalize' in SYS_ARGS, ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) ]) + if 'autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) : # # We should pull all the primary keys and regenerate them in order to insure some form of consistency # - + print (["..:: jobs finished "]) (Components()).finalize(args) # finalize(args) pass From 157df9334cff645116c176c95a7063832b690de1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 12 May 2021 10:37:40 -0500 Subject: [PATCH 175/250] bug fix ... --- pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline.py b/pipeline.py index 9aad2de..56e522e 100644 --- a/pipeline.py +++ b/pipeline.py @@ -652,8 +652,8 @@ if __name__ == '__main__' : DIRTY =1 jobs = [job for job in jobs if job.is_alive()] time.sleep(2) - # if DIRTY: - # print (["..:: jobs finished "]) + if DIRTY: + print (["..:: jobs finished "]) # # We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations # @@ -662,7 +662,7 @@ if __name__ == '__main__' : # # We should pull all the primary keys and regenerate them in order to insure some form of consistency # - print (["..:: jobs finished "]) + print (["..:: Finalizing process"]) (Components()).finalize(args) # finalize(args) pass From f99af3655d0c8792c34f11a246e437c7d00ae46c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 13 Jan 2022 15:05:00 -0600 Subject: [PATCH 176/250] bug fix: misc. improvements --- data/gan.py | 21 ++++++++++++--------- data/maker/__init__.py | 5 ++++- pipeline.py | 18 ++++++++++++++---- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/data/gan.py b/data/gan.py index 643e838..0008489 100644 --- a/data/gan.py +++ b/data/gan.py @@ -20,7 +20,9 @@ EMBEDDED IN CODE : """ import tensorflow as tf -from tensorflow.contrib.layers import l2_regularizer +# from tensorflow.contrib.layers import l2_regularizer +from tensorflow.keras import layers +from tensorflow.keras.regularizers import L2 as l2_regularizer import numpy as np import pandas as pd import time @@ -34,7 +36,7 @@ import pickle os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ['CUDA_VISIBLE_DEVICES'] = "0" os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' - +tf.compat.v1.disable_eager_execution() # STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256 # NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu']) # BATCHSIZE_PER_GPU = 2000 @@ -211,13 +213,14 @@ class GNet : labels = None if 'labels' not in args else args['labels'] n_labels= None if 'n_labels' not in args else args['n_labels'] shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing - mean, var = tf.nn.moments(inputs, shift, keep_dims=True) - shape = inputs.shape[1].value + # mean, var = tf.nn.moments(inputs, shift, keep_dims=True) + mean, var = tf.nn.moments(inputs, shift,keepdims=True) + # shape = inputs.shape[1].value + shape = inputs.shape[1] + if labels is not None: - offset_m = self.get.variables(shape=[1,shape], name='offset'+name, - initializer=tf.zeros_initializer) - scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name, - initializer=tf.ones_initializer) + offset_m = self.get.variables(shape=[1,shape], name='offset'+name,initializer=tf.zeros_initializer) + scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,initializer=tf.ones_initializer) offset = tf.nn.embedding_lookup(offset_m, labels) scale = tf.nn.embedding_lookup(scale_m, labels) @@ -595,7 +598,7 @@ class Predict(GNet): df = pd.DataFrame() CANDIDATE_COUNT = args['candidates'] if 'candidates' in args else 1 #0 if self.ROW_COUNT < 1000 else 100 candidates = [] - + with tf.compat.v1.Session() as sess: saver.restore(sess, model_dir) if self._LABEL is not None : diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 7439e45..9db2b8d 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -106,6 +106,8 @@ def train (**_args): values = _inputhandler._map[key]['values'].tolist() _map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()} info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map} + print() + # print ([_args['context'],_inputhandler._io]) logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io}) args['logs'] = _args['logs'] if 'logs' in _args else 'logs' @@ -142,9 +144,10 @@ def generate(**_args): :param context :param logs """ + _args['logs'] = _args['logs'] if 'logs' in _args else 'logs' partition = _args['partition'] if 'partition' in _args else None if not partition : - MAP_FLDER = os.sep.join([_args['logs'],'output',_args['context']]) + MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context']]) # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json'])) else: MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context'],str(partition)]) diff --git a/pipeline.py b/pipeline.py index 56e522e..296d4d5 100644 --- a/pipeline.py +++ b/pipeline.py @@ -151,6 +151,7 @@ class Components : if df.shape[0] and df.shape[0] : # # We have a full blown matrix to be processed + print ('-- Training --') data.maker.train(**_args) else: print ("... skipping training !!") @@ -259,16 +260,23 @@ class Components : _df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10]) #_df[name] = _df[name].dt.date # _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce') + else: + pass + _df[name] = pd.to_datetime(_df[name]) else: + value = 0 if _item['type'] == 'INTEGER' : _type = np.int64 elif _item['type'] in ['FLOAT','NUMERIC']: _type = np.float64 else: + _value = '' - _df[name] = _df[name].fillna(_value).astype(_type) + _df[name] = _df[name].fillna(_value) #.astype(_type) columns.append(name) - writer.write(_df,schema=_schema,table=args['from']) + print () + print (_df) + writer.write(_df.astype(object),schema=_schema,table=args['from']) else: writer.write(_df,table=args['from']) @@ -350,7 +358,7 @@ class Components : for _item in schema : dtype = str name = _item['name'] - novalue = -1 + novalue = 0 if _item['type'] in ['INTEGER','NUMERIC']: dtype = np.int64 @@ -550,7 +558,7 @@ if __name__ == '__main__' : index = f[0] if f else 0 # - print ("..::: ",PIPELINE[index]['context']) + print ("..::: ",PIPELINE[index]['context'],':::..') args = (PIPELINE[index]) for key in _config : if key == 'pipeline' or key in args: @@ -567,6 +575,7 @@ if __name__ == '__main__' : args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size']) if 'dataset' not in args : args['dataset'] = 'combined20191004v2_deid' + args['logs'] = args['logs'] if 'logs' in args else 'logs' PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 # # @TODO: @@ -599,6 +608,7 @@ if __name__ == '__main__' : jobs.append(job) pass else: + generator = Components() generator.generate(args) elif 'shuffle' in SYS_ARGS : From cad54d7b45d08b8d4749a736ed9fe6ef6762949e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 13 Jan 2022 17:36:53 -0600 Subject: [PATCH 177/250] version upgrade --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index eb8ea4d..c43bd15 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker", - "version":"1.4.7.5", + "version":"1.4.7.6", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] From ee0165de0188faba09c55e518fca6c2e5761f287 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 24 Mar 2022 11:38:52 -0500 Subject: [PATCH 178/250] bug fixes: enhancements --- binder.py | 377 +++++++++++++++++++++++++++++++++ data/gan.py | 54 +---- data/maker/__init__.py | 6 +- data/maker/prepare/__init__.py | 58 +---- pipeline.py | 330 ++++++++++++++--------------- 5 files changed, 543 insertions(+), 282 deletions(-) create mode 100644 binder.py diff --git a/binder.py b/binder.py new file mode 100644 index 0000000..5379d62 --- /dev/null +++ b/binder.py @@ -0,0 +1,377 @@ +#!/usr/bin/env python3 +""" +This file will perform basic tasks to finalize the GAN process by performing the following : + - basic stats & analytics + - rebuild io to another dataset +""" +import pandas as pd +import numpy as np +from multiprocessing import Process, Lock +from google.oauth2 import service_account +from google.cloud import bigquery as bq +import transport +from data.params import SYS_ARGS +import json + +import pandas as pd +import numpy as np +from google.oauth2 import service_account +import json + +# path = '../curation-prod.json' +# credentials = service_account.Credentials.from_service_account_file(path) +# df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard') +filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config'] +f = open(filename) +config = json.loads(f.read()) +args = config['pipeline'] +f.close() + +def _formatSQL(**_args): + """ + This function will build the _map for a given segment + """ + sql = """ + select DISTINCT x.person_id synthetic,y.person_id original + FROM :synthetic.:table x + INNER JOIN :original.:table y on x.person_id in (:ids) + AND x.person_id <> y.person_id AND x.gender_source_value = y.gender_source_value + AND x.year_of_birth = y.year_of_birth + ORDER BY 1 + """ + table= _args['table'] + original,synthetic = _args['schema']['original'],_args['schema']['synthetic'] + _ids = np.array(_args['ids']).astype(str) + return sql.replace(":ids",",".join(_ids)).replace(":synthetic",synthetic).replace(":original",original).replace(":table",table) +def _addCounts(**_args) : + store = _args['store'] + sql = _args['sql'] + reader = transport.factory.instance(**store['source']) + _df = reader.read(sql=sql) + _ids = _df.synthetic.unique() + _counts = [ np.sum(_df.synthetic == value) for value in _ids] + original = [_df[_df.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids] + _df = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts}) + + # + # We can post this to the backend ... + # + table = '_map' #-- Yes this is hard-coded + writer = transport.factory.instance(**dict(store['target'],**{"parallel":True,"table":table})) + # if writer.has(table=table) is False: + # writer.write(_df) + # else: + _schema = [{"name":name,"type":"INTEGER"} for name in _df.columns] + writer.write(_df,schema=_schema) + + + + + +def Init(**_args) : + """ + This function will build a map of the synthetic to real individuals. + The assumption is that the synthesized data is stored in the same data-store as the original the parameters provided are : + :param store object from the configuration file with source,target entries + :param table name of the original/synthetic tables (they should be the same) + :param feat. featuress/attributes ... demographics to account for + """ + store = _args['store'] + reader = transport.factory.instance(**store['source']) + original,synthetic = _args['schema']['original'],_args['schema']['synthetic'] + table = _args['table'] + sql = _args['sql'].replace(':synthetic',synthetic).replace(':original',original).replace(':table',table) + + _map = reader.read(sql=sql) + + + + k = _args['k'] if 'k' in _args else 2 + # _iodf = reader.read(table=table) + # _ids = _iodf['person_id'].unique().tolist() + # x_ = np.array_split(_ids,1000) + jobs = [] + # for _items in x_ : + # _p = {"ids":_items,"schema":_args['schema'],'store':store,'table':table} + # sql = _formatSQL(**_p) + # _p['sql'] = sql + # _apply = lambda params: _addCounts(**params) + # thread = Process(target=_apply,args=(_p,)) + # thread.start() + # jobs.append(thread) + + # return jobs + # + # We have performed a m:m (many-to-many) relationship with original participants and synthetic participants + # The goal is to obtain a singular map against which records will be migrated + # + print (['... computing counts (k)']) + _ids = _map.synthetic.unique() + _counts = [ np.sum(_map.synthetic == value) for value in _ids] + original = [_map[_map.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids] + print (['Building k-classes/groups']) + _mdf = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts}) + i = _mdf.apply(lambda row: row.counts >= k,axis=1) + _mdf = _mdf[i] + # + # Log what just happened here so we know about the equivalence classes, + # {"module":"binder","action":"map-generation","input":{"k":k,"rows":{"synthetic":_mdf.shape[0],"original":len(_counts)}}} + + return _mdf + # + # now we are posting this to target storage ... + # +def ApplyOn (**_args): + """ + This function will rewrite SQL that applies the synthetic identifier to the entries of the pipeline + We assume that the _map has two attributes (synthetic and original) + :param store + :param _config + """ + store_args = _args['store'] + _config = _args['config'] + + table = _config['from'] + reader = transport.factory.instance(**dict(store_args['source'],**{"table":table})) + attr = reader.read(limit=1).columns.tolist() + original_key = _args['original_key'] #-- assuming referential integrity + + # synthetic_key= columns['synthetic'] + # mapped_original=columns['orginal'] + fields = list(set(attr) - set([original_key])) + sql = "select _map.synthetic as :original_key,:fields from :original_schema.:table inner join :synthetic_schema._map on _map.original = :table.:original_key" + sql = sql.replace(":table",table).replace(":fields",",".join(fields)) + sql = sql.replace(":original_key",original_key) + _schema = _args['schema'] + sql = sql.replace(":original_schema",_schema['original']).replace(":synthetic_schema",_schema['synthetic']) + + return reader.read (sql=sql) + +if __name__ == '__main__' : + pass + +# class Analytics : +# """ +# This class will compile basic analytics about a given dataset i.e compare original/synthetic +# """ +# @staticmethod +# def distribution(**args): +# context = args['context'] +# df = args['data'] +# # +# #-- This data frame counts unique values for each feature (space) +# df_counts = pd.DataFrame(df.apply(lambda col: col.unique().size),columns=['counts']).T # unique counts +# # +# #-- Get the distributions for common values +# # +# names = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False] +# ddf = df.apply(lambda col: pd.DataFrame(col.values,columns=[col.name]).groupby([col.name]).size() ).fillna(0) +# ddf[context] = ddf.index + +# pass +# def distance(**args): +# """ +# This function will measure the distance between +# """ +# pass +# class Utils : +# @staticmethod +# def log(**args): +# logger = transport.factory.instance(type="mongo.MongoWriter",args={"dbname":"aou","doc":"logs"}) +# logger.write(args) +# logger.close() +# class get : +# @staticmethod +# def pipeline(table,path) : +# # contexts = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts'] +# config = json.loads((open(path)).read()) +# pipeline = config['pipeline'] +# # return [ item for item in pipeline if item['context'] in contexts] +# pipeline = [item for item in pipeline if 'from' in item and item['from'].strip() == table] +# Utils.log(module=table,action='init',input={"pipeline":pipeline}) +# return pipeline +# @staticmethod +# def sql(**args) : +# """ +# This function is intended to build SQL query for the remainder of the table that was not synthesized +# :config configuration entries +# :from source of the table name +# :dataset name of the source dataset + +# """ +# SQL = ["SELECT * FROM :from "] +# SQL_FILTER = [] +# NO_FILTERS_FOUND = True +# # pipeline = Utils.get.config(**args) +# pipeline = args['pipeline'] +# REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='} +# for item in pipeline : + + +# if 'filter' in item : +# if NO_FILTERS_FOUND : +# NO_FILTERS_FOUND = False +# SQL += ['WHERE'] +# # +# # Let us load the filter in the SQL Query +# FILTER = item['filter'] +# QUALIFIER = REVERSE_QUALIFIER[FILTER['qualifier'].upper()] +# SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')']).replace(":dataset",args['dataset'])] +# src = ".".join([args['dataset'],args['from']]) +# SQL += [" AND ".join(SQL_FILTER)] +# # +# # let's pull the field schemas out of the table definition +# # +# Utils.log(module=args['from'],action='sql',input={"sql":" ".join(SQL) }) +# return " ".join(SQL).replace(":from",src) + + +# def mk(**args) : +# dataset = args['dataset'] +# client = args['client'] if 'client' in args else bq.Client.from_service_account_file(args['private_key']) +# # +# # let us see if we have a dataset handy here +# # +# datasets = list(client.list_datasets()) +# found = [item for item in datasets if item.dataset_id == dataset] + +# if not found : + +# return client.create_dataset(dataset) +# return found[0] + +# def move (args): +# """ +# This function will move a table from the synthetic dataset into a designated location +# This is the simplest case for finalizing a synthetic data set +# :private_key +# """ +# pipeline = Utils.get.pipeline(args['from'],args['config']) +# _args = json.loads((open(args['config'])).read()) +# _args['pipeline'] = pipeline +# # del _args['pipeline'] +# args = dict(args,**_args) +# # del args['pipeline'] +# # private_key = args['private_key'] +# client = bq.Client.from_service_account_json(args['private_key']) + +# dataset = args['dataset'] +# if pipeline : +# SQL = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in pipeline] +# SQL += [Utils.get.sql(**args)] +# SQL = ('\n UNION ALL \n'.join(SQL).replace(':dataset','io')) +# else: +# # +# # moving a table to a designated location +# tablename = args['from'] +# if 'sql' not in args : +# SQL = "SELECT * FROM :dataset.:table" +# else: +# SQL = args['sql'] +# SQL = SQL.replace(":dataset",dataset).replace(":table",tablename) +# Utils.log(module=args['from'],action='sql',input={'sql':SQL}) +# # +# # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table +# # + + + +# odataset = mk(dataset=dataset+'_io',client=client) +# # SQL = "SELECT * FROM io.:context_full_io".replace(':context',context) +# config = bq.QueryJobConfig() +# config.destination = client.dataset(odataset.dataset_id).table(args['from']) +# config.use_query_cache = True +# config.allow_large_results = True +# config.priority = 'INTERACTIVE' +# # +# # + +# schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema +# fields = [" ".join(["CAST (",item.name,"AS",item.field_type.replace("INTEGER","INT64").replace("FLOAT","FLOAT64"),") ",item.name]) for item in schema] +# SQL = SQL.replace("*"," , ".join(fields)) +# # print (SQL) +# out = client.query(SQL,location='US',job_config=config) +# Utils.log(module=args['from'],action='move',input={'job':out.job_id}) +# return (out.job_id) + + + + +# import pandas as pd +# import numpy as np +# from google.oauth2 import service_account +# import json + +# # path = '../curation-prod.json' +# # credentials = service_account.Credentials.from_service_account_file(path) +# # df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard') +# filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config'] +# f = open(filename) +# config = json.loads(f.read()) +# args = config['pipeline'] +# f.close() + + +# if __name__ == '__main__' : +# """ +# Usage : +# finalize -- --contexts --from
+# """ + +# if 'move' in SYS_ARGS : + +# if 'init' in SYS_ARGS : +# dep = config['dep'] if 'dep' in config else {} +# info = [] + +# if 'queries' in dep : +# info += dep['queries'] +# print ('________') +# if 'tables' in dep : +# info += dep['tables'] +# args = {} +# jobs = [] +# for item in info : +# args = {} +# if type(item) == str : +# args['from'] = item +# name = item +# else: +# args = item +# name = item['from'] +# args['config'] = SYS_ARGS['config'] +# # args['pipeline'] = [] +# job = Process(target=move,args=(args,)) +# job.name = name +# jobs.append(job) +# job.start() + + +# # while len(jobs) > 0 : +# # jobs = [job for job in jobs if job.is_alive()] +# # time.sleep(1) + + +# else: +# move(SYS_ARGS) +# # # table = SYS_ARGS['from'] +# # # args = dict(config,**{"private_key":"../curation-prod.json"}) +# # args = dict(args,**SYS_ARGS) +# # contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']] +# # log = [] +# # if contexts : +# # args['contexts'] = contexts +# # log = move(**args) + +# # else: +# # tables = args['from'].split(',') +# # for name in tables : +# # name = name.strip() +# # args['from'] = name +# # log += [move(**args)] +# # print ("\n".join(log)) + + + +# else: +# print ("NOT YET READY !") \ No newline at end of file diff --git a/data/gan.py b/data/gan.py index 0008489..f5705ea 100644 --- a/data/gan.py +++ b/data/gan.py @@ -622,7 +622,7 @@ class Predict(GNet): candidates.append(np.array([np.round(row).astype(int) for row in _matrix])) # return candidates[0] if len(candidates) == 1 else candidates - return candidates + return [candidates [0]] def _apply(self,**args): # print (self.train_dir) @@ -768,55 +768,3 @@ class Predict(GNet): # return df.to_dict(orient='list') return _matrix - -if __name__ == '__main__' : - # - # Now we get things done ... - column = SYS_ARGS['column'] - column_id = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id' - column_id = column_id.split(',') if ',' in column_id else column_id - df = pd.read_csv(SYS_ARGS['raw-data']) - LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values - - context = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4] - if set(['train','learn']) & set(SYS_ARGS.keys()): - - df = pd.read_csv(SYS_ARGS['raw-data']) - - # cols = SYS_ARGS['column'] - # _map,_df = (Binary()).Export(df) - # i = np.arange(_map[column]['start'],_map[column]['end']) - max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10 - # REAL = _df[:,i] - REAL = pd.get_dummies(df[column]).astype(np.float32).values - LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values - trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id) - trainer.apply() - - - - - # - # We should train upon this data - # - # -- we need to convert the data-frame to binary matrix, given a column - # - pass - elif 'generate' in SYS_ARGS: - values = df[column].unique().tolist() - values.sort() - - p = Predict(context=context,label=LABEL,values=values,column=column) - p.load_meta(column) - r = p.apply() - # print (df) - # print () - df[column] = r[column] - # print (df) - - - else: - print (SYS_ARGS.keys()) - print (__doc__) - pass - diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 9db2b8d..a7d8d69 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -96,7 +96,11 @@ def train (**_args): # This args['store'] = copy.deepcopy(_args['store']['logs']) - args['store']['args']['doc'] = _args['context'] + if 'args' in _args['store']: + args['store']['args']['doc'] = _args['context'] + else: + + args['store']['doc'] = _args['context'] logger = factory.instance(**args['store']) args['logger'] = logger diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 5ace56a..6e67cb2 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -39,26 +39,10 @@ class Input : - provide a feature space, and rows (matrix profile) - a data index map """ - # def learn(self,**_args): - # """ - # This function is designed to learn about, the data and persist - # :param table - # :param store - # """ - # table = _args['table'] - # reader = transport.factory.instance(**_args['store']) - # df = reader.read(table=table,limit=1) - # self.columns = df.columns.tolist() - - # self._metadf = pd.DataFrame(self.df[self._columns].dtypes.values.astype(str)).T #,self._columns] - # self._metadf.columns = self._columns - - # sql = "SELECT :fields from :table".replace(":table",table) - def __init__(self,**_args): """ - :param table + :param data :param store data-store parameters/configuration :param sql sql query that pulls a representative sample of the data """ @@ -70,29 +54,18 @@ class Input : pass else: self._initsql(**_args) + # + # We need to have a means to map of values,columns and vector positions in order + # to perform convert and revert to and from binary + # self._map = {} if 'map' not in _args else _args['map'] - # self._metadf = pd.DataFrame(self.df[self._columns].dtypes.values.astype(str)).T #,self._columns] - # self._metadf.columns = self._columns - # if 'gpu' in _args and 'GPU' in os.environ: - - # np = cp - # index = int(_args['gpu']) - # np.cuda.Device(index).use() - # print(['..:: GPU ',index]) def _initsql(self,**_args): """ This function will initialize the class on the basis of a data-store and optionally pre-defined columns to be used to be synthesized :param store data-store configuration - :param sql sql query to be applied to the transported data :param columns list of columns to be """ - # _store_args = _args['store'] - # reader = transport.factory.instance(**_store_args) - # sql = _args['sql'] - - # self.df = reader.read(sql=_args['sql']) - if 'columns' not in _args : self._initcols(data=self.df) @@ -128,14 +101,6 @@ class Input : :param data data-frame that holds the data :param columns columns that need to be synthesized if any """ - # - # setting class-level variables to be reused across the class - # self.df = _args['data'] - row_count = self.df.shape[0] - # self.columns = self.df.columns - # self._metadf = self.df.apply(lambda col: col.unique().size) - # _df = pd.DataFrame(self.df.apply(lambda col: col.unique().size )).T - # cols = None if 'columns' not in _args else _args['columns'] self._initcols(**_args) def convert(self,**_args): @@ -247,16 +212,3 @@ class Input : return cols,_matrix -if __name__ == '__main__' : - df = pd.read_csv('../../sample.csv') - _input = Input(data=df,columns=['age','race']) - _m = _input.convert(column='age') - print (_m.shape) - print (_input.revert(matrix=_m,column='age')) - print (_input._metadf) - -# _args = {"store":{"type":"sql.BQReader","args":{"service_key":"/home/steve/dev/aou/accounts/curation-prod.json"}}} -# _args['table'] = 'io.observation' -# _i = Input(**_args) -# df = pd.read_csv('../../sample.csv') -# print (Input.ToBinary(df.age)) \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index 296d4d5..5fb62fe 100644 --- a/pipeline.py +++ b/pipeline.py @@ -101,11 +101,14 @@ class Components : df = pd.read_csv(args['file']) del args['file'] elif 'data' not in args : + reader = factory.instance(**args['store']['source']) + + if 'row_limit' in args : df = reader.read(sql=args['sql'],limit=args['row_limit']) else: - df = reader.read(sql=args['sql']) + df = reader.read(sql=args['sql']) schema = reader.meta(table=args['from']) if hasattr(reader,'meta') and 'from' in args else None else: df = args['data'] @@ -241,6 +244,7 @@ class Components : df.index = np.arange(df.shape[0]) self.post(data=df,schema=schema,store=args['store']['target']) def post(self,**_args) : + table = _args['from'] if 'from' in _args else _args['store']['table'] _schema = _args['schema'] if 'schema' in _args else None writer = factory.instance(**_args['store']) _df = _args['data'] @@ -251,13 +255,13 @@ class Components : _type = str _value = 0 if _item['type'] in ['DATE','TIMESTAMP','DATETIMESTAMP','DATETIME'] : - if _item['type'] == 'DATE' : + if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] : # # There is an issue with missing dates that needs to be resolved. # for some reason a missing date/time here will cause the types to turn into timestamp (problem) # The following is a hack to address the issue (alas) assuming 10 digit dates and 'NaT' replaces missing date values (pandas specifications) # - _df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10]) + _df[name] = _df[name].apply(lambda value: None if str(value) == 'NaT' else (str(value)[:10]) if _item['type'] in ['DATE','DATETIME'] else str(value)) #_df[name] = _df[name].dt.date # _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce') else: @@ -274,11 +278,33 @@ class Components : _value = '' _df[name] = _df[name].fillna(_value) #.astype(_type) columns.append(name) - print () - print (_df) - writer.write(_df.astype(object),schema=_schema,table=args['from']) + + fields = _df.columns.tolist() + if not writer.has(table=table) and _args['store']['provider'] != 'bigquery': + + _map = {'STRING':'VARCHAR(256)','INTEGER':'BIGINT'} if 'provider' in _args['store'] and _args['store']['provider'] != 'bigquery' else {} + _params = {'map':_map,'table':args['from']} + if _schema : + _params['schema'] = _schema + + else: + _params['fields'] = fields + + writer.make(**_params) + + fields = _df.columns.tolist() + _df = _df[fields] + # writer.fields = fields + if _args['store']['provider'] == 'bigquery' : + print (['_______ POSTING ______________ ',table]) + print (['_______________ ',_df.shape[0],' ___________________']) + writer.write(_df.astype(object),schema=_schema,table=table) else: - writer.write(_df,table=args['from']) + writer.table = table + writer.write(_df) + # else: + # writer.write(_df,table=args['from']) + def finalize(self,args): """ @@ -288,8 +314,9 @@ class Components : """ reader = factory.instance(**args['store']['source']) logger = factory.instance(**args['store']['logs']) - target = args['store']['target']['args']['dataset'] - source = args['store']['source']['args']['dataset'] + + target = args['store']['target']['args']['dataset'] + source = args['store']['source']['args']['dataset'] table = args['from'] schema = reader.meta(table=args['from']) # @@ -327,7 +354,10 @@ class Components : This function will generate data and store it to a given, """ store = args['store']['logs'] - store['args']['doc'] = args['context'] + if 'args' in store : + store['args']['doc'] = args['context'] + else: + store['doc'] = args['context'] logger = factory.instance(**store) #type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) ostore = args['store']['target'] @@ -348,13 +378,13 @@ class Components : schema = reader.meta(table=args['from']) schema = [{"name":_item.name,"type":_item.field_type} for _item in schema] - # else: # # # # This will account for autopilot mode ... # df = args['data'] _cast = {} if schema : + for _item in schema : dtype = str name = _item['name'] @@ -405,139 +435,72 @@ class Components : logger.write(_info) if args['data'].shape[0] > 0 and args['data'].shape[1] > 0 : candidates = (data.maker.generate(**args)) + else: candidates = [df] - if 'sql.BQWriter' in ostore['type'] : - #table = ".".join([ostore['['dataset'],args['context']]) - # writer = factory.instance(**ostore) - _columns = None - skip_columns = [] - _schema = schema - if schema : - cols = [_item['name'] for _item in _schema] - else: - cols = df.columns - for _df in candidates : - # - # we need to format the fields here to make sure we have something cohesive - # + + # if 'sql.BQWriter' in ostore['type'] : + _columns = None + skip_columns = [] + _schema = schema + if schema : + cols = [_item['name'] for _item in _schema] + else: + cols = df.columns.tolist() + _info = {"module":"gan-prep","action":"selection","input":{"candidates":len(candidates),"features":cols}} + logger.write(_info) + for _df in candidates : + # + # we need to format the fields here to make sure we have something cohesive + # - if not skip_columns : - # _columns = set(df.columns) - set(_df.columns) - if 'ignore' in args and 'columns' in args['ignore'] : - skip_columns = self.get_ignore(data=_df,columns=args['ignore']['columns']) - # for name in args['ignore']['columns'] : - # for _name in _df.columns: - # if _name in name: - # skip_columns.append(_name) - # - # We perform a series of set operations to insure that the following conditions are met: - # - the synthetic dataset only has fields that need to be synthesized - # - The original dataset has all the fields except those that need to be synthesized - # - - _df = _df[list(set(_df.columns) - set(skip_columns))].copy() - if x_cols : - _approx = {} - for _col in x_cols : - if real_df[_col].unique().size > 0 : - + if not skip_columns : + if 'ignore' in args and 'columns' in args['ignore'] : + skip_columns = self.get_ignore(data=_df,columns=args['ignore']['columns']) + # + # We perform a series of set operations to insure that the following conditions are met: + # - the synthetic dataset only has fields that need to be synthesized + # - The original dataset has all the fields except those that need to be synthesized + # + + _df = _df[list(set(_df.columns) - set(skip_columns))].copy() + if x_cols : + _approx = {} + for _col in x_cols : + if real_df[_col].unique().size > 0 : + - _df[_col] = self.approximate(real_df[_col].values) - _approx[_col] = { - "io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}, - "real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)} - } - else: - _df[_col] = -1 - logger.write({"module":"gan-generate","action":"approximate","status":_approx}) - if set(df.columns) & set(_df.columns) : - _columns = set(df.columns) - set(_df.columns) - df = df[_columns] + _df[_col] = self.approximate(real_df[_col].values) + _approx[_col] = { + "io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}, + "real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)} + } + else: + _df[_col] = -1 + logger.write({"module":"gan-generate","action":"approximate","status":_approx}) + if set(df.columns) & set(_df.columns) : + _columns = list(set(df.columns) - set(_df.columns)) + df = df[_columns] - # - # Let us merge the dataset here and and have a comprehensive dataset + # + # Let us merge the dataset here and and have a comprehensive dataset - _df = pd.DataFrame.join(df,_df) - - # if _schema : - # for _item in _schema : - # if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] : - # _df[_item['name']] = _df[_item['name']].astype(str) - - # pass - _params = {'data':_df,'store' : ostore} - if _schema : - _params ['schema'] = _schema - self.post(**_params) - # if _schema : - # writer.write(_df[cols],schema=_schema,table=args['from']) - # self.post(data=_df,schema=) - # else: - # writer.write(_df[cols],table=args['from']) + _df = pd.DataFrame.join(df,_df) + _params = {'data':_df,'store' : ostore} + if _schema : + _params ['schema'] = _schema + _info = {"module":"gan-prep","action":"write","input":{"rows":_df.shape[0],"cols":_df.shape[1]}} + logger.write(_info) + self.post(**_params) + # print (['_______ posting _________________',_df.shape]) + break + pass # else: # pass - - - # # - # # We need to post the generate the data in order to : - # # 1. compare immediately - # # 2. synthetic copy - # # - - # cols = _dc.columns.tolist() - - # data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query) - # # - # # performing basic analytics on the synthetic data generated (easy to quickly asses) - # # - # info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}} - - # # - # # @TODO: Send data over to a process for analytics - - # base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) - # cols = _dc.columns.tolist() - # for name in cols : - # _args['data'][name] = _dc[name] - - # # - # #-- Let us store all of this into bigquery - # prefix = args['notify']+'.'+_args['context'] - # partition = str(partition) - # table = '_'.join([prefix,partition,'io']).replace('__','_') - # folder = os.sep.join([args['logs'],args['context'],partition,'output']) - # if 'file' in args : - - # _fname = os.sep.join([folder,table.replace('_io','_full_io.csv')]) - # _pname = os.sep.join([folder,table])+'.csv' - # data_comp.to_csv( _pname,index=False) - # _args['data'].to_csv(_fname,index=False) - - # _id = 'path' - # else: - - # credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') - # _pname = os.sep.join([folder,table+'.csv']) - # _fname = table.replace('_io','_full_io') - # partial = '.'.join(['io',args['context']+'_partial_io']) - # complete= '.'.join(['io',args['context']+'_full_io']) - # data_comp.to_csv(_pname,index=False) - # if 'dump' in args : - # print (_args['data'].head()) - # else: - # Components.lock.acquire() - # data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) - # _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000) - # Components.lock.release() - # _id = 'dataset' - # info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} } - # if partition : - # info ['partition'] = int(partition) - # logger.write({"module":"generate","action":"write","input":info} ) - + def bind(self,**_args): + print (_args) if __name__ == '__main__' : @@ -611,6 +574,50 @@ if __name__ == '__main__' : generator = Components() generator.generate(args) + elif 'bind' in SYS_ARGS : + import binder + _args = _config['_map'] + _args['store'] = copy.deepcopy(_config['store']) + if 'init' in SYS_ARGS : + # + # Creating and persisting the map ... + print (['.... Binding Initialization']) + # jobs = binder.Init(**_args) + _mapped = binder.Init(**_args) + + + _schema = [{"name":_name,"type":"INTEGER"} for _name in _mapped.columns.tolist()] + publisher = lambda _params: (Components()).post(**_params) + _args = {'data':_mapped,'store':_config['store']['target']} + _args['store']['table'] = '_map' + if _args['store']['provider'] =='bigquery' : + _args['schema'] = _schema + + job = Process (target = publisher,args=(_args,)) + job.start() + jobs = [job] + else: + # + # Applying the map of k on a particular dataset + # + index = int(SYS_ARGS['index']) + _args['config'] = _config['pipeline'][index] + _args['original_key'] = 'person_id' if 'original_key' in _config else 'person_id' + table = _config['pipeline'][index]['from'] + _df = binder.ApplyOn(**_args) + _df = np.array_split(_df,PART_SIZE) + jobs = [] + print (['Publishing ',PART_SIZE,' PARTITION']) + for data in _df : + publisher = lambda _params: ( Components() ).post(**_params) + _args = {'data':data,'store':_config['store']['target']} + _args['store']['table'] = table + print (_args['store']) + job = Process(target = publisher,args=(_args,)) + job.name = "Publisher "+str(len(jobs)+1) + job.start() + jobs.append(job) + elif 'shuffle' in SYS_ARGS : index = 0 if GPU_CHIPS and 'all-chips' in SYS_ARGS: @@ -632,6 +639,7 @@ if __name__ == '__main__' : # Let us create n-jobs across n-gpus, The assumption here is the data that is produced will be a partition # @TODO: Find better name for partition # + if GPU_CHIPS and 'all-chips' in SYS_ARGS: index = 0 print (['... launching ',len(GPU_CHIPS),' jobs',args['context']]) @@ -652,12 +660,15 @@ if __name__ == '__main__' : else: # # The choice of the chip will be made internally + agent = Components() agent.train(**args) # # If we have any obs we should wait till they finish # DIRTY = 0 + if (len(jobs)) : + print (['.... waiting on ',len(jobs),' jobs']) while len(jobs)> 0 : DIRTY =1 jobs = [job for job in jobs if job.is_alive()] @@ -666,47 +677,16 @@ if __name__ == '__main__' : print (["..:: jobs finished "]) # # We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations - # - - if 'autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) : - # - # We should pull all the primary keys and regenerate them in order to insure some form of consistency - # - print (["..:: Finalizing process"]) - (Components()).finalize(args) - # finalize(args) - pass - # jobs = [] - # for index in range(0,PART_SIZE) : - # if 'focus' in args and int(args['focus']) != index : - # continue - # args['part_size'] = PART_SIZE - # args['partition'] = index - # args['data'] = DATA[index] - # if int(args['num_gpu']) > 1 : - # args['gpu'] = index - # else: - # args['gpu']=0 + # This holds true for bigquery - bigquery only + IS_BIGQUERY = _config['store']['source']['provider'] == _config['store']['target']['provider'] and _config['store']['source']['provider'] == 'bigquery' - # make = lambda _args: (Components()).train(**_args) - # job = Process(target=make,args=( dict(args),)) - # job.name = 'Trainer # ' + str(index) - # job.start() - # jobs.append(job) - # # args['gpu'] - # print (["Started ",len(jobs),"trainers" if len(jobs)>1 else "trainer" ]) - # while len(jobs)> 0 : - # jobs = [job for job in jobs if job.is_alive()] - # time.sleep(2) + # if 'bind' not in SYS_ARGS and IS_BIGQUERY and ('autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS)) : + # # + # # We should pull all the primary keys and regenerate them in order to insure some form of consistency + # # - # trainer = Components() - # trainer.train(**args) - + # # + # # - # Components.train(**args) -#for args in PIPELINE : - #args['dataset'] = 'combined20190510' - #process = Process(target=Components.train,args=(args,)) - #process.name = args['context'] - #process.start() -# Components.train(args) + # print (["..:: Finalizing process"]) + # (Components()).finalize(args) From 964ddb06abec16de023a05d754837b1410bb80f1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 24 Mar 2022 11:47:02 -0500 Subject: [PATCH 179/250] version increment --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index c43bd15..d3f0d4b 100644 --- a/setup.py +++ b/setup.py @@ -5,10 +5,10 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker", - "version":"1.4.7.6", + "version":"1.4.7.8", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} -args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] +args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' if sys.version_info[0] == 2 : From 0384a2e96f40d98bb28aae0b723e2cec865fe9cd Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 11 Apr 2022 18:33:07 -0500 Subject: [PATCH 180/250] bug fixes and simplified interface --- data/maker/__init__.py | 177 +++++++++++++++++++++++- data/maker/prepare/__init__.py | 4 +- finalize.py | 240 --------------------------------- pipeline.py | 2 +- 4 files changed, 177 insertions(+), 246 deletions(-) delete mode 100644 finalize.py diff --git a/data/maker/__init__.py b/data/maker/__init__.py index a7d8d69..bf388a6 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -11,13 +11,15 @@ This package is designed to generate synthetic data from a dataset from an origi import pandas as pd import numpy as np import data.gan as gan -from transport import factory +import transport from data.bridge import Binary import threading as thread from data.maker import prepare import copy import os import json +from multiprocessing import Process, RLock + class ContinuousToDiscrete : ROUND_UP = 2 @@ -101,7 +103,7 @@ def train (**_args): else: args['store']['doc'] = _args['context'] - logger = factory.instance(**args['store']) + logger = transport.factory.instance(**args['store']) args['logger'] = logger for key in _inputhandler._map : @@ -193,4 +195,173 @@ def generate(**_args): candidates = handler.apply(candidates=args['candidates']) return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates] - + +class Learner(Process): + def __init__(self,**_args): + + + super(Learner, self).__init__() + if 'gpu' in _args : + print (_args['gpu']) + os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu']) + self.gpu = int(_args['gpu']) + else: + self.gpu = None + self.info = _args['info'] + self.columns = self.info['columns'] if 'columns' in self.info else None + self.store = _args['store'] + if 'network_args' not in _args : + self.network_args ={ + 'context':_args['context'] if 'context' in _args else 'GENERAL', + 'logs':_args['logpath'] if 'logpath' in _args else 'logs', + 'max_epochs':int(_args['epochs']) if 'epochs' in _args else 2, + 'batch_size':int (_args['batch']) if 'batch' in _args else 2000 + } + else: + self.network_args = _args['network_args'] + self._encoder = None + self._map = None + self._df = _args['data'] if 'data' in _args else None + # + # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork + # + + # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs' + # sel.max_epoc + def get_schema(self): + return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])] + def initalize(self): + reader = transport.factory.instance(**self.store['source']) + _read_args= self.info + if self._df is None : + self._df = reader.read(**_read_args) + columns = self.columns if self.columns else self._df.columns + # + # convert the data to binary here ... + + _args = {"schema":self.get_schema(),"data":self._df,"columns":columns} + if self._map : + _args['map'] = self._map + self._encoder = prepare.Input(**_args) +class Trainer(Learner): + """ + This will perform training using a GAN + """ + def __init__(self,**_args): + super().__init__(**_args) + # self.info = _args['info'] + self.limit = int(_args['limit']) if 'limit' in _args else None + self.name = _args['name'] + self.autopilot = _args['autopilot'] if 'autopilot' in _args else False + self.generate = None + self.candidates = int(_args['candidates']) if 'candidates' in _args else 1 + def run(self): + self.initalize() + _space,_matrix = self._encoder.convert() + + _args = self.network_args + if self.gpu : + _args['gpu'] = self.gpu + _args['real'] = _matrix + _args['candidates'] = self.candidates + # + # At this point we have the binary matrix, we can initiate training + # + + gTrain = gan.Train(**_args) + gTrain.apply() + + writer = transport.factory.instance(provider='file',context='write',path=os.sep.join([gTrain.out_dir,'map.json'])) + writer.write(self._encoder._map,overwrite=True) + writer.close() + + # + # @TODO: At this point we need to generate another some other objects + # + _args = {"network_args":self.network_args,"store":self.store,"info":self.info,"candidates":self.candidates,"data":self._df} + if self.gpu : + _args['gpu'] = self.gpu + g = Generator(**_args) + # g.run() + self.generate = g + if self.autopilot : + self.generate.run() + def generate (self): + if self.autopilot : + print( "Autopilot is set ... No need to call this function") + else: + raise Exception( "Autopilot has not been, Wait till training is finished. Use is_alive function on process object") + +class Generator (Learner): + def __init__(self,**_args): + super().__init__(**_args) + # + # We need to load the mapping information for the space we are working with ... + # + self.network_args['candidates'] = int(_args['candidates']) if 'candidates' in _args else 1 + filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json']) + file = open(filename) + self._map = json.loads(file.read()) + file.close() + def run(self): + self.initalize() + # + # The values will be returned because we have provided _map information from the constructor + # + values,_matrix = self._encoder.convert() + _args = self.network_args + _args['map'] = self._map + _args['values'] = np.array(values) + _args['row_count'] = self._df.shape[0] + + gHandler = gan.Predict(**_args) + gHandler.load_meta(columns=None) + _iomatrix = gHandler.apply() + _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix] + self.post(_candidates) + def appriximate(self,_df): + _columns = self.info['approximate'] + _schema = {} + for _info in self.get_schema() : + _schema[_info['name']] = _info['type'] + + + for name in _columns : + batches = np.array_split(_df[name].values,10) + x = [] + for values in batches : + _values = np.random.dirichlet(values) + x += list(values + _values )if np.random.randint(0,2) else list(values - _values) + _df[name] = np.int64(x) if 'int' in _schema[name] else np.float64(x) + return _df + def format(self,_df): + pass + def post(self,_candidates): + + _store = self.store['target'] if 'target' in self.store else {'provider':'console'} + _store['lock'] = True + writer = transport.factory.instance(**_store) + + for _iodf in _candidates : + _df = self._df.copy() + _df[self.columns] = _iodf[self.columns] + if 'approximate' in self.info : + + _df = self.appriximate(_df) + writer.write(_df,schema=self.get_schema()) + pass +class factory : + _infocache = {} + @staticmethod + def instance(**_args): + """ + An instance of an object that trains and generates candidate datasets + :param gpu (optional) index of the gpu to be used if using one + :param store {source,target} if no target is provided console will be output + :param epochs (default 2) number of epochs to train + :param candidates(default 1) number of candidates to generate + :param info {columns,sql,from} + :param autopilot will generate output automatically + :param batch (default 2k) size of the batch + """ + return Trainer(**_args) \ No newline at end of file diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 6e67cb2..478d435 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -128,7 +128,7 @@ class Input : cols, _matrix = self.tobinary(_df[name],values) _beg,_end = i,i+len(cols) if name not in self._map : - self._map[name] = {"beg":_beg,"end":_end ,"values":cols} + self._map[name] = {"beg":_beg,"end":_end ,"values":cols.tolist()} i += len(cols) if not _m.shape[0]: _m = _matrix ; @@ -196,7 +196,7 @@ class Input : # In the advent the sample rows do NOT have the values of the cols = rows.unique() cols = np.array(cols) - row_count = len(rows) + row_count = np.int64(len(rows)) # if 'GPU' not in os.environ : # _matrix = np.zeros([row_count,cols.size],dtype=int) # diff --git a/finalize.py b/finalize.py deleted file mode 100644 index d420d7d..0000000 --- a/finalize.py +++ /dev/null @@ -1,240 +0,0 @@ -#!/usr/bin/env python3 -""" -This file will perform basic tasks to finalize the GAN process by performing the following : - - basic stats & analytics - - rebuild io to another dataset -""" -import pandas as pd -import numpy as np -from multiprocessing import Process, Lock -from google.oauth2 import service_account -from google.cloud import bigquery as bq -import transport -from data.params import SYS_ARGS -import json - -class Analytics : - """ - This class will compile basic analytics about a given dataset i.e compare original/synthetic - """ - @staticmethod - def distribution(**args): - context = args['context'] - df = args['data'] - # - #-- This data frame counts unique values for each feature (space) - df_counts = pd.DataFrame(df.apply(lambda col: col.unique().size),columns=['counts']).T # unique counts - # - #-- Get the distributions for common values - # - names = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False] - ddf = df.apply(lambda col: pd.DataFrame(col.values,columns=[col.name]).groupby([col.name]).size() ).fillna(0) - ddf[context] = ddf.index - - pass - def distance(**args): - """ - This function will measure the distance between - """ - pass -class Utils : - @staticmethod - def log(**args): - logger = transport.factory.instance(type="mongo.MongoWriter",args={"dbname":"aou","doc":"logs"}) - logger.write(args) - logger.close() - class get : - @staticmethod - def pipeline(table,path) : - # contexts = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts'] - config = json.loads((open(path)).read()) - pipeline = config['pipeline'] - # return [ item for item in pipeline if item['context'] in contexts] - pipeline = [item for item in pipeline if 'from' in item and item['from'].strip() == table] - Utils.log(module=table,action='init',input={"pipeline":pipeline}) - return pipeline - @staticmethod - def sql(**args) : - """ - This function is intended to build SQL query for the remainder of the table that was not synthesized - :config configuration entries - :from source of the table name - :dataset name of the source dataset - - """ - SQL = ["SELECT * FROM :from "] - SQL_FILTER = [] - NO_FILTERS_FOUND = True - # pipeline = Utils.get.config(**args) - pipeline = args['pipeline'] - REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='} - for item in pipeline : - - - if 'filter' in item : - if NO_FILTERS_FOUND : - NO_FILTERS_FOUND = False - SQL += ['WHERE'] - # - # Let us load the filter in the SQL Query - FILTER = item['filter'] - QUALIFIER = REVERSE_QUALIFIER[FILTER['qualifier'].upper()] - SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')']).replace(":dataset",args['dataset'])] - src = ".".join([args['dataset'],args['from']]) - SQL += [" AND ".join(SQL_FILTER)] - # - # let's pull the field schemas out of the table definition - # - Utils.log(module=args['from'],action='sql',input={"sql":" ".join(SQL) }) - return " ".join(SQL).replace(":from",src) - - -def mk(**args) : - dataset = args['dataset'] - client = args['client'] if 'client' in args else bq.Client.from_service_account_file(args['private_key']) - # - # let us see if we have a dataset handy here - # - datasets = list(client.list_datasets()) - found = [item for item in datasets if item.dataset_id == dataset] - - if not found : - - return client.create_dataset(dataset) - return found[0] - -def move (args): - """ - This function will move a table from the synthetic dataset into a designated location - This is the simplest case for finalizing a synthetic data set - :private_key - """ - pipeline = Utils.get.pipeline(args['from'],args['config']) - _args = json.loads((open(args['config'])).read()) - _args['pipeline'] = pipeline - # del _args['pipeline'] - args = dict(args,**_args) - # del args['pipeline'] - # private_key = args['private_key'] - client = bq.Client.from_service_account_json(args['private_key']) - - dataset = args['dataset'] - if pipeline : - SQL = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in pipeline] - SQL += [Utils.get.sql(**args)] - SQL = ('\n UNION ALL \n'.join(SQL).replace(':dataset','io')) - else: - # - # moving a table to a designated location - tablename = args['from'] - if 'sql' not in args : - SQL = "SELECT * FROM :dataset.:table" - else: - SQL = args['sql'] - SQL = SQL.replace(":dataset",dataset).replace(":table",tablename) - Utils.log(module=args['from'],action='sql',input={'sql':SQL}) - # - # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table - # - - - - odataset = mk(dataset=dataset+'_io',client=client) - # SQL = "SELECT * FROM io.:context_full_io".replace(':context',context) - config = bq.QueryJobConfig() - config.destination = client.dataset(odataset.dataset_id).table(args['from']) - config.use_query_cache = True - config.allow_large_results = True - config.priority = 'INTERACTIVE' - # - # - - schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema - fields = [" ".join(["CAST (",item.name,"AS",item.field_type.replace("INTEGER","INT64").replace("FLOAT","FLOAT64"),") ",item.name]) for item in schema] - SQL = SQL.replace("*"," , ".join(fields)) - # print (SQL) - out = client.query(SQL,location='US',job_config=config) - Utils.log(module=args['from'],action='move',input={'job':out.job_id}) - return (out.job_id) - - - - -import pandas as pd -import numpy as np -from google.oauth2 import service_account -import json - -# path = '../curation-prod.json' -# credentials = service_account.Credentials.from_service_account_file(path) -# df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard') -filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config'] -f = open(filename) -config = json.loads(f.read()) -args = config['pipeline'] -f.close() - - -if __name__ == '__main__' : - """ - Usage : - finalize -- --contexts --from
- """ - - if 'move' in SYS_ARGS : - - if 'init' in SYS_ARGS : - dep = config['dep'] if 'dep' in config else {} - info = [] - - if 'queries' in dep : - info += dep['queries'] - print ('________') - if 'tables' in dep : - info += dep['tables'] - args = {} - jobs = [] - for item in info : - args = {} - if type(item) == str : - args['from'] = item - name = item - else: - args = item - name = item['from'] - args['config'] = SYS_ARGS['config'] - # args['pipeline'] = [] - job = Process(target=move,args=(args,)) - job.name = name - jobs.append(job) - job.start() - - - # while len(jobs) > 0 : - # jobs = [job for job in jobs if job.is_alive()] - # time.sleep(1) - - - else: - move(SYS_ARGS) - # # table = SYS_ARGS['from'] - # # args = dict(config,**{"private_key":"../curation-prod.json"}) - # args = dict(args,**SYS_ARGS) - # contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']] - # log = [] - # if contexts : - # args['contexts'] = contexts - # log = move(**args) - - # else: - # tables = args['from'].split(',') - # for name in tables : - # name = name.strip() - # args['from'] = name - # log += [move(**args)] - # print ("\n".join(log)) - - - - else: - print ("NOT YET READY !") \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index 5fb62fe..9d095d9 100644 --- a/pipeline.py +++ b/pipeline.py @@ -486,7 +486,7 @@ class Components : # Let us merge the dataset here and and have a comprehensive dataset _df = pd.DataFrame.join(df,_df) - _params = {'data':_df,'store' : ostore} + _params = {'data':_df,'store' : ostore,'from':args['from']} if _schema : _params ['schema'] = _schema _info = {"module":"gan-prep","action":"write","input":{"rows":_df.shape[0],"cols":_df.shape[1]}} From ee518316c07d26ac546c0d7870de9584239c7b47 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 11 Apr 2022 18:52:46 -0500 Subject: [PATCH 181/250] verion update --- setup.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index d3f0d4b..3822df5 100644 --- a/setup.py +++ b/setup.py @@ -4,17 +4,12 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker", - "version":"1.4.7.8", - "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.5.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} -args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow','pandas','pandas-gbq','pymongo'] -args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' +args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow'] +args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git' if sys.version_info[0] == 2 : args['use_2to3'] = False args['use_2to3_exclude_fixers'] = ['lib2to3.fixes.fix_import'] -args['scripts']=['pipeline.py','finalize.py'] setup(**args) - - From 0797e3dba18580d275a668e9732534e5414c6eb3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 11 Apr 2022 23:27:25 -0500 Subject: [PATCH 182/250] post processing features with dates --- data/maker/__init__.py | 78 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 15 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index bf388a6..9d3bdb5 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -12,14 +12,14 @@ import pandas as pd import numpy as np import data.gan as gan import transport -from data.bridge import Binary +# from data.bridge import Binary import threading as thread from data.maker import prepare import copy import os import json from multiprocessing import Process, RLock - +from datetime import datetime, timedelta class ContinuousToDiscrete : ROUND_UP = 2 @@ -229,7 +229,11 @@ class Learner(Process): # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs' # sel.max_epoc def get_schema(self): - return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])] + if self.store['source']['provider'] != 'bigquery' : + return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])] + else: + reader = transport.factory.instance(**self.store['source']) + return reader.meta(table=self.info['from']) def initalize(self): reader = transport.factory.instance(**self.store['source']) _read_args= self.info @@ -319,21 +323,56 @@ class Generator (Learner): _iomatrix = gHandler.apply() _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix] self.post(_candidates) - def appriximate(self,_df): + def approximate(self,_df): _columns = self.info['approximate'] - _schema = {} - for _info in self.get_schema() : - _schema[_info['name']] = _info['type'] + # _schema = {} + # for _info in self.get_schema() : + # _schema[_info['name']] = _info['type'] for name in _columns : - batches = np.array_split(_df[name].values,10) + batches = np.array_split(_df[name].fillna(np.nan).values,2) + _type = np.int64 if 'int' in self.info['approximate'][name]else np.float64 x = [] for values in batches : - _values = np.random.dirichlet(values) - x += list(values + _values )if np.random.randint(0,2) else list(values - _values) - _df[name] = np.int64(x) if 'int' in _schema[name] else np.float64(x) + + index = np.where(values != '') + _values = np.random.dirichlet(values[index].astype(_type)) + values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values) + values[index] = values[index].astype(_type) + x += values.tolist() + if x : + _df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64) return _df + def make_date(self,**_args) : + """ + :param year initial value + """ + if _args['year'] in ['',None,np.nan] : + return None + year = int(_args['year']) + offset = _args['offset'] if 'offset' in _args else 0 + month = np.random.randint(1,13) + if month == 2: + _end = 28 if year % 4 != 0 else 29 + else: + _end = 31 if month in [1,3,5,7,8,10,12] else 30 + day = np.random.randint(1,_end) + + #-- synthetic date + _date = datetime(year=year,month=month,day=day) + FORMAT = _args['format'] if 'format' in _args else '%Y-%m-%d' + r = [] + if offset : + r = [_date.strftime(FORMAT)] + for _delta in offset : + _date = _date + timedelta(_delta) + r.append(_date.strftime(FORMAT)) + return r + else: + return _date.strftime(FORMAT) + + pass def format(self,_df): pass def post(self,_candidates): @@ -345,10 +384,19 @@ class Generator (Learner): for _iodf in _candidates : _df = self._df.copy() _df[self.columns] = _iodf[self.columns] - if 'approximate' in self.info : - - _df = self.appriximate(_df) - writer.write(_df,schema=self.get_schema()) + if 'approximate' in self.info : + _df = self.approximate(_df) + if 'make_date' in self.info : + for name in self.info['make_date'] : + # iname = self.info['make_date']['init_field'] + iname = self.info['make_date'][name] + + years = _df[iname] + _dates = [self.make_date(year=year) for year in years] + if _dates : + _df[name] = _dates + + writer.write(_df[['birth_datetime']+self.columns],schema=self.get_schema()) pass class factory : _infocache = {} From a35c0ed6a28dbd66e274ca04be0bd70bc34e408f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 11 Apr 2022 23:40:23 -0500 Subject: [PATCH 183/250] bug fix ... --- data/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/__init__.py b/data/__init__.py index 0ca216d..0f84ec8 100644 --- a/data/__init__.py +++ b/data/__init__.py @@ -1,4 +1,4 @@ -import data.params as params +# import data.params as params from data.params import SYS_ARGS import transport from multiprocessing import Process, Queue From 260f1021863a00c3bf36e41a59b179ad8c04883c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 12 Apr 2022 13:16:48 -0500 Subject: [PATCH 184/250] bug fixes, added logger (not yet using though) --- data/__init__.py | 12 ------------ data/maker/__init__.py | 6 ++++++ 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/data/__init__.py b/data/__init__.py index 0f84ec8..2b4a6aa 100644 --- a/data/__init__.py +++ b/data/__init__.py @@ -3,15 +3,3 @@ from data.params import SYS_ARGS import transport from multiprocessing import Process, Queue from data.maker import prepare - -class Trainer (Process) : - pass -class Maker(Process): - pass - -if __name__ == '__main__' : - - logger = transport.factory.instance(SYS_ARGS['store']['logger']) - - - \ No newline at end of file diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 9d3bdb5..d91c89e 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -207,9 +207,11 @@ class Learner(Process): self.gpu = int(_args['gpu']) else: self.gpu = None + self.info = _args['info'] self.columns = self.info['columns'] if 'columns' in self.info else None self.store = _args['store'] + self.logger = transport.factory.instance(_args['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True) if 'network_args' not in _args : self.network_args ={ 'context':_args['context'] if 'context' in _args else 'GENERAL', @@ -379,11 +381,15 @@ class Generator (Learner): _store = self.store['target'] if 'target' in self.store else {'provider':'console'} _store['lock'] = True + _store['context'] = 'write' #-- Just in case writer = transport.factory.instance(**_store) for _iodf in _candidates : _df = self._df.copy() _df[self.columns] = _iodf[self.columns] + # + #@TODO: + # Improve formatting with better post-processing pipeline if 'approximate' in self.info : _df = self.approximate(_df) if 'make_date' in self.info : From d6fd7bceba5f9d1634432d2404f65f1a4a656d16 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 12 Apr 2022 14:00:03 -0500 Subject: [PATCH 185/250] bug fix --- data/maker/__init__.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index d91c89e..e2a072a 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -214,7 +214,7 @@ class Learner(Process): self.logger = transport.factory.instance(_args['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True) if 'network_args' not in _args : self.network_args ={ - 'context':_args['context'] if 'context' in _args else 'GENERAL', + 'context':self.info['context'] , 'logs':_args['logpath'] if 'logpath' in _args else 'logs', 'max_epochs':int(_args['epochs']) if 'epochs' in _args else 2, 'batch_size':int (_args['batch']) if 'batch' in _args else 2000 @@ -363,7 +363,13 @@ class Generator (Learner): #-- synthetic date _date = datetime(year=year,month=month,day=day) - FORMAT = _args['format'] if 'format' in _args else '%Y-%m-%d' + FORMAT = '%Y-%m-%d' + if 'format' in self.info and 'field' in _args and _args['field'] in self.info['format']: + _name = _args['field'] + FORMAT = self.info['format'][_name] + + + r = [] if offset : r = [_date.strftime(FORMAT)] @@ -382,6 +388,8 @@ class Generator (Learner): _store = self.store['target'] if 'target' in self.store else {'provider':'console'} _store['lock'] = True _store['context'] = 'write' #-- Just in case + if 'table' not in _store : + _store['table'] = self.info['from'] writer = transport.factory.instance(**_store) for _iodf in _candidates : @@ -398,11 +406,12 @@ class Generator (Learner): iname = self.info['make_date'][name] years = _df[iname] - _dates = [self.make_date(year=year) for year in years] + _dates = [self.make_date(year=year,field=name) for year in years] if _dates : _df[name] = _dates - - writer.write(_df[['birth_datetime']+self.columns],schema=self.get_schema()) + _schema = self.get_schema() + _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] + writer.write(_df[['birth_datetime']+self.columns],schema=_schema) pass class factory : _infocache = {} From 838c7978de6f85ce7fe1affadbb9b6b60b4a633c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 12 Apr 2022 14:32:39 -0500 Subject: [PATCH 186/250] bug fix: gpu visibility --- data/gan.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/data/gan.py b/data/gan.py index f5705ea..e0f97b1 100644 --- a/data/gan.py +++ b/data/gan.py @@ -61,16 +61,19 @@ class GNet : self.logs = {} # self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] - self.GPU_CHIPS = None if 'gpu' not in args else args['gpu'] - if self.GPU_CHIPS is None: - self.GPU_CHIPS = [0] - if 'CUDA_VISIBLE_DEVICES' in os.environ : - os.environ.pop('CUDA_VISIBLE_DEVICES') - self.NUM_GPUS = 0 - else: - self.NUM_GPUS = len(self.GPU_CHIPS) + # self.GPU_CHIPS = None if 'gpu' not in args else args['gpu'] + # if self.GPU_CHIPS is None: + # self.GPU_CHIPS = [0] + # if 'CUDA_VISIBLE_DEVICES' in os.environ : + # os.environ.pop('CUDA_VISIBLE_DEVICES') + # self.NUM_GPUS = 0 + # else: + # self.NUM_GPUS = len(self.GPU_CHIPS) # os.environ['CUDA_VISIBLE_DEVICES'] = str(self.GPU_CHIPS[0]) - + self.NUM_GPUS = 0 if 'gpu' not in args else args['gpu'] + self.GPU_CHIPS = None if self.NUM_GPUS == 0 else [args['gpu']] + if self.GPU_CHIPS : + os.environ['CUDA_VISIBLE_DEVICES'] = str(self.GPU_CHIPS[0]) self.PARTITION = args['partition'] if 'partition' in args else None # if self.NUM_GPUS > 1 : # os.environ['CUDA_VISIBLE_DEVICES'] = "4" From 4aaefedce02a063abc2f512031d9f9cf40e51ff5 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 12 Apr 2022 14:35:58 -0500 Subject: [PATCH 187/250] bug fix --- data/maker/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index e2a072a..b7608d7 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -411,7 +411,7 @@ class Generator (Learner): _df[name] = _dates _schema = self.get_schema() _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] - writer.write(_df[['birth_datetime']+self.columns],schema=_schema) + writer.write(_df[self.columns],schema=_schema) pass class factory : _infocache = {} From bbbeb5172a274d1ed15718a6a55878bd5a45eba0 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 12 Apr 2022 14:50:19 -0500 Subject: [PATCH 188/250] bug fix --- data/gan.py | 21 +++++++++------------ data/maker/__init__.py | 3 ++- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/data/gan.py b/data/gan.py index e0f97b1..26f19a2 100644 --- a/data/gan.py +++ b/data/gan.py @@ -61,19 +61,16 @@ class GNet : self.logs = {} # self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] - # self.GPU_CHIPS = None if 'gpu' not in args else args['gpu'] - # if self.GPU_CHIPS is None: - # self.GPU_CHIPS = [0] - # if 'CUDA_VISIBLE_DEVICES' in os.environ : - # os.environ.pop('CUDA_VISIBLE_DEVICES') - # self.NUM_GPUS = 0 - # else: - # self.NUM_GPUS = len(self.GPU_CHIPS) + self.GPU_CHIPS = None if 'gpu' not in args else [args['gpu']] + if self.GPU_CHIPS is None: + self.GPU_CHIPS = [0] + if 'CUDA_VISIBLE_DEVICES' in os.environ : + os.environ.pop('CUDA_VISIBLE_DEVICES') + self.NUM_GPUS = 0 + else: + self.NUM_GPUS = len(self.GPU_CHIPS) # os.environ['CUDA_VISIBLE_DEVICES'] = str(self.GPU_CHIPS[0]) - self.NUM_GPUS = 0 if 'gpu' not in args else args['gpu'] - self.GPU_CHIPS = None if self.NUM_GPUS == 0 else [args['gpu']] - if self.GPU_CHIPS : - os.environ['CUDA_VISIBLE_DEVICES'] = str(self.GPU_CHIPS[0]) + self.PARTITION = args['partition'] if 'partition' in args else None # if self.NUM_GPUS > 1 : # os.environ['CUDA_VISIBLE_DEVICES'] = "4" diff --git a/data/maker/__init__.py b/data/maker/__init__.py index b7608d7..4c175e9 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -319,7 +319,8 @@ class Generator (Learner): _args['map'] = self._map _args['values'] = np.array(values) _args['row_count'] = self._df.shape[0] - + if self.gpu : + _args['gpu'] = self.gpu gHandler = gan.Predict(**_args) gHandler.load_meta(columns=None) _iomatrix = gHandler.apply() From 9b3031af1c8b17a8b5a6c2d12d9cbcdc25e79ecf Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 12 Apr 2022 14:59:46 -0500 Subject: [PATCH 189/250] bug fix: preconditions --- data/maker/__init__.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 4c175e9..fba1361 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -248,7 +248,7 @@ class Learner(Process): _args = {"schema":self.get_schema(),"data":self._df,"columns":columns} if self._map : _args['map'] = self._map - self._encoder = prepare.Input(**_args) + self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None class Trainer(Learner): """ This will perform training using a GAN @@ -263,6 +263,10 @@ class Trainer(Learner): self.candidates = int(_args['candidates']) if 'candidates' in _args else 1 def run(self): self.initalize() + if self._encoder is None : + # + # @TODO Log that the dataset was empty or not statistically relevant + return _space,_matrix = self._encoder.convert() _args = self.network_args @@ -311,9 +315,15 @@ class Generator (Learner): file.close() def run(self): self.initalize() + if self._encoder is None : + # + # @TODO Log that the dataset was empty or not statistically relevant + return + # # The values will be returned because we have provided _map information from the constructor # + values,_matrix = self._encoder.convert() _args = self.network_args _args['map'] = self._map From becc30ff4279e2b547123e1f3b6819a0e87b0af5 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 13 Apr 2022 09:36:21 -0500 Subject: [PATCH 190/250] bug fix added logger and approximation fix --- data/maker/__init__.py | 56 ++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index fba1361..382c209 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -202,7 +202,7 @@ class Learner(Process): super(Learner, self).__init__() if 'gpu' in _args : - print (_args['gpu']) + os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu']) self.gpu = int(_args['gpu']) else: @@ -224,9 +224,13 @@ class Learner(Process): self._encoder = None self._map = None self._df = _args['data'] if 'data' in _args else None + self.name = self.__class__.__name__+'::'+self.info['context']+'::'+self.info['from'] # # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork # + if self.logger : + _args = {'module':self.name,'action':'init','context':self.info['context'],'gpu':(self.gpu if self.gpu is not None else -1)} + self.logger.write(_args) # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs' # sel.max_epoc @@ -249,6 +253,9 @@ class Learner(Process): if self._map : _args['map'] = self._map self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None + if self.logger : + _args = {'module':self.name,'action':'data-prep','input':{'rows':self._df.shape[0],'cols':self._df.shape[1]} } + self.logger.write(_args) class Trainer(Learner): """ This will perform training using a GAN @@ -257,10 +264,11 @@ class Trainer(Learner): super().__init__(**_args) # self.info = _args['info'] self.limit = int(_args['limit']) if 'limit' in _args else None - self.name = _args['name'] + self.autopilot = _args['autopilot'] if 'autopilot' in _args else False self.generate = None self.candidates = int(_args['candidates']) if 'candidates' in _args else 1 + def run(self): self.initalize() if self._encoder is None : @@ -277,7 +285,7 @@ class Trainer(Learner): # # At this point we have the binary matrix, we can initiate training # - + beg = datetime.now().strftime('%Y-%m-%d %H:%M:%S') gTrain = gan.Train(**_args) gTrain.apply() @@ -293,6 +301,10 @@ class Trainer(Learner): _args['gpu'] = self.gpu g = Generator(**_args) # g.run() + if self.logger : + end = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + logs = {'module':self.name,'action':'train','input':{'start':beg,'end':end}} + self.logger.write(logs) self.generate = g if self.autopilot : self.generate.run() @@ -333,29 +345,38 @@ class Generator (Learner): _args['gpu'] = self.gpu gHandler = gan.Predict(**_args) gHandler.load_meta(columns=None) - _iomatrix = gHandler.apply() + _iomatrix = gHandler.apply() _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix] + if self.logger : + _size = np.sum([len(_item) for _item in _iomatrix]) + _log = {'module':self.name,'action':'io-data','input':{'candidates':len(_candidates),'rows':_size}} + self.logger.write(_log) self.post(_candidates) def approximate(self,_df): _columns = self.info['approximate'] - # _schema = {} - # for _info in self.get_schema() : - # _schema[_info['name']] = _info['type'] - - + for name in _columns : - batches = np.array_split(_df[name].fillna(np.nan).values,2) + if _df[name].size > 100 : + BATCH_SIZE = 10 + + else: + BATCH_SIZE = 1 + batches = np.array_split(_df[name].fillna(np.nan).values,BATCH_SIZE) _type = np.int64 if 'int' in self.info['approximate'][name]else np.float64 x = [] + _log = {'module':self.name,'action':'approximate','input':{'batch':BATCH_SIZE,'col':name}} for values in batches : - - index = np.where(values != '') + + index = [ _x not in ['',None,np.nan] for _x in values] _values = np.random.dirichlet(values[index].astype(_type)) values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values) values[index] = values[index].astype(_type) x += values.tolist() - if x : + if x : + _log['input']['diff'] = 1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size) _df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64) + if self.logger : + self.logger.write(_log) return _df def make_date(self,**_args) : """ @@ -402,10 +423,11 @@ class Generator (Learner): if 'table' not in _store : _store['table'] = self.info['from'] writer = transport.factory.instance(**_store) - + N = 0 for _iodf in _candidates : _df = self._df.copy() _df[self.columns] = _iodf[self.columns] + N += _df.shape[0] # #@TODO: # Improve formatting with better post-processing pipeline @@ -422,8 +444,10 @@ class Generator (Learner): _df[name] = _dates _schema = self.get_schema() _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] - writer.write(_df[self.columns],schema=_schema) - pass + + writer.write(_df,schema=_schema) + if self.logger : + self.logger.write({'module':self.name,'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) class factory : _infocache = {} @staticmethod From 2fdc7c8f5c92c1159dc8d716f10d23a352d61892 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 13 Apr 2022 10:07:27 -0500 Subject: [PATCH 191/250] bug fix --- data/maker/__init__.py | 221 +++++------------------------------------ 1 file changed, 26 insertions(+), 195 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 382c209..3acddc1 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -21,181 +21,6 @@ import json from multiprocessing import Process, RLock from datetime import datetime, timedelta -class ContinuousToDiscrete : - ROUND_UP = 2 - @staticmethod - def binary(X,n=4) : - """ - This function will convert a continous stream of information into a variety a bit stream of bins - """ - values = np.array(X).astype(np.float32) - BOUNDS = ContinuousToDiscrete.bounds(values,n) - matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n) - - - @staticmethod - def bounds(x,n): - # return np.array_split(x,n) - values = np.round(x,ContinuousToDiscrete.ROUND_UP) - return list(pd.cut(values,n).categories) - - - - @staticmethod - def continuous(X,BIN_SIZE=4) : - """ - This function will approximate a binary vector given boundary information - :X binary matrix - :BIN_SIZE - """ - BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE) - - values = [] - # _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE) - # # # print (BOUNDS) - l = {} - for i in np.arange(len(X)): #value in X : - - value = X[i] - - for item in BOUNDS : - if value >= item.left and value <= item.right : - values += [np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP)] - break - # values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if value >= item.left and value <= item.right ] - - - # # values = [] - # for row in _BINARY : - # # ubound = BOUNDS[row.index(1)] - # index = np.where(row == 1)[0][0] - - # ubound = BOUNDS[ index ].right - # lbound = BOUNDS[ index ].left - - # x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float) - # values.append(x_) - - # lbound = ubound - - # values = [np.random.uniform() for item in BOUNDS] - - return values - - -def train (**_args): - """ - :params sql - :params store - """ - - _inputhandler = prepare.Input(**_args) - values,_matrix = _inputhandler.convert() - args = {"real":_matrix,"context":_args['context']} - _map = {} - if 'store' in _args : - # - # This - - args['store'] = copy.deepcopy(_args['store']['logs']) - if 'args' in _args['store']: - args['store']['args']['doc'] = _args['context'] - else: - - args['store']['doc'] = _args['context'] - logger = transport.factory.instance(**args['store']) - args['logger'] = logger - - for key in _inputhandler._map : - beg = _inputhandler._map[key]['beg'] - end = _inputhandler._map[key]['end'] - values = _inputhandler._map[key]['values'].tolist() - _map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()} - info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map} - print() - # print ([_args['context'],_inputhandler._io]) - logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io}) - - args['logs'] = _args['logs'] if 'logs' in _args else 'logs' - args ['max_epochs'] = _args['max_epochs'] - args['matrix_size'] = _matrix.shape[0] - args['batch_size'] = 2000 - if 'partition' in _args : - args['partition'] = _args['partition'] - if 'gpu' in _args : - args['gpu'] = _args['gpu'] - # os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' - - trainer = gan.Train(**args) - # - # @TODO: Write the map.json in the output directory for the logs - # - # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w') - f = open(os.sep.join([trainer.out_dir,'map.json']),'w') - f.write(json.dumps(_map)) - f.close() - - trainer.apply() - pass - -def get(**args): - """ - This function will restore a checkpoint from a persistant storage on to disk - """ - pass -def generate(**_args): - """ - This function will generate a set of records, before we must load the parameters needed - :param data - :param context - :param logs - """ - _args['logs'] = _args['logs'] if 'logs' in _args else 'logs' - partition = _args['partition'] if 'partition' in _args else None - if not partition : - MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context']]) - # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json'])) - else: - MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context'],str(partition)]) - # f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json'])) - f = open(os.sep.join([MAP_FOLDER,'map.json'])) - _map = json.loads(f.read()) - f.close() - # - # - # if 'file' in _args : - # df = pd.read_csv(_args['file']) - # else: - # df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data']) - args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']} - args['logs'] = _args['logs'] if 'logs' in _args else 'logs' - args ['max_epochs'] = _args['max_epochs'] - # args['matrix_size'] = _matrix.shape[0] - args['batch_size'] = 2000 - args['partition'] = 0 if 'partition' not in _args else _args['partition'] - args['row_count'] = _args['data'].shape[0] - # - # @TODO: perhaps get the space of values here ... (not sure it's a good idea) - # - _args['map'] = _map - _inputhandler = prepare.Input(**_args) - values,_matrix = _inputhandler.convert() - args['values'] = np.array(values) - if 'gpu' in _args : - args['gpu'] = _args['gpu'] - - handler = gan.Predict (**args) - lparams = {'columns':None} - if partition : - lparams['partition'] = partition - handler.load_meta(**lparams) - # - # Let us now format the matrices by reverting them to a data-frame with values - # - - candidates = handler.apply(candidates=args['candidates']) - return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates] - class Learner(Process): def __init__(self,**_args): @@ -211,7 +36,7 @@ class Learner(Process): self.info = _args['info'] self.columns = self.info['columns'] if 'columns' in self.info else None self.store = _args['store'] - self.logger = transport.factory.instance(_args['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True) + if 'network_args' not in _args : self.network_args ={ 'context':self.info['context'] , @@ -228,12 +53,18 @@ class Learner(Process): # # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork # - if self.logger : - _args = {'module':self.name,'action':'init','context':self.info['context'],'gpu':(self.gpu if self.gpu is not None else -1)} - self.logger.write(_args) + + _log = {'module':self.name,'action':'init','context':self.info['context'],'gpu':(self.gpu if self.gpu is not None else -1)} + self.log(**_log) # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs' # sel.max_epoc + def log(self,**_args): + logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True) + logger.write(_args) + if hasattr(logger,'close') : + logger.close() + def get_schema(self): if self.store['source']['provider'] != 'bigquery' : return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])] @@ -253,9 +84,9 @@ class Learner(Process): if self._map : _args['map'] = self._map self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None - if self.logger : - _args = {'module':self.name,'action':'data-prep','input':{'rows':self._df.shape[0],'cols':self._df.shape[1]} } - self.logger.write(_args) + + _log = {'module':self.name,'action':'data-prep','input':{'rows':self._df.shape[0],'cols':self._df.shape[1]} } + self.log(**_log) class Trainer(Learner): """ This will perform training using a GAN @@ -301,10 +132,10 @@ class Trainer(Learner): _args['gpu'] = self.gpu g = Generator(**_args) # g.run() - if self.logger : - end = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - logs = {'module':self.name,'action':'train','input':{'start':beg,'end':end}} - self.logger.write(logs) + + end = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + _logs = {'module':self.name,'action':'train','input':{'start':beg,'end':end}} + self.log(**_logs) self.generate = g if self.autopilot : self.generate.run() @@ -347,10 +178,10 @@ class Generator (Learner): gHandler.load_meta(columns=None) _iomatrix = gHandler.apply() _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix] - if self.logger : - _size = np.sum([len(_item) for _item in _iomatrix]) - _log = {'module':self.name,'action':'io-data','input':{'candidates':len(_candidates),'rows':_size}} - self.logger.write(_log) + + _size = np.sum([len(_item) for _item in _iomatrix]) + _log = {'module':self.name,'action':'io-data','input':{'candidates':len(_candidates),'rows':int(_size)}} + self.log(**_log) self.post(_candidates) def approximate(self,_df): _columns = self.info['approximate'] @@ -373,10 +204,10 @@ class Generator (Learner): values[index] = values[index].astype(_type) x += values.tolist() if x : - _log['input']['diff'] = 1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size) + _log['input']['diff_pct'] = 100 * (1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size)) _df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64) - if self.logger : - self.logger.write(_log) + + self.log(**_log) return _df def make_date(self,**_args) : """ @@ -446,8 +277,8 @@ class Generator (Learner): _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] writer.write(_df,schema=_schema) - if self.logger : - self.logger.write({'module':self.name,'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) + + self.log(**{'module':self.name,'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) class factory : _infocache = {} @staticmethod From 1bdf6cc8b3adbb200e3f2e318553ddbef5b55e2f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 13 Apr 2022 10:55:55 -0500 Subject: [PATCH 192/250] bug fix --- data/maker/__init__.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 3acddc1..ff93104 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -244,7 +244,14 @@ class Generator (Learner): return _date.strftime(FORMAT) pass - def format(self,_df): + def format(self,_df,_schema): + for _item in _schema : + name = _item['name'] + if _item['type'].upper() in ['DATETIME','TIMESTAMP'] : + + _df[name] = pd.to_datetime(_df[name], format='%Y-%m-%d %H:%M:%S').astype('datetime64[ns]') + return _df + pass def post(self,_candidates): @@ -272,10 +279,10 @@ class Generator (Learner): years = _df[iname] _dates = [self.make_date(year=year,field=name) for year in years] if _dates : - _df[name] = _dates + _df[name] = _dates _schema = self.get_schema() _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] - + _df = self.format(_df,_schema) writer.write(_df,schema=_schema) self.log(**{'module':self.name,'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) From 1bffb8d7be70e4b1af868977d07964c700f0acff Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 13 Apr 2022 11:11:23 -0500 Subject: [PATCH 193/250] bug fix (exception handling) --- data/maker/__init__.py | 19 ++++++++++++------- data/maker/prepare/__init__.py | 13 ++++++++----- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index ff93104..807bd84 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -22,10 +22,12 @@ from multiprocessing import Process, RLock from datetime import datetime, timedelta class Learner(Process): + def __init__(self,**_args): super(Learner, self).__init__() + self.ndx = 0 if 'gpu' in _args : os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu']) @@ -49,19 +51,22 @@ class Learner(Process): self._encoder = None self._map = None self._df = _args['data'] if 'data' in _args else None - self.name = self.__class__.__name__+'::'+self.info['context']+'::'+self.info['from'] + self.name = self.__class__.__name__+'::'+self.info['from'] + self.name = self.name.replace('?','') # # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork # - _log = {'module':self.name,'action':'init','context':self.info['context'],'gpu':(self.gpu if self.gpu is not None else -1)} + _log = {'action':'init','context':self.info['context'],'gpu':(self.gpu if self.gpu is not None else -1)} self.log(**_log) # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs' # sel.max_epoc def log(self,**_args): logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True) + _args = dict({'ndx':self.ndx,'module':self.name,'info':self.info['context'],**_args}) logger.write(_args) + self.ndx += 1 if hasattr(logger,'close') : logger.close() @@ -85,7 +90,7 @@ class Learner(Process): _args['map'] = self._map self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None - _log = {'module':self.name,'action':'data-prep','input':{'rows':self._df.shape[0],'cols':self._df.shape[1]} } + _log = {'action':'data-prep','input':{'rows':self._df.shape[0],'cols':self._df.shape[1]} } self.log(**_log) class Trainer(Learner): """ @@ -134,7 +139,7 @@ class Trainer(Learner): # g.run() end = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - _logs = {'module':self.name,'action':'train','input':{'start':beg,'end':end}} + _logs = {'action':'train','input':{'start':beg,'end':end,"unique_counts":self._encoder._io[0]}} self.log(**_logs) self.generate = g if self.autopilot : @@ -180,7 +185,7 @@ class Generator (Learner): _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix] _size = np.sum([len(_item) for _item in _iomatrix]) - _log = {'module':self.name,'action':'io-data','input':{'candidates':len(_candidates),'rows':int(_size)}} + _log = {'action':'io-data','input':{'candidates':len(_candidates),'rows':int(_size)}} self.log(**_log) self.post(_candidates) def approximate(self,_df): @@ -195,7 +200,7 @@ class Generator (Learner): batches = np.array_split(_df[name].fillna(np.nan).values,BATCH_SIZE) _type = np.int64 if 'int' in self.info['approximate'][name]else np.float64 x = [] - _log = {'module':self.name,'action':'approximate','input':{'batch':BATCH_SIZE,'col':name}} + _log = {'action':'approximate','input':{'batch':BATCH_SIZE,'col':name}} for values in batches : index = [ _x not in ['',None,np.nan] for _x in values] @@ -285,7 +290,7 @@ class Generator (Learner): _df = self.format(_df,_schema) writer.write(_df,schema=_schema) - self.log(**{'module':self.name,'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) + self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) class factory : _infocache = {} @staticmethod diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 478d435..bc316e9 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -90,11 +90,14 @@ class Input : # else: # # We will look into the count and make a judgment call - _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T - MIN_SPACE_SIZE = 2 - self._columns = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist() - self._io = _df.to_dict(orient='records') - + try: + _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T + MIN_SPACE_SIZE = 2 + self._columns = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist() + self._io = _df.to_dict(orient='records') + except Exception as e: + print (e) + self._io = [] def _initdata(self,**_args): """ This function will initialize the class with a data-frame and columns of interest (if any) From 15e53cb6569eec301a3eb75637f7233a9c8a6ee4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 13 Apr 2022 11:19:36 -0500 Subject: [PATCH 194/250] bug fix (exception handling) --- data/maker/prepare/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index bc316e9..f7ae3f7 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -91,9 +91,11 @@ class Input : # # We will look into the count and make a judgment call try: - _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T - MIN_SPACE_SIZE = 2 - self._columns = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist() + # _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T + # MIN_SPACE_SIZE = 2 + # self._columns = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist() + # self._io = _df.to_dict(orient='records') + _df = self.df.nunique().T / self.df.shape[0] self._io = _df.to_dict(orient='records') except Exception as e: print (e) From 167e4b873d550860d772d3c608a8352f3d78f0db Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 13 Apr 2022 11:39:54 -0500 Subject: [PATCH 195/250] bug fix --- data/maker/prepare/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index f7ae3f7..3ef494e 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -96,7 +96,7 @@ class Input : # self._columns = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist() # self._io = _df.to_dict(orient='records') _df = self.df.nunique().T / self.df.shape[0] - self._io = _df.to_dict(orient='records') + self._io = pd.DataFrame(_df).to_dict(orient='records') except Exception as e: print (e) self._io = [] From 289f2e7b895885fd95cf8f0991aeaf7dbfcae198 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 13 Apr 2022 11:45:39 -0500 Subject: [PATCH 196/250] bugfix --- data/maker/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 807bd84..21e38c5 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -51,8 +51,8 @@ class Learner(Process): self._encoder = None self._map = None self._df = _args['data'] if 'data' in _args else None - self.name = self.__class__.__name__+'::'+self.info['from'] - self.name = self.name.replace('?','') + self.name = self.__class__.__name__ + # # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork # @@ -64,7 +64,7 @@ class Learner(Process): # sel.max_epoc def log(self,**_args): logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True) - _args = dict({'ndx':self.ndx,'module':self.name,'info':self.info['context'],**_args}) + _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'info':self.info['context'],**_args}) logger.write(_args) self.ndx += 1 if hasattr(logger,'close') : From e93fe7fea8f45db056642f913d82732279c77149 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 14 Apr 2022 10:06:27 -0500 Subject: [PATCH 197/250] bug fixes --- data/maker/__init__.py | 14 +++++++++++--- data/maker/prepare/__init__.py | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 21e38c5..77effb3 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -121,7 +121,7 @@ class Trainer(Learner): # # At this point we have the binary matrix, we can initiate training # - beg = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + beg = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S') gTrain = gan.Train(**_args) gTrain.apply() @@ -138,8 +138,9 @@ class Trainer(Learner): g = Generator(**_args) # g.run() - end = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - _logs = {'action':'train','input':{'start':beg,'end':end,"unique_counts":self._encoder._io[0]}} + end = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S') + _min = float(timedelta(end,beg).seconds/ 60) + _logs = {'action':'train','input':{'start':beg.strftime('%Y-%m-%d %H:%M:%S'),'minutes':_min,"unique_counts":self._encoder._io[0]}} self.log(**_logs) self.generate = g if self.autopilot : @@ -158,6 +159,7 @@ class Generator (Learner): # self.network_args['candidates'] = int(_args['candidates']) if 'candidates' in _args else 1 filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json']) + self.log(**{'action':'init-map','input':{'filename':filename,'exists':os.path.exists(filename)}}) file = open(filename) self._map = json.loads(file.read()) file.close() @@ -291,6 +293,12 @@ class Generator (Learner): writer.write(_df,schema=_schema) self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) +class Shuffle(Trainer): + """ + This is a method that will yield data with low utility + """ + def __init__(self,**_args): + super().__init__(self) class factory : _infocache = {} @staticmethod diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 3ef494e..1bf4872 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -96,7 +96,7 @@ class Input : # self._columns = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist() # self._io = _df.to_dict(orient='records') _df = self.df.nunique().T / self.df.shape[0] - self._io = pd.DataFrame(_df).to_dict(orient='records') + self._io = pd.DataFrame(_df).astype(float).to_dict(orient='records') except Exception as e: print (e) self._io = [] From 4345146f3a7f6bd6ffea63c0d416844b261a6103 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 14 Apr 2022 11:06:28 -0500 Subject: [PATCH 198/250] bug fix: logger and io space --- data/maker/__init__.py | 54 ++++++++++++++++++++++++++-------- data/maker/prepare/__init__.py | 5 ++-- setup.py | 3 +- 3 files changed, 46 insertions(+), 16 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 77effb3..bce8d65 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -28,6 +28,7 @@ class Learner(Process): super(Learner, self).__init__() self.ndx = 0 + self.lock = RLock() if 'gpu' in _args : os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu']) @@ -63,13 +64,21 @@ class Learner(Process): # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs' # sel.max_epoc def log(self,**_args): - logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True) - _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'info':self.info['context'],**_args}) - logger.write(_args) - self.ndx += 1 - if hasattr(logger,'close') : - logger.close() - + self.lock.acquire() + try: + logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True) + _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'info':self.info['context'],**_args}) + logger.write(_args) + self.ndx += 1 + if hasattr(logger,'close') : + logger.close() + except Exception as e: + print () + print (_args) + print (e) + pass + finally: + self.lock.release() def get_schema(self): if self.store['source']['provider'] != 'bigquery' : return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])] @@ -88,9 +97,8 @@ class Learner(Process): _args = {"schema":self.get_schema(),"data":self._df,"columns":columns} if self._map : _args['map'] = self._map - self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None - - _log = {'action':'data-prep','input':{'rows':self._df.shape[0],'cols':self._df.shape[1]} } + self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None + _log = {'action':'data-prep','input':{'rows':int(self._df.shape[0]),'cols':int(self._df.shape[1]) } } self.log(**_log) class Trainer(Learner): """ @@ -139,7 +147,7 @@ class Trainer(Learner): # g.run() end = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S') - _min = float(timedelta(end,beg).seconds/ 60) + _min = float((end-beg).seconds/ 60) _logs = {'action':'train','input':{'start':beg.strftime('%Y-%m-%d %H:%M:%S'),'minutes':_min,"unique_counts":self._encoder._io[0]}} self.log(**_logs) self.generate = g @@ -293,12 +301,27 @@ class Generator (Learner): writer.write(_df,schema=_schema) self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) -class Shuffle(Trainer): +class Shuffle(Generator): """ This is a method that will yield data with low utility """ def __init__(self,**_args): super().__init__(self) + def run(self): + + + self.initalize() + _index = np.arange(self._df.shape[0]) + np.random.shuffle(_index) + _iocolumns = self.info['columns'] + _ocolumns = list(set(self._df.columns) - set(_iocolumns) ) + _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(self._df.shape[0])) + self._df = self._df[_ocolumns].join(_iodf) + + + _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}} + self.log(**_log) + self.post([self._df]) class factory : _infocache = {} @staticmethod @@ -313,4 +336,9 @@ class factory : :param autopilot will generate output automatically :param batch (default 2k) size of the batch """ - return Trainer(**_args) \ No newline at end of file + if 'apply' not in _args : + return Trainer(**_args) + elif _args['apply'] == 'shuffe' : + return Shuffle(**_args) + elif _args['apply'] == 'generate' : + return Generator(**_args) \ No newline at end of file diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 1bf4872..50fcfdf 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -95,8 +95,9 @@ class Input : # MIN_SPACE_SIZE = 2 # self._columns = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist() # self._io = _df.to_dict(orient='records') - _df = self.df.nunique().T / self.df.shape[0] - self._io = pd.DataFrame(_df).astype(float).to_dict(orient='records') + _df = pd.DataFrame(self.df.nunique().T / self.df.shape[0]).T + self._io = (_df.to_dict(orient='records')) + except Exception as e: print (e) self._io = [] diff --git a/setup.py b/setup.py index 3822df5..c96877b 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,8 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.5.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.5.1", + "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git' From 7a22314a46ca5c806428f63cb3f90419e99b1af1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 14 Apr 2022 11:27:55 -0500 Subject: [PATCH 199/250] bugfix --- data/maker/__init__.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index bce8d65..eb4c02d 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -336,9 +336,10 @@ class factory : :param autopilot will generate output automatically :param batch (default 2k) size of the batch """ - if 'apply' not in _args : - return Trainer(**_args) - elif _args['apply'] == 'shuffe' : + + if _args['apply'] == 'shuffe' : return Shuffle(**_args) elif _args['apply'] == 'generate' : - return Generator(**_args) \ No newline at end of file + return Generator(**_args) + else: + return Trainer(**_args) \ No newline at end of file From 528e6db0b8fa09ee21c1e4598fed32d2fba6c3db Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 14 Apr 2022 11:41:30 -0500 Subject: [PATCH 200/250] bug fix --- data/maker/__init__.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index eb4c02d..49227a6 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -306,7 +306,7 @@ class Shuffle(Generator): This is a method that will yield data with low utility """ def __init__(self,**_args): - super().__init__(self) + super().__init__(**_args) def run(self): @@ -315,7 +315,8 @@ class Shuffle(Generator): np.random.shuffle(_index) _iocolumns = self.info['columns'] _ocolumns = list(set(self._df.columns) - set(_iocolumns) ) - _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(self._df.shape[0])) + # _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size)) + _iodf = pd.DataFrame(self._df[_iocolumns],index = np.arange(_index.size)) self._df = self._df[_ocolumns].join(_iodf) @@ -336,8 +337,8 @@ class factory : :param autopilot will generate output automatically :param batch (default 2k) size of the batch """ - - if _args['apply'] == 'shuffe' : + + if _args['apply'] == 'shuffle' : return Shuffle(**_args) elif _args['apply'] == 'generate' : return Generator(**_args) From 9f198f3b1556f411afda04b76359ce4ec0f47334 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 14 Apr 2022 12:03:59 -0500 Subject: [PATCH 201/250] bug fix: generator iherited by shuffle --- data/maker/__init__.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 49227a6..3f437d2 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -168,9 +168,12 @@ class Generator (Learner): self.network_args['candidates'] = int(_args['candidates']) if 'candidates' in _args else 1 filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json']) self.log(**{'action':'init-map','input':{'filename':filename,'exists':os.path.exists(filename)}}) - file = open(filename) - self._map = json.loads(file.read()) - file.close() + if os.path.exists(filename): + file = open(filename) + self._map = json.loads(file.read()) + file.close() + else: + self._map = {} def run(self): self.initalize() if self._encoder is None : From 1ff4145eeaaff2c7e901bb73cc3c1c650298f2a3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 14 Apr 2022 12:36:20 -0500 Subject: [PATCH 202/250] bugfix: formatter --- data/maker/__init__.py | 13 +++++++++++-- setup.py | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 3f437d2..2b53def 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -265,9 +265,17 @@ class Generator (Learner): def format(self,_df,_schema): for _item in _schema : name = _item['name'] - if _item['type'].upper() in ['DATETIME','TIMESTAMP'] : + + if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] : + FORMAT = '%Y-%d-%m' + if 'format' in self.info and name in self.info['format'] : + FORMAT = self.info['format'][name] + else: + if _item['type'] == ['DATETIME','TIMESTAMP'] : + FORMAT = '%Y-%d-%m %H:%M:%S' + self.log(**{'action':'format','input':{'name':name,'format':FORMAT}}) - _df[name] = pd.to_datetime(_df[name], format='%Y-%m-%d %H:%M:%S').astype('datetime64[ns]') + _df[name] = pd.to_datetime(_df[name], format=FORMAT).astype(str) #.astype('datetime64[ns]') return _df pass @@ -298,6 +306,7 @@ class Generator (Learner): _dates = [self.make_date(year=year,field=name) for year in years] if _dates : _df[name] = _dates + _schema = self.get_schema() _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] _df = self.format(_df,_schema) diff --git a/setup.py b/setup.py index c96877b..1991bde 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.5.1", +args = {"name":"data-maker","version":"1.5.2", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow'] From febcaa588395f5ee84c4cc7bec08683ba368d765 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 14 Apr 2022 12:42:11 -0500 Subject: [PATCH 203/250] bugfix: logs for formatting dates --- data/maker/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 2b53def..2921b46 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -263,6 +263,7 @@ class Generator (Learner): pass def format(self,_df,_schema): + r = {} for _item in _schema : name = _item['name'] @@ -273,9 +274,12 @@ class Generator (Learner): else: if _item['type'] == ['DATETIME','TIMESTAMP'] : FORMAT = '%Y-%d-%m %H:%M:%S' - self.log(**{'action':'format','input':{'name':name,'format':FORMAT}}) + r[name] = FORMAT + _df[name] = pd.to_datetime(_df[name], format=FORMAT).astype(str) #.astype('datetime64[ns]') + if r : + self.log(**{'action':'format','input':r}) return _df pass From 0e4148d4e79e23267be6a71285d5001104ec401d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 14 Apr 2022 13:37:47 -0500 Subject: [PATCH 204/250] bugfix: date/timestamp conversions --- data/maker/__init__.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 2921b46..184bca4 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -64,7 +64,7 @@ class Learner(Process): # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs' # sel.max_epoc def log(self,**_args): - self.lock.acquire() + # self.lock.acquire() try: logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True) _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'info':self.info['context'],**_args}) @@ -78,7 +78,8 @@ class Learner(Process): print (e) pass finally: - self.lock.release() + # self.lock.release() + pass def get_schema(self): if self.store['source']['provider'] != 'bigquery' : return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])] @@ -222,7 +223,7 @@ class Generator (Learner): values[index] = values[index].astype(_type) x += values.tolist() if x : - _log['input']['diff_pct'] = 100 * (1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size)) + _log['input']['identical_percentage'] = 100 * (1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size)) _df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64) self.log(**_log) @@ -243,14 +244,15 @@ class Generator (Learner): day = np.random.randint(1,_end) #-- synthetic date - _date = datetime(year=year,month=month,day=day) - FORMAT = '%Y-%m-%d' - if 'format' in self.info and 'field' in _args and _args['field'] in self.info['format']: + _date = datetime(year=year,month=month,day=day) #,minute=0,hour=0,second=0) + FORMAT = '%Y-%d-%m' + _name = _args['field'] if 'field' in _args else None + if 'format' in self.info and _name in self.info['format']: _name = _args['field'] FORMAT = self.info['format'][_name] - + # print ([_name,FORMAT, _date.strftime(FORMAT)]) r = [] if offset : r = [_date.strftime(FORMAT)] @@ -277,7 +279,7 @@ class Generator (Learner): r[name] = FORMAT - _df[name] = pd.to_datetime(_df[name], format=FORMAT).astype(str) #.astype('datetime64[ns]') + _df[name] = pd.to_datetime(_df[name], format=FORMAT).astype('datetime64[ns]') if r : self.log(**{'action':'format','input':r}) return _df @@ -308,12 +310,13 @@ class Generator (Learner): years = _df[iname] _dates = [self.make_date(year=year,field=name) for year in years] - if _dates : - _df[name] = _dates + if _dates : + _df[name] = _dates _schema = self.get_schema() _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] _df = self.format(_df,_schema) + writer.write(_df,schema=_schema) self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) From 93ebe8ee1b2b20f29c80281799b48eec65bf90eb Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 14 Apr 2022 18:07:17 -0500 Subject: [PATCH 205/250] bugfix: date type casting bug --- data/maker/__init__.py | 43 ++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 184bca4..6c2a463 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -235,6 +235,7 @@ class Generator (Learner): if _args['year'] in ['',None,np.nan] : return None year = int(_args['year']) + offset = _args['offset'] if 'offset' in _args else 0 month = np.random.randint(1,13) if month == 2: @@ -244,13 +245,13 @@ class Generator (Learner): day = np.random.randint(1,_end) #-- synthetic date - _date = datetime(year=year,month=month,day=day) #,minute=0,hour=0,second=0) - FORMAT = '%Y-%d-%m' + _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0) + FORMAT = '%Y-%m-%d' _name = _args['field'] if 'field' in _args else None if 'format' in self.info and _name in self.info['format']: - _name = _args['field'] + # _name = _args['field'] FORMAT = self.info['format'][_name] - + # print ([_name,FORMAT, _date.strftime(FORMAT)]) r = [] @@ -258,7 +259,7 @@ class Generator (Learner): r = [_date.strftime(FORMAT)] for _delta in offset : _date = _date + timedelta(_delta) - r.append(_date.strftime(FORMAT)) + r.append(_date.strptime(FORMAT)) return r else: return _date.strftime(FORMAT) @@ -270,16 +271,19 @@ class Generator (Learner): name = _item['name'] if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] : - FORMAT = '%Y-%d-%m' + FORMAT = '%Y-%m-%d' + if 'format' in self.info and name in self.info['format'] : FORMAT = self.info['format'][name] - else: - if _item['type'] == ['DATETIME','TIMESTAMP'] : - FORMAT = '%Y-%d-%m %H:%M:%S' - r[name] = FORMAT + elif _item['type'] in ['DATETIME','TIMESTAMP'] : + FORMAT = '%Y-%m-%d %H:%M:%S' - - _df[name] = pd.to_datetime(_df[name], format=FORMAT).astype('datetime64[ns]') + r[name] = FORMAT + _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') + if _item['type'] in ['DATETIME','TIMESTAMP']: + _df[name] = _df[name].astype('datetime64[ns]') + else: + _df[name] = _df[name].astype(str) if r : self.log(**{'action':'format','input':r}) return _df @@ -309,10 +313,12 @@ class Generator (Learner): iname = self.info['make_date'][name] years = _df[iname] - _dates = [self.make_date(year=year,field=name) for year in years] - if _dates : + _dates = [self.make_date(year=_year,field=name) for _year in years] + if _dates : _df[name] = _dates - + + + _schema = self.get_schema() _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] _df = self.format(_df,_schema) @@ -341,7 +347,12 @@ class Shuffle(Generator): _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}} self.log(**_log) - self.post([self._df]) + try: + self.post([self._df]) + self.log(**{'action':'completed','input':{'candidates':1,'rows':int(self._df.shape[0])}}) + except Exception as e : + # print (e) + self.log(**{'action':'failed','input':{'msg':e,'info':self.info}}) class factory : _infocache = {} @staticmethod From 01ca780c99d2e3dc7a42b1e4642d756f0bd74f15 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 14 Apr 2022 18:23:43 -0500 Subject: [PATCH 206/250] bugfix: date type casting bug --- data/maker/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 6c2a463..c65cbcf 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -284,6 +284,7 @@ class Generator (Learner): _df[name] = _df[name].astype('datetime64[ns]') else: _df[name] = _df[name].astype(str) + _df[name] = _df[name].fillna('') if r : self.log(**{'action':'format','input':r}) return _df From 133b0120db26643f374004e3f4eb0e9a622861d3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 14 Apr 2022 18:29:28 -0500 Subject: [PATCH 207/250] bugfix: date type casting bug --- data/maker/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index c65cbcf..cde3928 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -284,7 +284,7 @@ class Generator (Learner): _df[name] = _df[name].astype('datetime64[ns]') else: _df[name] = _df[name].astype(str) - _df[name] = _df[name].fillna('') + _df[name] = _df[name].replace('NaT','') if r : self.log(**{'action':'format','input':r}) return _df From 5d4c534faeac12c19ca39a1564c0ccd19b9a22cd Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 21 Apr 2022 10:14:00 -0500 Subject: [PATCH 208/250] bug fix: approximation null values --- data/maker/__init__.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index cde3928..723991f 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -218,7 +218,13 @@ class Generator (Learner): for values in batches : index = [ _x not in ['',None,np.nan] for _x in values] - _values = np.random.dirichlet(values[index].astype(_type)) + + if len(index) == len(values): + # + # Sometimes messy data has unpleasant surprises + continue + _values = np.random.dirichlet(values[index].astype(_type)) + values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values) values[index] = values[index].astype(_type) x += values.tolist() @@ -284,7 +290,7 @@ class Generator (Learner): _df[name] = _df[name].astype('datetime64[ns]') else: _df[name] = _df[name].astype(str) - _df[name] = _df[name].replace('NaT','') + _df = _df.replace('NaT','') if r : self.log(**{'action':'format','input':r}) return _df From 4aacb74f29967eb483082258d56285a5f7bda094 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 21 Apr 2022 10:53:19 -0500 Subject: [PATCH 209/250] bug fix with shuffler --- data/maker/__init__.py | 13 +++++++++---- setup.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 723991f..630aa41 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -341,16 +341,21 @@ class Shuffle(Generator): super().__init__(**_args) def run(self): - + np.random.seed(1) self.initalize() _index = np.arange(self._df.shape[0]) np.random.shuffle(_index) + np.random.shuffle(_index) _iocolumns = self.info['columns'] _ocolumns = list(set(self._df.columns) - set(_iocolumns) ) # _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size)) - _iodf = pd.DataFrame(self._df[_iocolumns],index = np.arange(_index.size)) - self._df = self._df[_ocolumns].join(_iodf) - + _iodf = pd.DataFrame(self._df[_iocolumns].copy(),index = np.arange(_index.size)) + # self._df = self._df.loc[_index][_ocolumns].join(_iodf) + self._df = self._df.loc[_index][_ocolumns] + self._df.index = np.arange(self._df.shape[0]) + self._df = self._df.join(_iodf) + + _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}} self.log(**_log) diff --git a/setup.py b/setup.py index 1991bde..801dc48 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.5.2", +args = {"name":"data-maker","version":"1.5.3", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow'] From 587248c63b84b010b6c481ac8e64692e950dfaf3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 21 Apr 2022 11:07:56 -0500 Subject: [PATCH 210/250] bug fix --- data/maker/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 630aa41..35a8967 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -219,7 +219,7 @@ class Generator (Learner): index = [ _x not in ['',None,np.nan] for _x in values] - if len(index) == len(values): + if np.sum(index) == 0: # # Sometimes messy data has unpleasant surprises continue @@ -228,6 +228,7 @@ class Generator (Learner): values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values) values[index] = values[index].astype(_type) x += values.tolist() + print (batches) if x : _log['input']['identical_percentage'] = 100 * (1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size)) _df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64) From aa41d371f4a02bf97bbb66d6bcdd8060635778a9 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 21 Apr 2022 11:12:09 -0500 Subject: [PATCH 211/250] bug fix --- data/maker/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 35a8967..7f1c896 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -355,7 +355,10 @@ class Shuffle(Generator): self._df = self._df.loc[_index][_ocolumns] self._df.index = np.arange(self._df.shape[0]) self._df = self._df.join(_iodf) - + # + # The following is a full shuffle + self._df = self._df.loc[_index] + self._df.index = np.arange(self._df.shape[0]) _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}} From 88b4fdd8610d62c82b72aef36436d402c8b81673 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 21 Apr 2022 11:40:41 -0500 Subject: [PATCH 212/250] bug fix --- data/maker/__init__.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 7f1c896..60141d0 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -223,16 +223,22 @@ class Generator (Learner): # # Sometimes messy data has unpleasant surprises continue - _values = np.random.dirichlet(values[index].astype(_type)) + + _values = np.random.rand( len(values[index])) + _values += np.std(values[index]) / 4 values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values) values[index] = values[index].astype(_type) x += values.tolist() - print (batches) + if x : - _log['input']['identical_percentage'] = 100 * (1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size)) - _df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64) + _log['input']['identical_percentage'] = 100 * (np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size)) + print (_df[name] == x) + print (_log) + + + _df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64) self.log(**_log) return _df def make_date(self,**_args) : From 4b4647d200a41a075b6be61fcb5256a238e9d6bb Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 21 Apr 2022 12:17:32 -0500 Subject: [PATCH 213/250] bug fix --- data/maker/__init__.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 60141d0..3c4d45f 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -233,11 +233,7 @@ class Generator (Learner): if x : _log['input']['identical_percentage'] = 100 * (np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size)) - print (_df[name] == x) - - print (_log) - - + _df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64) self.log(**_log) return _df From 42ccca5f8dd1c707ba56567cb58d9863d348a9e8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 16 May 2022 11:11:33 -0500 Subject: [PATCH 214/250] bug fixes can now be used as a library --- data/maker/__init__.py | 42 ++++++++++++++++++++++++++++-------------- setup.py | 2 +- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 3c4d45f..50abfd2 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -82,7 +82,7 @@ class Learner(Process): pass def get_schema(self): if self.store['source']['provider'] != 'bigquery' : - return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])] + return [] #{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])] else: reader = transport.factory.instance(**self.store['source']) return reader.meta(table=self.info['from']) @@ -276,24 +276,35 @@ class Generator (Learner): pass def format(self,_df,_schema): r = {} + for _item in _schema : name = _item['name'] if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] : FORMAT = '%Y-%m-%d' - - if 'format' in self.info and name in self.info['format'] : - FORMAT = self.info['format'][name] - elif _item['type'] in ['DATETIME','TIMESTAMP'] : - FORMAT = '%Y-%m-%d %H:%M:%S' - - r[name] = FORMAT - _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') - if _item['type'] in ['DATETIME','TIMESTAMP']: - _df[name] = _df[name].astype('datetime64[ns]') - else: - _df[name] = _df[name].astype(str) + try: + # + #-- Sometimes data isn't all it's meant to be + if 'format' in self.info and name in self.info['format'] : + FORMAT = self.info['format'][name] + elif _item['type'] in ['DATETIME','TIMESTAMP'] : + FORMAT = '%Y-%m-%d %H:%M:%S' + + r[name] = FORMAT + _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') + if _item['type'] in ['DATETIME','TIMESTAMP']: + _df[name] = _df[name].fillna('').astype('datetime64[ns]') + else: + _df[name] = _df[name].astype(str) + except Exception as e: + pass + finally: + pass + else: + # print (_item) + pass _df = _df.replace('NaT','') + if r : self.log(**{'action':'format','input':r}) return _df @@ -391,4 +402,7 @@ class factory : elif _args['apply'] == 'generate' : return Generator(**_args) else: - return Trainer(**_args) \ No newline at end of file + pthread= Trainer(**_args) + if 'start' in _args and _args['start'] == True : + pthread.start() + return pthread \ No newline at end of file diff --git a/setup.py b/setup.py index 801dc48..b5d3733 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.5.3", +args = {"name":"data-maker","version":"1.5.4", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow'] From 1d0bbce74819bd83763ca833a29c87ee842e0fa7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 16 May 2022 13:59:58 -0500 Subject: [PATCH 215/250] bug fixes data format --- data/maker/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 50abfd2..42af8f9 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -287,13 +287,15 @@ class Generator (Learner): #-- Sometimes data isn't all it's meant to be if 'format' in self.info and name in self.info['format'] : FORMAT = self.info['format'][name] + SIZE = 10 elif _item['type'] in ['DATETIME','TIMESTAMP'] : FORMAT = '%Y-%m-%d %H:%M:%S' + SIZE = 19 r[name] = FORMAT _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') if _item['type'] in ['DATETIME','TIMESTAMP']: - _df[name] = _df[name].fillna('').astype('datetime64[ns]') + pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]') else: _df[name] = _df[name].astype(str) except Exception as e: From 1e3e0eac45b689602639397213488f9b43e5e84e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 16 May 2022 14:02:40 -0500 Subject: [PATCH 216/250] bug fixes data format --- data/maker/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 42af8f9..1eea945 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -305,7 +305,7 @@ class Generator (Learner): else: # print (_item) pass - _df = _df.replace('NaT','') + _df = _df.replace('NaT','').replace('NA','') if r : self.log(**{'action':'format','input':r}) From 2b228f60750b4521e864b25f0bf36f63262cf2a5 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 17 May 2022 03:05:44 -0500 Subject: [PATCH 217/250] bug fix with type inference --- data/maker/__init__.py | 32 +++++++++++++++++++++++++------- data/maker/prepare/__init__.py | 1 + 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 1eea945..24fabe8 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -282,9 +282,11 @@ class Generator (Learner): if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] : FORMAT = '%Y-%m-%d' + try: # #-- Sometimes data isn't all it's meant to be + SIZE = -1 if 'format' in self.info and name in self.info['format'] : FORMAT = self.info['format'][name] SIZE = 10 @@ -292,20 +294,34 @@ class Generator (Learner): FORMAT = '%Y-%m-%d %H:%M:%S' SIZE = 19 + if SIZE > 0 : + + values = pd.to_datetime(_df[name], format=FORMAT).astype(str) + _df[name] = [_date[:SIZE] for _date in values] + + r[name] = FORMAT - _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') + # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') if _item['type'] in ['DATETIME','TIMESTAMP']: pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]') - else: - _df[name] = _df[name].astype(str) + except Exception as e: pass finally: pass else: - # print (_item) - pass - _df = _df.replace('NaT','').replace('NA','') + + # + # Because types are inferred on the basis of the sample being processed they can sometimes be wrong + # To help disambiguate we add the schema information + _type = None + if 'int' in _df[name].dtypes.name or 'int' in _item['type'].lower(): + _type = np.int + elif 'float' in _df[name].dtypes.name or 'float' in _item['type'].lower(): + _type = np.float + if _type : + _df[name] = _df[name].fillna(0).replace('',0).astype(_type) + # _df = _df.replace('NaT','').replace('NA','') if r : self.log(**{'action':'format','input':r}) @@ -319,7 +335,7 @@ class Generator (Learner): _store['context'] = 'write' #-- Just in case if 'table' not in _store : _store['table'] = self.info['from'] - writer = transport.factory.instance(**_store) + N = 0 for _iodf in _candidates : _df = self._df.copy() @@ -346,7 +362,9 @@ class Generator (Learner): _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] _df = self.format(_df,_schema) + writer = transport.factory.instance(**_store) writer.write(_df,schema=_schema) + # _df.to_csv('foo.csv') self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) class Shuffle(Generator): diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 50fcfdf..17da778 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -209,6 +209,7 @@ class Input : # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure) # _matrix = np.array([np.repeat(0,cols.size) for i in range(0,row_count)]) + [np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0] # else: # _matrix = cp.zeros([row_count,cols.size]) From 6841ccbd5e4abb8322df6da8b55904f99bcae89c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 17 May 2022 13:24:24 -0500 Subject: [PATCH 218/250] bug fix: missing data, adding an additional type: pandas._lib.missing.NAType in addition to np.nan, np.na --- data/maker/__init__.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 24fabe8..b9b48e4 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -52,6 +52,7 @@ class Learner(Process): self._encoder = None self._map = None self._df = _args['data'] if 'data' in _args else None + self.name = self.__class__.__name__ # @@ -92,10 +93,22 @@ class Learner(Process): if self._df is None : self._df = reader.read(**_read_args) columns = self.columns if self.columns else self._df.columns + # + # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases + # - The code below tries to address the issue (Perhaps better suited for the reading components) + for name in columns : + _index = np.random.choice(np.arange(self._df[name].size),5,False) + no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]] + print ([name,np.sum(no_value)]) + no_value = 0 if np.sum(no_value) > 0 else '' + + self._df[name] = self._df[name].fillna(no_value) + + # # convert the data to binary here ... - - _args = {"schema":self.get_schema(),"data":self._df,"columns":columns} + _schema = self.get_schema() + _args = {"schema":_schema,"data":self._df,"columns":columns} if self._map : _args['map'] = self._map self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None From 1dae4ffba8cba71f1cf8daf792cd0aa8b795431c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 17 May 2022 13:27:13 -0500 Subject: [PATCH 219/250] bug fix: missing data, adding an additional type: pandas._lib.missing.NAType in addition to np.nan, np.na --- data/maker/__init__.py | 3 +-- data/maker/prepare/__init__.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index b9b48e4..c8dc02a 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -98,8 +98,7 @@ class Learner(Process): # - The code below tries to address the issue (Perhaps better suited for the reading components) for name in columns : _index = np.random.choice(np.arange(self._df[name].size),5,False) - no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]] - print ([name,np.sum(no_value)]) + no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]] no_value = 0 if np.sum(no_value) > 0 else '' self._df[name] = self._df[name].fillna(no_value) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 17da778..45fc61c 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -196,7 +196,6 @@ class Input : :param rows np.array or list of vector of values :param cols a space of values if it were to be different fromt he current sample. """ - if not cols: # # In the advent the sample rows do NOT have the values of the From 377e84daea23ad126ea787102449b4ffa09b1fd3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 17 May 2022 18:04:05 -0500 Subject: [PATCH 220/250] bug fix: uploading data --- data/maker/__init__.py | 72 ++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 27 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index c8dc02a..d05509d 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -96,14 +96,17 @@ class Learner(Process): # # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases # - The code below tries to address the issue (Perhaps better suited for the reading components) + _log = {} for name in columns : _index = np.random.choice(np.arange(self._df[name].size),5,False) no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]] no_value = 0 if np.sum(no_value) > 0 else '' self._df[name] = self._df[name].fillna(no_value) - - + + _log[name] = self._df[name].dtypes.name + _log = {'action':'structure','input':_log} + self.log(**_log) # # convert the data to binary here ... _schema = self.get_schema() @@ -293,46 +296,52 @@ class Generator (Learner): name = _item['name'] if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] : - FORMAT = '%Y-%m-%d' + FORMAT = '%m-%d-%Y' - try: - # - #-- Sometimes data isn't all it's meant to be - SIZE = -1 - if 'format' in self.info and name in self.info['format'] : - FORMAT = self.info['format'][name] - SIZE = 10 - elif _item['type'] in ['DATETIME','TIMESTAMP'] : - FORMAT = '%Y-%m-%d %H:%M:%S' - SIZE = 19 + # try: + # # + # #-- Sometimes data isn't all it's meant to be + # SIZE = -1 + # if 'format' in self.info and name in self.info['format'] : + # FORMAT = self.info['format'][name] + # SIZE = 10 + # elif _item['type'] in ['DATETIME','TIMESTAMP'] : + # FORMAT = '%m-%d-%Y %H:%M:%S' + # SIZE = 19 - if SIZE > 0 : + # if SIZE > 0 : + + # values = pd.to_datetime(_df[name], format=FORMAT).astype(str) + # _df[name] = [_date[:SIZE].strip() for _date in values] - values = pd.to_datetime(_df[name], format=FORMAT).astype(str) - _df[name] = [_date[:SIZE] for _date in values] + # # _df[name] = _df[name].astype(str) + # r[name] = FORMAT + # # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') + # if _item['type'] in ['DATETIME','TIMESTAMP']: + # pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]') - r[name] = FORMAT - # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') - if _item['type'] in ['DATETIME','TIMESTAMP']: - pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]') - - except Exception as e: - pass - finally: - pass + # except Exception as e: + # pass + # finally: + # pass else: # # Because types are inferred on the basis of the sample being processed they can sometimes be wrong # To help disambiguate we add the schema information _type = None + if 'int' in _df[name].dtypes.name or 'int' in _item['type'].lower(): _type = np.int + elif 'float' in _df[name].dtypes.name or 'float' in _item['type'].lower(): _type = np.float if _type : - _df[name] = _df[name].fillna(0).replace('',0).astype(_type) + + _df[name] = _df[name].fillna(0).replace('',0).replace('NA',0).replace('nan',0).astype(_type) + # else: + # _df[name] = _df[name].astype(str) # _df = _df.replace('NaT','').replace('NA','') if r : @@ -373,10 +382,19 @@ class Generator (Learner): _schema = self.get_schema() _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] _df = self.format(_df,_schema) + _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ] + self.log(**{"action":"consolidate","input":_log}) + + # w = transport.factory.instance(doc='observation',provider='mongodb',context='write',db='IOV01_LOGS',auth_file='/home/steve/dev/transport/mongo.json') + # w.write(_df) + # print (_df[cols]) writer = transport.factory.instance(**_store) writer.write(_df,schema=_schema) - # _df.to_csv('foo.csv') + + + + self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) class Shuffle(Generator): From f3598efa0d3399516d4d4178671abe55605419de Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 17 May 2022 19:10:33 -0500 Subject: [PATCH 221/250] bug fix: date conversions --- data/maker/__init__.py | 49 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index d05509d..403255c 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -296,35 +296,35 @@ class Generator (Learner): name = _item['name'] if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] : - FORMAT = '%m-%d-%Y' + FORMAT = '%Y-%m-%d' - # try: - # # - # #-- Sometimes data isn't all it's meant to be - # SIZE = -1 - # if 'format' in self.info and name in self.info['format'] : - # FORMAT = self.info['format'][name] - # SIZE = 10 - # elif _item['type'] in ['DATETIME','TIMESTAMP'] : - # FORMAT = '%m-%d-%Y %H:%M:%S' - # SIZE = 19 + try: + # + #-- Sometimes data isn't all it's meant to be + SIZE = -1 + if 'format' in self.info and name in self.info['format'] : + FORMAT = self.info['format'][name] + SIZE = 10 + elif _item['type'] in ['DATETIME','TIMESTAMP'] : + FORMAT = '%Y-%m-%-d %H:%M:%S' + SIZE = 19 - # if SIZE > 0 : + if SIZE > 0 : - # values = pd.to_datetime(_df[name], format=FORMAT).astype(str) - # _df[name] = [_date[:SIZE].strip() for _date in values] + values = pd.to_datetime(_df[name], format=FORMAT).astype(np.datetime64) + # _df[name] = [_date[:SIZE].strip() for _date in values] - # # _df[name] = _df[name].astype(str) - # r[name] = FORMAT - # # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') - # if _item['type'] in ['DATETIME','TIMESTAMP']: - # pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]') + # _df[name] = _df[name].astype(str) + r[name] = FORMAT + # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') + if _item['type'] in ['DATETIME','TIMESTAMP']: + pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]') - # except Exception as e: - # pass - # finally: - # pass + except Exception as e: + pass + finally: + pass else: # @@ -387,10 +387,11 @@ class Generator (Learner): # w = transport.factory.instance(doc='observation',provider='mongodb',context='write',db='IOV01_LOGS',auth_file='/home/steve/dev/transport/mongo.json') # w.write(_df) + cols = [name for name in _df.columns if name.endswith('datetime')] # print (_df[cols]) writer = transport.factory.instance(**_store) - writer.write(_df,schema=_schema) + writer.write(_df[cols],schema=[_item for _item in _schema if _item['name'] in cols]) From e8edf886adfc0bd4e05b9ff40b137cc9667beb77 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 10 Jun 2022 13:00:28 -0500 Subject: [PATCH 222/250] bug fix: write data --- data/maker/__init__.py | 7 ++++--- setup.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 403255c..1666a42 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -380,18 +380,19 @@ class Generator (Learner): _schema = self.get_schema() - _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] + _schema = [{'name':_item.name,'type':(_item.field_type if has_attr(_item,'field_type') else 'VARCHAR(256)')} for _item in _schema] _df = self.format(_df,_schema) _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ] self.log(**{"action":"consolidate","input":_log}) # w = transport.factory.instance(doc='observation',provider='mongodb',context='write',db='IOV01_LOGS',auth_file='/home/steve/dev/transport/mongo.json') # w.write(_df) - cols = [name for name in _df.columns if name.endswith('datetime')] + # cols = [name for name in _df.columns if name.endswith('datetime')] + # print (_df[cols]) writer = transport.factory.instance(**_store) - writer.write(_df[cols],schema=[_item for _item in _schema if _item['name'] in cols]) + writer.write(_df,schema=[_item for _item in _schema if _item['name'] in cols]) diff --git a/setup.py b/setup.py index b5d3733..8da19f3 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.5.4", +args = {"name":"data-maker","version":"1.5.5", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow'] From 44d621941d12c768025bf3f1394f6ec06b6ef411 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 10 Jun 2022 13:16:11 -0500 Subject: [PATCH 223/250] bug fix --- data/maker/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 1666a42..e8e5363 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -380,7 +380,7 @@ class Generator (Learner): _schema = self.get_schema() - _schema = [{'name':_item.name,'type':(_item.field_type if has_attr(_item,'field_type') else 'VARCHAR(256)')} for _item in _schema] + _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] _df = self.format(_df,_schema) _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ] self.log(**{"action":"consolidate","input":_log}) @@ -388,11 +388,10 @@ class Generator (Learner): # w = transport.factory.instance(doc='observation',provider='mongodb',context='write',db='IOV01_LOGS',auth_file='/home/steve/dev/transport/mongo.json') # w.write(_df) # cols = [name for name in _df.columns if name.endswith('datetime')] - # print (_df[cols]) writer = transport.factory.instance(**_store) - writer.write(_df,schema=[_item for _item in _schema if _item['name'] in cols]) + writer.write(_df[:],schema=[_item for _item in _schema if _item['name'] in cols]) From 3087e98bc06a2099f6c271f2dd896783930ccf3a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 10 Jun 2022 13:33:51 -0500 Subject: [PATCH 224/250] bug fix --- data/maker/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index e8e5363..056cbbc 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -391,7 +391,7 @@ class Generator (Learner): # print (_df[cols]) writer = transport.factory.instance(**_store) - writer.write(_df[:],schema=[_item for _item in _schema if _item['name'] in cols]) + writer.write(_df[:],schema=_schema) From 7bf0b8e5839f95502f94280e5ad93d75e979a26f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 10 Jun 2022 14:52:55 -0500 Subject: [PATCH 225/250] bug fix --- data/maker/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 056cbbc..71d9c7b 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -98,6 +98,10 @@ class Learner(Process): # - The code below tries to address the issue (Perhaps better suited for the reading components) _log = {} for name in columns : + # + # randomly sampling 5 elements to make sense of data-types + if self._df[name].size < 5 : + continue _index = np.random.choice(np.arange(self._df[name].size),5,False) no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]] no_value = 0 if np.sum(no_value) > 0 else '' From 7e92571d0acf99d042a2c43b3621c50eb831cafe Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 14 Jun 2022 12:24:56 -0500 Subject: [PATCH 226/250] bug fix: errors occasionally --- data/maker/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 71d9c7b..60fc418 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -105,8 +105,10 @@ class Learner(Process): _index = np.random.choice(np.arange(self._df[name].size),5,False) no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]] no_value = 0 if np.sum(no_value) > 0 else '' - - self._df[name] = self._df[name].fillna(no_value) + try: + self._df[name] = self._df[name].fillna(no_value) + finally: + pass _log[name] = self._df[name].dtypes.name _log = {'action':'structure','input':_log} From d89daf76d6758424a6c316b1690fce364f32ef4f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 16 Jun 2022 23:56:16 -0500 Subject: [PATCH 227/250] bug fixes --- data/maker/__init__.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 60fc418..541db37 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -59,7 +59,7 @@ class Learner(Process): # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork # - _log = {'action':'init','context':self.info['context'],'gpu':(self.gpu if self.gpu is not None else -1)} + _log = {'action':'init','gpu':(self.gpu if self.gpu is not None else -1)} self.log(**_log) # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs' @@ -67,8 +67,10 @@ class Learner(Process): def log(self,**_args): # self.lock.acquire() try: + _context = self.info['context'] + _label = self.info['info'] if 'info' in self.info else _context logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True) - _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'info':self.info['context'],**_args}) + _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_info,**_args}) logger.write(_args) self.ndx += 1 if hasattr(logger,'close') : @@ -345,7 +347,7 @@ class Generator (Learner): _type = np.float if _type : - _df[name] = _df[name].fillna(0).replace('',0).replace('NA',0).replace('nan',0).astype(_type) + _df[name] = _df[name].fillna(0).replace(' ',0).replace('',0).replace('NA',0).replace('nan',0).astype(_type) # else: # _df[name] = _df[name].astype(str) # _df = _df.replace('NaT','').replace('NA','') @@ -397,7 +399,10 @@ class Generator (Learner): # print (_df[cols]) writer = transport.factory.instance(**_store) - writer.write(_df[:],schema=_schema) + if _store['provider'] == 'bigquery': + writer.write(_df,schema=[],table=self.info['from']) + else: + writer.write(_df,table=self.info['from']) From 899db5c0368e5d4e3c04ddfb618c4e38ee1ae5da Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 17 Jun 2022 00:17:00 -0500 Subject: [PATCH 228/250] bug fixes --- data/maker/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 541db37..2d1e1f8 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -70,7 +70,7 @@ class Learner(Process): _context = self.info['context'] _label = self.info['info'] if 'info' in self.info else _context logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True) - _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_info,**_args}) + _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args}) logger.write(_args) self.ndx += 1 if hasattr(logger,'close') : From 322b21aaacccaf2458caff72fd0a090c48c7d371 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 9 Aug 2022 12:22:07 -0500 Subject: [PATCH 229/250] bug fix: encoding/decoding to improve correlations between attributes --- data/maker/__init__.py | 123 ++++++++++++++++++++++----------- data/maker/prepare/__init__.py | 57 ++++++++++++++- 2 files changed, 136 insertions(+), 44 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 2d1e1f8..0d8bf33 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -13,13 +13,17 @@ import numpy as np import data.gan as gan import transport # from data.bridge import Binary -import threading as thread +import threading from data.maker import prepare import copy import os -import json +import nujson as json from multiprocessing import Process, RLock from datetime import datetime, timedelta +from multiprocessing import Queue + +import time + class Learner(Process): @@ -28,6 +32,7 @@ class Learner(Process): super(Learner, self).__init__() self.ndx = 0 + self._queue = Queue() self.lock = RLock() if 'gpu' in _args : @@ -61,34 +66,38 @@ class Learner(Process): _log = {'action':'init','gpu':(self.gpu if self.gpu is not None else -1)} self.log(**_log) - + self.cache = [] # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs' # sel.max_epoc def log(self,**_args): - # self.lock.acquire() + try: - _context = self.info['context'] - _label = self.info['info'] if 'info' in self.info else _context - logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True) - _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args}) - logger.write(_args) - self.ndx += 1 - if hasattr(logger,'close') : - logger.close() + # _context = self.info['context'] + # _label = self.info['info'] if 'info' in self.info else _context + # logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider=transport.providers.CONSOLE,context='write',lock=True) + # _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args}) + # logger.write(_args) + # self.ndx += 1 + # if hasattr(logger,'close') : + # logger.close() + pass except Exception as e: print () print (_args) print (e) pass finally: - # self.lock.release() + pass def get_schema(self): - if self.store['source']['provider'] != 'bigquery' : - return [] #{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])] - else: - reader = transport.factory.instance(**self.store['source']) - return reader.meta(table=self.info['from']) + # if self.store['source']['provider'] != 'bigquery' : + # return [] #{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])] + # else: + # reader = transport.factory.instance(**self.store['source']) + # return reader.meta(table=self.info['from']) + reader = transport.factory.instance(**self.store['source']) + return reader.meta(table=self.info['from']) + def initalize(self): reader = transport.factory.instance(**self.store['source']) _read_args= self.info @@ -124,6 +133,25 @@ class Learner(Process): self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None _log = {'action':'data-prep','input':{'rows':int(self._df.shape[0]),'cols':int(self._df.shape[1]) } } self.log(**_log) + def get(self): + + if self.cache : + return self.cache if len(self.cache) > 0 else(self.cache if not self.cache else self.cache[0]) + else: + return self._queue.get() if self._queue.qsize() > 0 else [] + + def listen(self): + while True : + _info = self._queue.get() + self.cache.append(_info) + self._queue.task_done() + def publish(self,caller): + if hasattr(caller,'_queue') : + _queue = caller._queue + _queue.put(self.cache) + + # _queue.join() + pass class Trainer(Learner): """ This will perform training using a GAN @@ -157,7 +185,8 @@ class Trainer(Learner): gTrain = gan.Train(**_args) gTrain.apply() - writer = transport.factory.instance(provider='file',context='write',path=os.sep.join([gTrain.out_dir,'map.json'])) + writer = transport.factory.instance(provider=transport.providers.FILE,context='write',path=os.sep.join([gTrain.out_dir,'map.json'])) + writer.write(self._encoder._map,overwrite=True) writer.close() @@ -174,9 +203,14 @@ class Trainer(Learner): _min = float((end-beg).seconds/ 60) _logs = {'action':'train','input':{'start':beg.strftime('%Y-%m-%d %H:%M:%S'),'minutes':_min,"unique_counts":self._encoder._io[0]}} self.log(**_logs) - self.generate = g - if self.autopilot : - self.generate.run() + self._g = g + if self.autopilot : + self._g.run() + # + #@TODO Find a way to have the data in the object .... + + + def generate (self): if self.autopilot : print( "Autopilot is set ... No need to call this function") @@ -224,6 +258,7 @@ class Generator (Learner): _size = np.sum([len(_item) for _item in _iomatrix]) _log = {'action':'io-data','input':{'candidates':len(_candidates),'rows':int(_size)}} self.log(**_log) + # self.cache = _candidates self.post(_candidates) def approximate(self,_df): _columns = self.info['approximate'] @@ -359,12 +394,14 @@ class Generator (Learner): pass def post(self,_candidates): - _store = self.store['target'] if 'target' in self.store else {'provider':'console'} - _store['lock'] = True - _store['context'] = 'write' #-- Just in case - if 'table' not in _store : - _store['table'] = self.info['from'] - + if 'target' in self.store : + _store = self.store['target'] if 'target' in self.store else {'provider':'console'} + _store['lock'] = True + _store['context'] = 'write' #-- Just in case + if 'table' not in _store : + _store['table'] = self.info['from'] + else: + _store = None N = 0 for _iodf in _candidates : _df = self._df.copy() @@ -397,13 +434,15 @@ class Generator (Learner): # w.write(_df) # cols = [name for name in _df.columns if name.endswith('datetime')] # print (_df[cols]) - - writer = transport.factory.instance(**_store) - if _store['provider'] == 'bigquery': - writer.write(_df,schema=[],table=self.info['from']) + if _store : + writer = transport.factory.instance(**_store) + if _store['provider'] == 'bigquery': + writer.write(_df,schema=[],table=self.info['from']) + else: + writer.write(_df,table=self.info['from']) else: - writer.write(_df,table=self.info['from']) - + self.cache.append(_df) + @@ -444,6 +483,8 @@ class Shuffle(Generator): except Exception as e : # print (e) self.log(**{'action':'failed','input':{'msg':e,'info':self.info}}) +class apply : + TRAIN,GENERATE,RANDOM = 'train','generate','random' class factory : _infocache = {} @staticmethod @@ -459,12 +500,12 @@ class factory : :param batch (default 2k) size of the batch """ - if _args['apply'] == 'shuffle' : - return Shuffle(**_args) - elif _args['apply'] == 'generate' : - return Generator(**_args) + if _args['apply'] in [apply.RANDOM] : + pthread = Shuffle(**_args) + elif _args['apply'] == apply.GENERATE : + pthread = Generator(**_args) else: pthread= Trainer(**_args) - if 'start' in _args and _args['start'] == True : - pthread.start() - return pthread \ No newline at end of file + if 'start' in _args and _args['start'] == True : + pthread.start() + return pthread \ No newline at end of file diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 45fc61c..d589c17 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -47,6 +47,15 @@ class Input : :param sql sql query that pulls a representative sample of the data """ self._schema = _args['schema'] if 'schema' in _args else {} + # + # schema data should be in a hash map for these purposes + # + if self._schema : + r = {} + for _item in self._schema : + r[_item['name']] = r[_item['type']] + self._schema = r + self.df = _args['data'] if 'sql' not in _args : self._initdata(**_args) @@ -60,6 +69,7 @@ class Input : # self._map = {} if 'map' not in _args else _args['map'] + def _initsql(self,**_args): """ This function will initialize the class on the basis of a data-store and optionally pre-defined columns to be used to be synthesized @@ -73,6 +83,10 @@ class Input : self._initcols(data=self.df,columns=_args['columns']) pass + def _init_map(self,values): + self._map = dict(zip(np.arange(len(values)),values)) + for key in self._map : + self._map[key] = self._map[key].tolist() def _initcols (self,**_args) : """ This function will initialize the columns to be synthesized and/or determine which ones can be synthesized @@ -109,7 +123,7 @@ class Input : """ self._initcols(**_args) - def convert(self,**_args): + def _convert(self,**_args): """ This function will convert a data-frame into a binary matrix and provide a map to be able to map the values back to the matrix :param columns in case we specify the columns to account for (just in case the original assumptions don't hold) @@ -150,7 +164,7 @@ class Input : return _values,_m - def revert(self,**_args) : + def _revert(self,**_args) : """ This function will take in a binary matrix and based on the map of values it will repopulate it with values :param _matrix binary matrix @@ -186,7 +200,9 @@ class Input : # r[key] = [columns[np.where(row == 1) [0][0] ] for row in _matrix[:,_beg:_end]] r[key] = [columns[np.where(row==1)[0][0]] if np.where(row==1)[0].size > 0 else '' for row in _matrix] - + # + # we should consider decoding the matrix if possible + # return pd.DataFrame(r) @@ -217,4 +233,39 @@ class Input : return cols,_matrix + def convert(self,**_args): + if 'columns' in _args or 'column' in _args : + columns = _args['columns'] if 'columns' in _args else [_args['column']] + else: + columns = self._columns + _df = self.df if 'data' not in _args else _args['data'] + _values,_matrix = self.encode(_df,columns) + _, _matrix = self.tobinary(_matrix) + self._init_map(_values) + return _values,_matrix #-- matrix has been updated ! + def revert(self,**_args): + # _columns = _args['column'] if 'column' in _args else None + _matrix = _args['matrix'] + # print (_matrix) + return self.decode(_matrix,columns=self._columns) + pass + def encode(self,df,columns) : + _df = df[columns].drop_duplicates() + _values = _df.values.tolist() + _encoded = df[columns].apply(lambda row: _values.index( list(row)) ,axis=1) + return np.array(_values),_encoded + def decode (self,_matrix,**_args): + # + # _matrix binary matrix + # _values value space given the columns + # columns name of the columns ... + # + + columns = _args['columns'] + _values = np.array( list(self._map.values())) + _matrix = pd.DataFrame(_matrix) #if type(_matrix) != pd.DataFrame else _matrix + x = _matrix.apply(lambda row: _values[row.values == 1 ].tolist()[0] if row.values.sum() > 0 else np.repeat(None,row.size), axis=1).tolist() + return pd.DataFrame(x,columns=columns) + + From 44d3f4989a7b0a6b401dba47d66725733932b467 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 10 Aug 2022 09:33:12 -0500 Subject: [PATCH 230/250] version update --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8da19f3..0e70341 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.5.5", +args = {"name":"data-maker","version":"1.5.6", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow'] From 3aee3e2caea14465ee878cc577098659c90a9303 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 10 Aug 2022 10:56:40 -0500 Subject: [PATCH 231/250] bug fix: schema --- data/maker/prepare/__init__.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index d589c17..8da73c3 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -46,15 +46,15 @@ class Input : :param store data-store parameters/configuration :param sql sql query that pulls a representative sample of the data """ - self._schema = _args['schema'] if 'schema' in _args else {} - # - # schema data should be in a hash map for these purposes - # - if self._schema : - r = {} - for _item in self._schema : - r[_item['name']] = r[_item['type']] - self._schema = r + # self._schema = _args['schema'] if 'schema' in _args else {} + # # + # # schema data should be in a hash map for these purposes + # # + # if self._schema : + # r = {} + # for _item in self._schema : + # r[_item['name']] = r[_item['type']] + # self._schema = r self.df = _args['data'] if 'sql' not in _args : From 4013fb8fd5b5ab3ed06f3ca3b28e4de922e8848f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 10 Aug 2022 11:29:21 -0500 Subject: [PATCH 232/250] minor bug fix, got fixed in data-transport returning properly formatted meta data --- data/maker/__init__.py | 2 +- data/maker/prepare/__init__.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 0d8bf33..7ea2c74 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -425,7 +425,7 @@ class Generator (Learner): _schema = self.get_schema() - _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] + # _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] _df = self.format(_df,_schema) _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ] self.log(**{"action":"consolidate","input":_log}) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 8da73c3..4b0bfd3 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -46,15 +46,15 @@ class Input : :param store data-store parameters/configuration :param sql sql query that pulls a representative sample of the data """ - # self._schema = _args['schema'] if 'schema' in _args else {} - # # - # # schema data should be in a hash map for these purposes - # # + self._schema = _args['schema'] if 'schema' in _args else {} + # + # schema data should be in a hash map for these purposes + # # if self._schema : # r = {} # for _item in self._schema : # r[_item['name']] = r[_item['type']] - # self._schema = r + # self._schema = r self.df = _args['data'] if 'sql' not in _args : From 23b3c52230b7109cf8696877059262aafad90ca3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 10 Aug 2022 12:42:11 -0500 Subject: [PATCH 233/250] bug fix: decoding matrix --- data/maker/prepare/__init__.py | 3 ++- setup.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 4b0bfd3..c91c773 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -264,7 +264,8 @@ class Input : columns = _args['columns'] _values = np.array( list(self._map.values())) _matrix = pd.DataFrame(_matrix) #if type(_matrix) != pd.DataFrame else _matrix - x = _matrix.apply(lambda row: _values[row.values == 1 ].tolist()[0] if row.values.sum() > 0 else np.repeat(None,row.size), axis=1).tolist() + # x = _matrix.apply(lambda row: _values[row.values == 1 ].tolist()[0] if row.values.sum() > 0 else None, axis=1).tolist() + x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else None ,axis=1).tolist() return pd.DataFrame(x,columns=columns) diff --git a/setup.py b/setup.py index 0e70341..ba52b61 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.5.6", +args = {"name":"data-maker","version":"1.5.8", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow'] From 7ad00166178a50db734ead47ddfebcb5b2324448 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 10 Aug 2022 14:44:29 -0500 Subject: [PATCH 234/250] bug fix: empty row handling --- data/maker/prepare/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index c91c773..f025294 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -265,7 +265,7 @@ class Input : _values = np.array( list(self._map.values())) _matrix = pd.DataFrame(_matrix) #if type(_matrix) != pd.DataFrame else _matrix # x = _matrix.apply(lambda row: _values[row.values == 1 ].tolist()[0] if row.values.sum() > 0 else None, axis=1).tolist() - x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else None ,axis=1).tolist() + x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist() return pd.DataFrame(x,columns=columns) From e47ffb3fae96adc99f52b601a891e3d65fd4ae31 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 10 Aug 2022 17:06:33 -0500 Subject: [PATCH 235/250] bug fix: random empty values --- data/maker/prepare/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index f025294..1fae46c 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -265,7 +265,11 @@ class Input : _values = np.array( list(self._map.values())) _matrix = pd.DataFrame(_matrix) #if type(_matrix) != pd.DataFrame else _matrix # x = _matrix.apply(lambda row: _values[row.values == 1 ].tolist()[0] if row.values.sum() > 0 else None, axis=1).tolist() - x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist() + #@TODO: Provide random values for things that are missing + + # x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist() + + x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.random.choice(np.matrix.flatten(_values,1)).tolist() ,axis=1).tolist() return pd.DataFrame(x,columns=columns) From afad88411811c67000514f75c3cc4b79c6a38455 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 10 Aug 2022 17:23:42 -0500 Subject: [PATCH 236/250] bug fix: random empty values --- data/maker/prepare/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 1fae46c..a19fd31 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -269,7 +269,7 @@ class Input : # x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist() - x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.random.choice(np.matrix.flatten(_values,1)).tolist() ,axis=1).tolist() + x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else _values[np.random.choice(_values.size,1 )].tolist() ,axis=1).tolist() return pd.DataFrame(x,columns=columns) From d8aad070eeea60d7ed3f59340386287e8ec01c1e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 10 Aug 2022 17:30:28 -0500 Subject: [PATCH 237/250] bug fix: random empty values --- data/maker/prepare/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index a19fd31..15cbe99 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -268,8 +268,9 @@ class Input : #@TODO: Provide random values for things that are missing # x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist() - - x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else _values[np.random.choice(_values.size,1 )].tolist() ,axis=1).tolist() + novalues = _values[np.random.choice(_values.size,1)[0]].tolist() + # novalues = np.repeat(None,len(self._columns)) + x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else novalues ,axis=1).tolist() return pd.DataFrame(x,columns=columns) From 96ac4cd9cbc5b3e1609c9381c017d6a2b7645951 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 10 Aug 2022 17:33:48 -0500 Subject: [PATCH 238/250] bug fix: random empty values --- data/maker/prepare/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 15cbe99..1adc44d 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -268,7 +268,7 @@ class Input : #@TODO: Provide random values for things that are missing # x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist() - novalues = _values[np.random.choice(_values.size,1)[0]].tolist() + novalues = _values[np.random.choice( len(_values),1)[0]].tolist() # novalues = np.repeat(None,len(self._columns)) x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else novalues ,axis=1).tolist() return pd.DataFrame(x,columns=columns) From d42d601be7adeb6573a3824d607f300bcf271fda Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 31 Aug 2022 12:51:48 -0500 Subject: [PATCH 239/250] bug fix & enhancements --- data/gan.py | 3 ++- data/maker/__init__.py | 18 +++++++++++++----- data/maker/prepare/__init__.py | 15 +++++++++++---- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/data/gan.py b/data/gan.py index 26f19a2..812426a 100644 --- a/data/gan.py +++ b/data/gan.py @@ -533,7 +533,7 @@ class Train (GNet): print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) # print (dir (w_distance)) - logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) }) + logs.append({"epoch": int(epoch),"distance":float(-w_sum/(self.STEPS_PER_EPOCH*2)) }) # if epoch % self.MAX_EPOCHS == 0: if epoch in [5,10,20,50,75, self.MAX_EPOCHS] : @@ -547,6 +547,7 @@ class Train (GNet): if self.logger : row = {"module":"gan-train","action":"logs","input":{"partition":self.PARTITION,"logs":logs}} #,"model":pickle.dump(sess)} self.logger.write(row) + # # @TODO: # We should upload the files in the checkpoint diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 7ea2c74..50ac8c1 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -69,15 +69,19 @@ class Learner(Process): self.cache = [] # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs' # sel.max_epoc + self.logger = None + if 'logger' in self.store : + self.logger = transport.factory.instance(**self.store['logger']) def log(self,**_args): try: - # _context = self.info['context'] - # _label = self.info['info'] if 'info' in self.info else _context + _context = self.info['context'] + _label = self.info['info'] if 'info' in self.info else _context # logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider=transport.providers.CONSOLE,context='write',lock=True) - # _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args}) - # logger.write(_args) - # self.ndx += 1 + _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args}) + if self.logger: + self.logger.write(_args) + self.ndx += 1 # if hasattr(logger,'close') : # logger.close() pass @@ -178,6 +182,8 @@ class Trainer(Learner): _args['gpu'] = self.gpu _args['real'] = _matrix _args['candidates'] = self.candidates + if self.logger : + _args['logger'] = transport.factory.instance(**self.store['logger']) # # At this point we have the binary matrix, we can initiate training # @@ -250,6 +256,8 @@ class Generator (Learner): _args['row_count'] = self._df.shape[0] if self.gpu : _args['gpu'] = self.gpu + if self.logger : + _args['logger'] = transport.factory.instance(**self.store['logger']) gHandler = gan.Predict(**_args) gHandler.load_meta(columns=None) _iomatrix = gHandler.apply() diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 1adc44d..c8331bd 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -34,6 +34,8 @@ class Hardware : pass class Input : + class NOVALUES : + RANDOM,IGNORE,ALWAYS = ['random','ignore','always'] """ This class is designed to read data from a source and and perform a variet of operations : - provide a feature space, and rows (matrix profile) @@ -257,8 +259,6 @@ class Input : def decode (self,_matrix,**_args): # # _matrix binary matrix - # _values value space given the columns - # columns name of the columns ... # columns = _args['columns'] @@ -268,8 +268,15 @@ class Input : #@TODO: Provide random values for things that are missing # x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist() - novalues = _values[np.random.choice( len(_values),1)[0]].tolist() - # novalues = np.repeat(None,len(self._columns)) + # + # @TODO: Provide a parameter to either: + # - missing = {outlier,random,none} + # - outlier: select an outlier, random: randomly select a value, none: do nothing ... + # + if np.random.choice([0,1],1)[0] : + novalues = _values[np.random.choice( len(_values),1)[0]].tolist() + else: + novalues = np.repeat(None,len(self._columns)) x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else novalues ,axis=1).tolist() return pd.DataFrame(x,columns=columns) From 7af3c3db6ac20465df98430e0429964b8f164b75 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 31 Aug 2022 17:26:45 -0500 Subject: [PATCH 240/250] bug fix --- data/maker/__init__.py | 12 ++++++------ setup.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 50ac8c1..bc5d9cc 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -79,16 +79,16 @@ class Learner(Process): _label = self.info['info'] if 'info' in self.info else _context # logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider=transport.providers.CONSOLE,context='write',lock=True) _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args}) - if self.logger: + if hasattr(self,'logger') : self.logger.write(_args) self.ndx += 1 # if hasattr(logger,'close') : # logger.close() pass except Exception as e: - print () - print (_args) - print (e) + # print () + # print (_args) + # print (e) pass finally: @@ -182,7 +182,7 @@ class Trainer(Learner): _args['gpu'] = self.gpu _args['real'] = _matrix _args['candidates'] = self.candidates - if self.logger : + if 'logger' in self.store : _args['logger'] = transport.factory.instance(**self.store['logger']) # # At this point we have the binary matrix, we can initiate training @@ -256,7 +256,7 @@ class Generator (Learner): _args['row_count'] = self._df.shape[0] if self.gpu : _args['gpu'] = self.gpu - if self.logger : + if 'logger' in self.store : _args['logger'] = transport.factory.instance(**self.store['logger']) gHandler = gan.Predict(**_args) gHandler.load_meta(columns=None) diff --git a/setup.py b/setup.py index ba52b61..7b06af8 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.5.8", +args = {"name":"data-maker","version":"1.5.9", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow'] From 4398212cafec8c8454b25ea85a4d06c3d2f154b9 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 31 Aug 2022 17:34:43 -0500 Subject: [PATCH 241/250] bug fix --- data/maker/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index bc5d9cc..cdc48e2 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -77,10 +77,11 @@ class Learner(Process): try: _context = self.info['context'] _label = self.info['info'] if 'info' in self.info else _context - # logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider=transport.providers.CONSOLE,context='write',lock=True) + # logger = _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args}) - if hasattr(self,'logger') : - self.logger.write(_args) + if 'logger' in self.store : + logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider=transport.providers.CONSOLE,context='write',lock=True) + logger.write(_args) self.ndx += 1 # if hasattr(logger,'close') : # logger.close() From 0efd4b13bc01fd7ebc0a000997f67c4e9defce38 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 16 Sep 2022 18:18:15 -0500 Subject: [PATCH 242/250] bug fix: crash with dataset & epochs --- README.md | 12 ++-- data/__init__.py | 1 + data/gan.py | 91 ++++++++++++++++++++++------ data/maker/__init__.py | 105 ++++++++++++++++++++++++++------- data/maker/prepare/__init__.py | 2 +- 5 files changed, 167 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index f3c92ed..32224c2 100644 --- a/README.md +++ b/README.md @@ -13,17 +13,19 @@ This package is designed to generate synthetic data from a dataset from an origi After installing the easiest way to get started is as follows (using pandas). The process is as follows: +Read about [data-transport on github](https://github.com/lnyemba/data-transport) or on [healthcareio.the-phi.com/git/code/transport](https://healthcareio.the-phi.com/git/code/transport.git) + **Train the GAN on the original/raw dataset** +1. We define the data sources + +The sources will consists in source, target and logger20. import pandas as pd import data.maker + import transport + from transport import providers - df = pd.read_csv('sample.csv') - column = 'gender' - id = 'id' - context = 'demo' - data.maker.train(context=context,data=df,column=column,id=id,logs='logs') The trainer will store the data on disk (for now) in a structured folder that will hold training models that will be used to generate the synthetic data. diff --git a/data/__init__.py b/data/__init__.py index 2b4a6aa..91b566d 100644 --- a/data/__init__.py +++ b/data/__init__.py @@ -3,3 +3,4 @@ from data.params import SYS_ARGS import transport from multiprocessing import Process, Queue from data.maker import prepare +from data.maker import state diff --git a/data/gan.py b/data/gan.py index 812426a..3727edb 100644 --- a/data/gan.py +++ b/data/gan.py @@ -100,6 +100,13 @@ class GNet : self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs']) + CHECKPOINT_SKIPS = 10 + if self.MAX_EPOCHS < 2*CHECKPOINT_SKIPS : + CHECKPOINT_SKIPS = 2 + self.CHECKPOINTS = np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() + + + self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100 self.CONTEXT = args['context'] self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None} @@ -120,14 +127,18 @@ class GNet : for key in ['train','output'] : self.mkdir(os.sep.join([self.log_dir,key])) self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT])) - if 'partition' in args : - self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT,str(args['partition'])])) - + # if 'partition' in args : + # self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT,str(args['partition'])])) self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) if 'partition' in args : self.train_dir = os.sep.join([self.train_dir,str(args['partition'])]) self.out_dir = os.sep.join([self.out_dir,str(args['partition'])]) + + for checkpoint in self.CHECKPOINTS : + self.mkdir (os.sep.join([self.train_dir,str(checkpoint)])) + self.mkdir (os.sep.join([self.out_dir,str(checkpoint)])) + # if self.logger : # We will clear the logs from the data-store @@ -150,12 +161,13 @@ class GNet : attr = json.loads((open(_name)).read()) for key in attr : value = attr[key] - setattr(self,key,value) + if not hasattr(self,key): + setattr(self,key,value) self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) - if 'partition' in args : - self.train_dir = os.sep.join([self.train_dir,str(args['partition'])]) - self.out_dir = os.sep.join([self.out_dir,str(args['partition'])]) + # if 'partition' in args : + # self.train_dir = os.sep.join([self.train_dir,str(args['partition'])]) + # self.out_dir = os.sep.join([self.out_dir,str(args['partition'])]) def log_meta(self,**args) : @@ -183,15 +195,24 @@ class GNet : suffix = self.CONTEXT #self.get.suffix() _name = os.sep.join([self.out_dir,'meta-'+suffix]) - f = open(_name+'.json','w') - f.write(json.dumps(_object)) + # f = open(_name+'.json','w') + # f.write(json.dumps(_object)) + # f.close() + + for _info in [{"name":os.sep.join([self.out_dir,'meta-'+suffix+'.json']),"data":_object},{"name":os.sep.join([self.out_dir,'epochs.json']),"data":self.logs['epochs'] if 'epochs' in self.logs else []}] : + f = open(_info['name'],'w') + f.write(json.dumps(_info['data'])) + f.close() return _object def mkdir (self,path): if not os.path.exists(path) : if os.sep in path : pass root = [] - for loc in path.split(os.sep) : + + for loc in path.strip().split(os.sep) : + if loc == '' : + root.append(os.sep) root.append(loc) if not os.path.exists(os.sep.join(root)) : os.mkdir(os.sep.join(root)) @@ -278,8 +299,10 @@ class Generator (GNet): tf.compat.v1.add_to_collection('glosses', loss) return loss, loss def load_meta(self, **args): - super().load_meta(**args) + # super().load_meta(**args) self.discriminator.load_meta(**args) + + def network(self,**args) : """ This function will build the network that will generate the synthetic candidates @@ -381,6 +404,7 @@ class Train (GNet): self.logger.write({"module":"gan-train","action":"start","input":{"partition":self.PARTITION,"meta":self.meta} } ) + # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta) def load_meta(self, column): """ @@ -445,7 +469,7 @@ class Train (GNet): else : dataset = tf.data.Dataset.from_tensor_slices(features_placeholder) # labels_placeholder = None - dataset = dataset.repeat(10000) + dataset = dataset.repeat(20000) dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) dataset = dataset.prefetch(1) @@ -472,9 +496,11 @@ class Train (GNet): if self._LABEL is not None : (real, label) = iterator.get_next() else: + real = iterator.get_next() label= None loss, w = self.loss(scope=scope, stage=stage, real=real, label=label) + #tf.get_variable_scope().reuse_variables() tf.compat.v1.get_variable_scope().reuse_variables() #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage) @@ -507,6 +533,7 @@ class Train (GNet): # init = tf.global_variables_initializer() init = tf.compat.v1.global_variables_initializer() logs = [] + self.logs['epochs'] = [] #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: @@ -536,25 +563,41 @@ class Train (GNet): logs.append({"epoch": int(epoch),"distance":float(-w_sum/(self.STEPS_PER_EPOCH*2)) }) # if epoch % self.MAX_EPOCHS == 0: - if epoch in [5,10,20,50,75, self.MAX_EPOCHS] : + # if epoch in [5,10,20,50,75, self.MAX_EPOCHS] : + if epoch in self.CHECKPOINTS or int(epoch) == 1: # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] suffix = self.CONTEXT #self.get.suffix() - _name = os.sep.join([self.train_dir,suffix]) + _name = os.sep.join([self.train_dir,str(epoch),suffix]) # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) saver.save(sess, _name, write_meta_graph=False, global_step=epoch) + # # + + logs = [{"path":_name,"epochs":int(epoch),"loss":float(-w_sum/(self.STEPS_PER_EPOCH*2))}] if self.logger : - row = {"module":"gan-train","action":"logs","input":{"partition":self.PARTITION,"logs":logs}} #,"model":pickle.dump(sess)} - self.logger.write(row) - + # row = {"module":"gan-train","action":"epochs","input":{"logs":logs}} #,"model":pickle.dump(sess)} + # self.logger.write(row) + self.logs['epochs'] += logs # # @TODO: # We should upload the files in the checkpoint # This would allow the learnt model to be portable to another system # tf.compat.v1.reset_default_graph() - + # + # let's sort the epochs we've logged thus far (if any) + # + self.logs['epochs'].sort(key=lambda _item: _item['loss']) + if self.logger : + _log = {'module':'gan-train','action':'epochs','input':self.logs['epochs']} + self.logger.write(_log) + + # + # @TODO: + # Make another copy of this on disk to be able to load it should we not have a logger setup + # + self.log_meta() class Predict(GNet): """ This class uses synthetic data given a learned model @@ -565,6 +608,7 @@ class Predict(GNet): self.values = args['values'] self.ROW_COUNT = args['row_count'] self.oROW_COUNT = self.ROW_COUNT + # self.MISSING_VALUES = np.nan_to_num(np.nan) # if 'no_value' in args and args['no_value'] not in ['na','','NA'] : # self.MISSING_VALUES = args['no_value'] @@ -577,9 +621,20 @@ class Predict(GNet): super().load_meta(**args) self.generator.load_meta(**args) self.ROW_COUNT = self.oROW_COUNT + # + # updating the input/output for the generator, so it points properly + # + + for object in [self,self.generator] : + _train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT,str(self.MAX_EPOCHS)]) + _out_dir= os.sep.join([self.log_dir,'output',self.CONTEXT,str(self.MAX_EPOCHS)]) + setattr(object,'train_dir',_train_dir) + setattr(object,'out_dir',_out_dir) def apply(self,**args): suffix = self.CONTEXT #self.get.suffix() model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) + # model_dir = os.sep.join([self.train_dir,str(self.MAX_EPOCHS)]) + demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] # # setup computational graph diff --git a/data/maker/__init__.py b/data/maker/__init__.py index cdc48e2..dea44eb 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -15,6 +15,7 @@ import transport # from data.bridge import Binary import threading from data.maker import prepare +from data.maker.state import State import copy import os import nujson as json @@ -25,6 +26,7 @@ from multiprocessing import Queue import time + class Learner(Process): def __init__(self,**_args): @@ -48,7 +50,7 @@ class Learner(Process): if 'network_args' not in _args : self.network_args ={ 'context':self.info['context'] , - 'logs':_args['logpath'] if 'logpath' in _args else 'logs', + 'logs':_args['logs'] if 'logs' in _args else 'logs', 'max_epochs':int(_args['epochs']) if 'epochs' in _args else 2, 'batch_size':int (_args['batch']) if 'batch' in _args else 2000 } @@ -72,6 +74,36 @@ class Learner(Process): self.logger = None if 'logger' in self.store : self.logger = transport.factory.instance(**self.store['logger']) + self.autopilot = False #-- to be set by caller + self._initStateSpace() + def _initStateSpace(self): + """ + Initializing state-space for the data-maker, The state-space functions are used as pre-post processing functions applied to the data accordingly i.e + - Trainer -> pre-processing + - Generation -> post processing + The specifications of a state space in the configuration file is as such + state:{pre:{path,pipeline:[]}, post:{path,pipeline:[]}} + """ + self._states = None + + if 'state' in self.info : + try: + _config = self.info ['state'] + self._states = State.instance(_config) + except Exception as e: + print (e) + pass + finally: + # __info = (pd.DataFrame(self._states)[['name','path','args']]).to_dict(orient='records') + if self._states : + __info = {} + + for key in self._states : + __info[key] = [{"name":_item['name'],"args":_item['args'],"path":_item['path']} for _item in self._states[key]] + self.log(object='state-space',action='load',input=__info) + + + def log(self,**_args): try: @@ -108,11 +140,36 @@ class Learner(Process): _read_args= self.info if self._df is None : self._df = reader.read(**_read_args) + # + # NOTE : PRE + # At this point we apply pre-processing of the data if there were ever a need for it + # + _log = {} + HAS_STATES = self._states is not None and 'pre' in self._states + NOT_GENERATING = self.name in ['Trainer','Shuffle'] + IS_AUTOPILOT = self.autopilot + # + # allow calling pre-conditions if either of the conditions is true + # 1. states and not generating + # 2. IS_GENERATING and states and not autopilot + _ALLOW_PRE_CALL = (HAS_STATES and NOT_GENERATING) or (NOT_GENERATING is False and HAS_STATES and IS_AUTOPILOT is False) + if _ALLOW_PRE_CALL : + # if HAS_STATES and NOT_GENERATING or (HAS_STATES and IS_AUTOPILOT is False and NOT_GENERATING is False): + _logs = {'action':'status','input':{'pre':self._states['pre']}} + _beg = list(self._df.shape) + self._df = State.apply(self._df,self._states['pre']) + _end = list(self._df.shape) + _logs['input']['size'] = _beg,_end + self.log(**_log) + + # + # + columns = self.columns if self.columns else self._df.columns # # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases # - The code below tries to address the issue (Perhaps better suited for the reading components) - _log = {} + for name in columns : # # randomly sampling 5 elements to make sense of data-types @@ -201,8 +258,14 @@ class Trainer(Learner): # @TODO: At this point we need to generate another some other objects # _args = {"network_args":self.network_args,"store":self.store,"info":self.info,"candidates":self.candidates,"data":self._df} + _args['logs'] = self.network_args['logs'] + _args['autopilot'] = self.autopilot if self.gpu : _args['gpu'] = self.gpu + + # + # Let us find the smallest, the item is sorted by loss ... + _args['epochs'] = gTrain.logs['epochs'][0]['epochs'] g = Generator(**_args) # g.run() @@ -239,6 +302,7 @@ class Generator (Learner): file.close() else: self._map = {} + self.autopilot = False if 'autopilot' not in _args else _args['autopilot'] def run(self): self.initalize() if self._encoder is None : @@ -416,33 +480,32 @@ class Generator (Learner): _df = self._df.copy() _df[self.columns] = _iodf[self.columns] N += _df.shape[0] - # - #@TODO: - # Improve formatting with better post-processing pipeline - if 'approximate' in self.info : - _df = self.approximate(_df) - if 'make_date' in self.info : - for name in self.info['make_date'] : - # iname = self.info['make_date']['init_field'] - iname = self.info['make_date'][name] + if self._states : + _df = State.apply(_df,self._states['post']) + # # + # #@TODO: + # # Improve formatting with better post-processing pipeline + # if 'approximate' in self.info : + # _df = self.approximate(_df) + # if 'make_date' in self.info : + # for name in self.info['make_date'] : + # # iname = self.info['make_date']['init_field'] + # iname = self.info['make_date'][name] - years = _df[iname] - _dates = [self.make_date(year=_year,field=name) for _year in years] - if _dates : - _df[name] = _dates + # years = _df[iname] + # _dates = [self.make_date(year=_year,field=name) for _year in years] + # if _dates : + # _df[name] = _dates _schema = self.get_schema() - # _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] + _df = self.format(_df,_schema) _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ] self.log(**{"action":"consolidate","input":_log}) - # w = transport.factory.instance(doc='observation',provider='mongodb',context='write',db='IOV01_LOGS',auth_file='/home/steve/dev/transport/mongo.json') - # w.write(_df) - # cols = [name for name in _df.columns if name.endswith('datetime')] - # print (_df[cols]) + if _store : writer = transport.factory.instance(**_store) if _store['provider'] == 'bigquery': @@ -507,8 +570,10 @@ class factory : :param info {columns,sql,from} :param autopilot will generate output automatically :param batch (default 2k) size of the batch + """ + if _args['apply'] in [apply.RANDOM] : pthread = Shuffle(**_args) elif _args['apply'] == apply.GENERATE : diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index c8331bd..b11be57 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -276,7 +276,7 @@ class Input : if np.random.choice([0,1],1)[0] : novalues = _values[np.random.choice( len(_values),1)[0]].tolist() else: - novalues = np.repeat(None,len(self._columns)) + novalues = np.repeat(None,len(self._columns)) x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else novalues ,axis=1).tolist() return pd.DataFrame(x,columns=columns) From 936bd3ee0be7e01352e364a5dd91337e09cc797c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 16 Sep 2022 19:10:49 -0500 Subject: [PATCH 243/250] bug fix with model saving, and pre/post processing --- data/gan.py | 6 +- data/maker/__init__.py | 2 +- data/maker/apply.py | 76 +++++++++++++++++++++++ data/maker/state/__init__.py | 105 +++++++++++++++++++++++++++++++ data/maker/state/default.py | 116 +++++++++++++++++++++++++++++++++++ 5 files changed, 301 insertions(+), 4 deletions(-) create mode 100644 data/maker/apply.py create mode 100644 data/maker/state/__init__.py create mode 100644 data/maker/state/default.py diff --git a/data/gan.py b/data/gan.py index 3727edb..f864dbf 100644 --- a/data/gan.py +++ b/data/gan.py @@ -469,7 +469,7 @@ class Train (GNet): else : dataset = tf.data.Dataset.from_tensor_slices(features_placeholder) # labels_placeholder = None - dataset = dataset.repeat(20000) + dataset = dataset.repeat(80000) dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) dataset = dataset.prefetch(1) @@ -564,12 +564,12 @@ class Train (GNet): # if epoch % self.MAX_EPOCHS == 0: # if epoch in [5,10,20,50,75, self.MAX_EPOCHS] : - if epoch in self.CHECKPOINTS or int(epoch) == 1: + if epoch in self.CHECKPOINTS : # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] suffix = self.CONTEXT #self.get.suffix() _name = os.sep.join([self.train_dir,str(epoch),suffix]) # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) - saver.save(sess, _name, write_meta_graph=False, global_step=epoch) + saver.save(sess, _name, write_meta_graph=False, global_step=np.int64(epoch)) # # diff --git a/data/maker/__init__.py b/data/maker/__init__.py index dea44eb..21b3017 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -265,7 +265,7 @@ class Trainer(Learner): # # Let us find the smallest, the item is sorted by loss ... - _args['epochs'] = gTrain.logs['epochs'][0]['epochs'] + _args['network_args']['max_epochs'] = gTrain.logs['epochs'][0]['epochs'] g = Generator(**_args) # g.run() diff --git a/data/maker/apply.py b/data/maker/apply.py new file mode 100644 index 0000000..bb6a085 --- /dev/null +++ b/data/maker/apply.py @@ -0,0 +1,76 @@ +""" +This file is designed to specify the appliction of pre/post-processing code. + The pre-processing code gets applied after the data has been loaded + The post-processing code get applied after the data has been generated for instance: + -approximation code/logic; date shifting; suppression; adding noise + - +""" +import numpy as np +from datetime import datetime, timedelta +import time + +class Phase: + def __init__(self,**_args): + self._df = _args['data'] + self.callback = _args['callback'] + def apply(self,**_args): + """ + :param data data-frame + :param _info arguments needed to be applied + :param callback callback function once done + """ + raise Exception ("Function needs to be Implemented") +class Pre(Phase): + pass +class Post(Phase): + def __init__(self,**_args): + super().__init__(**_args) + pass + +class Date(Post): + def __init__(self,**_args): + super().__init__(**_args) + def make(self,**_args): + """ + This function generates a random date given a year and optionally a set of days from the randomly generated date + :param year initial value of a year + :param offset list of days between initial date + """ + if _args['year'] in ['',None,np.nan] : + return None + year = int(_args['year']) + + offset = _args['offset'] if 'offset' in _args else 0 + month = np.random.randint(1,13) + if month == 2: + _end = 28 if year % 4 != 0 else 29 + else: + _end = 31 if month in [1,3,5,7,8,10,12] else 30 + day = np.random.randint(1,_end) + + #-- synthetic date + _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0) + FORMAT = '%Y-%m-%d' if 'format' not in _args else _args['format'] + + + + # print ([_name,FORMAT, _date.strftime(FORMAT)]) + r = [] + if offset : + r = [_date.strftime(FORMAT)] + for _delta in offset : + _date = _date + timedelta(_delta) + r.append(_date.strptime(FORMAT)) + return r + else: + return _date.strftime(FORMAT) + + def apply(self,**_args): + """ + + """ + pass +class Approximate(Post): + def apply(**_args): + pass + def applyWithRange(**_args): diff --git a/data/maker/state/__init__.py b/data/maker/state/__init__.py new file mode 100644 index 0000000..adf9837 --- /dev/null +++ b/data/maker/state/__init__.py @@ -0,0 +1,105 @@ +""" +This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditiions +""" +""" +This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditions, +The specifications for this are as follows (within an entry of the configuration) + { + "state":{ + "pre":[{"approximate":{"field":"int"}},{"newdate":{"field":"format"}}],"post":[{"limit":10}] + } + } +""" +import importlib +import importlib.util +import sys +from datetime import datetime +from data.maker.state.default import * +import os + + +class State : + @staticmethod + def apply(_data,lpointers): + """ + This function applies a pipeline against a given data-frame, the calling code must decide whether it is a pre/post + :_data data-frame + :_lpointers functions modules returned by instance (module,_args) + """ + for _item in lpointers : + if _item is None : + continue + + pointer = _item['module'] + _args = _item['args'] + + _data = pointer(_data,_args) + return _data + @staticmethod + def instance(_args): + pre = [] + post=[] + + out = {} + for key in _args : + # + # If the item has a path property is should be ignored + path = _args[key]['path'] if 'path' in _args[key] else '' + out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']] + + return out + # if 'pre' in _args: + # path = _args['pre']['path'] if 'path' in _args['pre'] else '' + + # pre = [ State._build(dict(_item,**{'path':path})) for _item in _args['pre']['pipeline']] + # else: + # path = _args['post']['path'] if 'path' in _args['post'] else '' + + # post = [ State._build(dict(_item,**{'path':path})) for _item in _args['post']['pipeline']] + # return {'pre':pre,'post':post} + + @staticmethod + def _extract(_entry): + + _name = list(set(_entry.keys()) - set(['path']) ) + _name = _name[0] + path = _entry['path'] if 'path' in _entry and os.path.exists(_entry['path']) else '' + return {"module": _name,"args": _entry[_name],'name':_name,'path':path} + pass + @staticmethod + def _build(_args): + + _info = State._extract(_args) + # _info = dict(_args,**_info) + + _info['module'] = State._instance(_info) + return _info if _info['module'] is not None else None + + @staticmethod + def _instance(_args): + """ + :path optional path of the file on disk + :module name of the function + """ + + _name = _args['module'] + + if 'path' in _args and os.path.exists(_args['path']): + path= _args['path'] + + spec = importlib.util.spec_from_file_location(_name, path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + else: + # + # Probably calling a built-in module (should be in this file) + + module = sys.modules['data.maker.state.default'] + + return getattr(module,_name) if hasattr(module,_name) else None + +# +# Adding a few custom functions that should be able to help .... +# These functions can be called without specifying a path +# + diff --git a/data/maker/state/default.py b/data/maker/state/default.py new file mode 100644 index 0000000..75c2c4b --- /dev/null +++ b/data/maker/state/default.py @@ -0,0 +1,116 @@ +""" +This file contains default functions applied to a data-frame/dataset as pre/post processing jobs. +The functions are organized in a pipeline i.e the data will be applied to each function + +Custom functions : + functions must tak 2 arguments (_data,_args) : where _data is a data frame and _arg is a object describing the input parameters +""" +import pandas as pd +import numpy as np +from datetime import datetime, timedelta + + +def limit(_data,size): + """ + ...,{limit:size} + """ + + # size = int(_args['limit']) + return _data.iloc[:size] +def format(_data,_schema): + """ + This function enforces a schema against a data-frame, this may or may not work depending on the persistence storage + :_data data-frame containing all data + :_args schema to enforce the data, we are expecting the format as a list of {name,type,description} + """ + return _data + +def approximate(_data,_args): + """ + :_args Object of {field:type} + This function will approximate n-fields in the data given it's distribution + """ + _m = {'int':int,'float':float,'integer':int,'double':float} + columns = list(_args.keys()) + for _name in columns : + if _name not in _data : + continue + otype = _args[_name] + otype = str if otype not in _m else _m[otype] + _data.loc[:,_name] = np.random.uniform(_data[_name].values).astype(otype) + + return _data +def split_date(_data,_args): + """ + This function takes a field and applies the format from other fields + :_data data-frame + :_config configuration entry {column:{format,column:format,type}} + """ + _columns = list(_args.keys()) + _m = {'int':int,'float':float,'integer':int,'double':float} + for _name in _columns : + _iname = _args[_name]['column'] + _iformat = _args[_name]['format']['in'] + _oformat = _args[_name]['format']['out'] + _otype = str if 'type' not in _args[_name] else _args[_name]['type'] + _data.loc[:,_name] = _data[_iname].apply(lambda _date: datetime.strftime(datetime.strptime(str(_date),_iformat),_oformat)).astype(_otype) + return _data +def newdate(_data,_args): + """ + This function creates a new data on a given column from another + :_data data frame + :_args configuration column:{format,column} + """ + _columns = list(_args.keys()) + for _name in _columns : + + format = _args[_name]['format'] + ROW_COUNT = _data[_name].size + if 'column' in _args[_name] : + srcName = _args[_name]['column'] + years = _data[srcName].values + else: + years = np.random.choice(np.arange(datetime.now().year- 90,datetime.now().year),ROW_COUNT) + _data.loc[:,_name] = [ _makedate(year = years[_index],format = format) for _index in np.arange(ROW_COUNT)] + + return _data +def _makedate(**_args): + """ + This function creates a new date and applies it to a column + :_data data-frame with columns + :_args arguments for col1:format + """ + _columns = list(_args.keys()) + + # if _args['year'] in ['',None,np.nan] : + # year = np.random.choice(np.arange(1920,222),1) + # else: + # year = int(_args['year']) + year = int(_args['year']) + offset = _args['offset'] if 'offset' in _args else 0 + month = np.random.randint(1,13) + if month == 2: + _end = 28 if year % 4 != 0 else 29 + else: + _end = 31 if month in [1,3,5,7,8,10,12] else 30 + day = np.random.randint(1,_end) + + #-- synthetic date + _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0) + FORMAT = '%Y-%m-%d' + + if 'format' in _args: + FORMAT = _args['format'] + + + # print ([_name,FORMAT, _date.strftime(FORMAT)]) + r = [] + if offset : + r = [_date.strftime(FORMAT)] + for _delta in offset : + _date = _date + timedelta(_delta) + r.append(_date.strptime(FORMAT)) + return r + else: + return _date.strftime(FORMAT) + From 4be340ec082509d645ef8e05a8ae18848eafd589 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 16 Sep 2022 19:13:22 -0500 Subject: [PATCH 244/250] version update --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7b06af8..c28f366 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.5.9", +args = {"name":"data-maker","version":"1.6.0", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow'] From 209a7b8ee5c04f094efa8ef33841e8464fd3f52c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 16 Sep 2022 22:39:25 -0500 Subject: [PATCH 245/250] bug fix: checkpoints --- data/gan.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/data/gan.py b/data/gan.py index f864dbf..dae6ea0 100644 --- a/data/gan.py +++ b/data/gan.py @@ -103,7 +103,7 @@ class GNet : CHECKPOINT_SKIPS = 10 if self.MAX_EPOCHS < 2*CHECKPOINT_SKIPS : CHECKPOINT_SKIPS = 2 - self.CHECKPOINTS = np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() + self.CHECKPOINTS = [1,self.MAX_EPOCHS] + np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() @@ -529,7 +529,7 @@ class Train (GNet): train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d) train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g) # saver = tf.train.Saver() - saver = tf.compat.v1.train.Saver() + saver = tf.compat.v1.train.Saver(max_to_keep=len(self.CHECKPOINTS)) # init = tf.global_variables_initializer() init = tf.compat.v1.global_variables_initializer() logs = [] @@ -564,7 +564,7 @@ class Train (GNet): # if epoch % self.MAX_EPOCHS == 0: # if epoch in [5,10,20,50,75, self.MAX_EPOCHS] : - if epoch in self.CHECKPOINTS : + if epoch in self.CHECKPOINTS : # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] suffix = self.CONTEXT #self.get.suffix() _name = os.sep.join([self.train_dir,str(epoch),suffix]) @@ -587,7 +587,9 @@ class Train (GNet): tf.compat.v1.reset_default_graph() # # let's sort the epochs we've logged thus far (if any) + # Take on the last five checkpoints https://stackoverflow.com/questions/41018454/tensorflow-checkpoint-models-getting-deleted # + # self.logs['epochs'] = self.logs['epochs'][-5:] self.logs['epochs'].sort(key=lambda _item: _item['loss']) if self.logger : _log = {'module':'gan-train','action':'epochs','input':self.logs['epochs']} From 3b0903bd4af7073d37094b5db4f63ad6e60a9073 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 19 Sep 2022 13:10:28 -0500 Subject: [PATCH 246/250] minor bug fix --- data/maker/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 21b3017..7f9c0f6 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -480,7 +480,7 @@ class Generator (Learner): _df = self._df.copy() _df[self.columns] = _iodf[self.columns] N += _df.shape[0] - if self._states : + if self._states and 'post' in self._states: _df = State.apply(_df,self._states['post']) # # # #@TODO: From ce594634e848a1956a5ff3dbd2c08a34028592de Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 11 Oct 2022 18:18:59 -0500 Subject: [PATCH 247/250] checkpoint enhancement --- data/gan.py | 49 ++++++++++++++++++++++-------------------- data/maker/__init__.py | 13 ++++++++--- setup.py | 4 ++-- 3 files changed, 38 insertions(+), 28 deletions(-) diff --git a/data/gan.py b/data/gan.py index dae6ea0..eaf5124 100644 --- a/data/gan.py +++ b/data/gan.py @@ -100,13 +100,12 @@ class GNet : self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs']) - CHECKPOINT_SKIPS = 10 - if self.MAX_EPOCHS < 2*CHECKPOINT_SKIPS : - CHECKPOINT_SKIPS = 2 - self.CHECKPOINTS = [1,self.MAX_EPOCHS] + np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() - - - + CHECKPOINT_SKIPS = int(args['checkpoint_skips']) if 'checkpoint_skips' in args else int(self.MAX_EPOCHS/10) + # if self.MAX_EPOCHS < 2*CHECKPOINT_SKIPS : + # CHECKPOINT_SKIPS = 2 + # self.CHECKPOINTS = [1,self.MAX_EPOCHS] + np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() + self.CHECKPOINTS = np.repeat(CHECKPOINT_SKIPS, self.MAX_EPOCHS/ CHECKPOINT_SKIPS).cumsum().astype(int).tolist() + self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100 self.CONTEXT = args['context'] self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None} @@ -469,7 +468,7 @@ class Train (GNet): else : dataset = tf.data.Dataset.from_tensor_slices(features_placeholder) # labels_placeholder = None - dataset = dataset.repeat(80000) + dataset = dataset.repeat(800000) dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) dataset = dataset.prefetch(1) @@ -560,39 +559,43 @@ class Train (GNet): print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) # print (dir (w_distance)) - logs.append({"epoch": int(epoch),"distance":float(-w_sum/(self.STEPS_PER_EPOCH*2)) }) - + # logs.append({"epoch": int(epoch),"distance":float(-w_sum/(self.STEPS_PER_EPOCH*2)) }) + + suffix = str(self.CONTEXT) + _name = os.sep.join([self.train_dir,str(epoch),suffix]) if epoch in self.CHECKPOINTS else '' + _logentry = {"path":_name,"epochs":int(epoch),"loss":float(-w_sum/(self.STEPS_PER_EPOCH*2))} # if epoch % self.MAX_EPOCHS == 0: # if epoch in [5,10,20,50,75, self.MAX_EPOCHS] : if epoch in self.CHECKPOINTS : # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - suffix = self.CONTEXT #self.get.suffix() - _name = os.sep.join([self.train_dir,str(epoch),suffix]) + # suffix = self.CONTEXT #self.get.suffix() + # _name = os.sep.join([self.train_dir,str(epoch),suffix]) # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) saver.save(sess, _name, write_meta_graph=False, global_step=np.int64(epoch)) # # - logs = [{"path":_name,"epochs":int(epoch),"loss":float(-w_sum/(self.STEPS_PER_EPOCH*2))}] - if self.logger : - # row = {"module":"gan-train","action":"epochs","input":{"logs":logs}} #,"model":pickle.dump(sess)} - # self.logger.write(row) - self.logs['epochs'] += logs - # - # @TODO: - # We should upload the files in the checkpoint - # This would allow the learnt model to be portable to another system + # logs = [] + # if self.logger : + # # row = {"module":"gan-train","action":"epochs","input":{"logs":logs}} #,"model":pickle.dump(sess)} + # # self.logger.write(row) + # self.logs['epochs'] += logs + # # + # # @TODO: + # # We should upload the files in the checkpoint + # # This would allow the learnt model to be portable to another system # + self.logs['epochs'].append(_logentry) tf.compat.v1.reset_default_graph() # # let's sort the epochs we've logged thus far (if any) # Take on the last five checkpoints https://stackoverflow.com/questions/41018454/tensorflow-checkpoint-models-getting-deleted # # self.logs['epochs'] = self.logs['epochs'][-5:] - self.logs['epochs'].sort(key=lambda _item: _item['loss']) + if self.logger : - _log = {'module':'gan-train','action':'epochs','input':self.logs['epochs']} + _log = {'module':'gan-train','context':self.CONTEXT,'action':'epochs','input':self.logs['epochs']} self.logger.write(_log) # diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 7f9c0f6..fdf2305 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -226,7 +226,7 @@ class Trainer(Learner): self.autopilot = _args['autopilot'] if 'autopilot' in _args else False self.generate = None self.candidates = int(_args['candidates']) if 'candidates' in _args else 1 - + self.checkpoint_skips = _args['checkpoint_skips'] if 'checkpoint_skips' in _args else None def run(self): self.initalize() if self._encoder is None : @@ -242,6 +242,8 @@ class Trainer(Learner): _args['candidates'] = self.candidates if 'logger' in self.store : _args['logger'] = transport.factory.instance(**self.store['logger']) + if self.checkpoint_skips : + _args['checkpoint_skips'] = self.checkpoint_skips # # At this point we have the binary matrix, we can initiate training # @@ -264,8 +266,13 @@ class Trainer(Learner): _args['gpu'] = self.gpu # - # Let us find the smallest, the item is sorted by loss ... - _args['network_args']['max_epochs'] = gTrain.logs['epochs'][0]['epochs'] + # Let us find the smallest, the item is sorted by loss on disk + # + _epochs = [_e for _e in gTrain.logs['epochs'] if _e['path'] != ''] + _epochs.sort(key=lambda _item: _item['loss'],reverse=False) + + _args['network_args']['max_epochs'] = _epochs[0]['epochs'] + self.log(action='autopilot',input={'epoch':_epochs[0]}) g = Generator(**_args) # g.run() diff --git a/setup.py b/setup.py index c28f366..3a2aaba 100644 --- a/setup.py +++ b/setup.py @@ -4,10 +4,10 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.6.0", +args = {"name":"data-maker","version":"1.6.2", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} -args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow'] +args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git' if sys.version_info[0] == 2 : From d469a4904fb5aaa090948ead3172c2d0eeb326f4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 9 Nov 2022 14:28:34 -0600 Subject: [PATCH 248/250] fixes with new features --- data/gan.py | 2 ++ setup.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/data/gan.py b/data/gan.py index eaf5124..d2cc3ea 100644 --- a/data/gan.py +++ b/data/gan.py @@ -101,6 +101,8 @@ class GNet : self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs']) CHECKPOINT_SKIPS = int(args['checkpoint_skips']) if 'checkpoint_skips' in args else int(self.MAX_EPOCHS/10) + + CHECKPOINT_SKIPS = 1 if CHECKPOINT_SKIPS < 1 else CHECKPOINT_SKIPS # if self.MAX_EPOCHS < 2*CHECKPOINT_SKIPS : # CHECKPOINT_SKIPS = 2 # self.CHECKPOINTS = [1,self.MAX_EPOCHS] + np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() diff --git a/setup.py b/setup.py index 3a2aaba..6327b10 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.6.2", +args = {"name":"data-maker","version":"1.6.3", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow'] From e196991c54d4207ab9c30507171748331d96c622 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 15 Nov 2022 11:01:11 -0600 Subject: [PATCH 249/250] plugin handling ... --- data/maker/__init__.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index fdf2305..7b3a347 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -589,4 +589,14 @@ class factory : pthread= Trainer(**_args) if 'start' in _args and _args['start'] == True : pthread.start() - return pthread \ No newline at end of file + return pthread + +class plugins: + @staticmethod + def load(_config): + """ + This function attempts to load the plugins to insure they are valid + _config configuration for plugin specifications {pre:{pipeline,path},post:{pipeline,path}} + """ + + From b2cf5ead53b51bfbe6dd331cf7f2c271605a0d0c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 15 Nov 2022 11:01:33 -0600 Subject: [PATCH 250/250] version # --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6327b10..8ad1b09 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.6.3", +args = {"name":"data-maker","version":"1.6.4", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow']