From dcc55eb1fbab75f32f8953d9b150dfe8fd567448 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 10 Jan 2020 13:12:58 -0600
Subject: [PATCH 001/250] bug fixes

---
 data/gan.py            | 25 ++++++++++++++++++++-----
 data/maker/__init__.py | 39 +++++++++++++++++++++++----------------
 data/maker/__main__.py | 33 +++++++++++++++++++++++++--------
 3 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 43d15ae..46ecb18 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -1,8 +1,23 @@
 """
-usage :
-    optional :
-    --num_gpu   number of gpus to use will default to 1
-    --epoch     steps per epoch default to 256
+This code was originally writen by Ziqi Zhang <ziqi.zhang@vanderbilt.edu> in order to generate synthetic data.
+The code is an implementation of a Generative Adversarial Network that uses the Wasserstein Distance (WGAN).
+It is intended to be used in 2 modes (embedded in code or using CLI)
+
+USAGE :
+
+The following parameters should be provided in a configuration file (JSON format)
+python data/maker --config <path-to-config-file.json>
+
+CONFIGURATION FILE STRUCTURE :
+
+    context     what it is you are loading (stroke, hypertension, ...)
+    data        path of the file to be loaded
+    logs        folder to store training model and meta data about learning
+    max_epochs  number of iterations in learning 
+    num_gpu     number of gpus to be used (will still run if the GPUs are not available)
+
+EMBEDDED IN CODE :
+
 """
 import tensorflow as tf
 from tensorflow.contrib.layers import l2_regularizer
@@ -426,7 +441,7 @@ class Train (GNet):
                     print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
                     # print (dir (w_distance))
 
-                    logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })
+                    logs.append({"epoch":epoch,"distance":-w_sum })
 
                     if epoch % self.MAX_EPOCHS == 0:
                         # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index f97e5f3..e0ca55d 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -24,21 +24,25 @@ def train (**args) :
     column      = args['column']
     
     column_id   = args['id']
-    df          = args['data']
-    logs        = args['logs']
-    real        = pd.get_dummies(df[column]).astype(np.float32).values
-    labels      = pd.get_dummies(df[column_id]).astype(np.float32).values
-    num_gpu     = 1 if 'num_gpu' not in args else args['num_gpu']
-    max_epochs  = 10 if 'max_epochs' not in args else args['max_epochs']
+    df          = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
+    # logs        = args['logs']
+    # real        = pd.get_dummies(df[column]).astype(np.float32).values
+    # labels      = pd.get_dummies(df[column_id]).astype(np.float32).values
+    args['real']        = pd.get_dummies(df[column]).astype(np.float32).values
+    args['label']      = pd.get_dummies(df[column_id]).astype(np.float32).values
+    # num_gpu     = 1 if 'num_gpu' not in args else args['num_gpu']
+    # max_epochs  = 10 if 'max_epochs' not in args else args['max_epochs']
     context     = args['context']
+
     if 'store' in args :
         args['store']['args']['doc'] = context
         logger = factory.instance(**args['store'])
+        args['logger'] = logger
         
     else:
         logger = None
-        
-    trainer     = gan.Train(context=context,max_epochs=max_epochs,num_gpu=num_gpu,real=real,label=labels,column=column,column_id=column_id,logger = logger,logs=logs)
+    trainer = gan.Train(**args)    
+    # trainer     = gan.Train(context=context,max_epochs=max_epochs,num_gpu=num_gpu,real=real,label=labels,column=column,column_id=column_id,logger = logger,logs=logs)
     return trainer.apply()
 
 def generate(**args):
@@ -51,14 +55,14 @@ def generate(**args):
     :id     column identifying an entity
     :logs   location on disk where the learnt knowledge of the dataset is
     """
-    df      = args['data']
-    
+    # df      = args['data']
+    df      = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
     column      = args['column'] 
     column_id   = args['id']
-    logs        = args['logs']
-    context = args['context']
-    num_gpu     = 1 if 'num_gpu' not in args else args['num_gpu']
-    max_epochs  = 10 if 'max_epochs' not in args else args['max_epochs']
+    # logs        = args['logs']
+    # context = args['context']
+    # num_gpu     = 1 if 'num_gpu' not in args else args['num_gpu']
+    # max_epochs  = 10 if 'max_epochs' not in args else args['max_epochs']
 
     #
     #@TODO:
@@ -69,8 +73,11 @@ def generate(**args):
     values = df[column].unique().tolist()
     values.sort()
 
-    labels = pd.get_dummies(df[column_id]).astype(np.float32).values
-    handler = gan.Predict (context=context,label=labels,max_epochs=max_epochs,num_gpu=num_gpu,values=values,column=column,logs=logs)
+    # labels = pd.get_dummies(df[column_id]).astype(np.float32).values
+    args['label']  = pd.get_dummies(df[column_id]).astype(np.float32).values
+    args['values']  = values
+    # handler = gan.Predict (context=context,label=labels,max_epochs=max_epochs,num_gpu=num_gpu,values=values,column=column,logs=logs)
+    handler = gan.Predict (**args)
     handler.load_meta(column)
     r =  handler.apply()
     _df = df.copy()
diff --git a/data/maker/__main__.py b/data/maker/__main__.py
index e77bf0a..56defec 100644
--- a/data/maker/__main__.py
+++ b/data/maker/__main__.py
@@ -1,10 +1,27 @@
 import pandas as pd
 import data.maker
-
-df      = pd.read_csv('sample.csv')
-column  = 'gender'
-id      = 'id' 
-context = 'demo'
-store = {"type":"mongo.MongoWriter","args":{"host":"localhost:27017","dbname":"GAN"}}
-max_epochs = 11
-data.maker.train(store=store,max_epochs=max_epochs,context=context,data=df,column=column,id=id,logs='foo')
\ No newline at end of file
+from data.params import SYS_ARGS
+import json
+from scipy.stats import wasserstein_distance as wd
+import risk
+import numpy as np
+if 'config' in SYS_ARGS :
+    ARGS = json.loads(open(SYS_ARGS['config']).read())
+    if 'generate' not in SYS_ARGS :
+        data.maker.train(**ARGS)    
+    else:
+        #
+        #
+        _df = data.maker.generate(**ARGS)
+        odf = pd.read_csv (ARGS['data'])
+        odf.columns = [name.lower() for name in odf.columns]
+        column = [ARGS['column'] ] #+ ARGS['id']
+        print (column)
+        print (_df[column].risk.evaluate())
+        print (odf[column].risk.evaluate())
+        _x = pd.get_dummies(_df[column]).values
+        y  = pd.get_dummies(odf[column]).values
+        N = _df.shape[0]
+        print (np.mean([ wd(_x[i],y[i])for i in range(0,N)]))
+        # column = SYS_ARGS['column']
+        # odf = open(SYS_ARGS['data'])
\ No newline at end of file

From 63a7f1a968293ad5a4e70d71b8be62ba1f97c9ea Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 10 Jan 2020 13:16:11 -0600
Subject: [PATCH 002/250] version # update

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index db4029b..5f800d9 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.0.5","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.0.6","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git'

From 31ca5886f0f6c53b77c4a6e001aee8a995cd7f78 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 11 Feb 2020 12:00:16 -0600
Subject: [PATCH 003/250] not sure about the changes (oops)

---
 data/gan.py            | 99 ++++++++++++++++++++++++++++++++----------
 data/maker/__init__.py | 89 ++++++++++++++++++++-----------------
 data/maker/__main__.py | 18 ++++----
 3 files changed, 137 insertions(+), 69 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 46ecb18..3f22740 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -43,6 +43,10 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 class void :
     pass
 class GNet :
+    def log(self,**args):
+        self.logs = dict(args,**self.logs)
+       
+            
     """
     This is the base class of a generative network functions, the details will be implemented in the subclasses.
     An instance of this class is accessed as follows 
@@ -52,7 +56,7 @@ class GNet :
     def __init__(self,**args):
         self.layers = void()
         self.layers.normalize = self.normalize
-
+        self.logs = {}
 
         self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
        
@@ -95,6 +99,15 @@ class GNet :
             
         self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])        
         self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
+        if self.logger :
+            #
+            # We will clear the logs from the data-store 
+            #
+            column = self.ATTRIBUTES['synthetic']
+            db = self.logger.db
+            if db[column].count() > 0 :
+                db.backup.insert({'name':column,'logs':list(db[column].find()) })
+                db[column].drop()
         
     def load_meta(self,column):
         """
@@ -114,7 +127,9 @@ class GNet :
                 
             
     def log_meta(self,**args) :
+        
         _object = {
+            '_id':'meta',
             'CONTEXT':self.CONTEXT,
             'ATTRIBUTES':self.ATTRIBUTES,
             'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU,
@@ -314,6 +329,11 @@ class Train (GNet):
         # print ([" *** ",self.BATCHSIZE_PER_GPU])
         
         self.meta = self.log_meta()
+        if(self.logger):
+            
+            self.logger.write( row=self.meta )
+        
+        self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta)
     def load_meta(self, column):
         """
         This function will delegate the calls to load meta data to it's dependents
@@ -350,11 +370,14 @@ class Train (GNet):
         if stage == 'D':
             w, loss = self.discriminator.loss(real=real, fake=fake, label=label)
             #losses = tf.get_collection('dlosses', scope)
+            flag = 'dlosses'
             losses = tf.compat.v1.get_collection('dlosses', scope)
         else:
             w, loss = self.generator.loss(fake=fake, label=label)
             #losses = tf.get_collection('glosses', scope)
+            flag = 'glosses'
             losses = tf.compat.v1.get_collection('glosses', scope)
+        # losses = tf.compat.v1.get_collection(flag, scope)
 
         total_loss = tf.add_n(losses, name='total_loss')
 
@@ -369,7 +392,8 @@ class Train (GNet):
         dataset = dataset.repeat(10000)
         dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
         dataset = dataset.prefetch(1)
-        iterator = dataset.make_initializable_iterator()
+        # iterator = dataset.make_initializable_iterator()
+        iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
         # next_element = iterator.get_next()
         # init_op = iterator.initializer
         return iterator, features_placeholder, labels_placeholder
@@ -405,7 +429,10 @@ class Train (GNet):
     def apply(self,**args):
         # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10
         REAL = self._REAL
-        LABEL= self._LABEL       
+        LABEL= self._LABEL   
+        if (self.logger):
+            pass
+        
         with tf.device('/cpu:0'):
             opt_d = tf.compat.v1.train.AdamOptimizer(1e-4)
             opt_g = tf.compat.v1.train.AdamOptimizer(1e-4)
@@ -441,7 +468,7 @@ class Train (GNet):
                     print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
                     # print (dir (w_distance))
 
-                    logs.append({"epoch":epoch,"distance":-w_sum })
+                    logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })
 
                     if epoch % self.MAX_EPOCHS == 0:
                         # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
@@ -452,9 +479,14 @@ class Train (GNet):
                         #
                         #
                         if self.logger :
-                            row = {"logs":logs} #,"model":pickle.dump(sess)}
-                            
+                            row = {"logs":logs} #,"model":pickle.dump(sess)}                            
                             self.logger.write(row=row)
+                            #
+                            # @TODO:
+                            # We should upload the files in the checkpoint 
+                            # This would allow the learnt model to be portable to another system
+                            #
+            tf.compat.v1.reset_default_graph()
 
 class Predict(GNet):
     """
@@ -479,38 +511,61 @@ class Predict(GNet):
         ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
         label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
         
-        fake = self.generator.network(inputs=z, label=label)
+        fake    = self.generator.network(inputs=z, label=label)
         init    = tf.compat.v1.global_variables_initializer()
-        saver = tf.compat.v1.train.Saver()
+        saver   = tf.compat.v1.train.Saver()
+        df      = pd.DataFrame()
+        CANDIDATE_COUNT = 1000
+        NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0]
         with tf.compat.v1.Session() as sess:
             
             # sess.run(init)
             saver.restore(sess, model_dir)
             labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
             
+            found = []
             labels= demo
-            f = sess.run(fake,feed_dict={y:labels})
-            #
-            # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
-            #
-
-            df =  ( pd.DataFrame(np.round(f).astype(np.int32)))
+            for i in np.arange(CANDIDATE_COUNT) :
+                
+                f = sess.run(fake,feed_dict={y:labels})
+                #
+                # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
+                # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
+                #
+                df =  ( pd.DataFrame(np.round(f).astype(np.int32)))
+                p = 0 not in df.sum(axis=1).values
+                
+                if  p:
+                    found.append(df)
+                    if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT:
+                        break
+                else:
+                    continue
+                    
             # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
             # df = (i * df).sum(axis=1)
             #
             # In case we are dealing with actual values like diagnosis codes we can perform 
             #
+            df = found[np.random.choice(np.arange(len(found)),1)[0]]
             columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
             
-            r = np.zeros((self.ROW_COUNT,len(columns)))
-            for col in df :
-                i = np.where(df[col])[0]
-                r[i] = col
-            
-            df = pd.DataFrame(r,columns=columns)
+            # r = np.zeros((self.ROW_COUNT,len(columns)))
+            r = np.zeros(self.ROW_COUNT)
+            df.columns = self.values
+            if len(found):
+                print (len(found),NTH_VALID_CANDIDATE)    
+                # x = df * self.values 
+                
+                df =  pd.DataFrame( df.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
+                df.columns = columns
+                
+               
             
-            df[df.columns] = (df.apply(lambda value: self.values[ int(value)],axis=1))
-            return df.to_dict(orient='lists')
+        
+        tf.compat.v1.reset_default_graph()
+        
+        return df.to_dict(orient='list')
             # return df.to_dict(orient='list')
             # count = str(len(os.listdir(self.out_dir)))
             # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index e0ca55d..f1a9537 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -12,6 +12,7 @@ import pandas as pd
 import numpy as np
 import data.gan as gan
 from transport import factory
+import threading as thread
 def train (**args) :
     """
     This function is intended to train the GAN in order to learn about the distribution of the features
@@ -21,30 +22,42 @@ def train (**args) :
     :data       data-frame to be synthesized
     :context    label of what we are synthesizing
     """
-    column      = args['column']
+    column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
     
     column_id   = args['id']
     df          = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
-    # logs        = args['logs']
-    # real        = pd.get_dummies(df[column]).astype(np.float32).values
-    # labels      = pd.get_dummies(df[column_id]).astype(np.float32).values
-    args['real']        = pd.get_dummies(df[column]).astype(np.float32).values
-    args['label']      = pd.get_dummies(df[column_id]).astype(np.float32).values
-    # num_gpu     = 1 if 'num_gpu' not in args else args['num_gpu']
-    # max_epochs  = 10 if 'max_epochs' not in args else args['max_epochs']
-    context     = args['context']
-
-    if 'store' in args :
-        args['store']['args']['doc'] = context
-        logger = factory.instance(**args['store'])
-        args['logger'] = logger
-        
-    else:
-        logger = None
-    trainer = gan.Train(**args)    
-    # trainer     = gan.Train(context=context,max_epochs=max_epochs,num_gpu=num_gpu,real=real,label=labels,column=column,column_id=column_id,logger = logger,logs=logs)
-    return trainer.apply()
+    df.columns = [name.lower() for name in df.columns]
 
+    #
+    # If we have several columns we will proceed one at a time (it could be done in separate threads)
+    # @TODO : Consider performing this task on several threads/GPUs simulataneously
+    # 
+    args['label']      = pd.get_dummies(df[column_id]).astype(np.float32).values
+    for col in column :    
+        args['real']        = pd.get_dummies(df[col]).astype(np.float32).values 
+        args['column']  = col
+        args['context'] = col
+        context     = args['context']
+        if 'store' in args :
+            args['store']['args']['doc'] = context
+            logger = factory.instance(**args['store'])
+            args['logger'] = logger
+            
+        else:
+            logger = None
+        trainer = gan.Train(**args)        
+        trainer.apply()
+def post(**args):
+    """
+    This uploads the tensorflow checkpoint to a data-store (mongodb, biguqery, s3)
+    
+    """
+    pass
+def get(**):
+    """
+    This function will restore a checkpoint from a persistant storage on to disk
+    """
+    pass
 def generate(**args):
     """
     This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
@@ -57,29 +70,27 @@ def generate(**args):
     """
     # df      = args['data']
     df      = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
-    column      = args['column'] 
+    
+    column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
     column_id   = args['id']
-    # logs        = args['logs']
-    # context = args['context']
-    # num_gpu     = 1 if 'num_gpu' not in args else args['num_gpu']
-    # max_epochs  = 10 if 'max_epochs' not in args else args['max_epochs']
-
     #
     #@TODO:
     #   If the identifier is not present, we should fine a way to determine or make one
     #
-    #ocolumns= list(set(df.columns.tolist())- set(columns))
-    
-    values = df[column].unique().tolist()
-    values.sort()
-
-    # labels = pd.get_dummies(df[column_id]).astype(np.float32).values
     args['label']  = pd.get_dummies(df[column_id]).astype(np.float32).values
-    args['values']  = values
-    # handler = gan.Predict (context=context,label=labels,max_epochs=max_epochs,num_gpu=num_gpu,values=values,column=column,logs=logs)
-    handler = gan.Predict (**args)
-    handler.load_meta(column)
-    r =  handler.apply()
-    _df = df.copy()
-    _df[column] = r[column]
+    _df     = df.copy()
+    for col in column :
+        args['context'] = col
+        args['column']  = col
+        values          = df[col].unique().tolist()
+        # values.sort()        
+        args['values']  = values
+        #
+        # we can determine the cardinalities here so we know what to allow or disallow
+        handler         = gan.Predict (**args)
+        handler.load_meta(col)
+        r       =  handler.apply()        
+        # print (r)        
+        _df[col] = r[col]
+        # break
     return _df
\ No newline at end of file
diff --git a/data/maker/__main__.py b/data/maker/__main__.py
index 56defec..63b464b 100644
--- a/data/maker/__main__.py
+++ b/data/maker/__main__.py
@@ -15,13 +15,15 @@ if 'config' in SYS_ARGS :
         _df = data.maker.generate(**ARGS)
         odf = pd.read_csv (ARGS['data'])
         odf.columns = [name.lower() for name in odf.columns]
-        column = [ARGS['column'] ] #+ ARGS['id']
-        print (column)
-        print (_df[column].risk.evaluate())
-        print (odf[column].risk.evaluate())
-        _x = pd.get_dummies(_df[column]).values
-        y  = pd.get_dummies(odf[column]).values
-        N = _df.shape[0]
-        print (np.mean([ wd(_x[i],y[i])for i in range(0,N)]))
+        column = ARGS['column']  if isinstance(ARGS['column'],list) else [ARGS['column']]
+        print(pd.merge(odf,_df, on='id'))
+        # print (_df[column].risk.evaluate(flag='synth'))
+        # print (odf[column].risk.evaluate(flag='original'))
+        # _x = pd.get_dummies(_df[column]).values
+        # y  = pd.get_dummies(odf[column]).values
+        # N = _df.shape[0]
+        # print (np.mean([ wd(_x[i],y[i])for i in range(0,N)]))
+        # print (wd(_x[0],y[0]) )
+        
         # column = SYS_ARGS['column']
         # odf = open(SYS_ARGS['data'])
\ No newline at end of file

From 37ba836a7ea50e6e5334c8fe6a5f52eeb7ca27f9 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 12 Feb 2020 12:41:01 -0600
Subject: [PATCH 004/250] bug fix ... need to design porting/loading models on
 the fly

---
 data/maker/__init__.py | 2 +-
 setup.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index f1a9537..2becbe2 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -53,7 +53,7 @@ def post(**args):
     
     """
     pass
-def get(**):
+def get(**,args):
     """
     This function will restore a checkpoint from a persistant storage on to disk
     """
diff --git a/setup.py b/setup.py
index 5f800d9..2fea026 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.0.6","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.0.7","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git'

From 6c12cf0b2a561f88539128fe1e2f1ec5500c52b1 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 12 Feb 2020 12:43:30 -0600
Subject: [PATCH 005/250] bug fix ... need to design porting/loading models on
 the fly

---
 data/maker/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 2becbe2..12abc8d 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -53,7 +53,7 @@ def post(**args):
     
     """
     pass
-def get(**,args):
+def get(**args):
     """
     This function will restore a checkpoint from a persistant storage on to disk
     """

From 725e32b160ff1788b447a92131b91a0a263e70fd Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 12 Feb 2020 13:46:20 -0600
Subject: [PATCH 006/250] bug fix ... need to design porting/loading models on
 the fly

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2fea026..8034249 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.0.7","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.0.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git'

From 383d7b7e64989d1b900a2b5a90931313f1942e87 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 12 Feb 2020 13:49:05 -0600
Subject: [PATCH 007/250] bug fix ... need to design porting/loading models on
 the fly

---
 data/gan.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 3f22740..439b52a 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -331,7 +331,7 @@ class Train (GNet):
         self.meta = self.log_meta()
         if(self.logger):
             
-            self.logger.write( row=self.meta )
+            self.logger.write( self.meta )
         
         self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta)
     def load_meta(self, column):
@@ -480,7 +480,7 @@ class Train (GNet):
                         #
                         if self.logger :
                             row = {"logs":logs} #,"model":pickle.dump(sess)}                            
-                            self.logger.write(row=row)
+                            self.logger.write(row)
                             #
                             # @TODO:
                             # We should upload the files in the checkpoint 

From 4024e508a82cce6473849dd2ca7c44722560fd7f Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 12 Feb 2020 13:57:28 -0600
Subject: [PATCH 008/250] bug fix ...

---
 data/gan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/gan.py b/data/gan.py
index 439b52a..e54daa8 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -129,7 +129,7 @@ class GNet :
     def log_meta(self,**args) :
         
         _object = {
-            '_id':'meta',
+            # '_id':'meta',
             'CONTEXT':self.CONTEXT,
             'ATTRIBUTES':self.ATTRIBUTES,
             'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU,

From ce55848cc8d8fa06aad95ce8f75274ae968e657d Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 13 Feb 2020 17:30:56 -0600
Subject: [PATCH 009/250] bug fix with dimensions @TODO: GPU workload

---
 data/gan.py | 1185 ++++++++++++++++++++++++++-------------------------
 setup.py    |    2 +-
 2 files changed, 598 insertions(+), 589 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index e54daa8..367d63c 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -10,11 +10,11 @@ python data/maker --config <path-to-config-file.json>
 
 CONFIGURATION FILE STRUCTURE :
 
-    context     what it is you are loading (stroke, hypertension, ...)
-    data        path of the file to be loaded
-    logs        folder to store training model and meta data about learning
-    max_epochs  number of iterations in learning 
-    num_gpu     number of gpus to be used (will still run if the GPUs are not available)
+        context         what it is you are loading (stroke, hypertension, ...)
+        data            path of the file to be loaded
+        logs            folder to store training model and meta data about learning
+        max_epochs      number of iterations in learning 
+        num_gpu         number of gpus to be used (will still run if the GPUs are not available)
 
 EMBEDDED IN CODE :
 
@@ -35,619 +35,628 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ['CUDA_VISIBLE_DEVICES'] = "0"
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 
-# STEPS_PER_EPOCH     = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256
-# NUM_GPUS            = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu'])
+# STEPS_PER_EPOCH         = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256
+# NUM_GPUS                        = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu'])
 # BATCHSIZE_PER_GPU   = 2000
-# TOTAL_BATCHSIZE     = BATCHSIZE_PER_GPU * NUM_GPUS
+# TOTAL_BATCHSIZE         = BATCHSIZE_PER_GPU * NUM_GPUS
 
 class void :
-    pass
+        pass
 class GNet :
-    def log(self,**args):
-        self.logs = dict(args,**self.logs)
-       
-            
-    """
-    This is the base class of a generative network functions, the details will be implemented in the subclasses.
-    An instance of this class is accessed as follows 
-    object.layers.normalize applies batch normalization or otherwise
-    obect.get.variables     instanciate variables on cpu and return a reference (tensor)
-    """
-    def __init__(self,**args):
-        self.layers = void()
-        self.layers.normalize = self.normalize
-        self.logs = {}
-
-        self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
-       
-
-        self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854
-        self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE]
-        self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis
-        # self.NUM_LABELS     = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1]
-        if 'label' in args and len(args['label'].shape) == 2 :
-            self.NUM_LABELS = args['label'].shape[1]
-        elif 'label' in args and len(args['label']) == 1 :
-            self.NUM_LABELS = args['label'].shape[0]
-        else:
-            self.NUM_LABELS = 8
-        self.Z_DIM = 128 #self.X_SPACE_SIZE     
-        self.BATCHSIZE_PER_GPU = args['real'].shape[0] if 'real' in args else 256
-        self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS
-        self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)   
-        self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs'])
-        self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100
-        self.CONTEXT = args['context']
-        self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None}
-        self._REAL = args['real'] if 'real' in args else None
-        self._LABEL = args['label'] if 'label' in args else None
-
-        self.get = void()
-        self.get.variables = self._variable_on_cpu
-        self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
-        self.logger = args['logger'] if 'logger' in args and args['logger'] else None
-        self.init_logs(**args)
-
-    def init_logs(self,**args):
-        self.log_dir = args['logs'] if 'logs' in args else 'logs'
-        self.mkdir(self.log_dir)
-        #
-        # 
-        for key in ['train','output'] :
-            self.mkdir(os.sep.join([self.log_dir,key]))
-            self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT]))
-            
-        self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])        
-        self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
-        if self.logger :
-            #
-            # We will clear the logs from the data-store 
-            #
-            column = self.ATTRIBUTES['synthetic']
-            db = self.logger.db
-            if db[column].count() > 0 :
-                db.backup.insert({'name':column,'logs':list(db[column].find()) })
-                db[column].drop()
-        
-    def load_meta(self,column):
+        def log(self,**args):
+                self.logs = dict(args,**self.logs)
+           
+                        
         """
-        This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model.
-        Because prediction and training can happen independently
+        This is the base class of a generative network functions, the details will be implemented in the subclasses.
+        An instance of this class is accessed as follows 
+        object.layers.normalize applies batch normalization or otherwise
+        obect.get.variables             instanciate variables on cpu and return a reference (tensor)
         """
-        # suffix = "-".join(column) if isinstance(column,list)else column
-        suffix = self.get.suffix()
-        _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json'])
-        if os.path.exists(_name) :
-            attr = json.loads((open(_name)).read())
-            for key in attr :
-                value = attr[key]
-                setattr(self,key,value)
-        self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])        
-        self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
+        def __init__(self,**args):
+                self.layers = void()
+                self.layers.normalize = self.normalize
+                self.logs = {}
+
+                self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
+                if self.NUM_GPUS > 1 :
+                    os.environ['CUDA_VISIBLE_DEVICES'] = "4"
+
+                self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854
+                self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE]
+                self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis
+                # self.NUM_LABELS         = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1]
+                
+                if 'label' in args and len(args['label'].shape) == 2 :
+                        self.NUM_LABELS = args['label'].shape[1]
+                elif 'label' in args and len(args['label']) == 1 :
+                        self.NUM_LABELS = args['label'].shape[0]
+                else:
+                        self.NUM_LABELS = 8
+                # self.Z_DIM = 128 #self.X_SPACE_SIZE     
+                self.Z_DIM = 128  #-- used as rows down stream
+                self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM]
+                if 'real' in args : 
+                                self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM]
+
+                self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) if 'real' in args else 256
+                self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS
+                self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)       
+                self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs'])
+                self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100
+                self.CONTEXT = args['context']
+                self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None}
+                self._REAL = args['real'] if 'real' in args else None
+                self._LABEL = args['label'] if 'label' in args else None
+
+                self.get = void()
+                self.get.variables = self._variable_on_cpu
+                self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
+                self.logger = args['logger'] if 'logger' in args and args['logger'] else None
+                self.init_logs(**args)
+
+        def init_logs(self,**args):
+                self.log_dir = args['logs'] if 'logs' in args else 'logs'
+                self.mkdir(self.log_dir)
+                #
+                # 
+                for key in ['train','output'] :
+                        self.mkdir(os.sep.join([self.log_dir,key]))
+                        self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT]))
+                        
+                self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])                
+                self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
+                if self.logger :
+                        #
+                        # We will clear the logs from the data-store 
+                        #
+                        column = self.ATTRIBUTES['synthetic']
+                        db = self.logger.db
+                        if db[column].count() > 0 :
+                                db.backup.insert({'name':column,'logs':list(db[column].find()) })
+                                db[column].drop()
+                
+        def load_meta(self,column):
+                """
+                This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model.
+                Because prediction and training can happen independently
+                """
+                # suffix = "-".join(column) if isinstance(column,list)else column
+                suffix = self.get.suffix()
+                _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json'])
+                if os.path.exists(_name) :
+                        attr = json.loads((open(_name)).read())
+                        for key in attr :
+                                value = attr[key]
+                                setattr(self,key,value)
+                self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])                
+                self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
+                                
+                        
+        def log_meta(self,**args) :
+                
+                _object = {
+                        # '_id':'meta',
+                        'CONTEXT':self.CONTEXT,
+                        'ATTRIBUTES':self.ATTRIBUTES,
+                        'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU,
+                        'Z_DIM':self.Z_DIM,
+                        "X_SPACE_SIZE":self.X_SPACE_SIZE,
+                        "D_STRUCTURE":self.D_STRUCTURE,
+                        "G_STRUCTURE":self.G_STRUCTURE,
+                        "NUM_GPUS":self.NUM_GPUS,
+                        "NUM_LABELS":self.NUM_LABELS,
+                        "MAX_EPOCHS":self.MAX_EPOCHS,
+                        "ROW_COUNT":self.ROW_COUNT
+                }
+                if args and 'key' in args and 'value' in args :
+                        key = args['key']
+                        value= args['value']
+                        object[key] = value
+                # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column
+                suffix = self.get.suffix()
+                _name = os.sep.join([self.out_dir,'meta-'+suffix])
+                
+                f = open(_name+'.json','w')
+                f.write(json.dumps(_object))
+                return _object
+        def mkdir (self,path):
+                if not os.path.exists(path) :
+                        os.mkdir(path)            
                 
-            
-    def log_meta(self,**args) :
-        
-        _object = {
-            # '_id':'meta',
-            'CONTEXT':self.CONTEXT,
-            'ATTRIBUTES':self.ATTRIBUTES,
-            'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU,
-            'Z_DIM':self.Z_DIM,
-            "X_SPACE_SIZE":self.X_SPACE_SIZE,
-            "D_STRUCTURE":self.D_STRUCTURE,
-            "G_STRUCTURE":self.G_STRUCTURE,
-            "NUM_GPUS":self.NUM_GPUS,
-            "NUM_LABELS":self.NUM_LABELS,
-            "MAX_EPOCHS":self.MAX_EPOCHS,
-            "ROW_COUNT":self.ROW_COUNT
-        }
-        if args and 'key' in args and 'value' in args :
-            key = args['key']
-            value= args['value']
-            object[key] = value
-        # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column
-        suffix = self.get.suffix()
-        _name = os.sep.join([self.out_dir,'meta-'+suffix])
-        
-        f = open(_name+'.json','w')
-        f.write(json.dumps(_object))
-        return _object
-    def mkdir (self,path):
-        if not os.path.exists(path) :
-            os.mkdir(path)        
-        
-
-    def normalize(self,**args):
-        """
-        This function will perform a batch normalization on an network layer
-        inputs      input layer of the neural network
-        name        name of the scope the 
-        labels      labels (attributes not synthesized) by default None
-        n_labels    number of labels default None
-        """
-        inputs  = args['inputs']
-        name    = args['name']
-        labels  = None if 'labels' not in args else args['labels']
-        n_labels= None if 'n_labels' not in args else args['n_labels']
-        shift   = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing
-        mean, var   = tf.nn.moments(inputs, shift, keep_dims=True)
-        shape       = inputs.shape[1].value
-        offset_m    = self.get.variables(shape=[n_labels,shape], name='offset'+name,
-                                    initializer=tf.zeros_initializer)
-        scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,
-                                initializer=tf.ones_initializer)
-        
-        offset  = tf.nn.embedding_lookup(offset_m, labels)
-        scale   = tf.nn.embedding_lookup(scale_m, labels)
-        result  = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8)
-        return result
-
-    def _variable_on_cpu(self,**args):
-        """
-        This function makes sure variables/tensors are not created on the GPU but rather on the CPU
-        """
 
-        name = args['name']
-        shape = args['shape']
-        initializer=None if 'initializer' not in args else args['initializer']
-        with tf.device('/cpu:0') :
-            cpu_var =  tf.compat.v1.get_variable(name,shape,initializer= initializer)
-        return cpu_var
-    def average_gradients(self,tower_grads):
-        average_grads = []
-        for grad_and_vars in zip(*tower_grads):
-            grads = []
-            for g, _ in grad_and_vars:
-                expanded_g = tf.expand_dims(g, 0)
-                grads.append(expanded_g)
-
-            grad = tf.concat(axis=0, values=grads)
-            grad = tf.reduce_mean(grad, 0)
-
-            v = grad_and_vars[0][1]
-            grad_and_var = (grad, v)
-            average_grads.append(grad_and_var)
-        return average_grads        
+        def normalize(self,**args):
+                """
+                This function will perform a batch normalization on an network layer
+                inputs          input layer of the neural network
+                name            name of the scope the 
+                labels          labels (attributes not synthesized) by default None
+                n_labels        number of labels default None
+                """
+                inputs  = args['inputs']
+                name    = args['name']
+                labels  = None if 'labels' not in args else args['labels']
+                n_labels= None if 'n_labels' not in args else args['n_labels']
+                shift   = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing
+                mean, var       = tf.nn.moments(inputs, shift, keep_dims=True)
+                shape           = inputs.shape[1].value
+                offset_m        = self.get.variables(shape=[n_labels,shape], name='offset'+name,
+                                                                        initializer=tf.zeros_initializer)
+                scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,
+                                                                initializer=tf.ones_initializer)
+                
+                offset  = tf.nn.embedding_lookup(offset_m, labels)
+                scale   = tf.nn.embedding_lookup(scale_m, labels)
+                result  = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8)
+                return result
+
+        def _variable_on_cpu(self,**args):
+                """
+                This function makes sure variables/tensors are not created on the GPU but rather on the CPU
+                """
+
+                name = args['name']
+                shape = args['shape']
+                initializer=None if 'initializer' not in args else args['initializer']
+                with tf.device('/cpu:0') :
+                        cpu_var =  tf.compat.v1.get_variable(name,shape,initializer= initializer)
+                return cpu_var
+        def average_gradients(self,tower_grads):
+                average_grads = []
+                for grad_and_vars in zip(*tower_grads):
+                        grads = []
+                        for g, _ in grad_and_vars:
+                                expanded_g = tf.expand_dims(g, 0)
+                                grads.append(expanded_g)
+
+                        grad = tf.concat(axis=0, values=grads)
+                        grad = tf.reduce_mean(grad, 0)
+
+                        v = grad_and_vars[0][1]
+                        grad_and_var = (grad, v)
+                        average_grads.append(grad_and_var)
+                return average_grads            
 
 
 class Generator (GNet):
-    """
-    This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator, this allows the generator not to be random
-    
-    """
-    def __init__(self,**args):
-        GNet.__init__(self,**args)
-        self.discriminator = Discriminator(**args)
-    def loss(self,**args):
-        fake    = args['fake']
-        label   = args['label']
-        y_hat_fake = self.discriminator.network(inputs=fake, label=label)
-        #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
-        all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-        loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs)
-        #tf.add_to_collection('glosses', loss)
-        tf.compat.v1.add_to_collection('glosses', loss)
-        return loss, loss        
-    def load_meta(self, column):
-        super().load_meta(column)
-        self.discriminator.load_meta(column)
-    def network(self,**args) :
-        """
-        This function will build the network that will generate the synthetic candidates
-        :inputs matrix of data that we need
-        :dim    dimensions of ...
         """
-        x       = args['inputs']
-        tmp_dim = self.Z_DIM if 'dim' not in args else args['dim']
-        label   = args['label']
+        This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator, this allows the generator not to be random
         
-        with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
-            for i, dim in enumerate(self.G_STRUCTURE[:-1]):
-                kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim])
-                h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS)
-                h2 = tf.nn.relu(h1)
-                x = x + h2
-                tmp_dim = dim
-            i = len(self.G_STRUCTURE) - 1
-            #
-            # This seems to be an extra hidden layer: 
-            # It's goal is to map continuous values to discrete values (pre-trained to do this)
-            kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]])
-            h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i),
-                        labels=label, n_labels=self.NUM_LABELS)
-            h2 = tf.nn.tanh(h1)
-            x = x + h2
-            # This seems to be the output layer
-            #
-            kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE])
-            bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE])
-            x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias))
-        return x       
+        """
+        def __init__(self,**args):
+                GNet.__init__(self,**args)
+                self.discriminator = Discriminator(**args)
+        def loss(self,**args):
+                fake    = args['fake']
+                label   = args['label']
+                y_hat_fake = self.discriminator.network(inputs=fake, label=label)
+                #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+                all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
+                loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs)
+                #tf.add_to_collection('glosses', loss)
+                tf.compat.v1.add_to_collection('glosses', loss)
+                return loss, loss                
+        def load_meta(self, column):
+                super().load_meta(column)
+                self.discriminator.load_meta(column)
+        def network(self,**args) :
+                """
+                This function will build the network that will generate the synthetic candidates
+                :inputs matrix of data that we need
+                :dim    dimensions of ...
+                """
+                x               = args['inputs']
+                tmp_dim = self.Z_DIM if 'dim' not in args else args['dim']
+                label   = args['label']
+                
+                with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
+                        for i, dim in enumerate(self.G_STRUCTURE[:-1]):
+                                kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim])
+                                h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS)
+                                h2 = tf.nn.relu(h1)
+                                x = x + h2
+                                tmp_dim = dim
+                        i = len(self.G_STRUCTURE) - 1
+                        #
+                        # This seems to be an extra hidden layer: 
+                        # It's goal is to map continuous values to discrete values (pre-trained to do this)
+                        kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]])
+                        h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i),
+                                                labels=label, n_labels=self.NUM_LABELS)
+                        h2 = tf.nn.tanh(h1)
+                        x = x + h2
+                        # This seems to be the output layer
+                        #
+                        kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE])
+                        bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE])
+                        x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias))
+                return x           
 
 class Discriminator(GNet):
-    def __init__(self,**args):
-        GNet.__init__(self,**args)     
-    def network(self,**args):
-        """
-        This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output (neuron)
-        :inputs
-        :label
-        """
-        x = args['inputs']
-        label = args['label']
-        with tf.compat.v1.variable_scope('D', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
-            for i, dim in enumerate(self.D_STRUCTURE[1:]):
-                kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[i], dim])
-                bias = self.get.variables(name='b_' + str(i), shape=[dim])
-                # print (["\t",bias,kernel])
-                x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias))
-                x = self.normalize(inputs=x, name='cln' + str(i), shift=1,labels=label, n_labels=self.NUM_LABELS)
-            i = len(self.D_STRUCTURE)
-            kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[-1], 1])
-            bias = self.get.variables(name='b_' + str(i), shape=[1])
-            y = tf.add(tf.matmul(x, kernel), bias)
-        return y
-    
-    def loss(self,**args) :
-        """
-        This function compute the loss of 
-        :real
-        :fake
-        :label
-        """
-        real    = args['real']
-        fake    = args['fake']
-        label   = args['label']
-        epsilon = tf.random.uniform(shape=[self.BATCHSIZE_PER_GPU,1],minval=0,maxval=1)
+        def __init__(self,**args):
+                GNet.__init__(self,**args)         
+        def network(self,**args):
+                """
+                This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output (neuron)
+                :inputs
+                :label
+                """
+                x = args['inputs']
+                label = args['label']
+                with tf.compat.v1.variable_scope('D', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
+                        for i, dim in enumerate(self.D_STRUCTURE[1:]):
+                                kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[i], dim])
+                                bias = self.get.variables(name='b_' + str(i), shape=[dim])
+                                # print (["\t",bias,kernel])
+                                x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias))
+                                x = self.normalize(inputs=x, name='cln' + str(i), shift=1,labels=label, n_labels=self.NUM_LABELS)
+                        i = len(self.D_STRUCTURE)
+                        kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[-1], 1])
+                        bias = self.get.variables(name='b_' + str(i), shape=[1])
+                        y = tf.add(tf.matmul(x, kernel), bias)
+                return y
         
-        x_hat       = real + epsilon * (fake - real)
-        y_hat_fake  = self.network(inputs=fake, label=label)
-        
-        y_hat_real  = self.network(inputs=real, label=label)
-        y_hat       = self.network(inputs=x_hat, label=label)
-
-        grad        = tf.gradients(y_hat, [x_hat])[0]
-        slopes      = tf.sqrt(tf.reduce_sum(tf.square(grad), 1))
-        gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2)
-        #all_regs    = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
-        all_regs    = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-        w_distance  = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake)
-        loss        = w_distance + 10 * gradient_penalty + sum(all_regs)
-        #tf.add_to_collection('dlosses', loss)
-        tf.compat.v1.add_to_collection('dlosses', loss)
-
-        return w_distance, loss        
+        def loss(self,**args) :
+                """
+                This function compute the loss of 
+                :real
+                :fake
+                :label
+                """
+                real    = args['real']
+                fake    = args['fake']
+                label   = args['label']
+                epsilon = tf.random.uniform(shape=[self.BATCHSIZE_PER_GPU,1],minval=0,maxval=1)
+                
+                x_hat           = real + epsilon * (fake - real)
+                y_hat_fake      = self.network(inputs=fake, label=label)
+                
+                y_hat_real      = self.network(inputs=real, label=label)
+                y_hat           = self.network(inputs=x_hat, label=label)
+
+                grad            = tf.gradients(y_hat, [x_hat])[0]
+                slopes          = tf.sqrt(tf.reduce_sum(tf.square(grad), 1))
+                gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2)
+                #all_regs        = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+                all_regs        = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
+                w_distance      = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake)
+                loss            = w_distance + 10 * gradient_penalty + sum(all_regs)
+                #tf.add_to_collection('dlosses', loss)
+                tf.compat.v1.add_to_collection('dlosses', loss)
+
+                return w_distance, loss            
 class Train (GNet):
-    def __init__(self,**args):
-        GNet.__init__(self,**args)
-        self.generator = Generator(**args)
-        self.discriminator = Discriminator(**args)
-        self._REAL = args['real']
-        self._LABEL= args['label']
-        self.column = args['column']
-        # print ([" *** ",self.BATCHSIZE_PER_GPU])
-        
-        self.meta = self.log_meta()
-        if(self.logger):
-            
-            self.logger.write( self.meta )
-        
-        self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta)
-    def load_meta(self, column):
-        """
-        This function will delegate the calls to load meta data to it's dependents
-        column name
-        """
-        super().load_meta(column)
-        self.generator.load_meta(column)
-        self.discriminator.load_meta(column)
-    def loss(self,**args):
-        """
-        This function will compute a "tower" loss of the generated candidate against real data
-        Training will consist in having both generator and discriminators
-        :scope
-        :stage
-        :real
-        :label
-        """
-
-        scope   = args['scope']
-        stage   = args['stage']
-        real    = args['real']
-        label   = args['label']
-        label   = tf.cast(label, tf.int32)
-        #
-        # @TODO: Ziqi needs to explain what's going on here
-        m = [[i] for i in np.arange(self._LABEL.shape[1]-2)]
-        label   = label[:, 1] * len(m) + tf.squeeze(
-            tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32))
-            )
-        # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] )
-        z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
+        def __init__(self,**args):
+                GNet.__init__(self,**args)
+                self.generator = Generator(**args)
+                self.discriminator = Discriminator(**args)
+                self._REAL = args['real']
+                self._LABEL= args['label']
+                self.column = args['column']
+                # print ([" *** ",self.BATCHSIZE_PER_GPU])
+                
+                self.meta = self.log_meta()
+                if(self.logger):
+                        
+                        self.logger.write( self.meta )
+                
+                self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta)
+        def load_meta(self, column):
+                """
+                This function will delegate the calls to load meta data to it's dependents
+                column name
+                """
+                super().load_meta(column)
+                self.generator.load_meta(column)
+                self.discriminator.load_meta(column)
+        def loss(self,**args):
+                """
+                This function will compute a "tower" loss of the generated candidate against real data
+                Training will consist in having both generator and discriminators
+                :scope
+                :stage
+                :real
+                :label
+                """
+
+                scope   = args['scope']
+                stage   = args['stage']
+                real    = args['real']
+                label   = args['label']
+                label   = tf.cast(label, tf.int32)
+                #
+                # @TODO: Ziqi needs to explain what's going on here
+                m = [[i] for i in np.arange(self._LABEL.shape[1]-2)]
+                label   = label[:, 1] * len(m) + tf.squeeze(
+                        tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32))
+                        )
+                # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] )
+                z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
+                
+                fake = self.generator.network(inputs=z, label=label)
+                if stage == 'D':
+                        w, loss = self.discriminator.loss(real=real, fake=fake, label=label)
+                        #losses = tf.get_collection('dlosses', scope)
+                        flag = 'dlosses'
+                        losses = tf.compat.v1.get_collection('dlosses', scope)
+                else:
+                        w, loss = self.generator.loss(fake=fake, label=label)
+                        #losses = tf.get_collection('glosses', scope)
+                        flag = 'glosses'
+                        losses = tf.compat.v1.get_collection('glosses', scope)
+                # losses = tf.compat.v1.get_collection(flag, scope)
+
+                total_loss = tf.add_n(losses, name='total_loss')
+
+                return total_loss, w
+        def input_fn(self):
+                """
+                This function seems to produce 
+                """
+                features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32)
+                labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32)
+                dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
+                dataset = dataset.repeat(10000)
+                dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
+                dataset = dataset.prefetch(1)
+                # iterator = dataset.make_initializable_iterator()
+                iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
+                # next_element = iterator.get_next()
+                # init_op = iterator.initializer
+                return iterator, features_placeholder, labels_placeholder
         
-        fake = self.generator.network(inputs=z, label=label)
-        if stage == 'D':
-            w, loss = self.discriminator.loss(real=real, fake=fake, label=label)
-            #losses = tf.get_collection('dlosses', scope)
-            flag = 'dlosses'
-            losses = tf.compat.v1.get_collection('dlosses', scope)
-        else:
-            w, loss = self.generator.loss(fake=fake, label=label)
-            #losses = tf.get_collection('glosses', scope)
-            flag = 'glosses'
-            losses = tf.compat.v1.get_collection('glosses', scope)
-        # losses = tf.compat.v1.get_collection(flag, scope)
-
-        total_loss = tf.add_n(losses, name='total_loss')
+        def network(self,**args):
+        # def graph(stage, opt):
+                # global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False)
+                stage   = args['stage']
+                opt             = args['opt']
+                tower_grads = []
+                per_gpu_w       = []
+                iterator, features_placeholder, labels_placeholder = self.input_fn()
+                with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()):
+                        for i in range(self.NUM_GPUS):
+                                with tf.device('/gpu:%d' % i):
+                                        with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
+                                                (real, label) = iterator.get_next()
+                                                loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL)
+                                                #tf.get_variable_scope().reuse_variables()
+                                                tf.compat.v1.get_variable_scope().reuse_variables()
+                                                #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
+                                                vars_ = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
+                                                grads = opt.compute_gradients(loss, vars_)
+                                                tower_grads.append(grads)
+                                                per_gpu_w.append(w)
+
+                grads = self.average_gradients(tower_grads)
+                apply_gradient_op = opt.apply_gradients(grads)
+
+                mean_w = tf.reduce_mean(per_gpu_w)
+                train_op = apply_gradient_op
+                return train_op, mean_w, iterator, features_placeholder, labels_placeholder
+        def apply(self,**args):
+                # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10
+                REAL = self._REAL
+                LABEL= self._LABEL       
+                if (self.logger):
+                        pass
+                
+                with tf.device('/cpu:0'):
+                        opt_d = tf.compat.v1.train.AdamOptimizer(1e-4)
+                        opt_g = tf.compat.v1.train.AdamOptimizer(1e-4)
+                        
+                        train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d)
+                        train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g)
+                        # saver = tf.train.Saver()
+                        saver   = tf.compat.v1.train.Saver()
+                        # init    = tf.global_variables_initializer()
+                        init    = tf.compat.v1.global_variables_initializer()
+                        logs = []
+                        #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
+                        with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
+                                sess.run(init)
+                                sess.run(iterator_d.initializer,
+                                                feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL})
+                                sess.run(iterator_g.initializer,
+                                                feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL})
+
+                                for epoch in range(1, self.MAX_EPOCHS + 1):
+                                        start_time = time.time()
+                                        w_sum = 0
+                                        for i in range(self.STEPS_PER_EPOCH):
+                                                for _ in range(2):
+                                                        _, w = sess.run([train_d, w_distance])
+                                                        w_sum += w
+                                                sess.run(train_g)
+                                        duration = time.time() - start_time
+
+                                        assert not np.isnan(w_sum), 'Model diverged with loss = NaN'
+
+                                        format_str = 'epoch: %d, w_distance = %f (%.1f)'
+                                        print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
+                                        # print (dir (w_distance))
+
+                                        logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })
+
+                                        if epoch % self.MAX_EPOCHS == 0:
+                                                # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
+                                                suffix = self.get.suffix()
+                                                _name  = os.sep.join([self.train_dir,suffix])
+                                                # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
+                                                saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
+                                                #
+                                                #
+                                                if self.logger :
+                                                        row = {"logs":logs} #,"model":pickle.dump(sess)}                                                        
+                                                        self.logger.write(row)
+                                                        #
+                                                        # @TODO:
+                                                        # We should upload the files in the checkpoint 
+                                                        # This would allow the learnt model to be portable to another system
+                                                        #
+                        tf.compat.v1.reset_default_graph()
 
-        return total_loss, w
-    def input_fn(self):
+class Predict(GNet):
         """
-        This function seems to produce 
+        This class uses synthetic data given a learned model
         """
-        features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32)
-        labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32)
-        dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
-        dataset = dataset.repeat(10000)
-        dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
-        dataset = dataset.prefetch(1)
-        # iterator = dataset.make_initializable_iterator()
-        iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
-        # next_element = iterator.get_next()
-        # init_op = iterator.initializer
-        return iterator, features_placeholder, labels_placeholder
-    
-    def network(self,**args):
-    # def graph(stage, opt):
-        # global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False)
-        stage   = args['stage']
-        opt     = args['opt']
-        tower_grads = []
-        per_gpu_w   = []
-        iterator, features_placeholder, labels_placeholder = self.input_fn()
-        with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()):
-            for i in range(self.NUM_GPUS):
-                with tf.device('/gpu:%d' % i):
-                    with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
-                        (real, label) = iterator.get_next()
-                        loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL)
-                        #tf.get_variable_scope().reuse_variables()
-                        tf.compat.v1.get_variable_scope().reuse_variables()
-                        #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
-                        vars_ = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
-                        grads = opt.compute_gradients(loss, vars_)
-                        tower_grads.append(grads)
-                        per_gpu_w.append(w)
-
-        grads = self.average_gradients(tower_grads)
-        apply_gradient_op = opt.apply_gradients(grads)
-
-        mean_w = tf.reduce_mean(per_gpu_w)
-        train_op = apply_gradient_op
-        return train_op, mean_w, iterator, features_placeholder, labels_placeholder
-    def apply(self,**args):
-        # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10
-        REAL = self._REAL
-        LABEL= self._LABEL   
-        if (self.logger):
-            pass
-        
-        with tf.device('/cpu:0'):
-            opt_d = tf.compat.v1.train.AdamOptimizer(1e-4)
-            opt_g = tf.compat.v1.train.AdamOptimizer(1e-4)
-            
-            train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d)
-            train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g)
-            # saver = tf.train.Saver()
-            saver   = tf.compat.v1.train.Saver()
-            # init    = tf.global_variables_initializer()
-            init    = tf.compat.v1.global_variables_initializer()
-            logs = []
-            #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
-            with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
-                sess.run(init)
-                sess.run(iterator_d.initializer,
-                        feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL})
-                sess.run(iterator_g.initializer,
-                        feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL})
-
-                for epoch in range(1, self.MAX_EPOCHS + 1):
-                    start_time = time.time()
-                    w_sum = 0
-                    for i in range(self.STEPS_PER_EPOCH):
-                        for _ in range(2):
-                            _, w = sess.run([train_d, w_distance])
-                            w_sum += w
-                        sess.run(train_g)
-                    duration = time.time() - start_time
-
-                    assert not np.isnan(w_sum), 'Model diverged with loss = NaN'
-
-                    format_str = 'epoch: %d, w_distance = %f (%.1f)'
-                    print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
-                    # print (dir (w_distance))
-
-                    logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })
-
-                    if epoch % self.MAX_EPOCHS == 0:
-                        # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
-                        suffix = self.get.suffix()
-                        _name  = os.sep.join([self.train_dir,suffix])
-                        # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
-                        saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
+        def __init__(self,**args):
+                GNet.__init__(self,**args)                
+                self.generator = Generator(**args)                
+                self.values  = args['values']
+        def load_meta(self, column):
+                super().load_meta(column)
+                self.generator.load_meta(column)
+        def apply(self,**args):
+                # print (self.train_dir)
+                # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
+                suffix = self.get.suffix()
+                model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
+                demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
+                tf.compat.v1.reset_default_graph()
+                z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
+                y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32)
+                ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
+                label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
+                
+                fake    = self.generator.network(inputs=z, label=label)
+                init    = tf.compat.v1.global_variables_initializer()
+                saver   = tf.compat.v1.train.Saver()
+                df              = pd.DataFrame()
+                CANDIDATE_COUNT = 1000
+                NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0]
+                with tf.compat.v1.Session() as sess:
+                        
+                        # sess.run(init)
+                        saver.restore(sess, model_dir)
+                        labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
+                        
+                        found = []
+                        labels= demo
+                        for i in np.arange(CANDIDATE_COUNT) :
+                                
+                                f = sess.run(fake,feed_dict={y:labels})
+                                #
+                                # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
+                                # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
+                                #
+                                df =  ( pd.DataFrame(np.round(f).astype(np.int32)))
+                                print (df.head())
+                                print ()
+                                p = 0 not in df.sum(axis=1).values
+                                
+                                if      p:
+                                        found.append(df)
+                                        if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT:
+                                                break
+                                else:
+                                        continue
+                                        
+                        # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
+                        # df = (i * df).sum(axis=1)
                         #
+                        # In case we are dealing with actual values like diagnosis codes we can perform 
                         #
-                        if self.logger :
-                            row = {"logs":logs} #,"model":pickle.dump(sess)}                            
-                            self.logger.write(row)
-                            #
-                            # @TODO:
-                            # We should upload the files in the checkpoint 
-                            # This would allow the learnt model to be portable to another system
-                            #
-            tf.compat.v1.reset_default_graph()
+                        df = found[np.random.choice(np.arange(len(found)),1)[0]]
+                        columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
+                        
+                        # r = np.zeros((self.ROW_COUNT,len(columns)))
+                        r = np.zeros(self.ROW_COUNT)
+                        df.columns = self.values
+                        if len(found):
+                                print (len(found),NTH_VALID_CANDIDATE)    
+                                # x = df * self.values 
+                                
+                                df =  pd.DataFrame( df.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
+                                df.columns = columns
+                                
+                           
+                        
+                
+                tf.compat.v1.reset_default_graph()
+                
+                return df.to_dict(orient='list')
+                        # return df.to_dict(orient='list')
+                        # count = str(len(os.listdir(self.out_dir)))
+                        # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])
+                        # df.to_csv(_name,index=False)
+
+                        
+                        # output.extend(np.round(f))
+
+                        # for m in range(2):
+                        #         for n in range(2, self.NUM_LABELS):
+                        #                 idx1 = (demo[:, m] == 1)
+                        #                 idx2 = (demo[:, n] == 1)
+                        #                 idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
+                        #                 num = np.sum(idx)
+                        #                 print ("___________________list__")
+                        #                 print (idx1)
+                        #                 print (idx2)
+                        #                 print (idx)
+                        #                 print (num)
+                        #                 print ("_____________________")
+                        #                 nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU))
+                        #                 label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS))
+                        #                 label_input[:, n] = 1
+                        #                 label_input[:, m] = 1
+                        #                 output = []
+                        #                 for i in range(nbatch):
+                        #                         f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]})
+                        #                         output.extend(np.round(f))
+                        #                 output = np.array(output)[:num]
+                                        # print ([m,n,output])
+                                        
+                                        # np.save(self.out_dir + str(m) + str(n), output)
+        
 
-class Predict(GNet):
-    """
-    This class uses synthetic data given a learned model
-    """
-    def __init__(self,**args):
-        GNet.__init__(self,**args)        
-        self.generator = Generator(**args)        
-        self.values  = args['values']
-    def load_meta(self, column):
-        super().load_meta(column)
-        self.generator.load_meta(column)
-    def apply(self,**args):
-        # print (self.train_dir)
-        # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
-        suffix = self.get.suffix()
-        model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
-        demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
-        tf.compat.v1.reset_default_graph()
-        z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
-        y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32)
-        ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
-        label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
+if __name__ == '__main__' :
+        #
+        # Now we get things done ...
+        column          = SYS_ARGS['column']
+        column_id       = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id'
+        column_id       = column_id.split(',') if ',' in column_id else column_id
+        df = pd.read_csv(SYS_ARGS['raw-data'])  
+        LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
         
-        fake    = self.generator.network(inputs=z, label=label)
-        init    = tf.compat.v1.global_variables_initializer()
-        saver   = tf.compat.v1.train.Saver()
-        df      = pd.DataFrame()
-        CANDIDATE_COUNT = 1000
-        NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0]
-        with tf.compat.v1.Session() as sess:
-            
-            # sess.run(init)
-            saver.restore(sess, model_dir)
-            labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
-            
-            found = []
-            labels= demo
-            for i in np.arange(CANDIDATE_COUNT) :
+        context         = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4]
+        if set(['train','learn']) & set(SYS_ARGS.keys()):
+                
+                df = pd.read_csv(SYS_ARGS['raw-data'])   
+                
+                # cols = SYS_ARGS['column']
+                # _map,_df = (Binary()).Export(df)
+                # i = np.arange(_map[column]['start'],_map[column]['end'])
+                max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10
+                # REAL    = _df[:,i]
+                REAL    = pd.get_dummies(df[column]).astype(np.float32).values
+                LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
+                trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id)
+                trainer.apply()
                 
-                f = sess.run(fake,feed_dict={y:labels})
+                
+           
+                
+                #
+                # We should train upon this data
                 #
-                # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
-                # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
+                # -- we need to convert the data-frame to binary matrix, given a column
                 #
-                df =  ( pd.DataFrame(np.round(f).astype(np.int32)))
-                p = 0 not in df.sum(axis=1).values
+                pass
+        elif 'generate' in SYS_ARGS:
+                values = df[column].unique().tolist()
+                values.sort()
                 
-                if  p:
-                    found.append(df)
-                    if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT:
-                        break
-                else:
-                    continue
-                    
-            # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
-            # df = (i * df).sum(axis=1)
-            #
-            # In case we are dealing with actual values like diagnosis codes we can perform 
-            #
-            df = found[np.random.choice(np.arange(len(found)),1)[0]]
-            columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
-            
-            # r = np.zeros((self.ROW_COUNT,len(columns)))
-            r = np.zeros(self.ROW_COUNT)
-            df.columns = self.values
-            if len(found):
-                print (len(found),NTH_VALID_CANDIDATE)    
-                # x = df * self.values 
+                p = Predict(context=context,label=LABEL,values=values,column=column)
+                p.load_meta(column)
+                r = p.apply()
+                print (df)
+                print ()
+                df[column] = r[column]
+                print (df)
                 
-                df =  pd.DataFrame( df.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
-                df.columns = columns
                 
-               
-            
-        
-        tf.compat.v1.reset_default_graph()
-        
-        return df.to_dict(orient='list')
-            # return df.to_dict(orient='list')
-            # count = str(len(os.listdir(self.out_dir)))
-            # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])
-            # df.to_csv(_name,index=False)
-
-            
-            # output.extend(np.round(f))
-
-            # for m in range(2):
-            #     for n in range(2, self.NUM_LABELS):
-            #         idx1 = (demo[:, m] == 1)
-            #         idx2 = (demo[:, n] == 1)
-            #         idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
-            #         num = np.sum(idx)
-            #         print ("___________________list__")
-            #         print (idx1)
-            #         print (idx2)
-            #         print (idx)
-            #         print (num)
-            #         print ("_____________________")
-            #         nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU))
-            #         label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS))
-            #         label_input[:, n] = 1
-            #         label_input[:, m] = 1
-            #         output = []
-            #         for i in range(nbatch):
-            #             f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]})
-            #             output.extend(np.round(f))
-            #         output = np.array(output)[:num]
-                    # print ([m,n,output])
-                    
-                    # np.save(self.out_dir + str(m) + str(n), output)
-    
-
-if __name__ == '__main__' :
-    #
-    # Now we get things done ...
-    column      = SYS_ARGS['column']
-    column_id   = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id'
-    column_id   = column_id.split(',') if ',' in column_id else column_id
-    df = pd.read_csv(SYS_ARGS['raw-data'])  
-    LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
-    
-    context     = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4]
-    if set(['train','learn']) & set(SYS_ARGS.keys()):
-        
-        df = pd.read_csv(SYS_ARGS['raw-data'])   
-        
-        # cols = SYS_ARGS['column']
-        # _map,_df = (Binary()).Export(df)
-        # i = np.arange(_map[column]['start'],_map[column]['end'])
-        max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10
-        # REAL    = _df[:,i]
-        REAL    = pd.get_dummies(df[column]).astype(np.float32).values
-        LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
-        trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id)
-        trainer.apply()
-        
-        
-       
-        
-        #
-        # We should train upon this data
-        #
-        # -- we need to convert the data-frame to binary matrix, given a column
-        #
+        else:
+                print (SYS_ARGS.keys())
+                print (__doc__)
         pass
-    elif 'generate' in SYS_ARGS:
-        values = df[column].unique().tolist()
-        values.sort()
-        
-        p = Predict(context=context,label=LABEL,values=values,column=column)
-        p.load_meta(column)
-        r = p.apply()
-        print (df)
-        print ()
-        df[column] = r[column]
-        print (df)
-        
-        
-    else:
-        print (SYS_ARGS.keys())
-        print (__doc__)
-    pass
 
diff --git a/setup.py b/setup.py
index 8034249..a0b96c7 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.0.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.0.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git'

From 0f0c2642c2e8d1d3a2463c6945c18441a7392691 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 18 Feb 2020 02:59:39 -0600
Subject: [PATCH 010/250] bug fix with binary matrix generation

---
 data/bridge.py         |  8 +++++---
 data/gan.py            |  8 +-------
 data/maker/__init__.py | 12 +++++++++---
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/data/bridge.py b/data/bridge.py
index fa323af..019f065 100644
--- a/data/bridge.py
+++ b/data/bridge.py
@@ -191,12 +191,13 @@ class Binary :
         #
         # This will give us a map of how each column was mapped to a bitstream
         
-        _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)
+        # _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)
+        _map = df.fillna('').apply(lambda column: self.__stream(column),axis=0)
         
         #
         # We will merge this to have a healthy matrix
         _matrix =  _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1)
-        _matrix = np.matrix([list(item) for item in _matrix])
+        _matrix = np.matrix([list(item) for item in _matrix]).astype(np.float32)
         #
         # let's format the map so we don't have an unreasonable amount of data
         #
@@ -210,7 +211,8 @@ class Binary :
             _m[name] = {"start":beg,"end":end}
             beg = end
 
-        return _m,_matrix.astype(np.float32)        
+        # return _m,_matrix.astype(np.float32)        
+        return _matrix
         
     def Import(self,df,values,_map):
         """
diff --git a/data/gan.py b/data/gan.py
index 367d63c..3d600a3 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -397,17 +397,13 @@ class Train (GNet):
                 labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32)
                 dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
                 dataset = dataset.repeat(10000)
-                dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
+                dataset = dataset.batch(batch_size=3000)
                 dataset = dataset.prefetch(1)
                 # iterator = dataset.make_initializable_iterator()
                 iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
-                # next_element = iterator.get_next()
-                # init_op = iterator.initializer
                 return iterator, features_placeholder, labels_placeholder
         
         def network(self,**args):
-        # def graph(stage, opt):
-                # global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False)
                 stage   = args['stage']
                 opt             = args['opt']
                 tower_grads = []
@@ -540,8 +536,6 @@ class Predict(GNet):
                                 # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
                                 #
                                 df =  ( pd.DataFrame(np.round(f).astype(np.int32)))
-                                print (df.head())
-                                print ()
                                 p = 0 not in df.sum(axis=1).values
                                 
                                 if      p:
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 12abc8d..74ae718 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -12,6 +12,7 @@ import pandas as pd
 import numpy as np
 import data.gan as gan
 from transport import factory
+from data.bridge import Binary
 import threading as thread
 def train (**args) :
     """
@@ -32,9 +33,12 @@ def train (**args) :
     # If we have several columns we will proceed one at a time (it could be done in separate threads)
     # @TODO : Consider performing this task on several threads/GPUs simulataneously
     # 
-    args['label']      = pd.get_dummies(df[column_id]).astype(np.float32).values
+    handler = Binary()
+    # args['label']      = pd.get_dummies(df[column_id]).astype(np.float32).values
+    args['label']   = handler.Export(df[[column_id]])
     for col in column :    
-        args['real']        = pd.get_dummies(df[col]).astype(np.float32).values 
+        # args['real']        = pd.get_dummies(df[col]).astype(np.float32).values 
+        args['real']    = handler.Export(df[[col]])
         args['column']  = col
         args['context'] = col
         context     = args['context']
@@ -77,7 +81,9 @@ def generate(**args):
     #@TODO:
     #   If the identifier is not present, we should fine a way to determine or make one
     #
-    args['label']  = pd.get_dummies(df[column_id]).astype(np.float32).values
+    # args['label']  = pd.get_dummies(df[column_id]).astype(np.float32).values
+    bwrangler = Binary()
+    args['label']   = bwrangler.Export(df[[column_id]])
     _df     = df.copy()
     for col in column :
         args['context'] = col

From dab3ab7bf732504f0205536402a1976c18ca3df0 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 18 Feb 2020 03:09:47 -0600
Subject: [PATCH 011/250] version stuff

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index a0b96c7..fcc12c1 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.0.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.1.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git'

From 4a25af6b1345223d9acb20f6eef74b09dd083eeb Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 18 Feb 2020 12:25:47 -0600
Subject: [PATCH 012/250] removing conditions, it blows up computational space

---
 data/gan.py            | 89 +++++++++++++++++++++++++++---------------
 data/maker/__init__.py |  7 ++--
 setup.py               |  2 +-
 3 files changed, 62 insertions(+), 36 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 3d600a3..77fcf3d 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -72,7 +72,7 @@ class GNet :
                 elif 'label' in args and len(args['label']) == 1 :
                         self.NUM_LABELS = args['label'].shape[0]
                 else:
-                        self.NUM_LABELS = 8
+                        self.NUM_LABELS = None
                 # self.Z_DIM = 128 #self.X_SPACE_SIZE     
                 self.Z_DIM = 128  #-- used as rows down stream
                 self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM]
@@ -180,14 +180,19 @@ class GNet :
                 shift   = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing
                 mean, var       = tf.nn.moments(inputs, shift, keep_dims=True)
                 shape           = inputs.shape[1].value
-                offset_m        = self.get.variables(shape=[n_labels,shape], name='offset'+name,
-                                                                        initializer=tf.zeros_initializer)
-                scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,
-                                                                initializer=tf.ones_initializer)
-                
-                offset  = tf.nn.embedding_lookup(offset_m, labels)
-                scale   = tf.nn.embedding_lookup(scale_m, labels)
-                result  = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8)
+                if labels is not None:
+                        offset_m        = self.get.variables(shape=[1,shape], name='offset'+name,
+                                                                                initializer=tf.zeros_initializer)
+                        scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,
+                                                                        initializer=tf.ones_initializer)
+                        offset  = tf.nn.embedding_lookup(offset_m, labels)
+                        scale   = tf.nn.embedding_lookup(scale_m, labels)
+
+                else:
+                        offset = None
+                        scale = None
+
+                result  = tf.nn.batch_normalization(inputs, mean, var,offset,scale, 1e-8)
                 return result
 
         def _variable_on_cpu(self,**args):
@@ -248,7 +253,7 @@ class Generator (GNet):
                 x               = args['inputs']
                 tmp_dim = self.Z_DIM if 'dim' not in args else args['dim']
                 label   = args['label']
-                
+                print (self.NUM_LABELS)
                 with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
                         for i, dim in enumerate(self.G_STRUCTURE[:-1]):
                                 kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim])
@@ -331,7 +336,7 @@ class Train (GNet):
                 self.generator = Generator(**args)
                 self.discriminator = Discriminator(**args)
                 self._REAL = args['real']
-                self._LABEL= args['label']
+                self._LABEL= args['label'] if 'label' in args else None
                 self.column = args['column']
                 # print ([" *** ",self.BATCHSIZE_PER_GPU])
                 
@@ -340,7 +345,7 @@ class Train (GNet):
                         
                         self.logger.write( self.meta )
                 
-                self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta)
+                # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta)
         def load_meta(self, column):
                 """
                 This function will delegate the calls to load meta data to it's dependents
@@ -363,13 +368,16 @@ class Train (GNet):
                 stage   = args['stage']
                 real    = args['real']
                 label   = args['label']
-                label   = tf.cast(label, tf.int32)
-                #
-                # @TODO: Ziqi needs to explain what's going on here
-                m = [[i] for i in np.arange(self._LABEL.shape[1]-2)]
-                label   = label[:, 1] * len(m) + tf.squeeze(
-                        tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32))
-                        )
+
+                
+                if label is not None :
+                        label   = tf.cast(label, tf.int32)
+                        #
+                        # @TODO: Ziqi needs to explain what's going on here
+                        m = [[i] for i in np.arange(self._LABEL.shape[1]-2)]
+                        label   = label[:, 1] * len(m) + tf.squeeze(
+                                tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32))
+                                )
                 # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] )
                 z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
                 
@@ -394,8 +402,13 @@ class Train (GNet):
                 This function seems to produce 
                 """
                 features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32)
-                labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32)
-                dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
+                LABEL_SHAPE = [None,None] if self._LABEL is None else self._LABEL.shape
+                labels_placeholder = tf.compat.v1.placeholder(shape=LABEL_SHAPE, dtype=tf.float32)
+                if self._LABEL is not None :
+                        dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
+                else :
+                        dataset = tf.data.Dataset.from_tensor_slices(features_placeholder)
+                # labels_placeholder = None
                 dataset = dataset.repeat(10000)
                 dataset = dataset.batch(batch_size=3000)
                 dataset = dataset.prefetch(1)
@@ -413,7 +426,10 @@ class Train (GNet):
                         for i in range(self.NUM_GPUS):
                                 with tf.device('/gpu:%d' % i):
                                         with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
-                                                (real, label) = iterator.get_next()
+                                                if self._LABEL is not None :
+                                                        (real, label) = iterator.get_next()
+                                                else:
+                                                        real = iterator.get_next()
                                                 loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL)
                                                 #tf.get_variable_scope().reuse_variables()
                                                 tf.compat.v1.get_variable_scope().reuse_variables()
@@ -450,11 +466,12 @@ class Train (GNet):
                         #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
                         with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
                                 sess.run(init)
+                                
                                 sess.run(iterator_d.initializer,
-                                                feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL})
+                                                        feed_dict={features_placeholder_d: REAL})
                                 sess.run(iterator_g.initializer,
-                                                feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL})
-
+                                                        feed_dict={features_placeholder_g: REAL})
+                                
                                 for epoch in range(1, self.MAX_EPOCHS + 1):
                                         start_time = time.time()
                                         w_sum = 0
@@ -511,9 +528,11 @@ class Predict(GNet):
                 tf.compat.v1.reset_default_graph()
                 z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
                 y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32)
-                ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
-                label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
-                
+                if self._LABEL is not None :
+                        ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
+                        label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
+                else:
+                        label = None
                 fake    = self.generator.network(inputs=z, label=label)
                 init    = tf.compat.v1.global_variables_initializer()
                 saver   = tf.compat.v1.train.Saver()
@@ -524,13 +543,19 @@ class Predict(GNet):
                         
                         # sess.run(init)
                         saver.restore(sess, model_dir)
-                        labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
+                        if self._LABEL is not None :
+                                labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
+                                labels= demo
+                        else:
+                                labels = None
                         
                         found = []
-                        labels= demo
+                        
                         for i in np.arange(CANDIDATE_COUNT) :
-                                
-                                f = sess.run(fake,feed_dict={y:labels})
+                                if labels :
+                                        f = sess.run(fake,feed_dict={y:labels})
+                                else:
+                                        f = sess.run(fake)
                                 #
                                 # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
                                 # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 74ae718..71fdc68 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -25,7 +25,7 @@ def train (**args) :
     """
     column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
     
-    column_id   = args['id']
+    # column_id   = args['id']
     df          = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
     df.columns = [name.lower() for name in df.columns]
 
@@ -35,7 +35,8 @@ def train (**args) :
     # 
     handler = Binary()
     # args['label']      = pd.get_dummies(df[column_id]).astype(np.float32).values
-    args['label']   = handler.Export(df[[column_id]])
+    # args['label']   = handler.Export(df[[column_id]])
+    # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1)
     for col in column :    
         # args['real']        = pd.get_dummies(df[col]).astype(np.float32).values 
         args['real']    = handler.Export(df[[col]])
@@ -83,7 +84,7 @@ def generate(**args):
     #
     # args['label']  = pd.get_dummies(df[column_id]).astype(np.float32).values
     bwrangler = Binary()
-    args['label']   = bwrangler.Export(df[[column_id]])
+    # args['label']   = bwrangler.Export(df[[column_id]])
     _df     = df.copy()
     for col in column :
         args['context'] = col
diff --git a/setup.py b/setup.py
index fcc12c1..50155cc 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@ def read(fname):
 args = {"name":"data-maker","version":"1.1.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
-args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git'
+args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
 
 if sys.version_info[0] == 2 :
     args['use_2to3'] = False

From cac2dd293def20f8343ef0e3647e2f83c9f6c461 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 18 Feb 2020 16:56:24 -0600
Subject: [PATCH 013/250] bug fix with dimensionalities and removing conditions

---
 data/gan.py | 9 ++++++---
 setup.py    | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 77fcf3d..c18277c 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -79,7 +79,8 @@ class GNet :
                 if 'real' in args : 
                                 self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM]
 
-                self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) if 'real' in args else 256
+                # self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) if 'real' in args else 256
+                self.BATCHSIZE_PER_GPU = 3000 if 'batch_size' not in args else int(args['batch_size'])
                 self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS
                 self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)       
                 self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs'])
@@ -410,7 +411,7 @@ class Train (GNet):
                         dataset = tf.data.Dataset.from_tensor_slices(features_placeholder)
                 # labels_placeholder = None
                 dataset = dataset.repeat(10000)
-                dataset = dataset.batch(batch_size=3000)
+                dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
                 dataset = dataset.prefetch(1)
                 # iterator = dataset.make_initializable_iterator()
                 iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
@@ -430,7 +431,8 @@ class Train (GNet):
                                                         (real, label) = iterator.get_next()
                                                 else:
                                                         real = iterator.get_next()
-                                                loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL)
+                                                        label= None
+                                                loss, w = self.loss(scope=scope, stage=stage, real=real, label=label)
                                                 #tf.get_variable_scope().reuse_variables()
                                                 tf.compat.v1.get_variable_scope().reuse_variables()
                                                 #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
@@ -465,6 +467,7 @@ class Train (GNet):
                         logs = []
                         #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
                         with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
+                                
                                 sess.run(init)
                                 
                                 sess.run(iterator_d.initializer,
diff --git a/setup.py b/setup.py
index 50155cc..8d41539 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.1.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.1.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From f63ede2fc58c983635b4c5a89ef33031938232d9 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 18 Feb 2020 17:23:13 -0600
Subject: [PATCH 014/250] tweak with batch size/gpu (bug with small data)

---
 data/gan.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index c18277c..ed8facd 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -59,8 +59,8 @@ class GNet :
                 self.logs = {}
 
                 self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
-                if self.NUM_GPUS > 1 :
-                    os.environ['CUDA_VISIBLE_DEVICES'] = "4"
+                # if self.NUM_GPUS > 1 :
+                #     os.environ['CUDA_VISIBLE_DEVICES'] = "4"
 
                 self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854
                 self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE]
@@ -78,9 +78,12 @@ class GNet :
                 self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM]
                 if 'real' in args : 
                                 self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM]
-
-                # self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) if 'real' in args else 256
-                self.BATCHSIZE_PER_GPU = 3000 if 'batch_size' not in args else int(args['batch_size'])
+                PROPOSED_BATCH_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size'])
+                if args['real'].shape[0]  < PROPOSED_BATCH_PER_GPU :
+                        self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) 
+                else:
+                        self.BATCHSIZE_PER_GPU = PROPOSED_BATCH_PER_GPU
+                # self.BATCHSIZE_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size'])
                 self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS
                 self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)       
                 self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs'])
@@ -254,7 +257,7 @@ class Generator (GNet):
                 x               = args['inputs']
                 tmp_dim = self.Z_DIM if 'dim' not in args else args['dim']
                 label   = args['label']
-                print (self.NUM_LABELS)
+                
                 with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
                         for i, dim in enumerate(self.G_STRUCTURE[:-1]):
                                 kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim])

From 74c1f9d511a494bcf5e6d40c3d0e9690e088cea4 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 20 Feb 2020 09:52:53 -0600
Subject: [PATCH 015/250] bug fix with class hierarchy

---
 data/gan.py            | 12 ++++++------
 data/maker/__init__.py |  4 ++--
 setup.py               |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index ed8facd..fd30070 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -76,13 +76,13 @@ class GNet :
                 # self.Z_DIM = 128 #self.X_SPACE_SIZE     
                 self.Z_DIM = 128  #-- used as rows down stream
                 self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM]
-                if 'real' in args : 
-                                self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM]
                 PROPOSED_BATCH_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size'])
-                if args['real'].shape[0]  < PROPOSED_BATCH_PER_GPU :
-                        self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) 
-                else:
-                        self.BATCHSIZE_PER_GPU = PROPOSED_BATCH_PER_GPU
+                self.BATCHSIZE_PER_GPU = PROPOSED_BATCH_PER_GPU
+                if 'real' in args : 
+                        self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM]
+                
+                        if args['real'].shape[0]  < PROPOSED_BATCH_PER_GPU :
+                                self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) 
                 # self.BATCHSIZE_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size'])
                 self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS
                 self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)       
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 71fdc68..cbd1ea9 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -38,8 +38,8 @@ def train (**args) :
     # args['label']   = handler.Export(df[[column_id]])
     # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1)
     for col in column :    
-        # args['real']        = pd.get_dummies(df[col]).astype(np.float32).values 
-        args['real']    = handler.Export(df[[col]])
+        args['real']        = pd.get_dummies(df[col]).astype(np.float32).values 
+        # args['real']    = handler.Export(df[[col]])
         args['column']  = col
         args['context'] = col
         context     = args['context']
diff --git a/setup.py b/setup.py
index 8d41539..a7e0642 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.1.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.1.2","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From cd88a9660a64f6115f27065a52551ca9fa8dd35e Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 20 Feb 2020 22:37:25 -0600
Subject: [PATCH 016/250] bug fix , generator

---
 data/gan.py            | 1 +
 data/maker/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/data/gan.py b/data/gan.py
index fd30070..2083f69 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -581,6 +581,7 @@ class Predict(GNet):
                         #
                         # In case we are dealing with actual values like diagnosis codes we can perform 
                         #
+                        
                         df = found[np.random.choice(np.arange(len(found)),1)[0]]
                         columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
                         
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index cbd1ea9..3c04b57 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -38,7 +38,7 @@ def train (**args) :
     # args['label']   = handler.Export(df[[column_id]])
     # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1)
     for col in column :    
-        args['real']        = pd.get_dummies(df[col]).astype(np.float32).values 
+        args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
         # args['real']    = handler.Export(df[[col]])
         args['column']  = col
         args['context'] = col

From c1a500fe4c3d18fa0606b2e68ad980515cd30f52 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 20 Feb 2020 23:08:35 -0600
Subject: [PATCH 017/250] bug fix , generator

---
 data/gan.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 2083f69..4c05566 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -568,8 +568,9 @@ class Predict(GNet):
                                 #
                                 df =  ( pd.DataFrame(np.round(f).astype(np.int32)))
                                 p = 0 not in df.sum(axis=1).values
-                                
-                                if      p:
+                                x = df.sum(axis=1).values
+                                print ( [np.sum(x),x.size])
+                                if np.divide( np.sum(x), x.size) :
                                         found.append(df)
                                         if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT:
                                                 break

From 553ee75a0681a80ea95fd0bdcf7920d383510c8d Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 25 Feb 2020 11:41:40 -0600
Subject: [PATCH 018/250] bug fix around shape of candidate data to generate

---
 data/gan.py            | 49 +++++++++++++++++++++++++++++++-----------
 data/maker/__init__.py |  8 +++----
 2 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 4c05566..6e6454e 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -166,7 +166,15 @@ class GNet :
                 return _object
         def mkdir (self,path):
                 if not os.path.exists(path) :
-                        os.mkdir(path)            
+                        if os.sep in path :
+                                pass
+                                root = []
+                                for loc in path.split(os.sep) :
+                                        root.append(loc)
+                                        os.mkdir(os.sep.join(root))
+
+                        else:
+                                os.mkdir(path)            
                 
 
         def normalize(self,**args):
@@ -520,8 +528,10 @@ class Predict(GNet):
         """
         def __init__(self,**args):
                 GNet.__init__(self,**args)                
-                self.generator = Generator(**args)                
-                self.values  = args['values']
+                self.generator  = Generator(**args)                
+                self.values     = args['values']
+                self.ROW_COUNT  = args['row_count']
+                self.MISSING_VALUES = args['no_value'] 
         def load_meta(self, column):
                 super().load_meta(column)
                 self.generator.load_meta(column)
@@ -532,8 +542,8 @@ class Predict(GNet):
                 model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
                 demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
                 tf.compat.v1.reset_default_graph()
-                z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
-                y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32)
+                z = tf.random.normal(shape=[self.ROW_COUNT, self.Z_DIM])
+                y = tf.compat.v1.placeholder(shape=[self.ROW_COUNT, self.NUM_LABELS], dtype=tf.int32)
                 if self._LABEL is not None :
                         ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
                         label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
@@ -556,7 +566,7 @@ class Predict(GNet):
                                 labels = None
                         
                         found = []
-                        
+                        ratio = []
                         for i in np.arange(CANDIDATE_COUNT) :
                                 if labels :
                                         f = sess.run(fake,feed_dict={y:labels})
@@ -569,10 +579,11 @@ class Predict(GNet):
                                 df =  ( pd.DataFrame(np.round(f).astype(np.int32)))
                                 p = 0 not in df.sum(axis=1).values
                                 x = df.sum(axis=1).values
-                                print ( [np.sum(x),x.size])
-                                if np.divide( np.sum(x), x.size) :
+                                
+                                if np.divide( np.sum(x), x.size)  > .9 or p:
+                                        ratio.append(np.divide( np.sum(x), x.size))
                                         found.append(df)
-                                        if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT:
+                                        if i == CANDIDATE_COUNT:
                                                 break
                                 else:
                                         continue
@@ -582,8 +593,9 @@ class Predict(GNet):
                         #
                         # In case we are dealing with actual values like diagnosis codes we can perform 
                         #
-                        
-                        df = found[np.random.choice(np.arange(len(found)),1)[0]]
+                        INDEX = np.random.choice(np.arange(len(found)),1)[0]
+                        INDEX = ratio.index(np.max(ratio))
+                        df = found[INDEX]
                         columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
                         
                         # r = np.zeros((self.ROW_COUNT,len(columns)))
@@ -592,9 +604,20 @@ class Predict(GNet):
                         if len(found):
                                 print (len(found),NTH_VALID_CANDIDATE)    
                                 # x = df * self.values 
-                                
-                                df =  pd.DataFrame( df.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
+                                #
+                                # let's get the missing rows (if any) ...
+                                #
+                                ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
+                                if ii :
+                                        #
+                                        #@TODO Have this be a configurable variable
+                                        missing = np.repeat(0, np.where(ii==1)[0].size)
+                                else:
+                                        missing = []
+                                i = np.where(ii == 0)[0]
+                                df =  pd.DataFrame( df.iloc.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
                                 df.columns = columns
+                                df = df[columns[0]].append(pd.Series(missing))
                                 
                            
                         
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 3c04b57..6205b78 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -77,25 +77,23 @@ def generate(**args):
     df      = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
     
     column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
-    column_id   = args['id']
+    # column_id   = args['id']
     #
     #@TODO:
     #   If the identifier is not present, we should fine a way to determine or make one
     #
-    # args['label']  = pd.get_dummies(df[column_id]).astype(np.float32).values
-    bwrangler = Binary()
-    # args['label']   = bwrangler.Export(df[[column_id]])
     _df     = df.copy()
     for col in column :
         args['context'] = col
         args['column']  = col
         values          = df[col].unique().tolist()
-        # values.sort()        
         args['values']  = values
+        args['row_count'] = df.shape[0]
         #
         # we can determine the cardinalities here so we know what to allow or disallow
         handler         = gan.Predict (**args)
         handler.load_meta(col)
+        # handler.ROW_COUNT = df[col].shape[0]
         r       =  handler.apply()        
         # print (r)        
         _df[col] = r[col]

From 1db182a528ce89d9bf6215f285446499124e122f Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 25 Feb 2020 11:44:30 -0600
Subject: [PATCH 019/250] update version #

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index a7e0642..aefd6d0 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.1.2","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.1.3","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 994e71160e54b039f87bcf456adfe6664452eb0e Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 25 Feb 2020 11:54:27 -0600
Subject: [PATCH 020/250] bug fix with directory

---
 data/gan.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 6e6454e..0d449d2 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -171,9 +171,10 @@ class GNet :
                                 root = []
                                 for loc in path.split(os.sep) :
                                         root.append(loc)
-                                        os.mkdir(os.sep.join(root))
+                                        if not os.path.exists(os.sep.join(root)) :
+                                                os.mkdir(os.sep.join(root))
 
-                        else:
+                        elif not os.path.exists(path):
                                 os.mkdir(path)            
                 
 

From 334915894a68e80981f45491ab0357cdcc26aa9e Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 25 Feb 2020 12:08:09 -0600
Subject: [PATCH 021/250] bug fix ...

---
 data/params.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/params.py b/data/params.py
index 999b919..55b3109 100644
--- a/data/params.py
+++ b/data/params.py
@@ -1,6 +1,6 @@
 import sys
 
-SYS_ARGS  = {'context':''}
+# SYS_ARGS  = {'context':''}
 if len(sys.argv) > 1:
 	
 	N = len(sys.argv)

From b1a9a9fcb977867bf7c880be572f3b5ad6b27faa Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 25 Feb 2020 12:09:04 -0600
Subject: [PATCH 022/250] -

---
 data/params.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/params.py b/data/params.py
index 55b3109..c667063 100644
--- a/data/params.py
+++ b/data/params.py
@@ -1,6 +1,6 @@
 import sys
 
-# SYS_ARGS  = {'context':''}
+SYS_ARGS  = {}
 if len(sys.argv) > 1:
 	
 	N = len(sys.argv)

From 93176a2d09e3234b9937865bc3033214a40d0a23 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 25 Feb 2020 12:23:40 -0600
Subject: [PATCH 023/250] bug fix: ambiguous thruth value of series

---
 data/gan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/gan.py b/data/gan.py
index 0d449d2..621cea9 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -609,7 +609,7 @@ class Predict(GNet):
                                 # let's get the missing rows (if any) ...
                                 #
                                 ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
-                                if ii :
+                                if ii.shape[0] == 0 :
                                         #
                                         #@TODO Have this be a configurable variable
                                         missing = np.repeat(0, np.where(ii==1)[0].size)

From 94780281d076eb95a225353326d12515c036caee Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 25 Feb 2020 12:25:03 -0600
Subject: [PATCH 024/250] bug fix: iloc index missing on generate samples

---
 data/gan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/gan.py b/data/gan.py
index 621cea9..382dc41 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -616,7 +616,7 @@ class Predict(GNet):
                                 else:
                                         missing = []
                                 i = np.where(ii == 0)[0]
-                                df =  pd.DataFrame( df.iloc.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
+                                df =  pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
                                 df.columns = columns
                                 df = df[columns[0]].append(pd.Series(missing))
                                 

From 0656474ca890c62f30cef940a0e513afa8006db3 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 25 Feb 2020 12:27:45 -0600
Subject: [PATCH 025/250] bug fix: data-frame should be returned (not series)

---
 data/gan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/gan.py b/data/gan.py
index 382dc41..f2f3cdb 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -624,7 +624,7 @@ class Predict(GNet):
                         
                 
                 tf.compat.v1.reset_default_graph()
-                
+                df = pd.DataFrame(df)
                 return df.to_dict(orient='list')
                         # return df.to_dict(orient='list')
                         # count = str(len(os.listdir(self.out_dir)))

From 1cfd2059a472b59d6780e71d680c4ce7c4dbf0db Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 25 Feb 2020 12:33:44 -0600
Subject: [PATCH 026/250] bug fix: generated sample structure

---
 data/gan.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/data/gan.py b/data/gan.py
index f2f3cdb..d1b3123 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -625,6 +625,7 @@ class Predict(GNet):
                 
                 tf.compat.v1.reset_default_graph()
                 df = pd.DataFrame(df)
+                df.columns = columns
                 return df.to_dict(orient='list')
                         # return df.to_dict(orient='list')
                         # count = str(len(os.listdir(self.out_dir)))

From a51be50a862ef3c93436f667dd13d133644671ac Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 26 Feb 2020 09:25:13 -0600
Subject: [PATCH 027/250] bug fix: missing values when generated

---
 data/gan.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/data/gan.py b/data/gan.py
index d1b3123..204f8af 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -594,6 +594,7 @@ class Predict(GNet):
                         #
                         # In case we are dealing with actual values like diagnosis codes we can perform 
                         #
+                        
                         INDEX = np.random.choice(np.arange(len(found)),1)[0]
                         INDEX = ratio.index(np.max(ratio))
                         df = found[INDEX]
@@ -609,7 +610,9 @@ class Predict(GNet):
                                 # let's get the missing rows (if any) ...
                                 #
                                 ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
-                                if ii.shape[0] == 0 :
+                                # print ([' **** ',ii.sum()])
+
+                                if ii.shape[0] > 0 :
                                         #
                                         #@TODO Have this be a configurable variable
                                         missing = np.repeat(0, np.where(ii==1)[0].size)

From 98a1062a3044b0fd662240540996808f80e73411 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 26 Feb 2020 09:32:29 -0600
Subject: [PATCH 028/250] bug fixes with missing values

---
 data/maker/__init__.py |   4 +-
 data/maker/__main__.py |   5 +-
 gan.py                 | 705 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 712 insertions(+), 2 deletions(-)
 create mode 100644 gan.py

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 6205b78..d5a4308 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -95,7 +95,9 @@ def generate(**args):
         handler.load_meta(col)
         # handler.ROW_COUNT = df[col].shape[0]
         r       =  handler.apply()        
-        # print (r)        
+        # print (r)      
+        # 
+        print ([_df.shape,len(r[col])])  
         _df[col] = r[col]
         # break
     return _df
\ No newline at end of file
diff --git a/data/maker/__main__.py b/data/maker/__main__.py
index 63b464b..583be60 100644
--- a/data/maker/__main__.py
+++ b/data/maker/__main__.py
@@ -12,11 +12,14 @@ if 'config' in SYS_ARGS :
     else:
         #
         #
+        ARGS['no_value'] = ''
         _df = data.maker.generate(**ARGS)
         odf = pd.read_csv (ARGS['data'])
         odf.columns = [name.lower() for name in odf.columns]
         column = ARGS['column']  if isinstance(ARGS['column'],list) else [ARGS['column']]
-        print(pd.merge(odf,_df, on='id'))
+        print (odf.head())
+        print (_df.head())
+        # print(pd.merge(odf,_df,rsuffix='_io'))
         # print (_df[column].risk.evaluate(flag='synth'))
         # print (odf[column].risk.evaluate(flag='original'))
         # _x = pd.get_dummies(_df[column]).values
diff --git a/gan.py b/gan.py
new file mode 100644
index 0000000..2e4d503
--- /dev/null
+++ b/gan.py
@@ -0,0 +1,705 @@
+"""
+This code was originally writen by Ziqi Zhang <ziqi.zhang@vanderbilt.edu> in order to generate synthetic data.
+The code is an implementation of a Generative Adversarial Network that uses the Wasserstein Distance (WGAN).
+It is intended to be used in 2 modes (embedded in code or using CLI)
+
+USAGE :
+
+The following parameters should be provided in a configuration file (JSON format)
+python data/maker --config <path-to-config-file.json>
+
+CONFIGURATION FILE STRUCTURE :
+
+																																																																context																																																																	what it is you are loading (stroke, hypertension, ...)
+																																																																data																																																																																						path of the file to be loaded
+																																																																logs																																																																																						folder to store training model and meta data about learning
+																																																																max_epochs																																												number of iterations in learning 
+																																																																num_gpu																																																																	number of gpus to be used (will still run if the GPUs are not available)
+
+EMBEDDED IN CODE :
+
+"""
+import tensorflow as tf
+from tensorflow.contrib.layers import l2_regularizer
+import numpy as np
+import pandas as pd
+import time
+import os
+import sys
+from data.params import SYS_ARGS
+from data.bridge import Binary
+import json
+import pickle
+
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ['CUDA_VISIBLE_DEVICES'] = "0"
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+
+# STEPS_PER_EPOCH																																																																															= int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256
+# NUM_GPUS																																																																																																																																																																																												= 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu'])
+# BATCHSIZE_PER_GPU																											= 2000
+# TOTAL_BATCHSIZE																																																																															= BATCHSIZE_PER_GPU * NUM_GPUS
+
+class void :
+																																																																pass
+class GNet :
+																																																																def log(self,**args):
+																																																																																																																																self.logs = dict(args,**self.logs)
+																																																																																	
+																																																																																																																																																																																																
+																																																																"""
+																																																																This is the base class of a generative network functions, the details will be implemented in the subclasses.
+																																																																An instance of this class is accessed as follows 
+																																																																object.layers.normalize applies batch normalization or otherwise
+																																																																obect.get.variables																																																																																																											instanciate variables on cpu and return a reference (tensor)
+																																																																"""
+																																																																def __init__(self,**args):
+																																																																																																																																self.layers = void()
+																																																																																																																																self.layers.normalize = self.normalize
+																																																																																																																																self.logs = {}
+
+																																																																																																																																self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
+																																																																																																																																# if self.NUM_GPUS > 1 :
+																																																																																																																																#																																															os.environ['CUDA_VISIBLE_DEVICES'] = "4"
+
+																																																																																																																																self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854
+																																																																																																																																self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE]
+																																																																																																																																self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis
+																																																																																																																																# self.NUM_LABELS																																																																															= 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1]
+																																																																																																																																
+																																																																																																																																if 'label' in args and len(args['label'].shape) == 2 :
+																																																																																																																																																																																																self.NUM_LABELS = args['label'].shape[1]
+																																																																																																																																elif 'label' in args and len(args['label']) == 1 :
+																																																																																																																																																																																																self.NUM_LABELS = args['label'].shape[0]
+																																																																																																																																else:
+																																																																																																																																																																																																self.NUM_LABELS = None
+																																																																																																																																# self.Z_DIM = 128 #self.X_SPACE_SIZE																																					
+																																																																																																																																self.Z_DIM = 128																#-- used as rows down stream
+																																																																																																																																self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM]
+																																																																																																																																PROPOSED_BATCH_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size'])
+																																																																																																																																self.BATCHSIZE_PER_GPU = PROPOSED_BATCH_PER_GPU
+																																																																																																																																if 'real' in args : 
+																																																																																																																																																																																																self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM]
+																																																																																																																																
+																																																																																																																																																																																																if args['real'].shape[0]																< PROPOSED_BATCH_PER_GPU :
+																																																																																																																																																																																																																																																																self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) 
+																																																																																																																																# self.BATCHSIZE_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size'])
+																																																																																																																																self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS
+																																																																																																																																self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)																																													
+																																																																																																																																self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs'])
+																																																																																																																																self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100
+																																																																																																																																self.CONTEXT = args['context']
+																																																																																																																																self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None}
+																																																																																																																																self._REAL = args['real'] if 'real' in args else None
+																																																																																																																																self._LABEL = args['label'] if 'label' in args else None
+
+																																																																																																																																self.get = void()
+																																																																																																																																self.get.variables = self._variable_on_cpu
+																																																																																																																																self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
+																																																																																																																																self.logger = args['logger'] if 'logger' in args and args['logger'] else None
+																																																																																																																																self.init_logs(**args)
+
+																																																																def init_logs(self,**args):
+																																																																																																																																self.log_dir = args['logs'] if 'logs' in args else 'logs'
+																																																																																																																																self.mkdir(self.log_dir)
+																																																																																																																																#
+																																																																																																																																# 
+																																																																																																																																for key in ['train','output'] :
+																																																																																																																																																																																																self.mkdir(os.sep.join([self.log_dir,key]))
+																																																																																																																																																																																																self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT]))
+																																																																																																																																																																																																
+																																																																																																																																self.train_dir		= os.sep.join([self.log_dir,'train',self.CONTEXT])																																																																																																																												
+																																																																																																																																self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
+																																																																																																																																if self.logger :
+																																																																																																																																																																																																#
+																																																																																																																																																																																																# We will clear the logs from the data-store 
+																																																																																																																																																																																																#
+																																																																																																																																																																																																column = self.ATTRIBUTES['synthetic']
+																																																																																																																																																																																																db = self.logger.db
+																																																																																																																																																																																																if db[column].count() > 0 :
+																																																																																																																																																																																																																																																																db.backup.insert({'name':column,'logs':list(db[column].find()) })
+																																																																																																																																																																																																																																																																db[column].drop()
+																																																																																																																																
+																																																																def load_meta(self,column):
+																																																																																																																																"""
+																																																																																																																																This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model.
+																																																																																																																																Because prediction and training can happen independently
+																																																																																																																																"""
+																																																																																																																																# suffix = "-".join(column) if isinstance(column,list)else column
+																																																																																																																																suffix = self.get.suffix()
+																																																																																																																																_name = os.sep.join([self.out_dir,'meta-'+suffix+'.json'])
+																																																																																																																																if os.path.exists(_name) :
+																																																																																																																																																																																																attr = json.loads((open(_name)).read())
+																																																																																																																																																																																																for key in attr :
+																																																																																																																																																																																																																																																																value = attr[key]
+																																																																																																																																																																																																																																																																setattr(self,key,value)
+																																																																																																																																self.train_dir		= os.sep.join([self.log_dir,'train',self.CONTEXT])																																																																																																																												
+																																																																																																																																self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
+																																																																																																																																																																																																																																																																
+																																																																																																																																																																																																
+																																																																def log_meta(self,**args) :
+																																																																																																																																
+																																																																																																																																_object = {
+																																																																																																																																																																																																# '_id':'meta',
+																																																																																																																																																																																																'CONTEXT':self.CONTEXT,
+																																																																																																																																																																																																'ATTRIBUTES':self.ATTRIBUTES,
+																																																																																																																																																																																																'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU,
+																																																																																																																																																																																																'Z_DIM':self.Z_DIM,
+																																																																																																																																																																																																"X_SPACE_SIZE":self.X_SPACE_SIZE,
+																																																																																																																																																																																																"D_STRUCTURE":self.D_STRUCTURE,
+																																																																																																																																																																																																"G_STRUCTURE":self.G_STRUCTURE,
+																																																																																																																																																																																																"NUM_GPUS":self.NUM_GPUS,
+																																																																																																																																																																																																"NUM_LABELS":self.NUM_LABELS,
+																																																																																																																																																																																																"MAX_EPOCHS":self.MAX_EPOCHS,
+																																																																																																																																																																																																"ROW_COUNT":self.ROW_COUNT
+																																																																																																																																}
+																																																																																																																																if args and 'key' in args and 'value' in args :
+																																																																																																																																																																																																key = args['key']
+																																																																																																																																																																																																value= args['value']
+																																																																																																																																																																																																object[key] = value
+																																																																																																																																# suffix = "-".join(self.column) if isinstance(self.column,list) else self.column
+																																																																																																																																suffix = self.get.suffix()
+																																																																																																																																_name = os.sep.join([self.out_dir,'meta-'+suffix])
+																																																																																																																																
+																																																																																																																																f = open(_name+'.json','w')
+																																																																																																																																f.write(json.dumps(_object))
+																																																																																																																																return _object
+																																																																def mkdir (self,path):
+																																																																																																																																if not os.path.exists(path) :
+																																																																																																																																																																																																os.mkdir(path)																																																																																		
+																																																																																																																																
+
+																																																																def normalize(self,**args):
+																																																																																																																																"""
+																																																																																																																																This function will perform a batch normalization on an network layer
+																																																																																																																																inputs																																																																		input layer of the neural network
+																																																																																																																																name																																																																																						name of the scope the 
+																																																																																																																																labels																																																																		labels (attributes not synthesized) by default None
+																																																																																																																																n_labels																																																																number of labels default None
+																																																																																																																																"""
+																																																																																																																																inputs		= args['inputs']
+																																																																																																																																name																						= args['name']
+																																																																																																																																labels		= None if 'labels' not in args else args['labels']
+																																																																																																																																n_labels= None if 'n_labels' not in args else args['n_labels']
+																																																																																																																																shift																					= [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing
+																																																																																																																																mean, var																																																															= tf.nn.moments(inputs, shift, keep_dims=True)
+																																																																																																																																shape																																																																																					= inputs.shape[1].value
+																																																																																																																																if labels is not None:
+																																																																																																																																																																																																offset_m																																																																= self.get.variables(shape=[1,shape], name='offset'+name,
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																initializer=tf.zeros_initializer)
+																																																																																																																																																																																																scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																initializer=tf.ones_initializer)
+																																																																																																																																																																																																offset		= tf.nn.embedding_lookup(offset_m, labels)
+																																																																																																																																																																																																scale																					= tf.nn.embedding_lookup(scale_m, labels)
+
+																																																																																																																																else:
+																																																																																																																																																																																																offset = None
+																																																																																																																																																																																																scale = None
+
+																																																																																																																																result		= tf.nn.batch_normalization(inputs, mean, var,offset,scale, 1e-8)
+																																																																																																																																return result
+
+																																																																def _variable_on_cpu(self,**args):
+																																																																																																																																"""
+																																																																																																																																This function makes sure variables/tensors are not created on the GPU but rather on the CPU
+																																																																																																																																"""
+
+																																																																																																																																name = args['name']
+																																																																																																																																shape = args['shape']
+																																																																																																																																initializer=None if 'initializer' not in args else args['initializer']
+																																																																																																																																with tf.device('/cpu:0') :
+																																																																																																																																																																																																cpu_var =  tf.compat.v1.get_variable(name,shape,initializer= initializer)
+																																																																																																																																return cpu_var
+																																																																def average_gradients(self,tower_grads):
+																																																																																																																																average_grads = []
+																																																																																																																																for grad_and_vars in zip(*tower_grads):
+																																																																																																																																																																																																grads = []
+																																																																																																																																																																																																for g, _ in grad_and_vars:
+																																																																																																																																																																																																																																																																expanded_g = tf.expand_dims(g, 0)
+																																																																																																																																																																																																																																																																grads.append(expanded_g)
+
+																																																																																																																																																																																																grad = tf.concat(axis=0, values=grads)
+																																																																																																																																																																																																grad = tf.reduce_mean(grad, 0)
+
+																																																																																																																																																																																																v = grad_and_vars[0][1]
+																																																																																																																																																																																																grad_and_var = (grad, v)
+																																																																																																																																																																																																average_grads.append(grad_and_var)
+																																																																																																																																return average_grads																																																																																						
+
+
+class Generator (GNet):
+																																																																"""
+																																																																This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator, this allows the generator not to be random
+																																																																
+																																																																"""
+																																																																def __init__(self,**args):
+																																																																																																																																GNet.__init__(self,**args)
+																																																																																																																																self.discriminator = Discriminator(**args)
+																																																																def loss(self,**args):
+																																																																																																																																fake																						= args['fake']
+																																																																																																																																label																					= args['label']
+																																																																																																																																y_hat_fake = self.discriminator.network(inputs=fake, label=label)
+																																																																																																																																#all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+																																																																																																																																all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
+																																																																																																																																loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs)
+																																																																																																																																#tf.add_to_collection('glosses', loss)
+																																																																																																																																tf.compat.v1.add_to_collection('glosses', loss)
+																																																																																																																																return loss, loss																																																																																																																																
+																																																																def load_meta(self, column):
+																																																																																																																																super().load_meta(column)
+																																																																																																																																self.discriminator.load_meta(column)
+																																																																def network(self,**args) :
+																																																																																																																																"""
+																																																																																																																																This function will build the network that will generate the synthetic candidates
+																																																																																																																																:inputs matrix of data that we need
+																																																																																																																																:dim																						dimensions of ...
+																																																																																																																																"""
+																																																																																																																																x																																																																																																																															= args['inputs']
+																																																																																																																																tmp_dim = self.Z_DIM if 'dim' not in args else args['dim']
+																																																																																																																																label																					= args['label']
+																																																																																																																																
+																																																																																																																																with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
+																																																																																																																																																																																																for i, dim in enumerate(self.G_STRUCTURE[:-1]):
+																																																																																																																																																																																																																																																																kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim])
+																																																																																																																																																																																																																																																																h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS)
+																																																																																																																																																																																																																																																																h2 = tf.nn.relu(h1)
+																																																																																																																																																																																																																																																																x = x + h2
+																																																																																																																																																																																																																																																																tmp_dim = dim
+																																																																																																																																																																																																i = len(self.G_STRUCTURE) - 1
+																																																																																																																																																																																																#
+																																																																																																																																																																																																# This seems to be an extra hidden layer: 
+																																																																																																																																																																																																# It's goal is to map continuous values to discrete values (pre-trained to do this)
+																																																																																																																																																																																																kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]])
+																																																																																																																																																																																																h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i),
+																																																																																																																																																																																																																																																																																																																																																																																																labels=label, n_labels=self.NUM_LABELS)
+																																																																																																																																																																																																h2 = tf.nn.tanh(h1)
+																																																																																																																																																																																																x = x + h2
+																																																																																																																																																																																																# This seems to be the output layer
+																																																																																																																																																																																																#
+																																																																																																																																																																																																kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE])
+																																																																																																																																																																																																bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE])
+																																																																																																																																																																																																x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias))
+																																																																																																																																return x																																																																																	
+
+class Discriminator(GNet):
+																																																																def __init__(self,**args):
+																																																																																																																																GNet.__init__(self,**args)																																																													
+																																																																def network(self,**args):
+																																																																																																																																"""
+																																																																																																																																This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output (neuron)
+																																																																																																																																:inputs
+																																																																																																																																:label
+																																																																																																																																"""
+																																																																																																																																x = args['inputs']
+																																																																																																																																label = args['label']
+																																																																																																																																with tf.compat.v1.variable_scope('D', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
+																																																																																																																																																																																																for i, dim in enumerate(self.D_STRUCTURE[1:]):
+																																																																																																																																																																																																																																																																kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[i], dim])
+																																																																																																																																																																																																																																																																bias = self.get.variables(name='b_' + str(i), shape=[dim])
+																																																																																																																																																																																																																																																																# print (["\t",bias,kernel])
+																																																																																																																																																																																																																																																																x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias))
+																																																																																																																																																																																																																																																																x = self.normalize(inputs=x, name='cln' + str(i), shift=1,labels=label, n_labels=self.NUM_LABELS)
+																																																																																																																																																																																																i = len(self.D_STRUCTURE)
+																																																																																																																																																																																																kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[-1], 1])
+																																																																																																																																																																																																bias = self.get.variables(name='b_' + str(i), shape=[1])
+																																																																																																																																																																																																y = tf.add(tf.matmul(x, kernel), bias)
+																																																																																																																																return y
+																																																																
+																																																																def loss(self,**args) :
+																																																																																																																																"""
+																																																																																																																																This function compute the loss of 
+																																																																																																																																:real
+																																																																																																																																:fake
+																																																																																																																																:label
+																																																																																																																																"""
+																																																																																																																																real																						= args['real']
+																																																																																																																																fake																						= args['fake']
+																																																																																																																																label																					= args['label']
+																																																																																																																																epsilon = tf.random.uniform(shape=[self.BATCHSIZE_PER_GPU,1],minval=0,maxval=1)
+																																																																																																																																
+																																																																																																																																x_hat																																																																																					= real + epsilon * (fake - real)
+																																																																																																																																y_hat_fake																																												= self.network(inputs=fake, label=label)
+																																																																																																																																
+																																																																																																																																y_hat_real																																												= self.network(inputs=real, label=label)
+																																																																																																																																y_hat																																																																																					= self.network(inputs=x_hat, label=label)
+
+																																																																																																																																grad																																																																																						= tf.gradients(y_hat, [x_hat])[0]
+																																																																																																																																slopes																																																																		= tf.sqrt(tf.reduce_sum(tf.square(grad), 1))
+																																																																																																																																gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2)
+																																																																																																																																#all_regs																																																																= tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+																																																																																																																																all_regs																																																																= tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
+																																																																																																																																w_distance																																												= -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake)
+																																																																																																																																loss																																																																																						= w_distance + 10 * gradient_penalty + sum(all_regs)
+																																																																																																																																#tf.add_to_collection('dlosses', loss)
+																																																																																																																																tf.compat.v1.add_to_collection('dlosses', loss)
+
+																																																																																																																																return w_distance, loss																																																																																		
+class Train (GNet):
+																																																																def __init__(self,**args):
+																																																																																																																																GNet.__init__(self,**args)
+																																																																																																																																self.generator = Generator(**args)
+																																																																																																																																self.discriminator = Discriminator(**args)
+																																																																																																																																self._REAL = args['real']
+																																																																																																																																self._LABEL= args['label'] if 'label' in args else None
+																																																																																																																																self.column = args['column']
+																																																																																																																																# print ([" *** ",self.BATCHSIZE_PER_GPU])
+																																																																																																																																
+																																																																																																																																self.meta = self.log_meta()
+																																																																																																																																if(self.logger):
+																																																																																																																																																																																																
+																																																																																																																																																																																																self.logger.write( self.meta )
+																																																																																																																																
+																																																																																																																																# self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta)
+																																																																def load_meta(self, column):
+																																																																																																																																"""
+																																																																																																																																This function will delegate the calls to load meta data to it's dependents
+																																																																																																																																column name
+																																																																																																																																"""
+																																																																																																																																super().load_meta(column)
+																																																																																																																																self.generator.load_meta(column)
+																																																																																																																																self.discriminator.load_meta(column)
+																																																																def loss(self,**args):
+																																																																																																																																"""
+																																																																																																																																This function will compute a "tower" loss of the generated candidate against real data
+																																																																																																																																Training will consist in having both generator and discriminators
+																																																																																																																																:scope
+																																																																																																																																:stage
+																																																																																																																																:real
+																																																																																																																																:label
+																																																																																																																																"""
+
+																																																																																																																																scope																					= args['scope']
+																																																																																																																																stage																					= args['stage']
+																																																																																																																																real																						= args['real']
+																																																																																																																																label																					= args['label']
+
+																																																																																																																																
+																																																																																																																																if label is not None :
+																																																																																																																																																																																																label																					= tf.cast(label, tf.int32)
+																																																																																																																																																																																																#
+																																																																																																																																																																																																# @TODO: Ziqi needs to explain what's going on here
+																																																																																																																																																																																																m = [[i] for i in np.arange(self._LABEL.shape[1]-2)]
+																																																																																																																																																																																																label																					= label[:, 1] * len(m) + tf.squeeze(
+																																																																																																																																																																																																																																																																tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32))
+																																																																																																																																																																																																																																																																)
+																																																																																																																																# label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] )
+																																																																																																																																z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
+																																																																																																																																
+																																																																																																																																fake = self.generator.network(inputs=z, label=label)
+																																																																																																																																if stage == 'D':
+																																																																																																																																																																																																w, loss = self.discriminator.loss(real=real, fake=fake, label=label)
+																																																																																																																																																																																																#losses = tf.get_collection('dlosses', scope)
+																																																																																																																																																																																																flag = 'dlosses'
+																																																																																																																																																																																																losses = tf.compat.v1.get_collection('dlosses', scope)
+																																																																																																																																else:
+																																																																																																																																																																																																w, loss = self.generator.loss(fake=fake, label=label)
+																																																																																																																																																																																																#losses = tf.get_collection('glosses', scope)
+																																																																																																																																																																																																flag = 'glosses'
+																																																																																																																																																																																																losses = tf.compat.v1.get_collection('glosses', scope)
+																																																																																																																																# losses = tf.compat.v1.get_collection(flag, scope)
+
+																																																																																																																																total_loss = tf.add_n(losses, name='total_loss')
+
+																																																																																																																																return total_loss, w
+																																																																def input_fn(self):
+																																																																																																																																"""
+																																																																																																																																This function seems to produce 
+																																																																																																																																"""
+																																																																																																																																features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32)
+																																																																																																																																LABEL_SHAPE = [None,None] if self._LABEL is None else self._LABEL.shape
+																																																																																																																																labels_placeholder = tf.compat.v1.placeholder(shape=LABEL_SHAPE, dtype=tf.float32)
+																																																																																																																																if self._LABEL is not None :
+																																																																																																																																																																																																dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
+																																																																																																																																else :
+																																																																																																																																																																																																dataset = tf.data.Dataset.from_tensor_slices(features_placeholder)
+																																																																																																																																# labels_placeholder = None
+																																																																																																																																dataset = dataset.repeat(10000)
+																																																																																																																																dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
+																																																																																																																																dataset = dataset.prefetch(1)
+																																																																																																																																# iterator = dataset.make_initializable_iterator()
+																																																																																																																																iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
+																																																																																																																																return iterator, features_placeholder, labels_placeholder
+																																																																
+																																																																def network(self,**args):
+																																																																																																																																stage																					= args['stage']
+																																																																																																																																opt																																																																																																											= args['opt']
+																																																																																																																																tower_grads = []
+																																																																																																																																per_gpu_w																																																															= []
+																																																																																																																																iterator, features_placeholder, labels_placeholder = self.input_fn()
+																																																																																																																																with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()):
+																																																																																																																																																																																																for i in range(self.NUM_GPUS):
+																																																																																																																																																																																																																																																																with tf.device('/gpu:%d' % i):
+																																																																																																																																																																																																																																																																																																																																with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
+																																																																																																																																																																																																																																																																																																																																																																																																if self._LABEL is not None :
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																(real, label) = iterator.get_next()
+																																																																																																																																																																																																																																																																																																																																																																																																else:
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																real = iterator.get_next()
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																label= None
+																																																																																																																																																																																																																																																																																																																																																																																																loss, w = self.loss(scope=scope, stage=stage, real=real, label=label)
+																																																																																																																																																																																																																																																																																																																																																																																																#tf.get_variable_scope().reuse_variables()
+																																																																																																																																																																																																																																																																																																																																																																																																tf.compat.v1.get_variable_scope().reuse_variables()
+																																																																																																																																																																																																																																																																																																																																																																																																#vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
+																																																																																																																																																																																																																																																																																																																																																																																																vars_ = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
+																																																																																																																																																																																																																																																																																																																																																																																																grads = opt.compute_gradients(loss, vars_)
+																																																																																																																																																																																																																																																																																																																																																																																																tower_grads.append(grads)
+																																																																																																																																																																																																																																																																																																																																																																																																per_gpu_w.append(w)
+
+																																																																																																																																grads = self.average_gradients(tower_grads)
+																																																																																																																																apply_gradient_op = opt.apply_gradients(grads)
+
+																																																																																																																																mean_w = tf.reduce_mean(per_gpu_w)
+																																																																																																																																train_op = apply_gradient_op
+																																																																																																																																return train_op, mean_w, iterator, features_placeholder, labels_placeholder
+																																																																def apply(self,**args):
+																																																																																																																																# max_epochs = args['max_epochs'] if 'max_epochs' in args else 10
+																																																																																																																																REAL = self._REAL
+																																																																																																																																LABEL= self._LABEL																																													
+																																																																																																																																if (self.logger):
+																																																																																																																																																																																																pass
+																																																																																																																																
+																																																																																																																																with tf.device('/cpu:0'):
+																																																																																																																																																																																																opt_d = tf.compat.v1.train.AdamOptimizer(1e-4)
+																																																																																																																																																																																																opt_g = tf.compat.v1.train.AdamOptimizer(1e-4)
+																																																																																																																																																																																																
+																																																																																																																																																																																																train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d)
+																																																																																																																																																																																																train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g)
+																																																																																																																																																																																																# saver = tf.train.Saver()
+																																																																																																																																																																																																saver																					= tf.compat.v1.train.Saver()
+																																																																																																																																																																																																# init																		= tf.global_variables_initializer()
+																																																																																																																																																																																																init																						= tf.compat.v1.global_variables_initializer()
+																																																																																																																																																																																																logs = []
+																																																																																																																																																																																																#with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
+																																																																																																																																																																																																with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
+																																																																																																																																																																																																																																																																
+																																																																																																																																																																																																																																																																sess.run(init)
+																																																																																																																																																																																																																																																																
+																																																																																																																																																																																																																																																																sess.run(iterator_d.initializer,
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																feed_dict={features_placeholder_d: REAL})
+																																																																																																																																																																																																																																																																sess.run(iterator_g.initializer,
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																feed_dict={features_placeholder_g: REAL})
+																																																																																																																																																																																																																																																																
+																																																																																																																																																																																																																																																																for epoch in range(1, self.MAX_EPOCHS + 1):
+																																																																																																																																																																																																																																																																																																																																start_time = time.time()
+																																																																																																																																																																																																																																																																																																																																w_sum = 0
+																																																																																																																																																																																																																																																																																																																																for i in range(self.STEPS_PER_EPOCH):
+																																																																																																																																																																																																																																																																																																																																																																																																for _ in range(2):
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																_, w = sess.run([train_d, w_distance])
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																w_sum += w
+																																																																																																																																																																																																																																																																																																																																																																																																sess.run(train_g)
+																																																																																																																																																																																																																																																																																																																																duration = time.time() - start_time
+
+																																																																																																																																																																																																																																																																																																																																assert not np.isnan(w_sum), 'Model diverged with loss = NaN'
+
+																																																																																																																																																																																																																																																																																																																																format_str = 'epoch: %d, w_distance = %f (%.1f)'
+																																																																																																																																																																																																																																																																																																																																print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
+																																																																																																																																																																																																																																																																																																																																# print (dir (w_distance))
+
+																																																																																																																																																																																																																																																																																																																																logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })
+
+																																																																																																																																																																																																																																																																																																																																if epoch % self.MAX_EPOCHS == 0:
+																																																																																																																																																																																																																																																																																																																																																																																																# suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
+																																																																																																																																																																																																																																																																																																																																																																																																suffix = self.get.suffix()
+																																																																																																																																																																																																																																																																																																																																																																																																_name  = os.sep.join([self.train_dir,suffix])
+																																																																																																																																																																																																																																																																																																																																																																																																# saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
+																																																																																																																																																																																																																																																																																																																																																																																																saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
+																																																																																																																																																																																																																																																																																																																																																																																																#
+																																																																																																																																																																																																																																																																																																																																																																																																#
+																																																																																																																																																																																																																																																																																																																																																																																																if self.logger :
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																row = {"logs":logs} #,"model":pickle.dump(sess)}																																																																																																																																																																																																																																																																																																																																																																																																																																																																
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																self.logger.write(row)
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																#
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																# @TODO:
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																# We should upload the files in the checkpoint 
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																# This would allow the learnt model to be portable to another system
+																																																																																																																																																																																																																																																																																																																																																																																																																																																																#
+																																																																																																																																																																																																tf.compat.v1.reset_default_graph()
+
+class Predict(GNet):
+																																																																"""
+																																																																This class uses synthetic data given a learned model
+																																																																"""
+																																																																def __init__(self,**args):
+																																																																																																																																GNet.__init__(self,**args)																																																																																																																												
+																																																																																																																																self.generator = Generator(**args)																																																																																																																												
+																																																																																																																																self.values  = args['values']
+																																																																def load_meta(self, column):
+																																																																																																																																super().load_meta(column)
+																																																																																																																																self.generator.load_meta(column)
+																																																																def apply(self,**args):
+																																																																																																																																# print (self.train_dir)
+																																																																																																																																# suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
+																																																																																																																																suffix = self.get.suffix()
+																																																																																																																																model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
+																																																																																																																																demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
+																																																																																																																																tf.compat.v1.reset_default_graph()
+																																																																																																																																#z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
+																																																																																																																																z = tf.random.normal(shape=[self._REAL.shape[0], self.Z_DIM])
+																																																																																																																																y = tf.compat.v1.placeholder(shape=[self._REAL.shape[0], self.NUM_LABELS], dtype=tf.int32)
+																																																																																																																																#y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32)
+																																																																																																																																if self._LABEL is not None :
+																																																																																																																																																																																																ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
+																																																																																																																																																																																																label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
+																																																																																																																																else:
+																																																																																																																																																																																																label = None
+																																																																																																																																fake																						= self.generator.network(inputs=z, label=label)
+																																																																																																																																init																						= tf.compat.v1.global_variables_initializer()
+																																																																																																																																saver																					= tf.compat.v1.train.Saver()
+																																																																																																																																df																																																																																																												= pd.DataFrame()
+																																																																																																																																CANDIDATE_COUNT = 10000
+																																																																																																																																NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0]
+																																																																																																																																with tf.compat.v1.Session() as sess:
+																																																																																																																																																																																																
+																																																																																																																																																																																																# sess.run(init)
+																																																																																																																																																																																																saver.restore(sess, model_dir)
+																																																																																																																																																																																																if self._LABEL is not None :
+																																																																																																																																																																																																																																																																labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
+																																																																																																																																																																																																																																																																labels= demo
+																																																																																																																																																																																																else:
+																																																																																																																																																																																																																																																																labels = None
+																																																																																																																																																																																																
+																																																																																																																																																																																																found = []
+																																																																																																																																																																																																
+																																																																																																																																																																																																for i in np.arange(CANDIDATE_COUNT) :
+																																																																																																																																																																																																																																																																if labels :
+																																																																																																																																																																																																																																																																																																																																f = sess.run(fake,feed_dict={y:labels})
+																																																																																																																																																																																																																																																																else:
+																																																																																																																																																																																																																																																																																																																																f = sess.run(fake)
+																																																																																																																																																																																																																																																																#
+																																																																																																																																																																																																																																																																# if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
+																																																																																																																																																																																																																																																																# The code below will insure we have some acceptable cardinal relationships between id and synthetic values
+																																																																																																																																																																																																																																																																#
+																																																																																																																																																																																																																																																																df =						( pd.DataFrame(np.round(f).astype(np.int32)))
+																																																																																																																																																																																																																																																																p = 0 not in df.sum(axis=1).values
+																																																																																																																																																																																																																																																																x = df.sum(axis=1).values
+																																																																																																																																																																																																																																																																if np.divide( np.sum(x), x.size) > .9:
+																																																																																																																																																																																																																																																																																																																																found.append(df)
+																																																																																																																																																																																																																																																																																																																																if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT:
+																																																																																																																																																																																																																																																																																																																																																																																																break
+																																																																																																																																																																																																																																																																else:
+																																																																																																																																																																																																																																																																																																																																continue
+																																																																																																																																																																																																																																																																																																																																
+																																																																																																																																																																																																# i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
+																																																																																																																																																																																																# df = (i * df).sum(axis=1)
+																																																																																																																																																																																																#
+																																																																																																																																																																																																# In case we are dealing with actual values like diagnosis codes we can perform 
+																																																																																																																																																																																																#
+																																																																																																																																																																																																INDEX =np.random.choice(np.arange(len(found)),1)[0]
+																																																																																																																																																																																																#df = found[np.random.choice(np.arange(len(found)),1)[0]]
+																																																																																																																																																																																																df = found[INDEX]
+																																																																																																																																																																																																columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
+																																																																																																																																																																																																
+																																																																																																																																																																																																# r = np.zeros((self.ROW_COUNT,len(columns)))
+																																																																																																																																																																																																r = np.zeros(self.ROW_COUNT)
+																																																																																																																																																																																																df.columns = self.values
+																																																																																																																																																																																																if len(found):
+																																																																																																																																																																																																																																																																print (len(found),NTH_VALID_CANDIDATE)			
+																																																																																																																																																																																																																																																																# x = df * self.values 
+																																																																																																																																																																																																																																																																#
+																																																																																																																																																																																																																																																																# let's get the rows with no values synthesized (for whatever reason)
+																																																																																																																																																																																																																																																																#
+																																																																																																																																																																																																																																																																ii = df.apply(lambda row: np.sum(row) == 0,axis=1)
+																																																																																																																																																																																																																																																																if np.sum(ii) > 0 :
+																																																																																																																																																																																																																																																																																missing = np.repeat(np.nan, np.where(ii==1)[0].size)
+																																																																																																																																																																																																																																																																else:
+																																																																																																																																																																																																																																																																																missing = []
+																																																																																																																																																																																																																																																																print (len (missing), df.shape)	
+																																																																																																																																																																																																																																																																i = np.where(ii == 0)[0]
+																																																																																																																																																																																																																																																																df =						pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row == 1)[0],1)[0]] ,axis=1))
+																																																																																																																																																																																																																																																																df.columns = columns
+																																																																																																																																																																																																																																																																df = df[columns[0]].append(pd.Series(missing))
+
+																																																																																																																																																																																																																																																																																																																																																																
+																																																																																																																																																																																																																	
+																																																																																																																																																																																																
+																																																																																																																																
+																																																																																																																																tf.compat.v1.reset_default_graph()
+																																																																																																																																df = pd.DataFrame(df)
+																																																																																																																																df.columns = columns
+																																																																																																																																print (df.head())
+																																																																																																																																print (df.shape)
+																																																																																																																																return df.to_dict(orient='list')
+																																																																																																																																																																																																# return df.to_dict(orient='list')
+																																																																																																																																																																																																# count = str(len(os.listdir(self.out_dir)))
+																																																																																																																																																																																																# _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])
+																																																																																																																																																																																																# df.to_csv(_name,index=False)
+
+																																																																																																																																																																																																
+																																																																																																																																																																																																# output.extend(np.round(f))
+
+																																																																																																																																																																																																# for m in range(2):
+																																																																																																																																																																																																#																																																																															for n in range(2, self.NUM_LABELS):
+																																																																																																																																																																																																#																																																																																																																																															idx1 = (demo[:, m] == 1)
+																																																																																																																																																																																																#																																																																																																																																															idx2 = (demo[:, n] == 1)
+																																																																																																																																																																																																#																																																																																																																																															idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
+																																																																																																																																																																																																#																																																																																																																																															num = np.sum(idx)
+																																																																																																																																																																																																#																																																																																																																																															print ("___________________list__")
+																																																																																																																																																																																																#																																																																																																																																															print (idx1)
+																																																																																																																																																																																																#																																																																																																																																															print (idx2)
+																																																																																																																																																																																																#																																																																																																																																															print (idx)
+																																																																																																																																																																																																#																																																																																																																																															print (num)
+																																																																																																																																																																																																#																																																																																																																																															print ("_____________________")
+																																																																																																																																																																																																#																																																																																																																																															nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU))
+																																																																																																																																																																																																#																																																																																																																																															label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS))
+																																																																																																																																																																																																#																																																																																																																																															label_input[:, n] = 1
+																																																																																																																																																																																																#																																																																																																																																															label_input[:, m] = 1
+																																																																																																																																																																																																#																																																																																																																																															output = []
+																																																																																																																																																																																																#																																																																																																																																															for i in range(nbatch):
+																																																																																																																																																																																																#																																																																																																																																																																																																															f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]})
+																																																																																																																																																																																																#																																																																																																																																																																																																															output.extend(np.round(f))
+																																																																																																																																																																																																#																																																																																																																																															output = np.array(output)[:num]
+																																																																																																																																																																																																																																																																																																																																# print ([m,n,output])
+																																																																																																																																																																																																																																																																																																																																
+																																																																																																																																																																																																																																																																																																																																# np.save(self.out_dir + str(m) + str(n), output)
+																																																																
+
+if __name__ == '__main__' :
+																																																																#
+																																																																# Now we get things done ...
+																																																																column																																																																		= SYS_ARGS['column']
+																																																																column_id																																																															= SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id'
+																																																																column_id																																																															= column_id.split(',') if ',' in column_id else column_id
+																																																																df = pd.read_csv(SYS_ARGS['raw-data'])		
+																																																																LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
+																																																																
+																																																																context																																																																	= SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4]
+																																																																if set(['train','learn']) & set(SYS_ARGS.keys()):
+																																																																																																																																
+																																																																																																																																df = pd.read_csv(SYS_ARGS['raw-data'])			
+																																																																																																																																
+																																																																																																																																# cols = SYS_ARGS['column']
+																																																																																																																																# _map,_df = (Binary()).Export(df)
+																																																																																																																																# i = np.arange(_map[column]['start'],_map[column]['end'])
+																																																																																																																																max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10
+																																																																																																																																# REAL																		= _df[:,i]
+																																																																																																																																REAL																						= pd.get_dummies(df[column]).astype(np.float32).values
+																																																																																																																																LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
+																																																																																																																																trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id)
+																																																																																																																																trainer.apply()
+																																																																																																																																
+																																																																																																																																
+																																																																																	
+																																																																																																																																
+																																																																																																																																#
+																																																																																																																																# We should train upon this data
+																																																																																																																																#
+																																																																																																																																# -- we need to convert the data-frame to binary matrix, given a column
+																																																																																																																																#
+																																																																																																																																pass
+																																																																elif 'generate' in SYS_ARGS:
+																																																																																																																																values = df[column].unique().tolist()
+																																																																																																																																values.sort()
+																																																																																																																																
+																																																																																																																																p = Predict(context=context,label=LABEL,values=values,column=column)
+																																																																																																																																p.load_meta(column)
+																																																																																																																																r = p.apply()
+																																																																																																																																print (df)
+																																																																																																																																print ()
+																																																																																																																																df[column] = r[column]
+																																																																																																																																print (df)
+																																																																																																																																
+																																																																																																																																
+																																																																else:
+																																																																																																																																print (SYS_ARGS.keys())
+																																																																																																																																print (__doc__)
+																																																																pass
+

From d5a343da8401f9a6873c2dbcd0ef0428f7bcc1b3 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 26 Feb 2020 09:33:35 -0600
Subject: [PATCH 029/250] house keeping work

---
 gan.py | 705 ---------------------------------------------------------
 1 file changed, 705 deletions(-)
 delete mode 100644 gan.py

diff --git a/gan.py b/gan.py
deleted file mode 100644
index 2e4d503..0000000
--- a/gan.py
+++ /dev/null
@@ -1,705 +0,0 @@
-"""
-This code was originally writen by Ziqi Zhang <ziqi.zhang@vanderbilt.edu> in order to generate synthetic data.
-The code is an implementation of a Generative Adversarial Network that uses the Wasserstein Distance (WGAN).
-It is intended to be used in 2 modes (embedded in code or using CLI)
-
-USAGE :
-
-The following parameters should be provided in a configuration file (JSON format)
-python data/maker --config <path-to-config-file.json>
-
-CONFIGURATION FILE STRUCTURE :
-
-																																																																context																																																																	what it is you are loading (stroke, hypertension, ...)
-																																																																data																																																																																						path of the file to be loaded
-																																																																logs																																																																																						folder to store training model and meta data about learning
-																																																																max_epochs																																												number of iterations in learning 
-																																																																num_gpu																																																																	number of gpus to be used (will still run if the GPUs are not available)
-
-EMBEDDED IN CODE :
-
-"""
-import tensorflow as tf
-from tensorflow.contrib.layers import l2_regularizer
-import numpy as np
-import pandas as pd
-import time
-import os
-import sys
-from data.params import SYS_ARGS
-from data.bridge import Binary
-import json
-import pickle
-
-os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-os.environ['CUDA_VISIBLE_DEVICES'] = "0"
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
-
-# STEPS_PER_EPOCH																																																																															= int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256
-# NUM_GPUS																																																																																																																																																																																												= 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu'])
-# BATCHSIZE_PER_GPU																											= 2000
-# TOTAL_BATCHSIZE																																																																															= BATCHSIZE_PER_GPU * NUM_GPUS
-
-class void :
-																																																																pass
-class GNet :
-																																																																def log(self,**args):
-																																																																																																																																self.logs = dict(args,**self.logs)
-																																																																																	
-																																																																																																																																																																																																
-																																																																"""
-																																																																This is the base class of a generative network functions, the details will be implemented in the subclasses.
-																																																																An instance of this class is accessed as follows 
-																																																																object.layers.normalize applies batch normalization or otherwise
-																																																																obect.get.variables																																																																																																											instanciate variables on cpu and return a reference (tensor)
-																																																																"""
-																																																																def __init__(self,**args):
-																																																																																																																																self.layers = void()
-																																																																																																																																self.layers.normalize = self.normalize
-																																																																																																																																self.logs = {}
-
-																																																																																																																																self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
-																																																																																																																																# if self.NUM_GPUS > 1 :
-																																																																																																																																#																																															os.environ['CUDA_VISIBLE_DEVICES'] = "4"
-
-																																																																																																																																self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854
-																																																																																																																																self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE]
-																																																																																																																																self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis
-																																																																																																																																# self.NUM_LABELS																																																																															= 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1]
-																																																																																																																																
-																																																																																																																																if 'label' in args and len(args['label'].shape) == 2 :
-																																																																																																																																																																																																self.NUM_LABELS = args['label'].shape[1]
-																																																																																																																																elif 'label' in args and len(args['label']) == 1 :
-																																																																																																																																																																																																self.NUM_LABELS = args['label'].shape[0]
-																																																																																																																																else:
-																																																																																																																																																																																																self.NUM_LABELS = None
-																																																																																																																																# self.Z_DIM = 128 #self.X_SPACE_SIZE																																					
-																																																																																																																																self.Z_DIM = 128																#-- used as rows down stream
-																																																																																																																																self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM]
-																																																																																																																																PROPOSED_BATCH_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size'])
-																																																																																																																																self.BATCHSIZE_PER_GPU = PROPOSED_BATCH_PER_GPU
-																																																																																																																																if 'real' in args : 
-																																																																																																																																																																																																self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM]
-																																																																																																																																
-																																																																																																																																																																																																if args['real'].shape[0]																< PROPOSED_BATCH_PER_GPU :
-																																																																																																																																																																																																																																																																self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) 
-																																																																																																																																# self.BATCHSIZE_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size'])
-																																																																																																																																self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS
-																																																																																																																																self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)																																													
-																																																																																																																																self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs'])
-																																																																																																																																self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100
-																																																																																																																																self.CONTEXT = args['context']
-																																																																																																																																self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None}
-																																																																																																																																self._REAL = args['real'] if 'real' in args else None
-																																																																																																																																self._LABEL = args['label'] if 'label' in args else None
-
-																																																																																																																																self.get = void()
-																																																																																																																																self.get.variables = self._variable_on_cpu
-																																																																																																																																self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
-																																																																																																																																self.logger = args['logger'] if 'logger' in args and args['logger'] else None
-																																																																																																																																self.init_logs(**args)
-
-																																																																def init_logs(self,**args):
-																																																																																																																																self.log_dir = args['logs'] if 'logs' in args else 'logs'
-																																																																																																																																self.mkdir(self.log_dir)
-																																																																																																																																#
-																																																																																																																																# 
-																																																																																																																																for key in ['train','output'] :
-																																																																																																																																																																																																self.mkdir(os.sep.join([self.log_dir,key]))
-																																																																																																																																																																																																self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT]))
-																																																																																																																																																																																																
-																																																																																																																																self.train_dir		= os.sep.join([self.log_dir,'train',self.CONTEXT])																																																																																																																												
-																																																																																																																																self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
-																																																																																																																																if self.logger :
-																																																																																																																																																																																																#
-																																																																																																																																																																																																# We will clear the logs from the data-store 
-																																																																																																																																																																																																#
-																																																																																																																																																																																																column = self.ATTRIBUTES['synthetic']
-																																																																																																																																																																																																db = self.logger.db
-																																																																																																																																																																																																if db[column].count() > 0 :
-																																																																																																																																																																																																																																																																db.backup.insert({'name':column,'logs':list(db[column].find()) })
-																																																																																																																																																																																																																																																																db[column].drop()
-																																																																																																																																
-																																																																def load_meta(self,column):
-																																																																																																																																"""
-																																																																																																																																This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model.
-																																																																																																																																Because prediction and training can happen independently
-																																																																																																																																"""
-																																																																																																																																# suffix = "-".join(column) if isinstance(column,list)else column
-																																																																																																																																suffix = self.get.suffix()
-																																																																																																																																_name = os.sep.join([self.out_dir,'meta-'+suffix+'.json'])
-																																																																																																																																if os.path.exists(_name) :
-																																																																																																																																																																																																attr = json.loads((open(_name)).read())
-																																																																																																																																																																																																for key in attr :
-																																																																																																																																																																																																																																																																value = attr[key]
-																																																																																																																																																																																																																																																																setattr(self,key,value)
-																																																																																																																																self.train_dir		= os.sep.join([self.log_dir,'train',self.CONTEXT])																																																																																																																												
-																																																																																																																																self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
-																																																																																																																																																																																																																																																																
-																																																																																																																																																																																																
-																																																																def log_meta(self,**args) :
-																																																																																																																																
-																																																																																																																																_object = {
-																																																																																																																																																																																																# '_id':'meta',
-																																																																																																																																																																																																'CONTEXT':self.CONTEXT,
-																																																																																																																																																																																																'ATTRIBUTES':self.ATTRIBUTES,
-																																																																																																																																																																																																'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU,
-																																																																																																																																																																																																'Z_DIM':self.Z_DIM,
-																																																																																																																																																																																																"X_SPACE_SIZE":self.X_SPACE_SIZE,
-																																																																																																																																																																																																"D_STRUCTURE":self.D_STRUCTURE,
-																																																																																																																																																																																																"G_STRUCTURE":self.G_STRUCTURE,
-																																																																																																																																																																																																"NUM_GPUS":self.NUM_GPUS,
-																																																																																																																																																																																																"NUM_LABELS":self.NUM_LABELS,
-																																																																																																																																																																																																"MAX_EPOCHS":self.MAX_EPOCHS,
-																																																																																																																																																																																																"ROW_COUNT":self.ROW_COUNT
-																																																																																																																																}
-																																																																																																																																if args and 'key' in args and 'value' in args :
-																																																																																																																																																																																																key = args['key']
-																																																																																																																																																																																																value= args['value']
-																																																																																																																																																																																																object[key] = value
-																																																																																																																																# suffix = "-".join(self.column) if isinstance(self.column,list) else self.column
-																																																																																																																																suffix = self.get.suffix()
-																																																																																																																																_name = os.sep.join([self.out_dir,'meta-'+suffix])
-																																																																																																																																
-																																																																																																																																f = open(_name+'.json','w')
-																																																																																																																																f.write(json.dumps(_object))
-																																																																																																																																return _object
-																																																																def mkdir (self,path):
-																																																																																																																																if not os.path.exists(path) :
-																																																																																																																																																																																																os.mkdir(path)																																																																																		
-																																																																																																																																
-
-																																																																def normalize(self,**args):
-																																																																																																																																"""
-																																																																																																																																This function will perform a batch normalization on an network layer
-																																																																																																																																inputs																																																																		input layer of the neural network
-																																																																																																																																name																																																																																						name of the scope the 
-																																																																																																																																labels																																																																		labels (attributes not synthesized) by default None
-																																																																																																																																n_labels																																																																number of labels default None
-																																																																																																																																"""
-																																																																																																																																inputs		= args['inputs']
-																																																																																																																																name																						= args['name']
-																																																																																																																																labels		= None if 'labels' not in args else args['labels']
-																																																																																																																																n_labels= None if 'n_labels' not in args else args['n_labels']
-																																																																																																																																shift																					= [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing
-																																																																																																																																mean, var																																																															= tf.nn.moments(inputs, shift, keep_dims=True)
-																																																																																																																																shape																																																																																					= inputs.shape[1].value
-																																																																																																																																if labels is not None:
-																																																																																																																																																																																																offset_m																																																																= self.get.variables(shape=[1,shape], name='offset'+name,
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																initializer=tf.zeros_initializer)
-																																																																																																																																																																																																scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																initializer=tf.ones_initializer)
-																																																																																																																																																																																																offset		= tf.nn.embedding_lookup(offset_m, labels)
-																																																																																																																																																																																																scale																					= tf.nn.embedding_lookup(scale_m, labels)
-
-																																																																																																																																else:
-																																																																																																																																																																																																offset = None
-																																																																																																																																																																																																scale = None
-
-																																																																																																																																result		= tf.nn.batch_normalization(inputs, mean, var,offset,scale, 1e-8)
-																																																																																																																																return result
-
-																																																																def _variable_on_cpu(self,**args):
-																																																																																																																																"""
-																																																																																																																																This function makes sure variables/tensors are not created on the GPU but rather on the CPU
-																																																																																																																																"""
-
-																																																																																																																																name = args['name']
-																																																																																																																																shape = args['shape']
-																																																																																																																																initializer=None if 'initializer' not in args else args['initializer']
-																																																																																																																																with tf.device('/cpu:0') :
-																																																																																																																																																																																																cpu_var =  tf.compat.v1.get_variable(name,shape,initializer= initializer)
-																																																																																																																																return cpu_var
-																																																																def average_gradients(self,tower_grads):
-																																																																																																																																average_grads = []
-																																																																																																																																for grad_and_vars in zip(*tower_grads):
-																																																																																																																																																																																																grads = []
-																																																																																																																																																																																																for g, _ in grad_and_vars:
-																																																																																																																																																																																																																																																																expanded_g = tf.expand_dims(g, 0)
-																																																																																																																																																																																																																																																																grads.append(expanded_g)
-
-																																																																																																																																																																																																grad = tf.concat(axis=0, values=grads)
-																																																																																																																																																																																																grad = tf.reduce_mean(grad, 0)
-
-																																																																																																																																																																																																v = grad_and_vars[0][1]
-																																																																																																																																																																																																grad_and_var = (grad, v)
-																																																																																																																																																																																																average_grads.append(grad_and_var)
-																																																																																																																																return average_grads																																																																																						
-
-
-class Generator (GNet):
-																																																																"""
-																																																																This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator, this allows the generator not to be random
-																																																																
-																																																																"""
-																																																																def __init__(self,**args):
-																																																																																																																																GNet.__init__(self,**args)
-																																																																																																																																self.discriminator = Discriminator(**args)
-																																																																def loss(self,**args):
-																																																																																																																																fake																						= args['fake']
-																																																																																																																																label																					= args['label']
-																																																																																																																																y_hat_fake = self.discriminator.network(inputs=fake, label=label)
-																																																																																																																																#all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
-																																																																																																																																all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-																																																																																																																																loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs)
-																																																																																																																																#tf.add_to_collection('glosses', loss)
-																																																																																																																																tf.compat.v1.add_to_collection('glosses', loss)
-																																																																																																																																return loss, loss																																																																																																																																
-																																																																def load_meta(self, column):
-																																																																																																																																super().load_meta(column)
-																																																																																																																																self.discriminator.load_meta(column)
-																																																																def network(self,**args) :
-																																																																																																																																"""
-																																																																																																																																This function will build the network that will generate the synthetic candidates
-																																																																																																																																:inputs matrix of data that we need
-																																																																																																																																:dim																						dimensions of ...
-																																																																																																																																"""
-																																																																																																																																x																																																																																																																															= args['inputs']
-																																																																																																																																tmp_dim = self.Z_DIM if 'dim' not in args else args['dim']
-																																																																																																																																label																					= args['label']
-																																																																																																																																
-																																																																																																																																with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
-																																																																																																																																																																																																for i, dim in enumerate(self.G_STRUCTURE[:-1]):
-																																																																																																																																																																																																																																																																kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim])
-																																																																																																																																																																																																																																																																h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS)
-																																																																																																																																																																																																																																																																h2 = tf.nn.relu(h1)
-																																																																																																																																																																																																																																																																x = x + h2
-																																																																																																																																																																																																																																																																tmp_dim = dim
-																																																																																																																																																																																																i = len(self.G_STRUCTURE) - 1
-																																																																																																																																																																																																#
-																																																																																																																																																																																																# This seems to be an extra hidden layer: 
-																																																																																																																																																																																																# It's goal is to map continuous values to discrete values (pre-trained to do this)
-																																																																																																																																																																																																kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]])
-																																																																																																																																																																																																h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i),
-																																																																																																																																																																																																																																																																																																																																																																																																labels=label, n_labels=self.NUM_LABELS)
-																																																																																																																																																																																																h2 = tf.nn.tanh(h1)
-																																																																																																																																																																																																x = x + h2
-																																																																																																																																																																																																# This seems to be the output layer
-																																																																																																																																																																																																#
-																																																																																																																																																																																																kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE])
-																																																																																																																																																																																																bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE])
-																																																																																																																																																																																																x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias))
-																																																																																																																																return x																																																																																	
-
-class Discriminator(GNet):
-																																																																def __init__(self,**args):
-																																																																																																																																GNet.__init__(self,**args)																																																													
-																																																																def network(self,**args):
-																																																																																																																																"""
-																																																																																																																																This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output (neuron)
-																																																																																																																																:inputs
-																																																																																																																																:label
-																																																																																																																																"""
-																																																																																																																																x = args['inputs']
-																																																																																																																																label = args['label']
-																																																																																																																																with tf.compat.v1.variable_scope('D', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
-																																																																																																																																																																																																for i, dim in enumerate(self.D_STRUCTURE[1:]):
-																																																																																																																																																																																																																																																																kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[i], dim])
-																																																																																																																																																																																																																																																																bias = self.get.variables(name='b_' + str(i), shape=[dim])
-																																																																																																																																																																																																																																																																# print (["\t",bias,kernel])
-																																																																																																																																																																																																																																																																x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias))
-																																																																																																																																																																																																																																																																x = self.normalize(inputs=x, name='cln' + str(i), shift=1,labels=label, n_labels=self.NUM_LABELS)
-																																																																																																																																																																																																i = len(self.D_STRUCTURE)
-																																																																																																																																																																																																kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[-1], 1])
-																																																																																																																																																																																																bias = self.get.variables(name='b_' + str(i), shape=[1])
-																																																																																																																																																																																																y = tf.add(tf.matmul(x, kernel), bias)
-																																																																																																																																return y
-																																																																
-																																																																def loss(self,**args) :
-																																																																																																																																"""
-																																																																																																																																This function compute the loss of 
-																																																																																																																																:real
-																																																																																																																																:fake
-																																																																																																																																:label
-																																																																																																																																"""
-																																																																																																																																real																						= args['real']
-																																																																																																																																fake																						= args['fake']
-																																																																																																																																label																					= args['label']
-																																																																																																																																epsilon = tf.random.uniform(shape=[self.BATCHSIZE_PER_GPU,1],minval=0,maxval=1)
-																																																																																																																																
-																																																																																																																																x_hat																																																																																					= real + epsilon * (fake - real)
-																																																																																																																																y_hat_fake																																												= self.network(inputs=fake, label=label)
-																																																																																																																																
-																																																																																																																																y_hat_real																																												= self.network(inputs=real, label=label)
-																																																																																																																																y_hat																																																																																					= self.network(inputs=x_hat, label=label)
-
-																																																																																																																																grad																																																																																						= tf.gradients(y_hat, [x_hat])[0]
-																																																																																																																																slopes																																																																		= tf.sqrt(tf.reduce_sum(tf.square(grad), 1))
-																																																																																																																																gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2)
-																																																																																																																																#all_regs																																																																= tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
-																																																																																																																																all_regs																																																																= tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-																																																																																																																																w_distance																																												= -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake)
-																																																																																																																																loss																																																																																						= w_distance + 10 * gradient_penalty + sum(all_regs)
-																																																																																																																																#tf.add_to_collection('dlosses', loss)
-																																																																																																																																tf.compat.v1.add_to_collection('dlosses', loss)
-
-																																																																																																																																return w_distance, loss																																																																																		
-class Train (GNet):
-																																																																def __init__(self,**args):
-																																																																																																																																GNet.__init__(self,**args)
-																																																																																																																																self.generator = Generator(**args)
-																																																																																																																																self.discriminator = Discriminator(**args)
-																																																																																																																																self._REAL = args['real']
-																																																																																																																																self._LABEL= args['label'] if 'label' in args else None
-																																																																																																																																self.column = args['column']
-																																																																																																																																# print ([" *** ",self.BATCHSIZE_PER_GPU])
-																																																																																																																																
-																																																																																																																																self.meta = self.log_meta()
-																																																																																																																																if(self.logger):
-																																																																																																																																																																																																
-																																																																																																																																																																																																self.logger.write( self.meta )
-																																																																																																																																
-																																																																																																																																# self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta)
-																																																																def load_meta(self, column):
-																																																																																																																																"""
-																																																																																																																																This function will delegate the calls to load meta data to it's dependents
-																																																																																																																																column name
-																																																																																																																																"""
-																																																																																																																																super().load_meta(column)
-																																																																																																																																self.generator.load_meta(column)
-																																																																																																																																self.discriminator.load_meta(column)
-																																																																def loss(self,**args):
-																																																																																																																																"""
-																																																																																																																																This function will compute a "tower" loss of the generated candidate against real data
-																																																																																																																																Training will consist in having both generator and discriminators
-																																																																																																																																:scope
-																																																																																																																																:stage
-																																																																																																																																:real
-																																																																																																																																:label
-																																																																																																																																"""
-
-																																																																																																																																scope																					= args['scope']
-																																																																																																																																stage																					= args['stage']
-																																																																																																																																real																						= args['real']
-																																																																																																																																label																					= args['label']
-
-																																																																																																																																
-																																																																																																																																if label is not None :
-																																																																																																																																																																																																label																					= tf.cast(label, tf.int32)
-																																																																																																																																																																																																#
-																																																																																																																																																																																																# @TODO: Ziqi needs to explain what's going on here
-																																																																																																																																																																																																m = [[i] for i in np.arange(self._LABEL.shape[1]-2)]
-																																																																																																																																																																																																label																					= label[:, 1] * len(m) + tf.squeeze(
-																																																																																																																																																																																																																																																																tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32))
-																																																																																																																																																																																																																																																																)
-																																																																																																																																# label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] )
-																																																																																																																																z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
-																																																																																																																																
-																																																																																																																																fake = self.generator.network(inputs=z, label=label)
-																																																																																																																																if stage == 'D':
-																																																																																																																																																																																																w, loss = self.discriminator.loss(real=real, fake=fake, label=label)
-																																																																																																																																																																																																#losses = tf.get_collection('dlosses', scope)
-																																																																																																																																																																																																flag = 'dlosses'
-																																																																																																																																																																																																losses = tf.compat.v1.get_collection('dlosses', scope)
-																																																																																																																																else:
-																																																																																																																																																																																																w, loss = self.generator.loss(fake=fake, label=label)
-																																																																																																																																																																																																#losses = tf.get_collection('glosses', scope)
-																																																																																																																																																																																																flag = 'glosses'
-																																																																																																																																																																																																losses = tf.compat.v1.get_collection('glosses', scope)
-																																																																																																																																# losses = tf.compat.v1.get_collection(flag, scope)
-
-																																																																																																																																total_loss = tf.add_n(losses, name='total_loss')
-
-																																																																																																																																return total_loss, w
-																																																																def input_fn(self):
-																																																																																																																																"""
-																																																																																																																																This function seems to produce 
-																																																																																																																																"""
-																																																																																																																																features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32)
-																																																																																																																																LABEL_SHAPE = [None,None] if self._LABEL is None else self._LABEL.shape
-																																																																																																																																labels_placeholder = tf.compat.v1.placeholder(shape=LABEL_SHAPE, dtype=tf.float32)
-																																																																																																																																if self._LABEL is not None :
-																																																																																																																																																																																																dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
-																																																																																																																																else :
-																																																																																																																																																																																																dataset = tf.data.Dataset.from_tensor_slices(features_placeholder)
-																																																																																																																																# labels_placeholder = None
-																																																																																																																																dataset = dataset.repeat(10000)
-																																																																																																																																dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
-																																																																																																																																dataset = dataset.prefetch(1)
-																																																																																																																																# iterator = dataset.make_initializable_iterator()
-																																																																																																																																iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
-																																																																																																																																return iterator, features_placeholder, labels_placeholder
-																																																																
-																																																																def network(self,**args):
-																																																																																																																																stage																					= args['stage']
-																																																																																																																																opt																																																																																																											= args['opt']
-																																																																																																																																tower_grads = []
-																																																																																																																																per_gpu_w																																																															= []
-																																																																																																																																iterator, features_placeholder, labels_placeholder = self.input_fn()
-																																																																																																																																with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()):
-																																																																																																																																																																																																for i in range(self.NUM_GPUS):
-																																																																																																																																																																																																																																																																with tf.device('/gpu:%d' % i):
-																																																																																																																																																																																																																																																																																																																																with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
-																																																																																																																																																																																																																																																																																																																																																																																																if self._LABEL is not None :
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																(real, label) = iterator.get_next()
-																																																																																																																																																																																																																																																																																																																																																																																																else:
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																real = iterator.get_next()
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																label= None
-																																																																																																																																																																																																																																																																																																																																																																																																loss, w = self.loss(scope=scope, stage=stage, real=real, label=label)
-																																																																																																																																																																																																																																																																																																																																																																																																#tf.get_variable_scope().reuse_variables()
-																																																																																																																																																																																																																																																																																																																																																																																																tf.compat.v1.get_variable_scope().reuse_variables()
-																																																																																																																																																																																																																																																																																																																																																																																																#vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
-																																																																																																																																																																																																																																																																																																																																																																																																vars_ = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
-																																																																																																																																																																																																																																																																																																																																																																																																grads = opt.compute_gradients(loss, vars_)
-																																																																																																																																																																																																																																																																																																																																																																																																tower_grads.append(grads)
-																																																																																																																																																																																																																																																																																																																																																																																																per_gpu_w.append(w)
-
-																																																																																																																																grads = self.average_gradients(tower_grads)
-																																																																																																																																apply_gradient_op = opt.apply_gradients(grads)
-
-																																																																																																																																mean_w = tf.reduce_mean(per_gpu_w)
-																																																																																																																																train_op = apply_gradient_op
-																																																																																																																																return train_op, mean_w, iterator, features_placeholder, labels_placeholder
-																																																																def apply(self,**args):
-																																																																																																																																# max_epochs = args['max_epochs'] if 'max_epochs' in args else 10
-																																																																																																																																REAL = self._REAL
-																																																																																																																																LABEL= self._LABEL																																													
-																																																																																																																																if (self.logger):
-																																																																																																																																																																																																pass
-																																																																																																																																
-																																																																																																																																with tf.device('/cpu:0'):
-																																																																																																																																																																																																opt_d = tf.compat.v1.train.AdamOptimizer(1e-4)
-																																																																																																																																																																																																opt_g = tf.compat.v1.train.AdamOptimizer(1e-4)
-																																																																																																																																																																																																
-																																																																																																																																																																																																train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d)
-																																																																																																																																																																																																train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g)
-																																																																																																																																																																																																# saver = tf.train.Saver()
-																																																																																																																																																																																																saver																					= tf.compat.v1.train.Saver()
-																																																																																																																																																																																																# init																		= tf.global_variables_initializer()
-																																																																																																																																																																																																init																						= tf.compat.v1.global_variables_initializer()
-																																																																																																																																																																																																logs = []
-																																																																																																																																																																																																#with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
-																																																																																																																																																																																																with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
-																																																																																																																																																																																																																																																																
-																																																																																																																																																																																																																																																																sess.run(init)
-																																																																																																																																																																																																																																																																
-																																																																																																																																																																																																																																																																sess.run(iterator_d.initializer,
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																feed_dict={features_placeholder_d: REAL})
-																																																																																																																																																																																																																																																																sess.run(iterator_g.initializer,
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																feed_dict={features_placeholder_g: REAL})
-																																																																																																																																																																																																																																																																
-																																																																																																																																																																																																																																																																for epoch in range(1, self.MAX_EPOCHS + 1):
-																																																																																																																																																																																																																																																																																																																																start_time = time.time()
-																																																																																																																																																																																																																																																																																																																																w_sum = 0
-																																																																																																																																																																																																																																																																																																																																for i in range(self.STEPS_PER_EPOCH):
-																																																																																																																																																																																																																																																																																																																																																																																																for _ in range(2):
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																_, w = sess.run([train_d, w_distance])
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																w_sum += w
-																																																																																																																																																																																																																																																																																																																																																																																																sess.run(train_g)
-																																																																																																																																																																																																																																																																																																																																duration = time.time() - start_time
-
-																																																																																																																																																																																																																																																																																																																																assert not np.isnan(w_sum), 'Model diverged with loss = NaN'
-
-																																																																																																																																																																																																																																																																																																																																format_str = 'epoch: %d, w_distance = %f (%.1f)'
-																																																																																																																																																																																																																																																																																																																																print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
-																																																																																																																																																																																																																																																																																																																																# print (dir (w_distance))
-
-																																																																																																																																																																																																																																																																																																																																logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })
-
-																																																																																																																																																																																																																																																																																																																																if epoch % self.MAX_EPOCHS == 0:
-																																																																																																																																																																																																																																																																																																																																																																																																# suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
-																																																																																																																																																																																																																																																																																																																																																																																																suffix = self.get.suffix()
-																																																																																																																																																																																																																																																																																																																																																																																																_name  = os.sep.join([self.train_dir,suffix])
-																																																																																																																																																																																																																																																																																																																																																																																																# saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
-																																																																																																																																																																																																																																																																																																																																																																																																saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
-																																																																																																																																																																																																																																																																																																																																																																																																#
-																																																																																																																																																																																																																																																																																																																																																																																																#
-																																																																																																																																																																																																																																																																																																																																																																																																if self.logger :
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																row = {"logs":logs} #,"model":pickle.dump(sess)}																																																																																																																																																																																																																																																																																																																																																																																																																																																																
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																self.logger.write(row)
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																#
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																# @TODO:
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																# We should upload the files in the checkpoint 
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																# This would allow the learnt model to be portable to another system
-																																																																																																																																																																																																																																																																																																																																																																																																																																																																#
-																																																																																																																																																																																																tf.compat.v1.reset_default_graph()
-
-class Predict(GNet):
-																																																																"""
-																																																																This class uses synthetic data given a learned model
-																																																																"""
-																																																																def __init__(self,**args):
-																																																																																																																																GNet.__init__(self,**args)																																																																																																																												
-																																																																																																																																self.generator = Generator(**args)																																																																																																																												
-																																																																																																																																self.values  = args['values']
-																																																																def load_meta(self, column):
-																																																																																																																																super().load_meta(column)
-																																																																																																																																self.generator.load_meta(column)
-																																																																def apply(self,**args):
-																																																																																																																																# print (self.train_dir)
-																																																																																																																																# suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
-																																																																																																																																suffix = self.get.suffix()
-																																																																																																																																model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
-																																																																																																																																demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
-																																																																																																																																tf.compat.v1.reset_default_graph()
-																																																																																																																																#z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
-																																																																																																																																z = tf.random.normal(shape=[self._REAL.shape[0], self.Z_DIM])
-																																																																																																																																y = tf.compat.v1.placeholder(shape=[self._REAL.shape[0], self.NUM_LABELS], dtype=tf.int32)
-																																																																																																																																#y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32)
-																																																																																																																																if self._LABEL is not None :
-																																																																																																																																																																																																ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
-																																																																																																																																																																																																label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
-																																																																																																																																else:
-																																																																																																																																																																																																label = None
-																																																																																																																																fake																						= self.generator.network(inputs=z, label=label)
-																																																																																																																																init																						= tf.compat.v1.global_variables_initializer()
-																																																																																																																																saver																					= tf.compat.v1.train.Saver()
-																																																																																																																																df																																																																																																												= pd.DataFrame()
-																																																																																																																																CANDIDATE_COUNT = 10000
-																																																																																																																																NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0]
-																																																																																																																																with tf.compat.v1.Session() as sess:
-																																																																																																																																																																																																
-																																																																																																																																																																																																# sess.run(init)
-																																																																																																																																																																																																saver.restore(sess, model_dir)
-																																																																																																																																																																																																if self._LABEL is not None :
-																																																																																																																																																																																																																																																																labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
-																																																																																																																																																																																																																																																																labels= demo
-																																																																																																																																																																																																else:
-																																																																																																																																																																																																																																																																labels = None
-																																																																																																																																																																																																
-																																																																																																																																																																																																found = []
-																																																																																																																																																																																																
-																																																																																																																																																																																																for i in np.arange(CANDIDATE_COUNT) :
-																																																																																																																																																																																																																																																																if labels :
-																																																																																																																																																																																																																																																																																																																																f = sess.run(fake,feed_dict={y:labels})
-																																																																																																																																																																																																																																																																else:
-																																																																																																																																																																																																																																																																																																																																f = sess.run(fake)
-																																																																																																																																																																																																																																																																#
-																																																																																																																																																																																																																																																																# if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
-																																																																																																																																																																																																																																																																# The code below will insure we have some acceptable cardinal relationships between id and synthetic values
-																																																																																																																																																																																																																																																																#
-																																																																																																																																																																																																																																																																df =						( pd.DataFrame(np.round(f).astype(np.int32)))
-																																																																																																																																																																																																																																																																p = 0 not in df.sum(axis=1).values
-																																																																																																																																																																																																																																																																x = df.sum(axis=1).values
-																																																																																																																																																																																																																																																																if np.divide( np.sum(x), x.size) > .9:
-																																																																																																																																																																																																																																																																																																																																found.append(df)
-																																																																																																																																																																																																																																																																																																																																if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT:
-																																																																																																																																																																																																																																																																																																																																																																																																break
-																																																																																																																																																																																																																																																																else:
-																																																																																																																																																																																																																																																																																																																																continue
-																																																																																																																																																																																																																																																																																																																																
-																																																																																																																																																																																																# i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
-																																																																																																																																																																																																# df = (i * df).sum(axis=1)
-																																																																																																																																																																																																#
-																																																																																																																																																																																																# In case we are dealing with actual values like diagnosis codes we can perform 
-																																																																																																																																																																																																#
-																																																																																																																																																																																																INDEX =np.random.choice(np.arange(len(found)),1)[0]
-																																																																																																																																																																																																#df = found[np.random.choice(np.arange(len(found)),1)[0]]
-																																																																																																																																																																																																df = found[INDEX]
-																																																																																																																																																																																																columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
-																																																																																																																																																																																																
-																																																																																																																																																																																																# r = np.zeros((self.ROW_COUNT,len(columns)))
-																																																																																																																																																																																																r = np.zeros(self.ROW_COUNT)
-																																																																																																																																																																																																df.columns = self.values
-																																																																																																																																																																																																if len(found):
-																																																																																																																																																																																																																																																																print (len(found),NTH_VALID_CANDIDATE)			
-																																																																																																																																																																																																																																																																# x = df * self.values 
-																																																																																																																																																																																																																																																																#
-																																																																																																																																																																																																																																																																# let's get the rows with no values synthesized (for whatever reason)
-																																																																																																																																																																																																																																																																#
-																																																																																																																																																																																																																																																																ii = df.apply(lambda row: np.sum(row) == 0,axis=1)
-																																																																																																																																																																																																																																																																if np.sum(ii) > 0 :
-																																																																																																																																																																																																																																																																																missing = np.repeat(np.nan, np.where(ii==1)[0].size)
-																																																																																																																																																																																																																																																																else:
-																																																																																																																																																																																																																																																																																missing = []
-																																																																																																																																																																																																																																																																print (len (missing), df.shape)	
-																																																																																																																																																																																																																																																																i = np.where(ii == 0)[0]
-																																																																																																																																																																																																																																																																df =						pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row == 1)[0],1)[0]] ,axis=1))
-																																																																																																																																																																																																																																																																df.columns = columns
-																																																																																																																																																																																																																																																																df = df[columns[0]].append(pd.Series(missing))
-
-																																																																																																																																																																																																																																																																																																																																																																
-																																																																																																																																																																																																																	
-																																																																																																																																																																																																
-																																																																																																																																
-																																																																																																																																tf.compat.v1.reset_default_graph()
-																																																																																																																																df = pd.DataFrame(df)
-																																																																																																																																df.columns = columns
-																																																																																																																																print (df.head())
-																																																																																																																																print (df.shape)
-																																																																																																																																return df.to_dict(orient='list')
-																																																																																																																																																																																																# return df.to_dict(orient='list')
-																																																																																																																																																																																																# count = str(len(os.listdir(self.out_dir)))
-																																																																																																																																																																																																# _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])
-																																																																																																																																																																																																# df.to_csv(_name,index=False)
-
-																																																																																																																																																																																																
-																																																																																																																																																																																																# output.extend(np.round(f))
-
-																																																																																																																																																																																																# for m in range(2):
-																																																																																																																																																																																																#																																																																															for n in range(2, self.NUM_LABELS):
-																																																																																																																																																																																																#																																																																																																																																															idx1 = (demo[:, m] == 1)
-																																																																																																																																																																																																#																																																																																																																																															idx2 = (demo[:, n] == 1)
-																																																																																																																																																																																																#																																																																																																																																															idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
-																																																																																																																																																																																																#																																																																																																																																															num = np.sum(idx)
-																																																																																																																																																																																																#																																																																																																																																															print ("___________________list__")
-																																																																																																																																																																																																#																																																																																																																																															print (idx1)
-																																																																																																																																																																																																#																																																																																																																																															print (idx2)
-																																																																																																																																																																																																#																																																																																																																																															print (idx)
-																																																																																																																																																																																																#																																																																																																																																															print (num)
-																																																																																																																																																																																																#																																																																																																																																															print ("_____________________")
-																																																																																																																																																																																																#																																																																																																																																															nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU))
-																																																																																																																																																																																																#																																																																																																																																															label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS))
-																																																																																																																																																																																																#																																																																																																																																															label_input[:, n] = 1
-																																																																																																																																																																																																#																																																																																																																																															label_input[:, m] = 1
-																																																																																																																																																																																																#																																																																																																																																															output = []
-																																																																																																																																																																																																#																																																																																																																																															for i in range(nbatch):
-																																																																																																																																																																																																#																																																																																																																																																																																																															f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]})
-																																																																																																																																																																																																#																																																																																																																																																																																																															output.extend(np.round(f))
-																																																																																																																																																																																																#																																																																																																																																															output = np.array(output)[:num]
-																																																																																																																																																																																																																																																																																																																																# print ([m,n,output])
-																																																																																																																																																																																																																																																																																																																																
-																																																																																																																																																																																																																																																																																																																																# np.save(self.out_dir + str(m) + str(n), output)
-																																																																
-
-if __name__ == '__main__' :
-																																																																#
-																																																																# Now we get things done ...
-																																																																column																																																																		= SYS_ARGS['column']
-																																																																column_id																																																															= SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id'
-																																																																column_id																																																															= column_id.split(',') if ',' in column_id else column_id
-																																																																df = pd.read_csv(SYS_ARGS['raw-data'])		
-																																																																LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
-																																																																
-																																																																context																																																																	= SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4]
-																																																																if set(['train','learn']) & set(SYS_ARGS.keys()):
-																																																																																																																																
-																																																																																																																																df = pd.read_csv(SYS_ARGS['raw-data'])			
-																																																																																																																																
-																																																																																																																																# cols = SYS_ARGS['column']
-																																																																																																																																# _map,_df = (Binary()).Export(df)
-																																																																																																																																# i = np.arange(_map[column]['start'],_map[column]['end'])
-																																																																																																																																max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10
-																																																																																																																																# REAL																		= _df[:,i]
-																																																																																																																																REAL																						= pd.get_dummies(df[column]).astype(np.float32).values
-																																																																																																																																LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
-																																																																																																																																trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id)
-																																																																																																																																trainer.apply()
-																																																																																																																																
-																																																																																																																																
-																																																																																	
-																																																																																																																																
-																																																																																																																																#
-																																																																																																																																# We should train upon this data
-																																																																																																																																#
-																																																																																																																																# -- we need to convert the data-frame to binary matrix, given a column
-																																																																																																																																#
-																																																																																																																																pass
-																																																																elif 'generate' in SYS_ARGS:
-																																																																																																																																values = df[column].unique().tolist()
-																																																																																																																																values.sort()
-																																																																																																																																
-																																																																																																																																p = Predict(context=context,label=LABEL,values=values,column=column)
-																																																																																																																																p.load_meta(column)
-																																																																																																																																r = p.apply()
-																																																																																																																																print (df)
-																																																																																																																																print ()
-																																																																																																																																df[column] = r[column]
-																																																																																																																																print (df)
-																																																																																																																																
-																																																																																																																																
-																																																																else:
-																																																																																																																																print (SYS_ARGS.keys())
-																																																																																																																																print (__doc__)
-																																																																pass
-

From bd6fb03f8d228028d2be53a33ad50cceb77fdb94 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 26 Feb 2020 09:34:01 -0600
Subject: [PATCH 030/250] updating version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index aefd6d0..477c48a 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.1.3","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.1.4","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 3fbd68309fb57b467063e9ee0b79eb06ff35c7d7 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 28 Feb 2020 21:37:26 -0600
Subject: [PATCH 031/250] Handling of continous values

---
 data/gan.py            |   8 +--
 data/maker/__init__.py | 114 ++++++++++++++++++++++++++++++++++-------
 data/maker/__main__.py |   6 +--
 3 files changed, 103 insertions(+), 25 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 204f8af..c2aadb5 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -604,7 +604,7 @@ class Predict(GNet):
                         r = np.zeros(self.ROW_COUNT)
                         df.columns = self.values
                         if len(found):
-                                print (len(found),NTH_VALID_CANDIDATE)    
+                                # print (len(found),NTH_VALID_CANDIDATE)    
                                 # x = df * self.values 
                                 #
                                 # let's get the missing rows (if any) ...
@@ -704,10 +704,10 @@ if __name__ == '__main__' :
                 p = Predict(context=context,label=LABEL,values=values,column=column)
                 p.load_meta(column)
                 r = p.apply()
-                print (df)
-                print ()
+                # print (df)
+                # print ()
                 df[column] = r[column]
-                print (df)
+                # print (df)
                 
                 
         else:
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index d5a4308..6114ad2 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -14,6 +14,68 @@ import data.gan as gan
 from transport import factory
 from data.bridge import Binary
 import threading as thread
+class ContinuousToDiscrete :
+    @staticmethod
+    def binary(X,n=4) :
+        """
+        This function will convert a continous stream of information into a variety a bit stream of bins
+        """
+        # BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist()
+        
+        BOUNDS = ContinuousToDiscrete.bounds(X,n)
+        
+        # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS]
+        _matrix = []
+        m = []
+        for value in X :
+            x_ = np.zeros(n)
+            _matrix.append(x_)
+            for row in BOUNDS :
+            
+                if value>= row.left and value <= row.right :
+                    index = BOUNDS.index(row)
+                    x_[index]  = 1
+                    break
+
+        return _matrix
+    
+    @staticmethod
+    def bounds(x,n):
+        return list(pd.cut(np.array(x),n).categories)
+        
+
+        
+    @staticmethod
+    def continuous(X,BIN_SIZE=4) :
+        """
+        This function will approximate a binary vector given boundary information
+        :X  binary matrix
+        :BIN_SIZE
+        """
+        BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)
+        
+        values = []
+        _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
+        # # print (BOUNDS)
+        
+        # values = []
+        for row in _BINARY :
+            # ubound = BOUNDS[row.index(1)]
+            index = np.where(row == 1)[0][0]
+            
+            ubound = BOUNDS[ index ].right
+            lbound = BOUNDS[ index ].left
+            
+            x_ = np.round(np.random.uniform(lbound,ubound),3).astype(float)            
+            values.append(x_)
+            
+            lbound = ubound
+        
+        return values
+            
+
+
+    
 def train (**args) :
     """
     This function is intended to train the GAN in order to learn about the distribution of the features
@@ -24,22 +86,30 @@ def train (**args) :
     :context    label of what we are synthesizing
     """
     column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
-    
+    CONTINUOUS  = args['continuous'] if 'continuous' in args else []
     # column_id   = args['id']
     df          = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
     df.columns = [name.lower() for name in df.columns]
-
+    #
+    # @TODO:
+    # Consider sequential training of sub population for extremely large datasets
+    #
+    
     #
     # If we have several columns we will proceed one at a time (it could be done in separate threads)
     # @TODO : Consider performing this task on several threads/GPUs simulataneously
     # 
-    handler = Binary()
-    # args['label']      = pd.get_dummies(df[column_id]).astype(np.float32).values
-    # args['label']   = handler.Export(df[[column_id]])
-    # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1)
-    for col in column :    
-        args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
-        # args['real']    = handler.Export(df[[col]])
+    for col in column : 
+        # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
+        # if 'float' not in df[col].dtypes.name :
+            # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
+        if 'float' in df[col].dtypes.name and col in CONTINUOUS:
+            BIN_SIZE = 10 if 'bin_size' not in args else int(args['bin_size'])
+            args['real']    = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32)
+        else:
+            args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
+        
+
         args['column']  = col
         args['context'] = col
         context     = args['context']
@@ -75,7 +145,7 @@ def generate(**args):
     """
     # df      = args['data']
     df      = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
-    
+    CONTINUOUS = args['continous'] if 'continuous' in args else []
     column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
     # column_id   = args['id']
     #
@@ -86,18 +156,26 @@ def generate(**args):
     for col in column :
         args['context'] = col
         args['column']  = col
-        values          = df[col].unique().tolist()
-        args['values']  = values
-        args['row_count'] = df.shape[0]
+        
+        if 'float' in df[col].dtypes.name or col in CONTINUOUS :
+            #
+            # We should create the bins for the values we are observing here
+            BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
+            values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE)
+        else:
+            values          = df[col].unique().tolist()
+        
+        args['values']      = values    
+        args['row_count']   = df.shape[0]
         #
         # we can determine the cardinalities here so we know what to allow or disallow
         handler         = gan.Predict (**args)
         handler.load_meta(col)
-        # handler.ROW_COUNT = df[col].shape[0]
-        r       =  handler.apply()        
-        # print (r)      
-        # 
-        print ([_df.shape,len(r[col])])  
+        r       =  handler.apply()                
         _df[col] = r[col]
+        #
+        # @TODO: log basic stats about the synthetic attribute
+        #
+        
         # break
     return _df
\ No newline at end of file
diff --git a/data/maker/__main__.py b/data/maker/__main__.py
index 583be60..d71d400 100644
--- a/data/maker/__main__.py
+++ b/data/maker/__main__.py
@@ -17,9 +17,9 @@ if 'config' in SYS_ARGS :
         odf = pd.read_csv (ARGS['data'])
         odf.columns = [name.lower() for name in odf.columns]
         column = ARGS['column']  if isinstance(ARGS['column'],list) else [ARGS['column']]
-        print (odf.head())
-        print (_df.head())
-        # print(pd.merge(odf,_df,rsuffix='_io'))
+        # print (odf.head())
+        # print (_df.head())
+        print(odf.join(_df[column],rsuffix='_io'))
         # print (_df[column].risk.evaluate(flag='synth'))
         # print (odf[column].risk.evaluate(flag='original'))
         # _x = pd.get_dummies(_df[column]).values

From a2988a59720101eeb4648434938e6540a212fcfa Mon Sep 17 00:00:00 2001
From: "Steve L. Nyemba" <steve.l.nyemba@vumc.org>
Date: Sun, 1 Mar 2020 12:07:02 -0600
Subject: [PATCH 032/250] pipeline

---
 pipeline.py | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 pipeline.py

diff --git a/pipeline.py b/pipeline.py
new file mode 100644
index 0000000..e6e1225
--- /dev/null
+++ b/pipeline.py
@@ -0,0 +1,126 @@
+import json
+from transport import factory
+import os
+from multiprocessing import Process
+import pandas as pd
+from google.oauth2 import service_account
+import data.maker
+
+from data.params import SYS_ARGS 
+
+f = open ('config.json')
+PIPELINE = json.loads(f.read())
+f.close()
+#
+# The configuration array is now loaded and we will execute the pipe line as follows
+DATASET='combined20190510_deid'
+
+class Components :
+        @staticmethod
+        def get(args):
+                SQL = args['sql']
+                if 'condition' in args :
+                        condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')'])
+                        SQL = " ".join([SQL,'WHERE',condition])
+
+                SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 "
+                return SQL #+ " LIMIT 10000 "
+
+        @staticmethod
+        def train(args):
+                """
+                This function will instanciate a worker that will train given a message that is provided to it
+                This is/will be a separate process that will
+                """
+                print (['starting .... ',args['notify'],args['context']] )
+                #SQL = args['sql']
+                #if 'condition' in args :
+                #       condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')'])
+                #       SQL = " ".join([SQL,'WHERE',condition])
+                print ( args['context'])
+                logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
+                log_folder = os.sep.join(["logs",args['context']])
+                _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":250,"num_gpus":2,"column":args['columns'],"id":"person_id","logger":logger}
+                os.environ['CUDA_VISIBLE_DEVICES'] = args['gpu']
+                #SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 "
+                SQL = Components.get(args)
+                if 'limit' in args :
+                        SQL = ' '.join([SQL,'limit',args['limit'] ])
+                _args['max_epochs'] = 250 if 'max_epochs' not in args else args['max_epochs']
+                credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
+                _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard')
+                #_args['data'] = _args['data'].astype(object)
+                _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+                data.maker.train(**_args) 
+        @staticmethod
+        def generate(args):
+                """
+                This function will generate data and store it to a given,
+                """
+                logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
+                log_folder = os.sep.join(["logs",args['context']])
+                _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":250,"num_gpus":2,"column":args['columns'],"id":"person_id","logger":logger}
+                os.environ['CUDA_VISIBLE_DEVICES'] = args['gpu']
+                SQL = Components.get(args) 
+                if 'limit' in args :
+                        SQL = " ".join([SQL ,'limit', args['limit'] ])
+                credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
+                _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').fillna('')
+                #_args['data'] = _args['data'].astype(object)
+                _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+        
+                _args['max_epochs'] = 250 if 'max_epochs' not in args else args['max_epochs']
+
+                _args['no_value'] = args['no_value'] if 'no_value' in args else '' 
+                #credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
+                #_args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard')
+                #_args['data'] = _args['data'].astype(object)
+                _dc = data.maker.generate(**_args) 
+                #
+                # We need to post the generate the data in order to :
+                #       1. compare immediately
+                #       2. synthetic copy
+                #
+                cols = _dc.columns.tolist()
+                print (args['columns']) 
+                data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io')     #-- will be used for comparison (store this in big query)
+                base_cols = list(set(_args['data'].columns) - set(args['columns']))     #-- rebuilt the dataset (and store it)
+                print (_args['data'].shape) 
+                print (_args['data'].shape) 
+                for name in cols :
+                        _args['data'][name] = _dc[name]
+                        # filename = os.sep.join([log_folder,'output',name+'.csv'])
+                        # data_comp[[name]].to_csv(filename,index=False)
+
+                #
+                #-- Let us store all of this into bigquery
+                prefix = args['notify']+'.'+_args['context']
+                table = '_'.join([prefix,'compare','io'])
+                data_comp.to_gbq(if_exists='replace',destination_table=table,credentials=credentials,chunksize=50000)           
+                _args['data'].to_gbq(if_exists='replace',destination_table=table.replace('compare','full'),credentials=credentials,chunksize=50000)
+                data_comp.to_csv(os.sep.join([log_folder,table+'.csv']),index=False)
+                
+
+if __name__ == '__main__' :
+        index = int(SYS_ARGS['index'])
+
+        args =  (PIPELINE[index])
+        #if 'limit' in SYS_ARGS :
+        #       args['limit'] = SYS_ARGS['limit']
+        #args['dataset'] = 'combined20190510'
+        SYS_ARGS['dataset'] = 'combined20190510_deid' if 'dataset' not in SYS_ARGS else SYS_ARGS['dataset']
+        #if 'max_epochs' in SYS_ARGS :
+        #       args['max_epochs'] = SYS_ARGS['max_epochs']
+        args = dict(args,**SYS_ARGS)
+        if 'generate' in SYS_ARGS :
+                Components.generate(args)
+                
+        else:
+                
+                Components.train(args)
+#for args in PIPELINE :
+        #args['dataset'] = 'combined20190510'
+        #process = Process(target=Components.train,args=(args,))
+        #process.name = args['context']
+        #process.start()
+#       Components.train(args)

From 8e722d5bf1e12b7694589ca9dc3b716c55841584 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 4 Mar 2020 11:49:18 -0600
Subject: [PATCH 033/250] bug fix and upgrades to base functionalities

---
 data/gan.py            |  19 +-
 data/maker/__init__.py |  35 ++--
 pipeline.py            | 384 ++++++++++++++++++++++++++++++-----------
 3 files changed, 313 insertions(+), 125 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index c2aadb5..b3b9cf8 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -431,9 +431,9 @@ class Train (GNet):
         
         def network(self,**args):
                 stage   = args['stage']
-                opt             = args['opt']
+                opt     = args['opt']
                 tower_grads = []
-                per_gpu_w       = []
+                per_gpu_w   = []
                 iterator, features_placeholder, labels_placeholder = self.input_fn()
                 with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()):
                         for i in range(self.NUM_GPUS):
@@ -550,6 +550,7 @@ class Predict(GNet):
                         label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
                 else:
                         label = None
+
                 fake    = self.generator.network(inputs=z, label=label)
                 init    = tf.compat.v1.global_variables_initializer()
                 saver   = tf.compat.v1.train.Saver()
@@ -577,11 +578,13 @@ class Predict(GNet):
                                 # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
                                 # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
                                 #
-                                df =  ( pd.DataFrame(np.round(f).astype(np.int32)))
+                                df =   pd.DataFrame(np.round(f).astype(np.int32))
+                                
+                                
                                 p = 0 not in df.sum(axis=1).values
                                 x = df.sum(axis=1).values
                                 
-                                if np.divide( np.sum(x), x.size)  > .9 or p:
+                                if np.divide( np.sum(x), x.size)  > .9 or p and np.sum(x) == x.size:
                                         ratio.append(np.divide( np.sum(x), x.size))
                                         found.append(df)
                                         if i == CANDIDATE_COUNT:
@@ -597,11 +600,13 @@ class Predict(GNet):
                         
                         INDEX = np.random.choice(np.arange(len(found)),1)[0]
                         INDEX = ratio.index(np.max(ratio))
+                        
                         df = found[INDEX]
                         columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
                         
                         # r = np.zeros((self.ROW_COUNT,len(columns)))
-                        r = np.zeros(self.ROW_COUNT)
+                        # r = np.zeros(self.ROW_COUNT)
+                        
                         df.columns = self.values
                         if len(found):
                                 # print (len(found),NTH_VALID_CANDIDATE)    
@@ -618,6 +623,10 @@ class Predict(GNet):
                                         missing = np.repeat(0, np.where(ii==1)[0].size)
                                 else:
                                         missing = []
+                                #
+                                # @TODO:
+                                #       Log the findings here in terms of ratio, missing, candidate count
+                                # print ([np.max(ratio),len(missing),len(found),i])
                                 i = np.where(ii == 0)[0]
                                 df =  pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
                                 df.columns = columns
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 6114ad2..080939c 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -15,6 +15,7 @@ from transport import factory
 from data.bridge import Binary
 import threading as thread
 class ContinuousToDiscrete :
+    ROUND_UP = 2
     @staticmethod
     def binary(X,n=4) :
         """
@@ -22,7 +23,7 @@ class ContinuousToDiscrete :
         """
         # BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist()
         
-        BOUNDS = ContinuousToDiscrete.bounds(X,n)
+        BOUNDS = ContinuousToDiscrete.bounds(np.round(X,ContinuousToDiscrete.ROUND_UP),n)
         
         # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS]
         _matrix = []
@@ -41,7 +42,7 @@ class ContinuousToDiscrete :
     
     @staticmethod
     def bounds(x,n):
-        return list(pd.cut(np.array(x),n).categories)
+        return list(pd.cut(np.array( np.round(x,ContinuousToDiscrete.ROUND_UP) ),n).categories)
         
 
         
@@ -66,7 +67,7 @@ class ContinuousToDiscrete :
             ubound = BOUNDS[ index ].right
             lbound = BOUNDS[ index ].left
             
-            x_ = np.round(np.random.uniform(lbound,ubound),3).astype(float)            
+            x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)            
             values.append(x_)
             
             lbound = ubound
@@ -104,10 +105,10 @@ def train (**args) :
         # if 'float' not in df[col].dtypes.name :
             # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
         if 'float' in df[col].dtypes.name and col in CONTINUOUS:
-            BIN_SIZE = 10 if 'bin_size' not in args else int(args['bin_size'])
+            BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
             args['real']    = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32)
         else:
-            args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
+            args['real']        = pd.get_dummies(df[col].dropna()).astype(np.float32).values 
         
 
         args['column']  = col
@@ -157,25 +158,27 @@ def generate(**args):
         args['context'] = col
         args['column']  = col
         
-        if 'float' in df[col].dtypes.name or col in CONTINUOUS :
-            #
-            # We should create the bins for the values we are observing here
-            BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
-            values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE)
-        else:
-            values          = df[col].unique().tolist()
+        # if 'float' in df[col].dtypes.name or col in CONTINUOUS :
+        #     #
+        #     # We should create the bins for the values we are observing here
+        #     BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
+        #     values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE)
+        #     # values = np.unique(values).tolist()
+        # else:
+        values          = df[col].unique().tolist()
         
         args['values']      = values    
         args['row_count']   = df.shape[0]
         #
         # we can determine the cardinalities here so we know what to allow or disallow
-        handler         = gan.Predict (**args)
+        handler     = gan.Predict (**args)
         handler.load_meta(col)
-        r       =  handler.apply()                
-        _df[col] = r[col]
+        r           =  handler.apply()                
+        BIN_SIZE    = 4 if 'bin_size' not in args else int(args['bin_size'])
+        _df[col]    = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if 'float' in df[col].dtypes.name or col in CONTINUOUS else r[col]
         #
         # @TODO: log basic stats about the synthetic attribute
         #
-        
+        # print (r)s
         # break
     return _df
\ No newline at end of file
diff --git a/pipeline.py b/pipeline.py
index e6e1225..134ca8c 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -1,5 +1,6 @@
 import json
 from transport import factory
+import numpy as np
 import os
 from multiprocessing import Process
 import pandas as pd
@@ -8,119 +9,294 @@ import data.maker
 
 from data.params import SYS_ARGS 
 
-f = open ('config.json')
-PIPELINE = json.loads(f.read())
-f.close()
 #
 # The configuration array is now loaded and we will execute the pipe line as follows
-DATASET='combined20190510_deid'
+DATASET='combined20190510'
 
 class Components :
-        @staticmethod
-        def get(args):
-                SQL = args['sql']
-                if 'condition' in args :
-                        condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')'])
-                        SQL = " ".join([SQL,'WHERE',condition])
 
-                SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 "
-                return SQL #+ " LIMIT 10000 "
+	@staticmethod
+	def get(args):
+		"""
+		This function returns a data-frame provided a bigquery sql statement with conditions (and limits for testing purposes)
+		The function must be wrapped around a lambda this makes testing easier and changing data stores transparent to the rest of the code. (Vital when testing)
+		:sql	basic sql statement
+		:condition	optional condition and filters
+		"""
+		SQL = args['sql']
+		if 'condition' in args :
+			condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')'])
+			SQL = " ".join([SQL,'WHERE',condition])
 
-        @staticmethod
-        def train(args):
-                """
-                This function will instanciate a worker that will train given a message that is provided to it
-                This is/will be a separate process that will
-                """
-                print (['starting .... ',args['notify'],args['context']] )
-                #SQL = args['sql']
-                #if 'condition' in args :
-                #       condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')'])
-                #       SQL = " ".join([SQL,'WHERE',condition])
-                print ( args['context'])
-                logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
-                log_folder = os.sep.join(["logs",args['context']])
-                _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":250,"num_gpus":2,"column":args['columns'],"id":"person_id","logger":logger}
-                os.environ['CUDA_VISIBLE_DEVICES'] = args['gpu']
-                #SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 "
-                SQL = Components.get(args)
-                if 'limit' in args :
-                        SQL = ' '.join([SQL,'limit',args['limit'] ])
-                _args['max_epochs'] = 250 if 'max_epochs' not in args else args['max_epochs']
-                credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
-                _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard')
-                #_args['data'] = _args['data'].astype(object)
-                _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
-                data.maker.train(**_args) 
-        @staticmethod
-        def generate(args):
-                """
-                This function will generate data and store it to a given,
-                """
-                logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
-                log_folder = os.sep.join(["logs",args['context']])
-                _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":250,"num_gpus":2,"column":args['columns'],"id":"person_id","logger":logger}
-                os.environ['CUDA_VISIBLE_DEVICES'] = args['gpu']
-                SQL = Components.get(args) 
-                if 'limit' in args :
-                        SQL = " ".join([SQL ,'limit', args['limit'] ])
-                credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
-                _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').fillna('')
-                #_args['data'] = _args['data'].astype(object)
-                _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
-        
-                _args['max_epochs'] = 250 if 'max_epochs' not in args else args['max_epochs']
+		SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 "
+		if 'limit' in args :
+			SQL = SQL + 'LIMIT ' + args['limit']
+		credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
+		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
+		return df
+		
+		# return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna()
+	@staticmethod
+	def split(X,MAX_ROWS=3,PART_SIZE=3):
+		
+		return list(pd.cut( np.arange(X.shape[0]+1),PART_SIZE).categories)
 
-                _args['no_value'] = args['no_value'] if 'no_value' in args else '' 
-                #credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
-                #_args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard')
-                #_args['data'] = _args['data'].astype(object)
-                _dc = data.maker.generate(**_args) 
-                #
-                # We need to post the generate the data in order to :
-                #       1. compare immediately
-                #       2. synthetic copy
-                #
-                cols = _dc.columns.tolist()
-                print (args['columns']) 
-                data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io')     #-- will be used for comparison (store this in big query)
-                base_cols = list(set(_args['data'].columns) - set(args['columns']))     #-- rebuilt the dataset (and store it)
-                print (_args['data'].shape) 
-                print (_args['data'].shape) 
-                for name in cols :
-                        _args['data'][name] = _dc[name]
-                        # filename = os.sep.join([log_folder,'output',name+'.csv'])
-                        # data_comp[[name]].to_csv(filename,index=False)
+	def train(self,**args):
+		"""
+		This function will perform training on the basis of a given pointer that reads data
 
-                #
-                #-- Let us store all of this into bigquery
-                prefix = args['notify']+'.'+_args['context']
-                table = '_'.join([prefix,'compare','io'])
-                data_comp.to_gbq(if_exists='replace',destination_table=table,credentials=credentials,chunksize=50000)           
-                _args['data'].to_gbq(if_exists='replace',destination_table=table.replace('compare','full'),credentials=credentials,chunksize=50000)
-                data_comp.to_csv(os.sep.join([log_folder,table+'.csv']),index=False)
-                
+		"""
+		#
+		# @TODO: we need to log something here about the parameters being passed
+		pointer  = args['reader'] if 'reader' in args else lambda: Components.get(**args)
+		df = pointer()
 
+		#
+		# Now we can parse the arguments and submit the entire thing to training
+		#
+		
+		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
+		log_folder = args['logs'] if 'logs' in args else 'logs'
+		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
+		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+
+		MAX_ROWS = args['max_rows'] 	if 'max_rows' in args else 0
+		PART_SIZE = args['part_size'] 	if 'part_size' in args else 0
+		
+		if df.shape[0] > MAX_ROWS and 'partition' not in args:
+			lbound = 0
+			bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
+			# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
+			
+			qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'})
+			
+			for b in bounds :
+				part_index = bounds.index(b)
+				ubound = int(b.right)
+				
+					
+				_data =  df.iloc[lbound:ubound][args['columns']]
+				lbound = ubound
+				
+				# _args['logs'] = os.sep.join([log_folder,str(part_index)])
+				_args['partition'] = str(part_index)
+				_args['logger'] = {'args':{'dbname':'aou','doc':args['context']},'type':'mongo.MongoWriter'}
+				#
+				# We should post the the partitions to a queue server (at least the instructions on ):
+				#	- where to get the data
+				#	- and athe arguments to use (partition #,columns,gpu,epochs)
+				#
+				info = {"rows":_data.shape[0],"cols":_data.shape[1], "paritition":part_index,"logs":_args['logs']}
+				p = {"args":_args,"data":_data.to_dict(orient="records"),"info":info}
+				qwriter.write(p)
+				#
+				# @TODO:
+				#	- Notify that information was just posted to the queue
+				info['max_rows'] 	= MAX_ROWS
+				info['part_size'] 	= PART_SIZE
+				logger.write({"module":"train","action":"setup-partition","input":info})
+			
+			pass
+		else:
+			partition = args['partition'] if 'partition' in args else ''
+			log_folder = os.sep.join([log_folder,args['context'],partition])
+			_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+			_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
+			_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+			os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
+			
+			_args['data'] = df
+			#
+			# @log :
+			#	Logging information about the training process for this partition (or not)
+			#
+			info = {"rows":df.shape[0],"cols":df.shape[1], "partition":partition,"logs":_args['logs']}
+			logger.write({"module":"train","action":"train","input":info})
+			data.maker.train(**_args)
+
+		pass
+		
+	# @staticmethod
+	def generate(self,args):
+		"""
+		This function will generate data and store it to a given,
+		"""
+		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
+		log_folder = args['logs'] if 'logs' in args else 'logs'
+		partition = args['partition'] if 'partition' in args else ''
+		log_folder = os.sep.join([log_folder,args['context'],partition])
+		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
+		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
+		_args['no_value']= args['no_value']
+		MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
+		PART_SIZE = args['part_size'] 	if 'part_size' in args else 0
+		
+		# credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
+		# _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
+		reader = args['reader']
+		df = reader()
+		if 'partition' in args :
+			bounds = Components.split(df,MAX_ROWS,PART_SIZE)
+			# bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
+			lbound = int(bounds[int(partition)].left)
+			ubound = int(bounds[int(partition)].right)
+			df = df.iloc[lbound:ubound]
+		_args['data'] = df
+			# _args['data'] = reader()
+		#_args['data'] = _args['data'].astype(object)
+		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+		_dc = data.maker.generate(**_args) 
+		#
+		# We need to post the generate the data in order to :
+		#	1. compare immediately
+		#	2. synthetic copy
+		#
+		
+		cols = _dc.columns.tolist()
+		
+		data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io')				#-- will be used for comparison (store this in big query)
+		base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
+		
+		for name in cols :
+			_args['data'][name] = _dc[name]
+			info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
+			if partition != '' :
+				info['partition'] = partition
+			logger.write(info)
+			# filename = os.sep.join([log_folder,'output',name+'.csv'])
+			# data_comp[[name]].to_csv(filename,index=False)
+
+		#
+		#-- Let us store all of this into bigquery
+		prefix = args['notify']+'.'+_args['context']
+		table = '_'.join([prefix,partition,'io']).replace('__','_')
+		folder = os.sep.join([args['logs'],args['context'],partition,'output']) 
+		if 'file' in args :
+			
+			_fname = os.sep.join([folder,table.replace('_io','_full_io.csv')])
+			_pname = os.sep.join([folder,table])+'.csv'
+			data_comp.to_csv( _pname,index=False)
+			_args['data'].to_csv(_fname,index=False)
+			
+			
+		else:
+			credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
+			_pname = os.sep.join([folder,table+'.csv'])
+			_fname = table.replace('_io','_full_io')
+			data_comp.to_gbq(if_exists='replace',destination_table=_pname,credentials='credentials',chunk_size=50000)	
+			data_comp.to_csv(_pname,index=False)
+			INSERT_FLAG = 'replace' if 'partition' not in args else 'append'	
+			_args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=_fname,credentials='credentials',chunk_size=50000)
+		
+		info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} }
+		if partition :
+			info ['partition'] = partition
+		logger.write({"module":"generate","action":"write","info":info} )
+	@staticmethod
+	def callback(channel,method,header,stream):
+		
+		info = json.loads(stream)
+		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']})
+		
+		logger.write({'module':'process','action':'read-partition','input':info['info']})
+		df = pd.DataFrame(info['data'])
+		args = info['args']
+		if int(args['num_gpu']) > 1 and args['gpu'] > 0:
+			args['gpu'] = args['gpu'] + args['num_gpu']
+		args['reader'] = lambda: df
+		#
+		# @TODO: Fix
+		# 	There is an inconsistency in column/columns ... fix this shit!
+		#
+		args['columns'] = args['column']		
+		(Components()).train(**args)		
+		logger.write({"module":"process","action":"exit","info":info["info"]})
+		channel.close()
+		channel.connection.close()
+		pass
+	
 if __name__ == '__main__' :
-        index = int(SYS_ARGS['index'])
+	filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json'
+	f = open (filename)
+	PIPELINE = json.loads(f.read())
+	f.close()	
+	index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else 0
+	
+	args =  (PIPELINE[index])
+	args['dataset'] = 'combined20190510'
+	args = dict(args,**SYS_ARGS)
+	args['max_rows'] = int(args['max_rows']) 	if 'max_rows' in args else 3
+	args['part_size']= int(args['part_size']) 	if 'part_size' in args else 3
+
+	#
+	# @TODO:
+	#	Log what was initiated so we have context of this processing ...
+	#
+	if 'listen' not in SYS_ARGS :
+		if 'file' in args :
+			reader = lambda: pd.read_csv(args['file']) ;
+		else:
+			reader = lambda: Components().get(args)
+		args['reader'] = reader
+	
+	if 'generate' in SYS_ARGS :
+		#
+		# Let us see if we have partitions given the log folder
+		
+		content = os.listdir( os.sep.join([args['logs'],args['context']]))
+		generator = Components()
+		if ''.join(content).isnumeric() :
+			#
+			# we have partitions we are working with
+			
+			for id in ''.join(content) :
+				args['partition'] = id
+				
+				generator.generate(args)
+		else:
+			generator.generate(args)
+		# Components.generate(args)
+	elif 'listen' in args :
+		#
+		# This will start a worker just in case to listen to a queue
+		if 'read' in SYS_ARGS :
+			QUEUE_TYPE = 'queue.QueueReader'
+			pointer = lambda qreader: qreader.read(1)
+		else:
+			QUEUE_TYPE = 'queue.QueueListener'
+			pointer = lambda qlistener: qlistener.listen()
+		N = int(SYS_ARGS['jobs']) if 'jobs' in SYS_ARGS else 1 
+		
+		qhandlers = [factory.instance(type=QUEUE_TYPE,args={'queue':'aou.io'}) for i in np.arange(N)]
+		jobs = []
+		for qhandler in qhandlers :
+			qhandler.callback = Components.callback
+			job = Process(target=pointer,args=(qhandler,))
+			job.start()
+			jobs.append(job)
+		#
+		# let us wait for the jobs
+		print (["Started ",len(jobs)," trainers"])
+		while len(jobs) > 0 :
+			
+			jobs = [job for job in jobs if job.is_alive()]
+		
+		# pointer(qhandler)
+
 
-        args =  (PIPELINE[index])
-        #if 'limit' in SYS_ARGS :
-        #       args['limit'] = SYS_ARGS['limit']
-        #args['dataset'] = 'combined20190510'
-        SYS_ARGS['dataset'] = 'combined20190510_deid' if 'dataset' not in SYS_ARGS else SYS_ARGS['dataset']
-        #if 'max_epochs' in SYS_ARGS :
-        #       args['max_epochs'] = SYS_ARGS['max_epochs']
-        args = dict(args,**SYS_ARGS)
-        if 'generate' in SYS_ARGS :
-                Components.generate(args)
-                
-        else:
-                
-                Components.train(args)
+		# qreader.read(1)
+		pass
+	else:
+		
+		trainer = Components()
+		trainer.train(**args)
+		# Components.train(**args)
 #for args in PIPELINE :
-        #args['dataset'] = 'combined20190510'
-        #process = Process(target=Components.train,args=(args,))
-        #process.name = args['context']
-        #process.start()
-#       Components.train(args)
+	#args['dataset'] = 'combined20190510'
+	#process = Process(target=Components.train,args=(args,))
+	#process.name = args['context']
+	#process.start()
+#	Components.train(args)

From eb1abdf9892cabf6654f55855be0c0aba9a36772 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 4 Mar 2020 12:16:50 -0600
Subject: [PATCH 034/250] bug fix with installer within branch

---
 pipeline.py | 1 +
 setup.py    | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 134ca8c..04658da 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import json
 from transport import factory
 import numpy as np
diff --git a/setup.py b/setup.py
index 477c48a..e6b988a 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.1.4","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.1.6","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
@@ -12,4 +12,5 @@ args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
 if sys.version_info[0] == 2 :
     args['use_2to3'] = False
     args['use_2to3_exclude_fixers'] = ['lib2to3.fixes.fix_import']
+args['scripts']=['pipeline.py']
 setup(**args)

From 076db1ec2ccf391656f906e360eb8b2cd9ec8521 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 4 Mar 2020 13:22:25 -0600
Subject: [PATCH 035/250] bug fix

---
 pipeline.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 04658da..fd5a28e 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -33,7 +33,7 @@ class Components :
 		if 'limit' in args :
 			SQL = SQL + 'LIMIT ' + args['limit']
 		credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
-		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
+		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard')
 		return df
 		
 		# return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna()
@@ -51,7 +51,9 @@ class Components :
 		# @TODO: we need to log something here about the parameters being passed
 		pointer  = args['reader'] if 'reader' in args else lambda: Components.get(**args)
 		df = pointer()
-
+		if df.shape[0] == 0 :
+			print ("CAN NOT TRAIN EMPTY DATASET ")
+			return 
 		#
 		# Now we can parse the arguments and submit the entire thing to training
 		#

From 14cce1ef09ed96ebb16cf9b785494ea4a08df096 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 4 Mar 2020 13:40:26 -0600
Subject: [PATCH 036/250] bug fix with worker ...

---
 pipeline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pipeline.py b/pipeline.py
index fd5a28e..8c8a7d7 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -233,6 +233,7 @@ if __name__ == '__main__' :
 	args = dict(args,**SYS_ARGS)
 	args['max_rows'] = int(args['max_rows']) 	if 'max_rows' in args else 3
 	args['part_size']= int(args['part_size']) 	if 'part_size' in args else 3
+	
 
 	#
 	# @TODO:
@@ -265,6 +266,7 @@ if __name__ == '__main__' :
 	elif 'listen' in args :
 		#
 		# This will start a worker just in case to listen to a queue
+		SYS_ARGS = dict(args)	#-- things get lost in context
 		if 'read' in SYS_ARGS :
 			QUEUE_TYPE = 'queue.QueueReader'
 			pointer = lambda qreader: qreader.read(1)

From 832581303b623c4cc6cc3cf43f4716ac1427f773 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 4 Mar 2020 14:08:10 -0600
Subject: [PATCH 037/250] bug fix: gpu assignement error

---
 pipeline.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 8c8a7d7..b53ba52 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -63,6 +63,7 @@ class Components :
 		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
 		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+		_args['gpu'] = args['gpu'] if 'gpu' in args else 0
 
 		MAX_ROWS = args['max_rows'] 	if 'max_rows' in args else 0
 		PART_SIZE = args['part_size'] 	if 'part_size' in args else 0
@@ -85,6 +86,7 @@ class Components :
 				# _args['logs'] = os.sep.join([log_folder,str(part_index)])
 				_args['partition'] = str(part_index)
 				_args['logger'] = {'args':{'dbname':'aou','doc':args['context']},'type':'mongo.MongoWriter'}
+				
 				#
 				# We should post the the partitions to a queue server (at least the instructions on ):
 				#	- where to get the data
@@ -207,8 +209,9 @@ class Components :
 		logger.write({'module':'process','action':'read-partition','input':info['info']})
 		df = pd.DataFrame(info['data'])
 		args = info['args']
+		args['gpu'] = int(info['info']['partition'])
 		if int(args['num_gpu']) > 1 and args['gpu'] > 0:
-			args['gpu'] = args['gpu'] + args['num_gpu']
+			args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else 0
 		args['reader'] = lambda: df
 		#
 		# @TODO: Fix

From 2b7b1757f92840a65fa8a599ff475f06e7be00ca Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 4 Mar 2020 14:17:28 -0600
Subject: [PATCH 038/250] bug fixes with callback (worker)

---
 pipeline.py | 2 +-
 setup.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index b53ba52..a39dbc7 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -211,7 +211,7 @@ class Components :
 		args = info['args']
 		args['gpu'] = int(info['info']['partition'])
 		if int(args['num_gpu']) > 1 and args['gpu'] > 0:
-			args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else 0
+			args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else 0 #-- 8 max gpus
 		args['reader'] = lambda: df
 		#
 		# @TODO: Fix
diff --git a/setup.py b/setup.py
index e6b988a..26e2e9d 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.1.6","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.1.7","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 142491cf5bcd15f61f5cc79023b1cc9dcfeed00f Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 4 Mar 2020 14:27:54 -0600
Subject: [PATCH 039/250] Bug Fix: GPU work load

---
 pipeline.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index a39dbc7..fcf9912 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -209,9 +209,10 @@ class Components :
 		logger.write({'module':'process','action':'read-partition','input':info['info']})
 		df = pd.DataFrame(info['data'])
 		args = info['args']
-		args['gpu'] = int(info['info']['partition'])
-		if int(args['num_gpu']) > 1 and args['gpu'] > 0:
-			args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else 0 #-- 8 max gpus
+		MAX_GPUS = 8
+		args['gpu'] = int(info['info']['partition']) if info['info']['partition'] < MAX_GPUS else np.random.choice(np.arange(MAX_GPUS),1).astype(int).tolist()[0]
+		# if int(args['num_gpu']) > 1 and args['gpu'] > 0:
+		# 	args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else args['gpu'] #-- 8 max gpus
 		args['reader'] = lambda: df
 		#
 		# @TODO: Fix

From 84b3e6c0484f7931a587ccfc41a3f4810ebbdffe Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 4 Mar 2020 14:30:40 -0600
Subject: [PATCH 040/250] dataset ... (fix)

---
 pipeline.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index fcf9912..6bbec2f 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -12,7 +12,7 @@ from data.params import SYS_ARGS
 
 #
 # The configuration array is now loaded and we will execute the pipe line as follows
-DATASET='combined20190510'
+DATASET='combined20191004v2_deid'
 
 class Components :
 
@@ -233,11 +233,12 @@ if __name__ == '__main__' :
 	index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else 0
 	
 	args =  (PIPELINE[index])
-	args['dataset'] = 'combined20190510'
+	
 	args = dict(args,**SYS_ARGS)
 	args['max_rows'] = int(args['max_rows']) 	if 'max_rows' in args else 3
 	args['part_size']= int(args['part_size']) 	if 'part_size' in args else 3
-	
+	if 'dataset' not in args :
+		args['dataset'] = 'combined20191004v2_deid'
 
 	#
 	# @TODO:

From f4295041f9e441922ceec7416b79a739f4eb036c Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 4 Mar 2020 16:38:44 -0600
Subject: [PATCH 041/250] bug fix: type

---
 pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 6bbec2f..cd527a5 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -92,7 +92,7 @@ class Components :
 				#	- where to get the data
 				#	- and athe arguments to use (partition #,columns,gpu,epochs)
 				#
-				info = {"rows":_data.shape[0],"cols":_data.shape[1], "paritition":part_index,"logs":_args['logs']}
+				info = {"rows":_data.shape[0],"cols":_data.shape[1], "partition":part_index,"logs":_args['logs']}
 				p = {"args":_args,"data":_data.to_dict(orient="records"),"info":info}
 				qwriter.write(p)
 				#

From dd7fd5696bf682cd9b70786e2ccaa454aa62d841 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 5 Mar 2020 11:49:14 -0600
Subject: [PATCH 042/250] bug fix with partitions (generation may require it
 regardless)

---
 data/gan.py |  4 ++--
 pipeline.py | 30 ++++++++++++++++++++++++++----
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index b3b9cf8..a591f34 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -356,7 +356,7 @@ class Train (GNet):
                 self.meta = self.log_meta()
                 if(self.logger):
                         
-                        self.logger.write( self.meta )
+                        self.logger.write({"module":"gan-train","action":"start","input":self.meta} )
                 
                 # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta)
         def load_meta(self, column):
@@ -514,7 +514,7 @@ class Train (GNet):
                                                 #
                                                 #
                                                 if self.logger :
-                                                        row = {"logs":logs} #,"model":pickle.dump(sess)}                                                        
+                                                        row = {"module":"gan-train","action":"logs","input":logs} #,"model":pickle.dump(sess)}                                                        
                                                         self.logger.write(row)
                                                         #
                                                         # @TODO:
diff --git a/pipeline.py b/pipeline.py
index cd527a5..58b5380 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -131,6 +131,7 @@ class Components :
 		log_folder = args['logs'] if 'logs' in args else 'logs'
 		partition = args['partition'] if 'partition' in args else ''
 		log_folder = os.sep.join([log_folder,args['context'],partition])
+
 		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
 		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
@@ -143,12 +144,31 @@ class Components :
 		# _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
 		reader = args['reader']
 		df = reader()
-		if 'partition' in args :
+		bounds = Components.split(df,MAX_ROWS,PART_SIZE)
+		if partition != '' and os.path.exists(log_folder):
 			bounds = Components.split(df,MAX_ROWS,PART_SIZE)
 			# bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
 			lbound = int(bounds[int(partition)].left)
 			ubound = int(bounds[int(partition)].right)
 			df = df.iloc[lbound:ubound]
+		else:
+			#
+			# We have an implicit partition here
+			# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
+			logger.write({"module":"generate","action":"virtual-parititions","input":{"rows":df.shape[0],"max_rows":MAX_ROWS,"part_size":PART_SIZE}})
+			for item in bounds :
+
+				lbound = int(item.left)
+				ubound = int(item.right)
+				args['reader'] = lambda: df[lbound:ubound]
+				args['partition'] = bounds.index(item)
+				
+				self.generate(args)
+			return ;
+		if not os.path.exists(log_folder) :
+			log_folder = log_folder.replace(partition,'')
+			_args['logs'] = log_folder
+
 		_args['data'] = df
 			# _args['data'] = reader()
 		#_args['data'] = _args['data'].astype(object)
@@ -193,7 +213,7 @@ class Components :
 			_fname = table.replace('_io','_full_io')
 			data_comp.to_gbq(if_exists='replace',destination_table=_pname,credentials='credentials',chunk_size=50000)	
 			data_comp.to_csv(_pname,index=False)
-			INSERT_FLAG = 'replace' if 'partition' not in args else 'append'	
+			INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'	
 			_args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=_fname,credentials='credentials',chunk_size=50000)
 		
 		info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} }
@@ -235,8 +255,9 @@ if __name__ == '__main__' :
 	args =  (PIPELINE[index])
 	
 	args = dict(args,**SYS_ARGS)
-	args['max_rows'] = int(args['max_rows']) 	if 'max_rows' in args else 3
-	args['part_size']= int(args['part_size']) 	if 'part_size' in args else 3
+	args['max_rows'] 	= int(args['max_rows']) 	if 'max_rows' in args else 3
+	args['part_size']	= int(args['part_size']) 	if 'part_size' in args else 4
+	args['logs'] 		= args['logs'] if 'logs' in args else 'logs'
 	if 'dataset' not in args :
 		args['dataset'] = 'combined20191004v2_deid'
 
@@ -257,6 +278,7 @@ if __name__ == '__main__' :
 		
 		content = os.listdir( os.sep.join([args['logs'],args['context']]))
 		generator = Components()
+		
 		if ''.join(content).isnumeric() :
 			#
 			# we have partitions we are working with

From c9115fe473fe3d4b2c2b7996abd968feacba90b7 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 5 Mar 2020 11:50:04 -0600
Subject: [PATCH 043/250] version update

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 26e2e9d..02f49a2 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.1.7","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.1.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 27473989f9804564f081285b884c5921752d5a94 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 5 Mar 2020 12:03:04 -0600
Subject: [PATCH 044/250] bug fix ...

---
 pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 58b5380..5442935 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -130,8 +130,8 @@ class Components :
 		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 		log_folder = args['logs'] if 'logs' in args else 'logs'
 		partition = args['partition'] if 'partition' in args else ''
-		log_folder = os.sep.join([log_folder,args['context'],partition])
-
+		log_folder = os.sep.join([log_folder,args['context'],str(partition)])
+		
 		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
 		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 

From c75bb54d2b8bd4df6a5a55c76db62a3e3b9caaee Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 5 Mar 2020 23:49:55 -0600
Subject: [PATCH 045/250] bug fix: mandatory partitioning while training

---
 pipeline.py | 81 ++++++++++++++++++++++-------------------------------
 setup.py    |  2 +-
 2 files changed, 35 insertions(+), 48 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 5442935..917026d 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -61,27 +61,23 @@ class Components :
 		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 		log_folder = args['logs'] if 'logs' in args else 'logs'
 		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+		
 		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
 		_args['gpu'] = args['gpu'] if 'gpu' in args else 0
 
-		MAX_ROWS = args['max_rows'] 	if 'max_rows' in args else 0
-		PART_SIZE = args['part_size'] 	if 'part_size' in args else 0
+		# MAX_ROWS = args['max_rows'] 	if 'max_rows' in args else 0
+		PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
 		
-		if df.shape[0] > MAX_ROWS and 'partition' not in args:
+		if 'partition' not in args:
 			lbound = 0
-			bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
+			# bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
 			# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
-			
+			columns = args['columns']
+			df = np.array_split(df[columns].values,PART_SIZE)
 			qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'})
-			
-			for b in bounds :
-				part_index = bounds.index(b)
-				ubound = int(b.right)
-				
-					
-				_data =  df.iloc[lbound:ubound][args['columns']]
-				lbound = ubound
+			part_index  = 0			
+			for _df in df:
 				
 				# _args['logs'] = os.sep.join([log_folder,str(part_index)])
 				_args['partition'] = str(part_index)
@@ -92,14 +88,20 @@ class Components :
 				#	- where to get the data
 				#	- and athe arguments to use (partition #,columns,gpu,epochs)
 				#
-				info = {"rows":_data.shape[0],"cols":_data.shape[1], "partition":part_index,"logs":_args['logs']}
-				p = {"args":_args,"data":_data.to_dict(orient="records"),"info":info}
+
+				_df = pd.DataFrame(_df,columns=columns)
+				# print (columns)
+				
+				info = {"rows":_df.shape[0],"cols":_df.shape[1], "partition":part_index,"logs":_args['logs'],"num_gpu":2,"part_size":PART_SIZE}
+				p = {"args":_args,"data":_df.to_dict(orient="records"),"info":info}
+				part_index += 1
 				qwriter.write(p)
 				#
 				# @TODO:
 				#	- Notify that information was just posted to the queue
-				info['max_rows'] 	= MAX_ROWS
-				info['part_size'] 	= PART_SIZE
+				#	In case we want slow-mode, we can store the partitions in mongodb and process (Yes|No)?
+				#
+				
 				logger.write({"module":"train","action":"setup-partition","input":info})
 			
 			pass
@@ -137,37 +139,18 @@ class Components :
 		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
 		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
 		_args['no_value']= args['no_value']
-		MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
-		PART_SIZE = args['part_size'] 	if 'part_size' in args else 0
+		# MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
+		PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
 		
 		# credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 		# _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
 		reader = args['reader']
 		df = reader()
-		bounds = Components.split(df,MAX_ROWS,PART_SIZE)
+		# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
 		if partition != '' and os.path.exists(log_folder):
-			bounds = Components.split(df,MAX_ROWS,PART_SIZE)
-			# bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
-			lbound = int(bounds[int(partition)].left)
-			ubound = int(bounds[int(partition)].right)
-			df = df.iloc[lbound:ubound]
-		else:
-			#
-			# We have an implicit partition here
-			# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
-			logger.write({"module":"generate","action":"virtual-parititions","input":{"rows":df.shape[0],"max_rows":MAX_ROWS,"part_size":PART_SIZE}})
-			for item in bounds :
-
-				lbound = int(item.left)
-				ubound = int(item.right)
-				args['reader'] = lambda: df[lbound:ubound]
-				args['partition'] = bounds.index(item)
-				
-				self.generate(args)
-			return ;
-		if not os.path.exists(log_folder) :
-			log_folder = log_folder.replace(partition,'')
-			_args['logs'] = log_folder
+			columns = args['columns']
+			df = np.array_split(df[columns].values,PART_SIZE)
+			df = pd.DataFrame(df[ int (partition) ],columns = columns)
 
 		_args['data'] = df
 			# _args['data'] = reader()
@@ -189,7 +172,7 @@ class Components :
 			_args['data'][name] = _dc[name]
 			info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
 			if partition != '' :
-				info['partition'] = partition
+				info['partition'] = int(partition)
 			logger.write(info)
 			# filename = os.sep.join([log_folder,'output',name+'.csv'])
 			# data_comp[[name]].to_csv(filename,index=False)
@@ -218,7 +201,7 @@ class Components :
 		
 		info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} }
 		if partition :
-			info ['partition'] = partition
+			info ['partition'] = int(partition)
 		logger.write({"module":"generate","action":"write","info":info} )
 	@staticmethod
 	def callback(channel,method,header,stream):
@@ -229,8 +212,12 @@ class Components :
 		logger.write({'module':'process','action':'read-partition','input':info['info']})
 		df = pd.DataFrame(info['data'])
 		args = info['args']
-		MAX_GPUS = 8
-		args['gpu'] = int(info['info']['partition']) if info['info']['partition'] < MAX_GPUS else np.random.choice(np.arange(MAX_GPUS),1).astype(int).tolist()[0]
+		if args['num_gpu'] > 1 :
+			args['gpu'] = int(info['info']['partition']) if info['info']['partition'] == 0 else info['info']['partition'] + 2
+			args['num_gpu'] = 2
+		else:
+			args['gpu'] 	= 0
+			args['num_gpu'] = 1
 		# if int(args['num_gpu']) > 1 and args['gpu'] > 0:
 		# 	args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else args['gpu'] #-- 8 max gpus
 		args['reader'] = lambda: df
@@ -296,7 +283,7 @@ if __name__ == '__main__' :
 		SYS_ARGS = dict(args)	#-- things get lost in context
 		if 'read' in SYS_ARGS :
 			QUEUE_TYPE = 'queue.QueueReader'
-			pointer = lambda qreader: qreader.read(1)
+			pointer = lambda qreader: qreader.read()
 		else:
 			QUEUE_TYPE = 'queue.QueueListener'
 			pointer = lambda qlistener: qlistener.listen()
diff --git a/setup.py b/setup.py
index 02f49a2..bcacb62 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.1.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.1.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 32a5e19060e286a2b8a24f296b91e2768ccff45d Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 6 Mar 2020 11:17:29 -0600
Subject: [PATCH 046/250] bug fix with minor corrections

---
 pipeline.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 917026d..c5a16d8 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -92,7 +92,7 @@ class Components :
 				_df = pd.DataFrame(_df,columns=columns)
 				# print (columns)
 				
-				info = {"rows":_df.shape[0],"cols":_df.shape[1], "partition":part_index,"logs":_args['logs'],"num_gpu":2,"part_size":PART_SIZE}
+				info = {"rows":_df.shape[0],"cols":_df.shape[1], "partition":part_index,"logs":_args['logs'],"num_gpu":1,"part_size":PART_SIZE}
 				p = {"args":_args,"data":_df.to_dict(orient="records"),"info":info}
 				part_index += 1
 				qwriter.write(p)
@@ -134,7 +134,7 @@ class Components :
 		partition = args['partition'] if 'partition' in args else ''
 		log_folder = os.sep.join([log_folder,args['context'],str(partition)])
 		
-		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+		_args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
 		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
 		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
@@ -147,15 +147,18 @@ class Components :
 		reader = args['reader']
 		df = reader()
 		# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
-		if partition != '' and os.path.exists(log_folder):
+		if partition != '' :
 			columns = args['columns']
 			df = np.array_split(df[columns].values,PART_SIZE)
 			df = pd.DataFrame(df[ int (partition) ],columns = columns)
+			info = {"parition":int(partition),"rows":df.shape[0],"cols":df.shape[0],"part_size":PART_SIZE}
+			logger.write({"module":"generate","action":"partition","input":info})
 
 		_args['data'] = df
 			# _args['data'] = reader()
 		#_args['data'] = _args['data'].astype(object)
-		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+		_args['num_gpu'] = 1
+		_args['gpu'] = partition
 		_dc = data.maker.generate(**_args) 
 		#
 		# We need to post the generate the data in order to :
@@ -205,7 +208,9 @@ class Components :
 		logger.write({"module":"generate","action":"write","info":info} )
 	@staticmethod
 	def callback(channel,method,header,stream):
-		
+		if stream.decode('utf8') in ['QUIT','EXIT','END'] :
+			channel.close()
+			channel.connection.close()
 		info = json.loads(stream)
 		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']})
 		
@@ -214,10 +219,10 @@ class Components :
 		args = info['args']
 		if args['num_gpu'] > 1 :
 			args['gpu'] = int(info['info']['partition']) if info['info']['partition'] == 0 else info['info']['partition'] + 2
-			args['num_gpu'] = 2
+		
 		else:
 			args['gpu'] 	= 0
-			args['num_gpu'] = 1
+		args['num_gpu'] = 1
 		# if int(args['num_gpu']) > 1 and args['gpu'] > 0:
 		# 	args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else args['gpu'] #-- 8 max gpus
 		args['reader'] = lambda: df
@@ -242,8 +247,7 @@ if __name__ == '__main__' :
 	args =  (PIPELINE[index])
 	
 	args = dict(args,**SYS_ARGS)
-	args['max_rows'] 	= int(args['max_rows']) 	if 'max_rows' in args else 3
-	args['part_size']	= int(args['part_size']) 	if 'part_size' in args else 4
+	
 	args['logs'] 		= args['logs'] if 'logs' in args else 'logs'
 	if 'dataset' not in args :
 		args['dataset'] = 'combined20191004v2_deid'

From 8c5193cb6d4293682f838597bcca7e8287e37d6d Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 6 Mar 2020 11:40:47 -0600
Subject: [PATCH 047/250] bug fix ... (hopfully makes a difference)

---
 pipeline.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index c5a16d8..df92427 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -2,6 +2,7 @@
 import json
 from transport import factory
 import numpy as np
+import time
 import os
 from multiprocessing import Process
 import pandas as pd
@@ -76,7 +77,12 @@ class Components :
 			columns = args['columns']
 			df = np.array_split(df[columns].values,PART_SIZE)
 			qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'})
-			part_index  = 0			
+			part_index  = 0	
+			#
+			# let's start n processes to listen & train this mother ...
+			# 		
+			#-- hopefully they learn as daemons
+
 			for _df in df:
 				
 				# _args['logs'] = os.sep.join([log_folder,str(part_index)])
@@ -206,6 +212,7 @@ class Components :
 		if partition :
 			info ['partition'] = int(partition)
 		logger.write({"module":"generate","action":"write","info":info} )
+		
 	@staticmethod
 	def callback(channel,method,header,stream):
 		if stream.decode('utf8') in ['QUIT','EXIT','END'] :
@@ -306,6 +313,7 @@ if __name__ == '__main__' :
 		while len(jobs) > 0 :
 			
 			jobs = [job for job in jobs if job.is_alive()]
+			time.sleep(2)
 		
 		# pointer(qhandler)
 

From 57e32261c668260b64a088a165e35cdfbaad8294 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 6 Mar 2020 12:11:22 -0600
Subject: [PATCH 048/250] creating processes for the generators

---
 pipeline.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index df92427..c042588 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -280,11 +280,21 @@ if __name__ == '__main__' :
 		if ''.join(content).isnumeric() :
 			#
 			# we have partitions we are working with
-			
+			make = lambda args: (Components()).generate(args)
+			jobs = []
+			print (["Started ",len(jobs),"generators"])
 			for id in ''.join(content) :
 				args['partition'] = id
+				job = Process(target=make,args=(args,args))
+				
+				job.start()
+				jobs.append(job)
+			
+			while (len(jobs)> 0) :
+				jobs = [jobs for job in jobs if job.is_alive()]
+				time.sleep(2)
 				
-				generator.generate(args)
+				# generator.generate(args)
 		else:
 			generator.generate(args)
 		# Components.generate(args)

From 872744c682d26c751d2dfb377e05dc2afb64f95b Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 6 Mar 2020 13:00:32 -0600
Subject: [PATCH 049/250] bug fix with queue connection dropping out

---
 pipeline.py | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index c042588..65eda3e 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -99,7 +99,7 @@ class Components :
 				# print (columns)
 				
 				info = {"rows":_df.shape[0],"cols":_df.shape[1], "partition":part_index,"logs":_args['logs'],"num_gpu":1,"part_size":PART_SIZE}
-				p = {"args":_args,"data":_df.to_dict(orient="records"),"info":info}
+				p = {"args":_args,"data":_df.to_dict(orient="records"),"input":info}
 				part_index += 1
 				qwriter.write(p)
 				#
@@ -124,7 +124,8 @@ class Components :
 			# @log :
 			#	Logging information about the training process for this partition (or not)
 			#
-			info = {"rows":df.shape[0],"cols":df.shape[1], "partition":partition,"logs":_args['logs']}
+			info = {"rows":df.shape[0],"cols":df.shape[1], "partition":int(partition),"logs":_args['logs']}
+			
 			logger.write({"module":"train","action":"train","input":info})
 			data.maker.train(**_args)
 
@@ -211,7 +212,7 @@ class Components :
 		info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} }
 		if partition :
 			info ['partition'] = int(partition)
-		logger.write({"module":"generate","action":"write","info":info} )
+		logger.write({"module":"generate","action":"write","input":info} )
 		
 	@staticmethod
 	def callback(channel,method,header,stream):
@@ -221,11 +222,11 @@ class Components :
 		info = json.loads(stream)
 		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']})
 		
-		logger.write({'module':'process','action':'read-partition','input':info['info']})
+		logger.write({'module':'process','action':'read-partition','input':info['input']})
 		df = pd.DataFrame(info['data'])
 		args = info['args']
 		if args['num_gpu'] > 1 :
-			args['gpu'] = int(info['info']['partition']) if info['info']['partition'] == 0 else info['info']['partition'] + 2
+			args['gpu'] = int(info['info']['partition']) if info['input']['partition'] == 0 else info['input']['partition'] + 2
 		
 		else:
 			args['gpu'] 	= 0
@@ -237,11 +238,12 @@ class Components :
 		# @TODO: Fix
 		# 	There is an inconsistency in column/columns ... fix this shit!
 		#
-		args['columns'] = args['column']		
-		(Components()).train(**args)		
-		logger.write({"module":"process","action":"exit","info":info["info"]})
 		channel.close()
 		channel.connection.close()
+		args['columns'] = args['column']		
+		(Components()).train(**args)		
+		logger.write({"module":"process","action":"exit","input":info["input"]})
+
 		pass
 	
 if __name__ == '__main__' :
@@ -280,18 +282,19 @@ if __name__ == '__main__' :
 		if ''.join(content).isnumeric() :
 			#
 			# we have partitions we are working with
-			make = lambda args: (Components()).generate(args)
+			make = lambda _args: (Components()).generate(_args)
 			jobs = []
-			print (["Started ",len(jobs),"generators"])
+			
 			for id in ''.join(content) :
 				args['partition'] = id
-				job = Process(target=make,args=(args,args))
-				
+				job = Process(target=make,args=(args,))
+				job.name = 'generator # '+str(id)
 				job.start()
 				jobs.append(job)
-			
-			while (len(jobs)> 0) :
-				jobs = [jobs for job in jobs if job.is_alive()]
+
+			print (["Started ",len(jobs),"generator"+"s" if len(jobs)>1 else "" ])
+			while len(jobs)> 0 :
+				jobs = [job for job in jobs if job.is_alive()]
 				time.sleep(2)
 				
 				# generator.generate(args)

From 8e5b475a01f1a246ff01f7cf414e3d403d4e9a19 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 6 Mar 2020 13:27:03 -0600
Subject: [PATCH 050/250] inefficient data load (urgh)

---
 pipeline.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 65eda3e..15d562a 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -269,7 +269,8 @@ if __name__ == '__main__' :
 		if 'file' in args :
 			reader = lambda: pd.read_csv(args['file']) ;
 		else:
-			reader = lambda: Components().get(args)
+			_df = Components().get(args)
+			reader = lambda: _df
 		args['reader'] = reader
 	
 	if 'generate' in SYS_ARGS :

From 7dfd4032863932283ddbe29797ff72e9197195dc Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 6 Mar 2020 14:20:06 -0600
Subject: [PATCH 051/250] bug fix, gpu configuration memeory error

---
 pipeline.py | 51 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 15d562a..59745a9 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -143,29 +143,36 @@ class Components :
 		
 		_args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
 		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
-		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
-		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
+		# _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+		
+		if args['num_gpu'] > 1 :
+			_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0]
+		else:
+			_args['gpu']  = 0
+		_args['num_gpu'] 	= 1
+		
 		_args['no_value']= args['no_value']
 		# MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
 		PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
 		
 		# credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 		# _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
-		reader = args['reader']
-		df = reader()
+		# reader = args['reader']
+		# df = reader()
+		df = args['reader']() if 'reader' in args else args['data']
 		# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
-		if partition != '' :
-			columns = args['columns']
-			df = np.array_split(df[columns].values,PART_SIZE)
-			df = pd.DataFrame(df[ int (partition) ],columns = columns)
-			info = {"parition":int(partition),"rows":df.shape[0],"cols":df.shape[0],"part_size":PART_SIZE}
-			logger.write({"module":"generate","action":"partition","input":info})
-
+		# if partition != '' :
+		# 	columns = args['columns']
+		# 	df = np.array_split(df[columns].values,PART_SIZE)
+		# 	df = pd.DataFrame(df[ int (partition) ],columns = columns)
+		info = {"parition":int(partition),"rows":df.shape[0],"cols":df.shape[0],"part_size":PART_SIZE}
+		logger.write({"module":"generate","action":"partition","input":info})
+		
 		_args['data'] = df
 			# _args['data'] = reader()
 		#_args['data'] = _args['data'].astype(object)
-		_args['num_gpu'] = 1
-		_args['gpu'] = partition
+		# _args['num_gpu'] = 1
+		
 		_dc = data.maker.generate(**_args) 
 		#
 		# We need to post the generate the data in order to :
@@ -226,7 +233,7 @@ class Components :
 		df = pd.DataFrame(info['data'])
 		args = info['args']
 		if args['num_gpu'] > 1 :
-			args['gpu'] = int(info['info']['partition']) if info['input']['partition'] == 0 else info['input']['partition'] + 2
+			args['gpu'] = int(info['input']['partition']) if info['input']['partition'] < 8  else np.random.choice(np.arange(8),1).astype(int)[0]
 		
 		else:
 			args['gpu'] 	= 0
@@ -269,8 +276,8 @@ if __name__ == '__main__' :
 		if 'file' in args :
 			reader = lambda: pd.read_csv(args['file']) ;
 		else:
-			_df = Components().get(args)
-			reader = lambda: _df
+			DATA = Components().get(args)
+			reader = lambda: DATA
 		args['reader'] = reader
 	
 	if 'generate' in SYS_ARGS :
@@ -279,15 +286,23 @@ if __name__ == '__main__' :
 		
 		content = os.listdir( os.sep.join([args['logs'],args['context']]))
 		generator = Components()
-		
+		DATA = reader()
 		if ''.join(content).isnumeric() :
 			#
 			# we have partitions we are working with
 			make = lambda _args: (Components()).generate(_args)
 			jobs = []
-			
+			del args['reader']
+			columns = DATA.columns.tolist()
+			DATA  = np.array_split(DATA[args['columns']],len(content))
 			for id in ''.join(content) :
 				args['partition'] = id
+				args['data'] = pd.DataFrame(DATA[(int(id))],columns=args['columns'])
+				if args['num_gpu'] > 0 :
+					args['gpu'] = id
+				else:
+					args['gpu']=0
+				args['num_gpu']=1
 				job = Process(target=make,args=(args,))
 				job.name = 'generator # '+str(id)
 				job.start()

From 411aa170ebc00b00032ba6055494a8c22c0013e2 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 6 Mar 2020 14:33:41 -0600
Subject: [PATCH 052/250] gpu fix

---
 pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 59745a9..e6283fd 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -165,7 +165,7 @@ class Components :
 		# 	columns = args['columns']
 		# 	df = np.array_split(df[columns].values,PART_SIZE)
 		# 	df = pd.DataFrame(df[ int (partition) ],columns = columns)
-		info = {"parition":int(partition),"rows":df.shape[0],"cols":df.shape[0],"part_size":PART_SIZE}
+		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":df.shape[0],"cols":df.shape[1],"part_size":PART_SIZE}
 		logger.write({"module":"generate","action":"partition","input":info})
 		
 		_args['data'] = df

From ff6ae5a622b05b5f45915168ef8135f0dc9ed713 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 6 Mar 2020 14:42:40 -0600
Subject: [PATCH 053/250] ...

---
 pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index e6283fd..dfa0331 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -150,7 +150,7 @@ class Components :
 		else:
 			_args['gpu']  = 0
 		_args['num_gpu'] 	= 1
-		
+		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
 		_args['no_value']= args['no_value']
 		# MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
 		PART_SIZE = int(args['part_size']) if 'part_size' in args else 8

From 6f51eedca80dde48264647a8fefb03d742bd2506 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 6 Mar 2020 14:45:22 -0600
Subject: [PATCH 054/250] xx

---
 pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index dfa0331..2ce90a9 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -298,11 +298,11 @@ if __name__ == '__main__' :
 			for id in ''.join(content) :
 				args['partition'] = id
 				args['data'] = pd.DataFrame(DATA[(int(id))],columns=args['columns'])
-				if args['num_gpu'] > 0 :
+				if args['num_gpu'] > 1 :
 					args['gpu'] = id
 				else:
 					args['gpu']=0
-				args['num_gpu']=1
+				
 				job = Process(target=make,args=(args,))
 				job.name = 'generator # '+str(id)
 				job.start()

From 49177957b8e5f7f96621ec2e261f9a59bb2b815a Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 6 Mar 2020 14:56:28 -0600
Subject: [PATCH 055/250] ...

---
 pipeline.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 2ce90a9..9d9c097 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -78,6 +78,7 @@ class Components :
 			df = np.array_split(df[columns].values,PART_SIZE)
 			qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'})
 			part_index  = 0	
+			
 			#
 			# let's start n processes to listen & train this mother ...
 			# 		
@@ -145,7 +146,7 @@ class Components :
 		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 		# _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
 		
-		if args['num_gpu'] > 1 :
+		if int(args['num_gpu']) > 1 :
 			_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0]
 		else:
 			_args['gpu']  = 0
@@ -295,10 +296,17 @@ if __name__ == '__main__' :
 			del args['reader']
 			columns = DATA.columns.tolist()
 			DATA  = np.array_split(DATA[args['columns']],len(content))
+
 			for id in ''.join(content) :
+				if 'focus' in args and int(args['focus']) != int(id) :
+					#
+					# This handles failures/recoveries for whatever reason
+					# If we are only interested in generating data for a given partition 
+					continue
+
 				args['partition'] = id
 				args['data'] = pd.DataFrame(DATA[(int(id))],columns=args['columns'])
-				if args['num_gpu'] > 1 :
+				if int(args['num_gpu']) > 1 :
 					args['gpu'] = id
 				else:
 					args['gpu']=0

From e02a4a60abd8a936d77b5720beb0e27a34718307 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 6 Mar 2020 15:26:18 -0600
Subject: [PATCH 056/250] acceptance criteria fix

---
 data/gan.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index a591f34..80c3f8e 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -584,7 +584,7 @@ class Predict(GNet):
                                 p = 0 not in df.sum(axis=1).values
                                 x = df.sum(axis=1).values
                                 
-                                if np.divide( np.sum(x), x.size)  > .9 or p and np.sum(x) == x.size:
+                                if x.max() == 1 and np.divide( np.sum(x), x.size)  > .9 or p and np.sum(x) == x.size and x.size == self.values.size:
                                         ratio.append(np.divide( np.sum(x), x.size))
                                         found.append(df)
                                         if i == CANDIDATE_COUNT:
@@ -606,7 +606,9 @@ class Predict(GNet):
                         
                         # r = np.zeros((self.ROW_COUNT,len(columns)))
                         # r = np.zeros(self.ROW_COUNT)
-                        
+                        if self.logger :
+                                info = {"found":len(found),"selected":INDEX, "ratio": ratio[INDEX],"rows":df.shape[0],"cols":df.shape[1]}
+                                self.logger.write({"module":"gan-generate","action":"generate","input":info})
                         df.columns = self.values
                         if len(found):
                                 # print (len(found),NTH_VALID_CANDIDATE)    

From 78718b6c42793d8df227f4d08f800f5dc4b89cfb Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 6 Mar 2020 15:39:13 -0600
Subject: [PATCH 057/250] ...

---
 data/gan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/gan.py b/data/gan.py
index 80c3f8e..7952f23 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -584,7 +584,7 @@ class Predict(GNet):
                                 p = 0 not in df.sum(axis=1).values
                                 x = df.sum(axis=1).values
                                 
-                                if x.max() == 1 and np.divide( np.sum(x), x.size)  > .9 or p and np.sum(x) == x.size and x.size == self.values.size:
+                                if np.divide( np.sum(x), x.size)  > .9 or p and np.sum(x) == x.size and x.size == self.values.size:
                                         ratio.append(np.divide( np.sum(x), x.size))
                                         found.append(df)
                                         if i == CANDIDATE_COUNT:

From 97669f3b6b5100be7397de81967e6785c5d915da Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 6 Mar 2020 15:42:13 -0600
Subject: [PATCH 058/250] setup ...

---
 data/gan.py | 2 +-
 setup.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 7952f23..bf27b3b 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -607,7 +607,7 @@ class Predict(GNet):
                         # r = np.zeros((self.ROW_COUNT,len(columns)))
                         # r = np.zeros(self.ROW_COUNT)
                         if self.logger :
-                                info = {"found":len(found),"selected":INDEX, "ratio": ratio[INDEX],"rows":df.shape[0],"cols":df.shape[1]}
+                                info = {"found":len(found),"selected":INDEX, "ratio": ratio[INDEX],"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)}
                                 self.logger.write({"module":"gan-generate","action":"generate","input":info})
                         df.columns = self.values
                         if len(found):
diff --git a/setup.py b/setup.py
index bcacb62..bf63cb0 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.1.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.2.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From d72fb6b4e34a1a88323fd37fb13211b3aea2bda1 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 6 Mar 2020 16:22:37 -0600
Subject: [PATCH 059/250] bug fix ...

---
 data/gan.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index bf27b3b..1df26a3 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -578,18 +578,20 @@ class Predict(GNet):
                                 # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
                                 # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
                                 #
-                                df =   pd.DataFrame(np.round(f).astype(np.int32))
+                                
+                                df =   pd.DataFrame(np.round(f)).astype(np.int32)
                                 
                                 
                                 p = 0 not in df.sum(axis=1).values
                                 x = df.sum(axis=1).values
                                 
-                                if np.divide( np.sum(x), x.size)  > .9 or p and np.sum(x) == x.size and x.size == self.values.size:
+                                if np.divide( np.sum(x), x.size)  > .9 or p and np.sum(x) == x.size :
                                         ratio.append(np.divide( np.sum(x), x.size))
                                         found.append(df)
                                         if i == CANDIDATE_COUNT:
                                                 break
                                 else:
+
                                         continue
                                         
                         # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
@@ -597,9 +599,13 @@ class Predict(GNet):
                         #
                         # In case we are dealing with actual values like diagnosis codes we can perform 
                         #
-                        
-                        INDEX = np.random.choice(np.arange(len(found)),1)[0]
-                        INDEX = ratio.index(np.max(ratio))
+                        _index = [found.index(item) for item in found if item.shape[1] == len(self.values)]
+                        if not _index :
+                                INDEX = np.random.choice(np.arange(len(found)),1)[0]
+                                INDEX = ratio.index(np.max(ratio))
+                        else:
+                                INDEX = _index[0]
+
                         
                         df = found[INDEX]
                         columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]

From 718e57840159558cb0bc2c6773a41919369652d0 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sat, 7 Mar 2020 09:16:17 -0600
Subject: [PATCH 060/250] bug fix, trainer

---
 data/gan.py            |  4 +--
 data/maker/__init__.py |  8 ++++--
 pipeline.py            | 60 +++++++++++++++++++++++++++++++++---------
 3 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 1df26a3..898d4ea 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -581,7 +581,6 @@ class Predict(GNet):
                                 
                                 df =   pd.DataFrame(np.round(f)).astype(np.int32)
                                 
-                                
                                 p = 0 not in df.sum(axis=1).values
                                 x = df.sum(axis=1).values
                                 
@@ -599,7 +598,8 @@ class Predict(GNet):
                         #
                         # In case we are dealing with actual values like diagnosis codes we can perform 
                         #
-                        _index = [found.index(item) for item in found if item.shape[1] == len(self.values)]
+                        N = len(found)
+                        _index = [i for i in range(0,N) if found[i].shape[1] == len(self.values)]
                         if not _index :
                                 INDEX = np.random.choice(np.arange(len(found)),1)[0]
                                 INDEX = ratio.index(np.max(ratio))
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 080939c..f4bce16 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -37,11 +37,14 @@ class ContinuousToDiscrete :
                     index = BOUNDS.index(row)
                     x_[index]  = 1
                     break
-
+        #
+        # for items in BOUNDS :
+        #   index = BOUNDS.index(items)
         return _matrix
     
     @staticmethod
     def bounds(x,n):
+        # return np.array_split(x,n)
         return list(pd.cut(np.array( np.round(x,ContinuousToDiscrete.ROUND_UP) ),n).categories)
         
 
@@ -175,7 +178,8 @@ def generate(**args):
         handler.load_meta(col)
         r           =  handler.apply()                
         BIN_SIZE    = 4 if 'bin_size' not in args else int(args['bin_size'])
-        _df[col]    = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if 'float' in df[col].dtypes.name or col in CONTINUOUS else r[col]
+        _df[col]    = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col]
+        # _df[col]    = r[col]
         #
         # @TODO: log basic stats about the synthetic attribute
         #
diff --git a/pipeline.py b/pipeline.py
index 9d9c097..6234c26 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -50,11 +50,12 @@ class Components :
 		"""
 		#
 		# @TODO: we need to log something here about the parameters being passed
-		pointer  = args['reader'] if 'reader' in args else lambda: Components.get(**args)
-		df = pointer()
-		if df.shape[0] == 0 :
-			print ("CAN NOT TRAIN EMPTY DATASET ")
-			return 
+		# pointer  = args['reader'] if 'reader' in args else lambda: Components.get(**args)
+		df = args['reader']()
+		
+		# if df.shape[0] == 0 :
+		# 	print ("CAN NOT TRAIN EMPTY DATASET ")
+		# 	return 
 		#
 		# Now we can parse the arguments and submit the entire thing to training
 		#
@@ -113,18 +114,29 @@ class Components :
 			
 			pass
 		else:
+			print ('.....')
 			partition = args['partition'] if 'partition' in args else ''
-			log_folder = os.sep.join([log_folder,args['context'],partition])
+			log_folder = os.sep.join([log_folder,args['context'],str(partition)])
 			_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
 			_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
-			_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
-			os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
+
+			#
+			# We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel
+			#
+			if int(args['num_gpu']) > 1 :
+				_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0]
+			else:
+				_args['gpu']  = 0
+			_args['num_gpu'] 	= 1
+			os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
+			
 			
 			_args['data'] = df
 			#
 			# @log :
 			#	Logging information about the training process for this partition (or not)
 			#
+			
 			info = {"rows":df.shape[0],"cols":df.shape[1], "partition":int(partition),"logs":_args['logs']}
 			
 			logger.write({"module":"train","action":"train","input":info})
@@ -291,7 +303,7 @@ if __name__ == '__main__' :
 		if ''.join(content).isnumeric() :
 			#
 			# we have partitions we are working with
-			make = lambda _args: (Components()).generate(_args)
+			
 			jobs = []
 			del args['reader']
 			columns = DATA.columns.tolist()
@@ -310,13 +322,13 @@ if __name__ == '__main__' :
 					args['gpu'] = id
 				else:
 					args['gpu']=0
-				
+				make = lambda _args: (Components()).generate(_args)
 				job = Process(target=make,args=(args,))
 				job.name = 'generator # '+str(id)
 				job.start()
 				jobs.append(job)
 
-			print (["Started ",len(jobs),"generator"+"s" if len(jobs)>1 else "" ])
+			print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ])
 			while len(jobs)> 0 :
 				jobs = [job for job in jobs if job.is_alive()]
 				time.sleep(2)
@@ -358,9 +370,31 @@ if __name__ == '__main__' :
 		# qreader.read(1)
 		pass
 	else:
+		PART_SIZE = int(args['jobs']) if 'jobs' in args else 8
+		DATA = reader()
+		DATA  = np.array_split(DATA[args['columns']],PART_SIZE)
+		jobs = []
+		for index in range(0,int(args['jobs'])) :
+			if 'focus' in args and int(args['focus']) != index :
+				continue
+			args['partition'] = index
+			_df = pd.DataFrame(DATA[index],columns=args['columns'])
+			args['reader'] = lambda: _df
+			make = lambda _args: (Components()).train(**_args)
+			job = Process(target=make,args=(args,))
+			job.name = 'Trainer # ' + str(index)
+			job.start()
+			jobs.append(job)
+			# args['gpu']
+		print (["Started ",len(jobs),"trainers" if len(jobs)>1 else "trainer" ])
+		while len(jobs)> 0 :
+			jobs = [job for job in jobs if job.is_alive()]
+			time.sleep(2)
+
+		# trainer = Components()
+		# trainer.train(**args)
+		
 		
-		trainer = Components()
-		trainer.train(**args)
 		# Components.train(**args)
 #for args in PIPELINE :
 	#args['dataset'] = 'combined20190510'

From 330d6b6ae681dcc50f647d17a777354980fa3f58 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 8 Mar 2020 08:48:38 -0500
Subject: [PATCH 061/250] bug fix with partition & data -access

---
 data/gan.py            |  43 +++++++---
 data/maker/__init__.py |  27 ++++--
 pipeline.py            | 184 ++++++++++++++++++-----------------------
 3 files changed, 131 insertions(+), 123 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 898d4ea..a6d35e1 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -532,10 +532,13 @@ class Predict(GNet):
                 self.generator  = Generator(**args)                
                 self.values     = args['values']
                 self.ROW_COUNT  = args['row_count']
+                self.oROW_COUNT = self.ROW_COUNT
+                
                 self.MISSING_VALUES = args['no_value'] 
         def load_meta(self, column):
                 super().load_meta(column)
                 self.generator.load_meta(column)
+                self.ROW_COUNT = self.oROW_COUNT
         def apply(self,**args):
                 # print (self.train_dir)
                 # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
@@ -544,6 +547,7 @@ class Predict(GNet):
                 demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
                 tf.compat.v1.reset_default_graph()
                 z = tf.random.normal(shape=[self.ROW_COUNT, self.Z_DIM])
+
                 y = tf.compat.v1.placeholder(shape=[self.ROW_COUNT, self.NUM_LABELS], dtype=tf.int32)
                 if self._LABEL is not None :
                         ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
@@ -569,6 +573,8 @@ class Predict(GNet):
                         
                         found = []
                         ratio = []
+                        __x__ = None
+                        __ratio=0
                         for i in np.arange(CANDIDATE_COUNT) :
                                 if labels :
                                         f = sess.run(fake,feed_dict={y:labels})
@@ -590,7 +596,8 @@ class Predict(GNet):
                                         if i == CANDIDATE_COUNT:
                                                 break
                                 else:
-
+                                        __x__   = df if  __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __x__
+                                        __ratio = np.divide( np.sum(x), x.size) if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __ratio
                                         continue
                                         
                         # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
@@ -600,23 +607,33 @@ class Predict(GNet):
                         #
                         N = len(found)
                         _index = [i for i in range(0,N) if found[i].shape[1] == len(self.values)]
-                        if not _index :
-                                INDEX = np.random.choice(np.arange(len(found)),1)[0]
-                                INDEX = ratio.index(np.max(ratio))
-                        else:
-                                INDEX = _index[0]
+                        if not _index and not found :
+                                df = __x__
+                                INDEX = -1
+                        else :
+                                if not _index :
+                                        INDEX = np.random.choice(np.arange(len(found)),1)[0]
+                                        INDEX = ratio.index(np.max(ratio))
+                                else:
+                                        INDEX = _index[0]
 
                         
-                        df = found[INDEX]
+                                df = found[INDEX]
                         columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
                         
                         # r = np.zeros((self.ROW_COUNT,len(columns)))
                         # r = np.zeros(self.ROW_COUNT)
                         if self.logger :
-                                info = {"found":len(found),"selected":INDEX, "ratio": ratio[INDEX],"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)}
+                                info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)}
+                                if INDEX > 0 :
+                                        info =dict(info ,**{"selected":INDEX, "ratio": ratio[INDEX] })
+                                else :
+                                        
+                                        info['selected'] = -1
+                                        info['ratio'] = __ratio
                                 self.logger.write({"module":"gan-generate","action":"generate","input":info})
                         df.columns = self.values
-                        if len(found):
+                        if len(found) or df.columns.size == len(self.values):
                                 # print (len(found),NTH_VALID_CANDIDATE)    
                                 # x = df * self.values 
                                 #
@@ -639,10 +656,14 @@ class Predict(GNet):
                                 df =  pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
                                 df.columns = columns
                                 df = df[columns[0]].append(pd.Series(missing))
-                                
+                                if self.logger :
+                                        
+                                        info= {"missing": i.size,"rows":df.shape[0],"cols":1}
+                                        self.logger.write({"module":"gan-generate","action":"compile.io","input":info})
+
                            
                         
-                
+                # print(df.head())
                 tf.compat.v1.reset_default_graph()
                 df = pd.DataFrame(df)
                 df.columns = columns
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index f4bce16..4be97b8 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -107,23 +107,33 @@ def train (**args) :
         # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
         # if 'float' not in df[col].dtypes.name :
             # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
-        if 'float' in df[col].dtypes.name and col in CONTINUOUS:
+        if col in CONTINUOUS:
             BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
             args['real']    = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32)
         else:
-            args['real']        = pd.get_dummies(df[col].dropna()).astype(np.float32).values 
+            df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False)
+            # print (df[col].dtypes)
+            # print (df[col].dropna/(axis=1).unique())
+            args['real']  = pd.get_dummies(df[col].dropna()).astype(np.float32).values
+
+            
         
 
-        args['column']  = col
-        args['context'] = col
         context     = args['context']
         if 'store' in args :
             args['store']['args']['doc'] = context
             logger = factory.instance(**args['store'])
             args['logger'] = logger
+            info = {"rows":args['real'].shape[0],"cols":args['real'].shape[1],"name":col}
+            logger.write({"module":"gan-train","action":"data-prep","input":info})
             
         else:
             logger = None
+        args['column']  = col
+        args['context'] = col
+
+        #
+        # If the s
         trainer = gan.Train(**args)        
         trainer.apply()
 def post(**args):
@@ -149,6 +159,7 @@ def generate(**args):
     """
     # df      = args['data']
     df      = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
+    
     CONTINUOUS = args['continous'] if 'continuous' in args else []
     column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
     # column_id   = args['id']
@@ -168,7 +179,8 @@ def generate(**args):
         #     values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE)
         #     # values = np.unique(values).tolist()
         # else:
-        values          = df[col].unique().tolist()
+        values          = df[col].dropna().unique().tolist()
+        
         
         args['values']      = values    
         args['row_count']   = df.shape[0]
@@ -178,8 +190,9 @@ def generate(**args):
         handler.load_meta(col)
         r           =  handler.apply()                
         BIN_SIZE    = 4 if 'bin_size' not in args else int(args['bin_size'])
-        _df[col]    = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col]
-        # _df[col]    = r[col]
+        
+        # _df[col]    = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col]
+        _df[col]    = r[col]
         #
         # @TODO: log basic stats about the synthetic attribute
         #
diff --git a/pipeline.py b/pipeline.py
index 6234c26..0f2c258 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -30,11 +30,13 @@ class Components :
 			condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')'])
 			SQL = " ".join([SQL,'WHERE',condition])
 
-		SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 "
+		SQL = SQL.replace(':dataset',args['dataset']) #+ " LI "
+		
 		if 'limit' in args :
-			SQL = SQL + 'LIMIT ' + args['limit']
+			SQL = SQL + ' LIMIT ' + args['limit']
+		
 		credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
-		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard')
+		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').astype(object)
 		return df
 		
 		# return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna()
@@ -51,7 +53,8 @@ class Components :
 		#
 		# @TODO: we need to log something here about the parameters being passed
 		# pointer  = args['reader'] if 'reader' in args else lambda: Components.get(**args)
-		df = args['reader']()
+		df = args['data']
+		
 		
 		# if df.shape[0] == 0 :
 		# 	print ("CAN NOT TRAIN EMPTY DATASET ")
@@ -62,85 +65,43 @@ class Components :
 		
 		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 		log_folder = args['logs'] if 'logs' in args else 'logs'
-		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+		# _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
 		
-		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
-		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
-		_args['gpu'] = args['gpu'] if 'gpu' in args else 0
+		# _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
+		# _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+		# _args['gpu'] = args['gpu'] if 'gpu' in args else 0
 
-		# MAX_ROWS = args['max_rows'] 	if 'max_rows' in args else 0
-		PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
-		
-		if 'partition' not in args:
-			lbound = 0
-			# bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
-			# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
-			columns = args['columns']
-			df = np.array_split(df[columns].values,PART_SIZE)
-			qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'})
-			part_index  = 0	
-			
-			#
-			# let's start n processes to listen & train this mother ...
-			# 		
-			#-- hopefully they learn as daemons
+		# # MAX_ROWS = args['max_rows'] 	if 'max_rows' in args else 0
+		PART_SIZE = int(args['part_size']) 
 
-			for _df in df:
-				
-				# _args['logs'] = os.sep.join([log_folder,str(part_index)])
-				_args['partition'] = str(part_index)
-				_args['logger'] = {'args':{'dbname':'aou','doc':args['context']},'type':'mongo.MongoWriter'}
-				
-				#
-				# We should post the the partitions to a queue server (at least the instructions on ):
-				#	- where to get the data
-				#	- and athe arguments to use (partition #,columns,gpu,epochs)
-				#
+		partition = args['partition'] 
+		log_folder = os.sep.join([log_folder,args['context'],str(partition)])
+		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 
-				_df = pd.DataFrame(_df,columns=columns)
-				# print (columns)
-				
-				info = {"rows":_df.shape[0],"cols":_df.shape[1], "partition":part_index,"logs":_args['logs'],"num_gpu":1,"part_size":PART_SIZE}
-				p = {"args":_args,"data":_df.to_dict(orient="records"),"input":info}
-				part_index += 1
-				qwriter.write(p)
-				#
-				# @TODO:
-				#	- Notify that information was just posted to the queue
-				#	In case we want slow-mode, we can store the partitions in mongodb and process (Yes|No)?
-				#
-				
-				logger.write({"module":"train","action":"setup-partition","input":info})
-			
-			pass
+		#
+		# We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel
+		#
+		if int(args['num_gpu']) > 1 :
+			_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0]
 		else:
-			print ('.....')
-			partition = args['partition'] if 'partition' in args else ''
-			log_folder = os.sep.join([log_folder,args['context'],str(partition)])
-			_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
-			_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
-
-			#
-			# We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel
-			#
-			if int(args['num_gpu']) > 1 :
-				_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0]
-			else:
-				_args['gpu']  = 0
-			_args['num_gpu'] 	= 1
-			os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
-			
-			
-			_args['data'] = df
-			#
-			# @log :
-			#	Logging information about the training process for this partition (or not)
-			#
-			
-			info = {"rows":df.shape[0],"cols":df.shape[1], "partition":int(partition),"logs":_args['logs']}
-			
-			logger.write({"module":"train","action":"train","input":info})
-			data.maker.train(**_args)
+			_args['gpu']  = 0
+		_args['num_gpu'] 	= 1
+		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
+		
+		_args['store'] = {'type':'mongo.MongoWriter','args':{'dbname':'aou','doc':args['context']}}
+		_args['data'] = args['data']
+		
+		# print (['partition ',partition,df.value_source_concept_id.unique()])
+		#
+		# @log :
+		#	Logging information about the training process for this partition (or not)
+		#
+		
+		info = {"rows":df.shape[0],"cols":df.shape[1], "partition":int(partition),"logs":_args['logs']}
+		
+		logger.write({"module":"train","action":"train","input":info})
+		data.maker.train(**_args)
 
 		pass
 		
@@ -210,6 +171,7 @@ class Components :
 		#
 		#-- Let us store all of this into bigquery
 		prefix = args['notify']+'.'+_args['context']
+		partition = str(partition)
 		table = '_'.join([prefix,partition,'io']).replace('__','_')
 		folder = os.sep.join([args['logs'],args['context'],partition,'output']) 
 		if 'file' in args :
@@ -219,17 +181,19 @@ class Components :
 			data_comp.to_csv( _pname,index=False)
 			_args['data'].to_csv(_fname,index=False)
 			
-			
+			_id = 'path'
 		else:
 			credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 			_pname = os.sep.join([folder,table+'.csv'])
 			_fname = table.replace('_io','_full_io')
-			data_comp.to_gbq(if_exists='replace',destination_table=_pname,credentials='credentials',chunk_size=50000)	
+			partial = '.'.join(['io',args['context']+'_partial_io'])
+			complete= '.'.join(['io',args['context']+'_full_io'])
+			data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=50000)	
 			data_comp.to_csv(_pname,index=False)
 			INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'	
-			_args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=_fname,credentials='credentials',chunk_size=50000)
-		
-		info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} }
+			_args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=complete,credentials=credentials,chunksize=50000)
+			_id = 'dataset'
+		info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
 		if partition :
 			info ['partition'] = int(partition)
 		logger.write({"module":"generate","action":"write","input":info} )
@@ -280,18 +244,18 @@ if __name__ == '__main__' :
 	args['logs'] 		= args['logs'] if 'logs' in args else 'logs'
 	if 'dataset' not in args :
 		args['dataset'] = 'combined20191004v2_deid'
-
+	PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
 	#
 	# @TODO:
 	#	Log what was initiated so we have context of this processing ...
 	#
 	if 'listen' not in SYS_ARGS :
 		if 'file' in args :
-			reader = lambda: pd.read_csv(args['file']) ;
+			DATA = pd.read_csv(args['file']) ;
 		else:
 			DATA = Components().get(args)
-			reader = lambda: DATA
-		args['reader'] = reader
+		COLUMNS = DATA.columns
+		DATA = np.array_split(DATA,PART_SIZE)
 	
 	if 'generate' in SYS_ARGS :
 		#
@@ -299,32 +263,34 @@ if __name__ == '__main__' :
 		
 		content = os.listdir( os.sep.join([args['logs'],args['context']]))
 		generator = Components()
-		DATA = reader()
+		
 		if ''.join(content).isnumeric() :
 			#
 			# we have partitions we are working with
 			
 			jobs = []
-			del args['reader']
-			columns = DATA.columns.tolist()
-			DATA  = np.array_split(DATA[args['columns']],len(content))
+			
+			# columns = DATA.columns.tolist()
+			
+			# DATA  = np.array_split(DATA,PART_SIZE)
 
-			for id in ''.join(content) :
-				if 'focus' in args and int(args['focus']) != int(id) :
+			for index in range(0,PART_SIZE) :
+				if 'focus' in args and int(args['focus']) != index :
 					#
 					# This handles failures/recoveries for whatever reason
 					# If we are only interested in generating data for a given partition 
 					continue
-
-				args['partition'] = id
-				args['data'] = pd.DataFrame(DATA[(int(id))],columns=args['columns'])
+				# index = id.index(id)
+				
+				args['partition'] = index
+				args['data'] = DATA[index]
 				if int(args['num_gpu']) > 1 :
-					args['gpu'] = id
+					args['gpu'] = index
 				else:
 					args['gpu']=0
 				make = lambda _args: (Components()).generate(_args)
 				job = Process(target=make,args=(args,))
-				job.name = 'generator # '+str(id)
+				job.name = 'generator # '+str(index)
 				job.start()
 				jobs.append(job)
 
@@ -370,18 +336,26 @@ if __name__ == '__main__' :
 		# qreader.read(1)
 		pass
 	else:
-		PART_SIZE = int(args['jobs']) if 'jobs' in args else 8
-		DATA = reader()
-		DATA  = np.array_split(DATA[args['columns']],PART_SIZE)
+		
+		# DATA  = np.array_split(DATA,PART_SIZE)
+		
 		jobs = []
-		for index in range(0,int(args['jobs'])) :
+		for index in range(0,PART_SIZE) :
 			if 'focus' in args and int(args['focus']) != index :
 				continue
+			args['part_size'] = PART_SIZE
 			args['partition'] = index
-			_df = pd.DataFrame(DATA[index],columns=args['columns'])
-			args['reader'] = lambda: _df
+			# _df = pd.DataFrame(DATA[index],columns=args['columns'])
+			args['data'] = DATA[index]
+			args['data'].to_csv('aou-'+str(index)+'csv',index=False)
+			# args['reader'] = lambda: _df
+			if int(args['num_gpu']) > 1 :
+				args['gpu'] = index
+			else:
+				args['gpu']=0
+
 			make = lambda _args: (Components()).train(**_args)
-			job = Process(target=make,args=(args,))
+			job = Process(target=make,args=( dict(args),))
 			job.name = 'Trainer # ' + str(index)
 			job.start()
 			jobs.append(job)

From 266bdc8bd282ca5b1588434a18f8dcbc3067fb1b Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 8 Mar 2020 15:00:26 -0500
Subject: [PATCH 062/250] bug fix with batch_size (GPU load)

---
 pipeline.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 0f2c258..418ccbf 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -78,7 +78,8 @@ class Components :
 		log_folder = os.sep.join([log_folder,args['context'],str(partition)])
 		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
 		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
-
+		if 'batch_size' in args :
+			_args['batch_size'] = int(args['batch_size'])
 		#
 		# We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel
 		#
@@ -118,6 +119,8 @@ class Components :
 		_args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
 		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 		# _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+		if 'batch_size' in args :
+			_args['batch_size'] = int(args['batch_size'])
 		
 		if int(args['num_gpu']) > 1 :
 			_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0]

From e07c3553884fc9726cc464e9523f28a1a7f55794 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 8 Mar 2020 19:33:08 -0500
Subject: [PATCH 063/250] bug fix, with logs and partitioning

---
 data/gan.py            | 11 +++++++----
 data/maker/__init__.py |  4 ++--
 pipeline.py            |  8 +++++---
 setup.py               |  2 +-
 4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index a6d35e1..3c41f59 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -59,6 +59,7 @@ class GNet :
                 self.logs = {}
 
                 self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
+                self.PARTITION = args['partition']
                 # if self.NUM_GPUS > 1 :
                 #     os.environ['CUDA_VISIBLE_DEVICES'] = "4"
 
@@ -356,7 +357,7 @@ class Train (GNet):
                 self.meta = self.log_meta()
                 if(self.logger):
                         
-                        self.logger.write({"module":"gan-train","action":"start","input":self.meta} )
+                        self.logger.write({"module":"gan-train","action":"start","input":{"partition":self.PARTITION,"meta":self.meta} } )
                 
                 # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta)
         def load_meta(self, column):
@@ -408,7 +409,7 @@ class Train (GNet):
                 # losses = tf.compat.v1.get_collection(flag, scope)
 
                 total_loss = tf.add_n(losses, name='total_loss')
-
+                print (total_loss)
                 return total_loss, w
         def input_fn(self):
                 """
@@ -514,7 +515,7 @@ class Train (GNet):
                                                 #
                                                 #
                                                 if self.logger :
-                                                        row = {"module":"gan-train","action":"logs","input":logs} #,"model":pickle.dump(sess)}                                                        
+                                                        row = {"module":"gan-train","action":"logs","input":{"partition":self.PARTITION,"logs":logs}} #,"model":pickle.dump(sess)}                                                        
                                                         self.logger.write(row)
                                                         #
                                                         # @TODO:
@@ -623,6 +624,7 @@ class Predict(GNet):
                         
                         # r = np.zeros((self.ROW_COUNT,len(columns)))
                         # r = np.zeros(self.ROW_COUNT)
+                        
                         if self.logger :
                                 info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)}
                                 if INDEX > 0 :
@@ -631,6 +633,7 @@ class Predict(GNet):
                                         
                                         info['selected'] = -1
                                         info['ratio'] = __ratio
+                                info['partition'] = self.PARTITION
                                 self.logger.write({"module":"gan-generate","action":"generate","input":info})
                         df.columns = self.values
                         if len(found) or df.columns.size == len(self.values):
@@ -658,7 +661,7 @@ class Predict(GNet):
                                 df = df[columns[0]].append(pd.Series(missing))
                                 if self.logger :
                                         
-                                        info= {"missing": i.size,"rows":df.shape[0],"cols":1}
+                                        info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION}
                                         self.logger.write({"module":"gan-generate","action":"compile.io","input":info})
 
                            
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 4be97b8..729654f 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -111,7 +111,7 @@ def train (**args) :
             BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
             args['real']    = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32)
         else:
-            df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False)
+            # df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False)
             # print (df[col].dtypes)
             # print (df[col].dropna/(axis=1).unique())
             args['real']  = pd.get_dummies(df[col].dropna()).astype(np.float32).values
@@ -124,7 +124,7 @@ def train (**args) :
             args['store']['args']['doc'] = context
             logger = factory.instance(**args['store'])
             args['logger'] = logger
-            info = {"rows":args['real'].shape[0],"cols":args['real'].shape[1],"name":col}
+            info = {"rows":args['real'].shape[0],"cols":args['real'].shape[1],"name":col,"partition":args['partition']}
             logger.write({"module":"gan-train","action":"data-prep","input":info})
             
         else:
diff --git a/pipeline.py b/pipeline.py
index 418ccbf..89ba16f 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -89,7 +89,8 @@ class Components :
 			_args['gpu']  = 0
 		_args['num_gpu'] 	= 1
 		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
-		
+		_args['partition'] = int(partition)
+		_args['continuous']= args['continuous'] if 'continuous' in args else []
 		_args['store'] = {'type':'mongo.MongoWriter','args':{'dbname':'aou','doc':args['context']}}
 		_args['data'] = args['data']
 		
@@ -144,7 +145,8 @@ class Components :
 		# 	df = pd.DataFrame(df[ int (partition) ],columns = columns)
 		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":df.shape[0],"cols":df.shape[1],"part_size":PART_SIZE}
 		logger.write({"module":"generate","action":"partition","input":info})
-		
+		_args['partition'] = int(partition)
+		_args['continuous']= args['continuous'] if 'continuous' in args else []
 		_args['data'] = df
 			# _args['data'] = reader()
 		#_args['data'] = _args['data'].astype(object)
@@ -194,7 +196,7 @@ class Components :
 			data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=50000)	
 			data_comp.to_csv(_pname,index=False)
 			INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'	
-			_args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=complete,credentials=credentials,chunksize=50000)
+			_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=50000)
 			_id = 'dataset'
 		info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
 		if partition :
diff --git a/setup.py b/setup.py
index bf63cb0..5a8f7b6 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.2.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.2.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 8455cd7554acfac8927bab1d8a21015209ed14a3 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 8 Mar 2020 20:27:27 -0500
Subject: [PATCH 064/250] bug fix: typo

---
 data/maker/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 729654f..354b78f 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -160,7 +160,7 @@ def generate(**args):
     # df      = args['data']
     df      = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
     
-    CONTINUOUS = args['continous'] if 'continuous' in args else []
+    CONTINUOUS = args['continuous'] if 'continuous' in args else []
     column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
     # column_id   = args['id']
     #

From bbd03c4a63aeb109dd878d4e50cd9b8568bf8b45 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 9 Mar 2020 13:10:26 -0500
Subject: [PATCH 065/250] bug fix with GPU allocation

---
 pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 89ba16f..7a2cf3a 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -84,7 +84,7 @@ class Components :
 		# We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel
 		#
 		if int(args['num_gpu']) > 1 :
-			_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0]
+			_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)
 		else:
 			_args['gpu']  = 0
 		_args['num_gpu'] 	= 1
@@ -124,7 +124,7 @@ class Components :
 			_args['batch_size'] = int(args['batch_size'])
 		
 		if int(args['num_gpu']) > 1 :
-			_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)[0]
+			_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)
 		else:
 			_args['gpu']  = 0
 		_args['num_gpu'] 	= 1
@@ -215,7 +215,7 @@ class Components :
 		df = pd.DataFrame(info['data'])
 		args = info['args']
 		if args['num_gpu'] > 1 :
-			args['gpu'] = int(info['input']['partition']) if info['input']['partition'] < 8  else np.random.choice(np.arange(8),1).astype(int)[0]
+			args['gpu'] = int(info['input']['partition']) if info['input']['partition'] < 8  else np.random.choice(np.arange(8)).astype(int)
 		
 		else:
 			args['gpu'] 	= 0

From fc08a8f643d0f38d12a728a4d3045f4f7be8f9bd Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 12 Mar 2020 09:41:54 -0500
Subject: [PATCH 066/250] bug fix: continuous variable handling

---
 data/gan.py            |  2 +-
 data/maker/__init__.py | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 3c41f59..4f34634 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -409,7 +409,7 @@ class Train (GNet):
                 # losses = tf.compat.v1.get_collection(flag, scope)
 
                 total_loss = tf.add_n(losses, name='total_loss')
-                print (total_loss)
+                # print (total_loss)
                 return total_loss, w
         def input_fn(self):
                 """
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 354b78f..97cc3dd 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -22,9 +22,10 @@ class ContinuousToDiscrete :
         This function will convert a continous stream of information into a variety a bit stream of bins
         """
         # BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist()
-        
-        BOUNDS = ContinuousToDiscrete.bounds(np.round(X,ContinuousToDiscrete.ROUND_UP),n)
-        
+        # print ( X.values.astype(np.float32))
+        # print ("___________________________")
+        values = X.values.astype(np.float32)
+        BOUNDS = ContinuousToDiscrete.bounds(values,n)
         # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS]
         _matrix = []
         m = []
@@ -40,12 +41,13 @@ class ContinuousToDiscrete :
         #
         # for items in BOUNDS :
         #   index = BOUNDS.index(items)
-        return _matrix
+        return np.array(_matrix)
     
     @staticmethod
     def bounds(x,n):
         # return np.array_split(x,n)
-        return list(pd.cut(np.array( np.round(x,ContinuousToDiscrete.ROUND_UP) ),n).categories)
+        values = np.round(x,ContinuousToDiscrete.ROUND_UP)
+        return list(pd.cut(values,n).categories)
         
 
         

From 60cbf2dd3fd32ae8f5712d22dcceb367945a24a1 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 12 Mar 2020 09:55:29 -0500
Subject: [PATCH 067/250] bug fix: continuous values

---
 data/maker/__init__.py | 1 +
 pipeline.py            | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 97cc3dd..2b51670 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -112,6 +112,7 @@ def train (**args) :
         if col in CONTINUOUS:
             BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
             args['real']    = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32)
+            # print ( pd.DataFrame(args['real']).head() ) 
         else:
             # df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False)
             # print (df[col].dtypes)
diff --git a/pipeline.py b/pipeline.py
index 7a2cf3a..9eee8c5 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -143,7 +143,7 @@ class Components :
 		# 	columns = args['columns']
 		# 	df = np.array_split(df[columns].values,PART_SIZE)
 		# 	df = pd.DataFrame(df[ int (partition) ],columns = columns)
-		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":df.shape[0],"cols":df.shape[1],"part_size":PART_SIZE}
+		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":str(df.shape[0]),"cols":str(df.shape[1]),"part_size":int(PART_SIZE)}
 		logger.write({"module":"generate","action":"partition","input":info})
 		_args['partition'] = int(partition)
 		_args['continuous']= args['continuous'] if 'continuous' in args else []
@@ -352,7 +352,7 @@ if __name__ == '__main__' :
 			args['partition'] = index
 			# _df = pd.DataFrame(DATA[index],columns=args['columns'])
 			args['data'] = DATA[index]
-			args['data'].to_csv('aou-'+str(index)+'csv',index=False)
+			# args['data'].to_csv('aou-'+str(index)+'csv',index=False)
 			# args['reader'] = lambda: _df
 			if int(args['num_gpu']) > 1 :
 				args['gpu'] = index

From d30d2233c865e2dc03a807f70b554734774150a3 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 12 Mar 2020 09:56:08 -0500
Subject: [PATCH 068/250] versioning ...

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 5a8f7b6..78c52ea 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.2.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.2.2","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 772d841ee80d9279c0348356ea268787d54ef44b Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 12 Mar 2020 14:37:01 -0500
Subject: [PATCH 069/250] bug fix ...

---
 data/maker/__init__.py | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 2b51670..5b4cb7e 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -24,24 +24,25 @@ class ContinuousToDiscrete :
         # BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist()
         # print ( X.values.astype(np.float32))
         # print ("___________________________")
-        values = X.values.astype(np.float32)
+        values = np.array(X).astype(np.float32)
         BOUNDS = ContinuousToDiscrete.bounds(values,n)
         # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS]
         _matrix = []
         m = []
         for value in X :
             x_ = np.zeros(n)
-            _matrix.append(x_)
+            
             for row in BOUNDS :
             
                 if value>= row.left and value <= row.right :
                     index = BOUNDS.index(row)
                     x_[index]  = 1
                     break
+            _matrix += x_.tolist()
         #
         # for items in BOUNDS :
         #   index = BOUNDS.index(items)
-        return np.array(_matrix)
+        return np.array(_matrix).reshape(len(X),n)
     
     @staticmethod
     def bounds(x,n):
@@ -92,7 +93,7 @@ def train (**args) :
     :context    label of what we are synthesizing
     """
     column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
-    CONTINUOUS  = args['continuous'] if 'continuous' in args else []
+    # CONTINUOUS  = args['continuous'] if 'continuous' in args else []
     # column_id   = args['id']
     df          = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
     df.columns = [name.lower() for name in df.columns]
@@ -109,15 +110,16 @@ def train (**args) :
         # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
         # if 'float' not in df[col].dtypes.name :
             # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
-        if col in CONTINUOUS:
-            BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
-            args['real']    = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32)
-            # print ( pd.DataFrame(args['real']).head() ) 
-        else:
+        # if col in CONTINUOUS:
+        #     BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
+        #     args['real']    = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32)
+        #     # args['real'] = args['real'].reshape(df.shape[0],BIN_SIZE)
+            
+        # else:
             # df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False)
             # print (df[col].dtypes)
             # print (df[col].dropna/(axis=1).unique())
-            args['real']  = pd.get_dummies(df[col].dropna()).astype(np.float32).values
+        args['real']  = pd.get_dummies(df[col].dropna()).astype(np.float32).values
 
             
         
@@ -170,6 +172,7 @@ def generate(**args):
     #@TODO:
     #   If the identifier is not present, we should fine a way to determine or make one
     #
+    BIN_SIZE    = 4 if 'bin_size' not in args else int(args['bin_size'])
     _df     = df.copy()
     for col in column :
         args['context'] = col
@@ -181,10 +184,15 @@ def generate(**args):
         #     BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
         #     values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE)
         #     # values = np.unique(values).tolist()
+        # else:
+        # if col in CONTINUOUS :
+        #     values = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32).T
+            
         # else:
         values          = df[col].dropna().unique().tolist()
         
         
+        
         args['values']      = values    
         args['row_count']   = df.shape[0]
         #
@@ -192,10 +200,9 @@ def generate(**args):
         handler     = gan.Predict (**args)
         handler.load_meta(col)
         r           =  handler.apply()                
-        BIN_SIZE    = 4 if 'bin_size' not in args else int(args['bin_size'])
         
-        # _df[col]    = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col]
-        _df[col]    = r[col]
+        _df[col]    = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col]
+        # _df[col]    = r[col]
         #
         # @TODO: log basic stats about the synthetic attribute
         #

From e81e50c94f8fdf051cbf76d9479cc68a40b1ef5d Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sat, 14 Mar 2020 11:12:13 -0500
Subject: [PATCH 070/250] Bug fix with the number of candidates generated

---
 data/gan.py            |  10 +-
 data/maker/__init__.py |   1 +
 drive/pipeline.py      | 303 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 311 insertions(+), 3 deletions(-)
 create mode 100644 drive/pipeline.py

diff --git a/data/gan.py b/data/gan.py
index 4f34634..28d5ea3 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -424,6 +424,7 @@ class Train (GNet):
                         dataset = tf.data.Dataset.from_tensor_slices(features_placeholder)
                 # labels_placeholder = None
                 dataset = dataset.repeat(10000)
+                print ([' ******* ',self.BATCHSIZE_PER_GPU])
                 dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
                 dataset = dataset.prefetch(1)
                 # iterator = dataset.make_initializable_iterator()
@@ -560,7 +561,7 @@ class Predict(GNet):
                 init    = tf.compat.v1.global_variables_initializer()
                 saver   = tf.compat.v1.train.Saver()
                 df              = pd.DataFrame()
-                CANDIDATE_COUNT = 1000
+                CANDIDATE_COUNT = 10 #0 if self.ROW_COUNT < 1000 else 100
                 NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0]
                 with tf.compat.v1.Session() as sess:
                         
@@ -594,13 +595,16 @@ class Predict(GNet):
                                 if np.divide( np.sum(x), x.size)  > .9 or p and np.sum(x) == x.size :
                                         ratio.append(np.divide( np.sum(x), x.size))
                                         found.append(df)
-                                        if i == CANDIDATE_COUNT:
+                                        
+                                        # break
+                                        if len(found) == CANDIDATE_COUNT:
+                                                
                                                 break
                                 else:
                                         __x__   = df if  __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __x__
                                         __ratio = np.divide( np.sum(x), x.size) if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __ratio
                                         continue
-                                        
+                                       
                         # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
                         # df = (i * df).sum(axis=1)
                         #
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 5b4cb7e..3a016cf 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -208,4 +208,5 @@ def generate(**args):
         #
         # print (r)s
         # break
+        
     return _df
\ No newline at end of file
diff --git a/drive/pipeline.py b/drive/pipeline.py
new file mode 100644
index 0000000..04658da
--- /dev/null
+++ b/drive/pipeline.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env python3
+import json
+from transport import factory
+import numpy as np
+import os
+from multiprocessing import Process
+import pandas as pd
+from google.oauth2 import service_account
+import data.maker
+
+from data.params import SYS_ARGS 
+
+#
+# The configuration array is now loaded and we will execute the pipe line as follows
+DATASET='combined20190510'
+
+class Components :
+
+	@staticmethod
+	def get(args):
+		"""
+		This function returns a data-frame provided a bigquery sql statement with conditions (and limits for testing purposes)
+		The function must be wrapped around a lambda this makes testing easier and changing data stores transparent to the rest of the code. (Vital when testing)
+		:sql	basic sql statement
+		:condition	optional condition and filters
+		"""
+		SQL = args['sql']
+		if 'condition' in args :
+			condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')'])
+			SQL = " ".join([SQL,'WHERE',condition])
+
+		SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 "
+		if 'limit' in args :
+			SQL = SQL + 'LIMIT ' + args['limit']
+		credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
+		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
+		return df
+		
+		# return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna()
+	@staticmethod
+	def split(X,MAX_ROWS=3,PART_SIZE=3):
+		
+		return list(pd.cut( np.arange(X.shape[0]+1),PART_SIZE).categories)
+
+	def train(self,**args):
+		"""
+		This function will perform training on the basis of a given pointer that reads data
+
+		"""
+		#
+		# @TODO: we need to log something here about the parameters being passed
+		pointer  = args['reader'] if 'reader' in args else lambda: Components.get(**args)
+		df = pointer()
+
+		#
+		# Now we can parse the arguments and submit the entire thing to training
+		#
+		
+		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
+		log_folder = args['logs'] if 'logs' in args else 'logs'
+		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
+		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+
+		MAX_ROWS = args['max_rows'] 	if 'max_rows' in args else 0
+		PART_SIZE = args['part_size'] 	if 'part_size' in args else 0
+		
+		if df.shape[0] > MAX_ROWS and 'partition' not in args:
+			lbound = 0
+			bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
+			# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
+			
+			qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'})
+			
+			for b in bounds :
+				part_index = bounds.index(b)
+				ubound = int(b.right)
+				
+					
+				_data =  df.iloc[lbound:ubound][args['columns']]
+				lbound = ubound
+				
+				# _args['logs'] = os.sep.join([log_folder,str(part_index)])
+				_args['partition'] = str(part_index)
+				_args['logger'] = {'args':{'dbname':'aou','doc':args['context']},'type':'mongo.MongoWriter'}
+				#
+				# We should post the the partitions to a queue server (at least the instructions on ):
+				#	- where to get the data
+				#	- and athe arguments to use (partition #,columns,gpu,epochs)
+				#
+				info = {"rows":_data.shape[0],"cols":_data.shape[1], "paritition":part_index,"logs":_args['logs']}
+				p = {"args":_args,"data":_data.to_dict(orient="records"),"info":info}
+				qwriter.write(p)
+				#
+				# @TODO:
+				#	- Notify that information was just posted to the queue
+				info['max_rows'] 	= MAX_ROWS
+				info['part_size'] 	= PART_SIZE
+				logger.write({"module":"train","action":"setup-partition","input":info})
+			
+			pass
+		else:
+			partition = args['partition'] if 'partition' in args else ''
+			log_folder = os.sep.join([log_folder,args['context'],partition])
+			_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+			_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
+			_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+			os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
+			
+			_args['data'] = df
+			#
+			# @log :
+			#	Logging information about the training process for this partition (or not)
+			#
+			info = {"rows":df.shape[0],"cols":df.shape[1], "partition":partition,"logs":_args['logs']}
+			logger.write({"module":"train","action":"train","input":info})
+			data.maker.train(**_args)
+
+		pass
+		
+	# @staticmethod
+	def generate(self,args):
+		"""
+		This function will generate data and store it to a given,
+		"""
+		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
+		log_folder = args['logs'] if 'logs' in args else 'logs'
+		partition = args['partition'] if 'partition' in args else ''
+		log_folder = os.sep.join([log_folder,args['context'],partition])
+		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
+		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
+		_args['no_value']= args['no_value']
+		MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
+		PART_SIZE = args['part_size'] 	if 'part_size' in args else 0
+		
+		# credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
+		# _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
+		reader = args['reader']
+		df = reader()
+		if 'partition' in args :
+			bounds = Components.split(df,MAX_ROWS,PART_SIZE)
+			# bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
+			lbound = int(bounds[int(partition)].left)
+			ubound = int(bounds[int(partition)].right)
+			df = df.iloc[lbound:ubound]
+		_args['data'] = df
+			# _args['data'] = reader()
+		#_args['data'] = _args['data'].astype(object)
+		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+		_dc = data.maker.generate(**_args) 
+		#
+		# We need to post the generate the data in order to :
+		#	1. compare immediately
+		#	2. synthetic copy
+		#
+		
+		cols = _dc.columns.tolist()
+		
+		data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io')				#-- will be used for comparison (store this in big query)
+		base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
+		
+		for name in cols :
+			_args['data'][name] = _dc[name]
+			info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
+			if partition != '' :
+				info['partition'] = partition
+			logger.write(info)
+			# filename = os.sep.join([log_folder,'output',name+'.csv'])
+			# data_comp[[name]].to_csv(filename,index=False)
+
+		#
+		#-- Let us store all of this into bigquery
+		prefix = args['notify']+'.'+_args['context']
+		table = '_'.join([prefix,partition,'io']).replace('__','_')
+		folder = os.sep.join([args['logs'],args['context'],partition,'output']) 
+		if 'file' in args :
+			
+			_fname = os.sep.join([folder,table.replace('_io','_full_io.csv')])
+			_pname = os.sep.join([folder,table])+'.csv'
+			data_comp.to_csv( _pname,index=False)
+			_args['data'].to_csv(_fname,index=False)
+			
+			
+		else:
+			credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
+			_pname = os.sep.join([folder,table+'.csv'])
+			_fname = table.replace('_io','_full_io')
+			data_comp.to_gbq(if_exists='replace',destination_table=_pname,credentials='credentials',chunk_size=50000)	
+			data_comp.to_csv(_pname,index=False)
+			INSERT_FLAG = 'replace' if 'partition' not in args else 'append'	
+			_args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=_fname,credentials='credentials',chunk_size=50000)
+		
+		info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} }
+		if partition :
+			info ['partition'] = partition
+		logger.write({"module":"generate","action":"write","info":info} )
+	@staticmethod
+	def callback(channel,method,header,stream):
+		
+		info = json.loads(stream)
+		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']})
+		
+		logger.write({'module':'process','action':'read-partition','input':info['info']})
+		df = pd.DataFrame(info['data'])
+		args = info['args']
+		if int(args['num_gpu']) > 1 and args['gpu'] > 0:
+			args['gpu'] = args['gpu'] + args['num_gpu']
+		args['reader'] = lambda: df
+		#
+		# @TODO: Fix
+		# 	There is an inconsistency in column/columns ... fix this shit!
+		#
+		args['columns'] = args['column']		
+		(Components()).train(**args)		
+		logger.write({"module":"process","action":"exit","info":info["info"]})
+		channel.close()
+		channel.connection.close()
+		pass
+	
+if __name__ == '__main__' :
+	filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json'
+	f = open (filename)
+	PIPELINE = json.loads(f.read())
+	f.close()	
+	index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else 0
+	
+	args =  (PIPELINE[index])
+	args['dataset'] = 'combined20190510'
+	args = dict(args,**SYS_ARGS)
+	args['max_rows'] = int(args['max_rows']) 	if 'max_rows' in args else 3
+	args['part_size']= int(args['part_size']) 	if 'part_size' in args else 3
+
+	#
+	# @TODO:
+	#	Log what was initiated so we have context of this processing ...
+	#
+	if 'listen' not in SYS_ARGS :
+		if 'file' in args :
+			reader = lambda: pd.read_csv(args['file']) ;
+		else:
+			reader = lambda: Components().get(args)
+		args['reader'] = reader
+	
+	if 'generate' in SYS_ARGS :
+		#
+		# Let us see if we have partitions given the log folder
+		
+		content = os.listdir( os.sep.join([args['logs'],args['context']]))
+		generator = Components()
+		if ''.join(content).isnumeric() :
+			#
+			# we have partitions we are working with
+			
+			for id in ''.join(content) :
+				args['partition'] = id
+				
+				generator.generate(args)
+		else:
+			generator.generate(args)
+		# Components.generate(args)
+	elif 'listen' in args :
+		#
+		# This will start a worker just in case to listen to a queue
+		if 'read' in SYS_ARGS :
+			QUEUE_TYPE = 'queue.QueueReader'
+			pointer = lambda qreader: qreader.read(1)
+		else:
+			QUEUE_TYPE = 'queue.QueueListener'
+			pointer = lambda qlistener: qlistener.listen()
+		N = int(SYS_ARGS['jobs']) if 'jobs' in SYS_ARGS else 1 
+		
+		qhandlers = [factory.instance(type=QUEUE_TYPE,args={'queue':'aou.io'}) for i in np.arange(N)]
+		jobs = []
+		for qhandler in qhandlers :
+			qhandler.callback = Components.callback
+			job = Process(target=pointer,args=(qhandler,))
+			job.start()
+			jobs.append(job)
+		#
+		# let us wait for the jobs
+		print (["Started ",len(jobs)," trainers"])
+		while len(jobs) > 0 :
+			
+			jobs = [job for job in jobs if job.is_alive()]
+		
+		# pointer(qhandler)
+
+
+		# qreader.read(1)
+		pass
+	else:
+		
+		trainer = Components()
+		trainer.train(**args)
+		# Components.train(**args)
+#for args in PIPELINE :
+	#args['dataset'] = 'combined20190510'
+	#process = Process(target=Components.train,args=(args,))
+	#process.name = args['context']
+	#process.start()
+#	Components.train(args)

From f9496ed8061cf0f1c452f75ffb3a421af119446d Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 15 Mar 2020 10:25:19 -0500
Subject: [PATCH 071/250] bug fix with program dying

---
 pipeline.py | 20 +++++++++++---------
 setup.py    |  2 +-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 9eee8c5..bfdd72e 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -76,10 +76,11 @@ class Components :
 
 		partition = args['partition'] 
 		log_folder = os.sep.join([log_folder,args['context'],str(partition)])
-		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+		_args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
 		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 		if 'batch_size' in args :
 			_args['batch_size'] = int(args['batch_size'])
+			
 		#
 		# We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel
 		#
@@ -143,7 +144,7 @@ class Components :
 		# 	columns = args['columns']
 		# 	df = np.array_split(df[columns].values,PART_SIZE)
 		# 	df = pd.DataFrame(df[ int (partition) ],columns = columns)
-		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":str(df.shape[0]),"cols":str(df.shape[1]),"part_size":int(PART_SIZE)}
+		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE)}
 		logger.write({"module":"generate","action":"partition","input":info})
 		_args['partition'] = int(partition)
 		_args['continuous']= args['continuous'] if 'continuous' in args else []
@@ -163,7 +164,6 @@ class Components :
 		
 		data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io')				#-- will be used for comparison (store this in big query)
 		base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
-		
 		for name in cols :
 			_args['data'][name] = _dc[name]
 			info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
@@ -193,10 +193,14 @@ class Components :
 			_fname = table.replace('_io','_full_io')
 			partial = '.'.join(['io',args['context']+'_partial_io'])
 			complete= '.'.join(['io',args['context']+'_full_io'])
-			data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=50000)	
 			data_comp.to_csv(_pname,index=False)
-			INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'	
-			_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=50000)
+			if 'dump' in args :
+				print (_args['data'].head())
+			else:
+				data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=50000)	
+				
+				INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'	
+				_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=50000)
 			_id = 'dataset'
 		info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
 		if partition :
@@ -247,6 +251,7 @@ if __name__ == '__main__' :
 	args = dict(args,**SYS_ARGS)
 	
 	args['logs'] 		= args['logs'] if 'logs' in args else 'logs'
+	args['batch_size']	= 2000 if 'batch_size' not in args else int(args['batch_size'])
 	if 'dataset' not in args :
 		args['dataset'] = 'combined20191004v2_deid'
 	PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
@@ -350,10 +355,7 @@ if __name__ == '__main__' :
 				continue
 			args['part_size'] = PART_SIZE
 			args['partition'] = index
-			# _df = pd.DataFrame(DATA[index],columns=args['columns'])
 			args['data'] = DATA[index]
-			# args['data'].to_csv('aou-'+str(index)+'csv',index=False)
-			# args['reader'] = lambda: _df
 			if int(args['num_gpu']) > 1 :
 				args['gpu'] = index
 			else:
diff --git a/setup.py b/setup.py
index 78c52ea..4a4e87b 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.2.2","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.2.3","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From af6ab356d832014d9608ff70812d47e07b24aa53 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 16 Mar 2020 16:22:34 -0500
Subject: [PATCH 072/250] bug fix: index number or context

---
 data/gan.py |  2 +-
 pipeline.py | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 28d5ea3..c85776a 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -424,7 +424,7 @@ class Train (GNet):
                         dataset = tf.data.Dataset.from_tensor_slices(features_placeholder)
                 # labels_placeholder = None
                 dataset = dataset.repeat(10000)
-                print ([' ******* ',self.BATCHSIZE_PER_GPU])
+                
                 dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
                 dataset = dataset.prefetch(1)
                 # iterator = dataset.make_initializable_iterator()
diff --git a/pipeline.py b/pipeline.py
index bfdd72e..b838043 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -244,8 +244,19 @@ if __name__ == '__main__' :
 	f = open (filename)
 	PIPELINE = json.loads(f.read())
 	f.close()	
-	index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else 0
-	
+	index = SYS_ARGS['index']
+	if index.isnumeric() :
+		index = int(SYS_ARGS['index']) 
+	else:
+		#
+		# The index provided is a key to a pipeline entry mainly the context
+		#
+		N = len(PIPELINE)
+		f = [i for i in range(0,N) if PIPELINE[i]['context'] == index]
+		index = f[0] if f else 0
+	#
+	# print 	
+	print ("..::: ",PIPELINE[index]['context'])
 	args =  (PIPELINE[index])
 	
 	args = dict(args,**SYS_ARGS)

From 2f6f43c9c694383d02563b6e3fa4abe9471c4f95 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 18 Mar 2020 23:16:36 -0500
Subject: [PATCH 073/250] bug fix: statistics for quick assessment

---
 pipeline.py | 20 ++++++++++++++++++--
 setup.py    |  2 +-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index b838043..76496bd 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -163,6 +163,21 @@ class Components :
 		cols = _dc.columns.tolist()
 		
 		data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io')				#-- will be used for comparison (store this in big query)
+		#
+		# performing basic analytics on the synthetic data generated (easy to quickly asses)
+		#
+		info = {"module":"generate","action":"io-stats","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
+		logs = []
+		for name in data_comp.columns.tolist() :
+			g = pd.DataFrame(data_comp.groupby([name]).size())						
+			g.columns = ['counts']
+			g[name] = g.index.tolist()
+			g.index = np.arange(g.shape[0])
+			logs.append({"name":name,"counts": g.to_dict(orient='records')})
+		info['input']['logs'] = logs
+		logger.write(info)
+
+		
 		base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
 		for name in cols :
 			_args['data'][name] = _dc[name]
@@ -170,6 +185,7 @@ class Components :
 			if partition != '' :
 				info['partition'] = int(partition)
 			logger.write(info)
+			
 			# filename = os.sep.join([log_folder,'output',name+'.csv'])
 			# data_comp[[name]].to_csv(filename,index=False)
 
@@ -197,10 +213,10 @@ class Components :
 			if 'dump' in args :
 				print (_args['data'].head())
 			else:
-				data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=50000)	
+				data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)	
 				
 				INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'	
-				_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=50000)
+				_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
 			_id = 'dataset'
 		info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
 		if partition :
diff --git a/setup.py b/setup.py
index 4a4e87b..0f38464 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.2.3","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.2.4","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 915601236cd0f06a99f2e7fbdbaa5153da7f25f6 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 25 Mar 2020 17:43:23 -0500
Subject: [PATCH 074/250] bug fix with ICD and some minor improvements

---
 data/gan.py            |  13 +++--
 data/maker/__init__.py |  53 +++++++++++++-----
 pipeline.py            | 124 ++++++++++++++++-------------------------
 setup.py               |   2 +-
 4 files changed, 97 insertions(+), 95 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index c85776a..a6dece6 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -172,7 +172,7 @@ class GNet :
                                 root = []
                                 for loc in path.split(os.sep) :
                                         root.append(loc)
-                                        if not os.path.exists(os.sep.join(root)) :
+                                        if not os.path.exists(os.sep.join(root)) :                                                
                                                 os.mkdir(os.sep.join(root))
 
                         elif not os.path.exists(path):
@@ -535,8 +535,12 @@ class Predict(GNet):
                 self.values     = args['values']
                 self.ROW_COUNT  = args['row_count']
                 self.oROW_COUNT = self.ROW_COUNT
-                
-                self.MISSING_VALUES = args['no_value'] 
+                if args['no_value'] in ['na','','NA'] :
+                        self.MISSING_VALUES = np.nan
+                else :
+                        self.MISSING_VALUES = args['no_value']
+                # self.MISSING_VALUES = args['no_value']
+                # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else  np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value']
         def load_meta(self, column):
                 super().load_meta(column)
                 self.generator.load_meta(column)
@@ -652,7 +656,8 @@ class Predict(GNet):
                                 if ii.shape[0] > 0 :
                                         #
                                         #@TODO Have this be a configurable variable
-                                        missing = np.repeat(0, np.where(ii==1)[0].size)
+                                        
+                                        missing = np.repeat(self.MISSING_VALUES, np.where(ii==1)[0].size)
                                 else:
                                         missing = []
                                 #
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 3a016cf..e252de5 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -62,21 +62,28 @@ class ContinuousToDiscrete :
         BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)
         
         values = []
-        _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
-        # # print (BOUNDS)
-        
-        # values = []
-        for row in _BINARY :
-            # ubound = BOUNDS[row.index(1)]
-            index = np.where(row == 1)[0][0]
+        # _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
+        # # # print (BOUNDS)
+        l = {}
+        for value in X :
+            values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if  value >= item.left and value <= item.right ]
+            
             
-            ubound = BOUNDS[ index ].right
-            lbound = BOUNDS[ index ].left
+    
+        # # values = []
+        # for row in _BINARY :
+        #     # ubound = BOUNDS[row.index(1)]
+        #     index = np.where(row == 1)[0][0]
+            
+        #     ubound = BOUNDS[ index ].right
+        #     lbound = BOUNDS[ index ].left
             
-            x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)            
-            values.append(x_)
+        #     x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)            
+        #     values.append(x_)
             
-            lbound = ubound
+        #     lbound = ubound
+
+        # values = [np.random.uniform() for item in BOUNDS]
         
         return values
             
@@ -173,6 +180,8 @@ def generate(**args):
     #   If the identifier is not present, we should fine a way to determine or make one
     #
     BIN_SIZE    = 4 if 'bin_size' not in args else int(args['bin_size'])
+    NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
+
     _df     = df.copy()
     for col in column :
         args['context'] = col
@@ -195,13 +204,29 @@ def generate(**args):
         
         args['values']      = values    
         args['row_count']   = df.shape[0]
+        if col in NO_VALUE :
+            args['no_value'] = NO_VALUE[col]
+        else:
+            args['no_value'] = NO_VALUE
+        
         #
         # we can determine the cardinalities here so we know what to allow or disallow
         handler     = gan.Predict (**args)
         handler.load_meta(col)
         r           =  handler.apply()                
-        
-        _df[col]    = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col]
+        if col in CONTINUOUS :
+            r[col] = np.array(r[col])
+            MISSING= np.nan if args['no_value'] in ['na','','NA'] else args['no_value']
+            
+            if np.isnan(MISSING):
+                i = np.isnan(r[col])                
+                i = np.where (i == False)[0]                
+            else:
+                i = np.where( r[col] != None)[0]            
+            _approx = ContinuousToDiscrete.continuous(r[col][i],BIN_SIZE)                        
+            r[col][i] = _approx
+            
+        _df[col]    = r[col] #ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col]
         # _df[col]    = r[col]
         #
         # @TODO: log basic stats about the synthetic attribute
diff --git a/pipeline.py b/pipeline.py
index 76496bd..0d19e60 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -16,7 +16,12 @@ from data.params import SYS_ARGS
 DATASET='combined20191004v2_deid'
 
 class Components :
-
+	class KEYS :
+		PIPELINE_KEY = 'pipeline'
+		SQL_FILTER = 'filter'
+	@staticmethod
+	def get_logger(**args) :
+		return factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 	@staticmethod
 	def get(args):
 		"""
@@ -26,15 +31,19 @@ class Components :
 		:condition	optional condition and filters
 		"""
 		SQL = args['sql']
-		if 'condition' in args :
-			condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')'])
+		if Components.KEYS.SQL_FILTER in args :
+			SQL_FILTER = Components.KEYS.SQL_FILTER
+			condition = ' '.join([args[SQL_FILTER]['field'],args[SQL_FILTER]['qualifier'],'(',args[SQL_FILTER]['value'],')'])
 			SQL = " ".join([SQL,'WHERE',condition])
 
 		SQL = SQL.replace(':dataset',args['dataset']) #+ " LI "
 		
 		if 'limit' in args :
 			SQL = SQL + ' LIMIT ' + args['limit']
-		
+		#
+		# let's log the sql query that has been performed here
+		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
+		logger.write({"module":"bigquery","action":"read","input":{"sql":SQL}})
 		credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').astype(object)
 		return df
@@ -131,6 +140,7 @@ class Components :
 		_args['num_gpu'] 	= 1
 		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
 		_args['no_value']= args['no_value']
+		
 		# MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
 		PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
 		
@@ -166,19 +176,27 @@ class Components :
 		#
 		# performing basic analytics on the synthetic data generated (easy to quickly asses)
 		#
-		info = {"module":"generate","action":"io-stats","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
-		logs = []
-		for name in data_comp.columns.tolist() :
-			g = pd.DataFrame(data_comp.groupby([name]).size())						
-			g.columns = ['counts']
-			g[name] = g.index.tolist()
-			g.index = np.arange(g.shape[0])
-			logs.append({"name":name,"counts": g.to_dict(orient='records')})
-		info['input']['logs'] = logs
+		info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
+		x = {}
+		for name in args['columns'] :
+			ident = data_comp.apply(lambda row: 1*(row[name]==row[name+'_io']),axis=1).sum()
+			count = data_comp[name].unique().size
+			_ident= data_comp.shape[1] - ident
+			_count= data_comp[name+'_io'].unique().size
+			
+			info['input']['logs'] += [{"name":name,"identical":int(ident),"no_identical":int(_ident),"original_count":count,"synthetic_count":_count}]
+		# for name in data_comp.columns.tolist() :
+			# g = pd.DataFrame(data_comp.groupby([name]).size())						
+			# g.columns = ['counts']
+			# g[name] = g.index.tolist()
+			# g.index = np.arange(g.shape[0])
+			# logs.append({"name":name,"counts": g.to_dict(orient='records')})
+		# info['input']['logs'] = logs
 		logger.write(info)
 
 		
 		base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
+		cols = _dc.columns.tolist()
 		for name in cols :
 			_args['data'][name] = _dc[name]
 			info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
@@ -223,43 +241,14 @@ class Components :
 			info ['partition'] = int(partition)
 		logger.write({"module":"generate","action":"write","input":info} )
 		
-	@staticmethod
-	def callback(channel,method,header,stream):
-		if stream.decode('utf8') in ['QUIT','EXIT','END'] :
-			channel.close()
-			channel.connection.close()
-		info = json.loads(stream)
-		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']})
-		
-		logger.write({'module':'process','action':'read-partition','input':info['input']})
-		df = pd.DataFrame(info['data'])
-		args = info['args']
-		if args['num_gpu'] > 1 :
-			args['gpu'] = int(info['input']['partition']) if info['input']['partition'] < 8  else np.random.choice(np.arange(8)).astype(int)
-		
-		else:
-			args['gpu'] 	= 0
-		args['num_gpu'] = 1
-		# if int(args['num_gpu']) > 1 and args['gpu'] > 0:
-		# 	args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else args['gpu'] #-- 8 max gpus
-		args['reader'] = lambda: df
-		#
-		# @TODO: Fix
-		# 	There is an inconsistency in column/columns ... fix this shit!
-		#
-		channel.close()
-		channel.connection.close()
-		args['columns'] = args['column']		
-		(Components()).train(**args)		
-		logger.write({"module":"process","action":"exit","input":info["input"]})
-
-		pass
+	
 	
 if __name__ == '__main__' :
 	filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json'
 	f = open (filename)
-	PIPELINE = json.loads(f.read())
+	_config = json.loads(f.read())
 	f.close()	
+	PIPELINE = _config['pipeline']
 	index = SYS_ARGS['index']
 	if index.isnumeric() :
 		index = int(SYS_ARGS['index']) 
@@ -274,10 +263,17 @@ if __name__ == '__main__' :
 	# print 	
 	print ("..::: ",PIPELINE[index]['context'])
 	args =  (PIPELINE[index])
-	
+	for key in _config :
+		if key == 'pipeline' or key in args:
+			#
+			# skip in case of pipeline or if key exists in the selected pipeline (provided by index)
+			# 
+			continue
+		
+		args[key] = _config[key]
 	args = dict(args,**SYS_ARGS)
 	
-	args['logs'] 		= args['logs'] if 'logs' in args else 'logs'
+	
 	args['batch_size']	= 2000 if 'batch_size' not in args else int(args['batch_size'])
 	if 'dataset' not in args :
 		args['dataset'] = 'combined20191004v2_deid'
@@ -340,38 +336,14 @@ if __name__ == '__main__' :
 		else:
 			generator.generate(args)
 		# Components.generate(args)
-	elif 'listen' in args :
+	elif 'finalize' in args :
 		#
-		# This will start a worker just in case to listen to a queue
-		SYS_ARGS = dict(args)	#-- things get lost in context
-		if 'read' in SYS_ARGS :
-			QUEUE_TYPE = 'queue.QueueReader'
-			pointer = lambda qreader: qreader.read()
-		else:
-			QUEUE_TYPE = 'queue.QueueListener'
-			pointer = lambda qlistener: qlistener.listen()
-		N = int(SYS_ARGS['jobs']) if 'jobs' in SYS_ARGS else 1 
-		
-		qhandlers = [factory.instance(type=QUEUE_TYPE,args={'queue':'aou.io'}) for i in np.arange(N)]
-		jobs = []
-		for qhandler in qhandlers :
-			qhandler.callback = Components.callback
-			job = Process(target=pointer,args=(qhandler,))
-			job.start()
-			jobs.append(job)
+		# This will finalize a given set of synthetic operations into a table
 		#
-		# let us wait for the jobs
-		print (["Started ",len(jobs)," trainers"])
-		while len(jobs) > 0 :
-			
-			jobs = [job for job in jobs if job.is_alive()]
-			time.sleep(2)
+		idataset = args['input'] if 'input' in args else 'io'	#-- input dataset
+		odataset = args['output']	#-- output dataset
+		labels = [name.strip() for name in args['labels'].split(',') ]
 		
-		# pointer(qhandler)
-
-
-		# qreader.read(1)
-		pass
 	else:
 		
 		# DATA  = np.array_split(DATA,PART_SIZE)
diff --git a/setup.py b/setup.py
index 0f38464..c441e36 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.2.4","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.2.5","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From a1ac97fbca76c3ad3ec7299145ceb781b5a94296 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 25 Mar 2020 22:22:08 -0500
Subject: [PATCH 075/250] bug fix, multiple epochs

---
 data/gan.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/data/gan.py b/data/gan.py
index a6dece6..41daa3d 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -507,7 +507,8 @@ class Train (GNet):
 
                                         logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })
 
-                                        if epoch % self.MAX_EPOCHS == 0:
+                                        # if epoch % self.MAX_EPOCHS == 0:
+                                        if epoch in [5,10,50, self.MAX_EPOCHS] :
                                                 # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
                                                 suffix = self.get.suffix()
                                                 _name  = os.sep.join([self.train_dir,suffix])

From 6e0f89cd3c3fd008f32018c85021f62c6f46696c Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 26 Mar 2020 23:39:59 -0500
Subject: [PATCH 076/250] bug fix: epochs, process control (generator)

---
 data/gan.py |  2 +-
 pipeline.py | 27 ++++++++++++++-------------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 41daa3d..c54f5bd 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -508,7 +508,7 @@ class Train (GNet):
                                         logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })
 
                                         # if epoch % self.MAX_EPOCHS == 0:
-                                        if epoch in [5,10,50, self.MAX_EPOCHS] :
+                                        if epoch in [5,10,20,50,75, self.MAX_EPOCHS] :
                                                 # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
                                                 suffix = self.get.suffix()
                                                 _name  = os.sep.join([self.train_dir,suffix])
diff --git a/pipeline.py b/pipeline.py
index 0d19e60..884609f 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -178,13 +178,14 @@ class Components :
 		#
 		info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
 		x = {}
-		for name in args['columns'] :
-			ident = data_comp.apply(lambda row: 1*(row[name]==row[name+'_io']),axis=1).sum()
-			count = data_comp[name].unique().size
-			_ident= data_comp.shape[1] - ident
-			_count= data_comp[name+'_io'].unique().size
+		# for name in args['columns'] :
+		# 	ident = data_comp.apply(lambda row: 1*(row[name]==row[name+'_io']),axis=1).sum()
+		# 	count = data_comp[name].unique().size
+		# 	_ident= data_comp.shape[1] - ident
+		# 	_count= data_comp[name+'_io'].unique().size
+		# 	_count= len(set(data_comp[name+'_io'].values.tolist()))
 			
-			info['input']['logs'] += [{"name":name,"identical":int(ident),"no_identical":int(_ident),"original_count":count,"synthetic_count":_count}]
+		# 	info['input']['logs'] += [{"name":name,"identical":int(ident),"no_identical":int(_ident),"original_count":count,"synthetic_count":_count}]
 		# for name in data_comp.columns.tolist() :
 			# g = pd.DataFrame(data_comp.groupby([name]).size())						
 			# g.columns = ['counts']
@@ -192,17 +193,17 @@ class Components :
 			# g.index = np.arange(g.shape[0])
 			# logs.append({"name":name,"counts": g.to_dict(orient='records')})
 		# info['input']['logs'] = logs
-		logger.write(info)
+		# logger.write(info)
 
 		
 		base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
 		cols = _dc.columns.tolist()
-		for name in cols :
-			_args['data'][name] = _dc[name]
-			info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
-			if partition != '' :
-				info['partition'] = int(partition)
-			logger.write(info)
+		# for name in cols :
+		# 	_args['data'][name] = _dc[name]
+		# 	info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
+		# 	if partition != '' :
+		# 		info['partition'] = int(partition)
+		# 	logger.write(info)
 			
 			# filename = os.sep.join([log_folder,'output',name+'.csv'])
 			# data_comp[[name]].to_csv(filename,index=False)

From 205adf8fa65b73e6b1070da59ed6345ed68f1ae1 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 26 Mar 2020 23:40:09 -0500
Subject: [PATCH 077/250] bug fix: epochs, process control (generator)

---
 pipeline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pipeline.py b/pipeline.py
index 884609f..7620afd 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -327,6 +327,8 @@ if __name__ == '__main__' :
 				job.name = 'generator # '+str(index)
 				job.start()
 				jobs.append(job)
+				if len(jobs) == 1 :
+					job.join()
 
 			print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ])
 			while len(jobs)> 0 :

From e8906d1646720294e08a8524587def89c36ce375 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 27 Mar 2020 00:34:05 -0500
Subject: [PATCH 078/250] bug fix: process causing error when writing to
 bigquery

---
 pipeline.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 7620afd..6e847fb 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -4,7 +4,7 @@ from transport import factory
 import numpy as np
 import time
 import os
-from multiprocessing import Process
+from multiprocessing import Process, Lock
 import pandas as pd
 from google.oauth2 import service_account
 import data.maker
@@ -16,9 +16,11 @@ from data.params import SYS_ARGS
 DATASET='combined20191004v2_deid'
 
 class Components :
+	lock = Lock()
 	class KEYS :
 		PIPELINE_KEY = 'pipeline'
 		SQL_FILTER = 'filter'
+		
 	@staticmethod
 	def get_logger(**args) :
 		return factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
@@ -232,10 +234,12 @@ class Components :
 			if 'dump' in args :
 				print (_args['data'].head())
 			else:
+				Components.lock.acquire()
 				data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)	
 				
 				INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'	
 				_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
+				Components.lock.release()
 			_id = 'dataset'
 		info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
 		if partition :
@@ -327,8 +331,8 @@ if __name__ == '__main__' :
 				job.name = 'generator # '+str(index)
 				job.start()
 				jobs.append(job)
-				if len(jobs) == 1 :
-					job.join()
+				# if len(jobs) == 1 :
+				# 	job.join()
 
 			print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ])
 			while len(jobs)> 0 :

From 459afa2890291dd1c0bc2a8cc63c75e6b7bdd0ae Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 27 Mar 2020 01:52:16 -0500
Subject: [PATCH 079/250] bug fix:generated data has JSON object

---
 pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 6e847fb..066a418 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -200,8 +200,8 @@ class Components :
 		
 		base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
 		cols = _dc.columns.tolist()
-		# for name in cols :
-		# 	_args['data'][name] = _dc[name]
+		for name in cols :
+			_args['data'][name] = _dc[name]
 		# 	info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
 		# 	if partition != '' :
 		# 		info['partition'] = int(partition)

From 4c297679dc197ace2aec0fe87ca5ea847c70e890 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 1 Apr 2020 00:21:51 -0500
Subject: [PATCH 080/250] bug fixes and optimizations

---
 data/maker/__init__.py | 46 +++++++++++++++++++++++++-----------------
 pipeline.py            |  2 +-
 setup.py               |  2 +-
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index e252de5..378c226 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -27,22 +27,25 @@ class ContinuousToDiscrete :
         values = np.array(X).astype(np.float32)
         BOUNDS = ContinuousToDiscrete.bounds(values,n)
         # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS]
-        _matrix = []
-        m = []
-        for value in X :
-            x_ = np.zeros(n)
+        # _matrix = []
+        # m = []
+        # for value in X :
+        #     x_ = np.zeros(n)
             
-            for row in BOUNDS :
+        #     for row in BOUNDS :
             
-                if value>= row.left and value <= row.right :
-                    index = BOUNDS.index(row)
-                    x_[index]  = 1
-                    break
-            _matrix += x_.tolist()
-        #
-        # for items in BOUNDS :
-        #   index = BOUNDS.index(items)
-        return np.array(_matrix).reshape(len(X),n)
+        #         if value>= row.left and value <= row.right :
+        #             index = BOUNDS.index(row)
+        #             x_[index]  = 1
+        #             break
+        #     _matrix += x_.tolist()
+        # #
+        # # for items in BOUNDS :
+        # #   index = BOUNDS.index(items)
+        
+        # return np.array(_matrix).reshape(len(X),n)
+        matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n)
+
     
     @staticmethod
     def bounds(x,n):
@@ -65,9 +68,15 @@ class ContinuousToDiscrete :
         # _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
         # # # print (BOUNDS)
         l = {}
-        for value in X :
-            values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if  value >= item.left and value <= item.right ]
+        for i in np.arange(len(X)): #value in X :
+            
+            value = X[i]
             
+            for item in BOUNDS :
+                if value >= item.left and value <= item.right :
+                    values += [np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP)]
+                    break
+            # values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if  value >= item.left and value <= item.right ]
             
     
         # # values = []
@@ -223,11 +232,10 @@ def generate(**args):
                 i = np.where (i == False)[0]                
             else:
                 i = np.where( r[col] != None)[0]            
-            _approx = ContinuousToDiscrete.continuous(r[col][i],BIN_SIZE)                        
+            _approx = ContinuousToDiscrete.continuous(r[col][i],BIN_SIZE)  #-- approximating based on arbitrary bins                                
             r[col][i] = _approx
             
-        _df[col]    = r[col] #ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col]
-        # _df[col]    = r[col]
+        _df[col]    = r[col]
         #
         # @TODO: log basic stats about the synthetic attribute
         #
diff --git a/pipeline.py b/pipeline.py
index 066a418..5af9550 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -47,7 +47,7 @@ class Components :
 		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 		logger.write({"module":"bigquery","action":"read","input":{"sql":SQL}})
 		credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
-		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').astype(object)
+		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard')
 		return df
 		
 		# return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna()
diff --git a/setup.py b/setup.py
index c441e36..1c8aef0 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.2.5","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.2.6","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 89ed5d5d46ae0d109cdd1d7b5415c1778f30bce6 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 1 Apr 2020 00:22:21 -0500
Subject: [PATCH 081/250] simplify the CLI interface to leverage existing
 configuration

---
 finalize.py | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 finalize.py

diff --git a/finalize.py b/finalize.py
new file mode 100644
index 0000000..a375b37
--- /dev/null
+++ b/finalize.py
@@ -0,0 +1,159 @@
+"""
+This file will perform basic tasks to finalize the GAN process by performing the following :
+    - basic stats & analytics
+    - rebuild io to another dataset
+"""
+import pandas as pd
+import numpy as np
+from google.oauth2 import service_account
+from google.cloud import bigquery as bq
+from data.params import SYS_ARGS 
+import json
+class Analytics :
+    """
+    This class will compile basic analytics about a given dataset i.e compare original/synthetic
+    """
+    @staticmethod
+    def distribution(**args):
+        context = args['context']
+        df = args['data']
+        #
+        #-- This data frame counts unique values for each feature (space)
+        df_counts = pd.DataFrame(df.apply(lambda col: col.unique().size),columns=['counts']).T  # unique counts
+        #
+        #-- Get the distributions for common values
+        #
+        names   = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False]
+        ddf     = df.apply(lambda col: pd.DataFrame(col.values,columns=[col.name]).groupby([col.name]).size() ).fillna(0)
+        ddf[context] = ddf.index
+          
+        pass
+    def distance(**args):
+        """
+        This function will measure the distance between 
+        """
+        df      = args['data']
+        names   = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False]
+class Utils :
+    class get :
+        @staticmethod
+        def config(**args) :
+            contexts    = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts']
+            pipeline    = args['pipeline']
+            return [ item for item in pipeline if item['context'] in contexts]
+        @staticmethod
+        def sql(**args) :
+            """
+            This function is intended to build SQL query for the remainder of the table that was not synthesized
+            :config configuration entries
+            :from   source of the table name
+            :dataset    name of the source dataset
+            
+            """
+            SQL = ["SELECT * FROM :from "]
+            SQL_FILTER = []
+            NO_FILTERS_FOUND = True
+            pipeline = Utils.get.config(**args)
+            REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='}
+            for item in pipeline :
+                
+
+                if 'filter' in item :
+                    if NO_FILTERS_FOUND :
+                        NO_FILTERS_FOUND = False
+                        SQL  += ['WHERE']
+                    #
+                    # Let us load the filter in the SQL Query
+                    FILTER = item['filter']
+                    QUALIFIER = REVERSE_QUALIFIER[FILTER['qualifier'].upper()]
+                    SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')'])]
+            src = ".".join([args['dataset'],args['from']])
+            SQL += [" AND ".join(SQL_FILTER)]
+            #
+            # let's pull the field schemas out of the table definition
+            #
+
+            return " ".join(SQL).replace(":from",src)
+
+        
+def mk(**args) :
+    dataset  = args['dataset']
+    client  = args['client'] if 'client' in args else bq.Client.from_service_account_file(args['private_key'])
+    #
+    # let us see if we have a dataset handy here 
+    #
+    datasets = list(client.list_datasets())
+    found = [item for item in datasets if item.dataset_id == dataset]
+    
+    if not found :
+
+        return client.create_dataset(dataset)
+    return found[0] 
+        
+def move (**args):
+    """
+    This function will move a table from the synthetic dataset into a designated location
+    This is the simplest case for finalizing a synthetic data set
+    :private_key        
+    """
+    private_key = args['private_key']
+    client      = bq.Client.from_service_account_json(private_key)
+    config      = Utils.get.config(**args)
+    dataset     = args['dataset']
+    SQL         = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in config]
+    SQL         += [Utils.get.sql(**args)]
+    SQL         =  ('\n UNION ALL \n'.join(SQL).replace(':dataset','io'))
+
+
+    #
+    # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table
+    #
+    
+    
+
+    odataset    = mk(dataset=dataset+'_io',client=client)
+    # SQL =       "SELECT * FROM io.:context_full_io".replace(':context',context)
+    config = bq.QueryJobConfig()
+    config.destination = client.dataset(odataset.dataset_id).table(args['from'])
+    config.use_query_cache = True
+    config.allow_large_results = True
+    config.priority = 'INTERACTIVE'
+    #
+    #
+
+    schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
+    fields = [" ".join(["CAST (",item.name,"AS",item.field_type.replace("INTEGER","INT64").replace("FLOAT","FLOAT64"),") ",item.name]) for item in schema]
+    SQL = SQL.replace("*"," , ".join(fields))
+    # print (SQL)
+    out = client.query(SQL,location='US',job_config=config)
+    print (dir (out))
+    
+    
+
+
+import pandas as pd
+import numpy as np
+from google.oauth2 import service_account
+import json
+
+# path = '../curation-prod.json'
+# credentials = service_account.Credentials.from_service_account_file(path)
+# df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
+f = open('config.json')
+config = json.loads(f.read())
+args = config['pipeline']
+f.close()
+
+
+if __name__ == '__main__' :
+    """
+    Usage :
+        finalize --<move|stats> --contexts <c1,c2,...c3> --from <table>
+    """
+    if 'move' in SYS_ARGS :
+        table = SYS_ARGS['from']
+        contexts = [item['context'] for item in config['pipeline'] if item['from'] == args['from']]
+        args = dict(config,**{"private_key":"../curation-prod.json"})
+        args = dict(args,**SYS_ARGS)
+        args['contexts'] = contexts
+        move(**args)
\ No newline at end of file

From 5c9fda4018f9fa37a965e42966d8e6d3b5ba1e49 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 1 Apr 2020 00:25:20 -0500
Subject: [PATCH 082/250] bug fix: CLI parameter handling (wrong reference)

---
 finalize.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/finalize.py b/finalize.py
index a375b37..fd50d1d 100644
--- a/finalize.py
+++ b/finalize.py
@@ -139,7 +139,8 @@ import json
 # path = '../curation-prod.json'
 # credentials = service_account.Credentials.from_service_account_file(path)
 # df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
-f = open('config.json')
+filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config']
+f = open(filename)
 config = json.loads(f.read())
 args = config['pipeline']
 f.close()
@@ -152,8 +153,10 @@ if __name__ == '__main__' :
     """
     if 'move' in SYS_ARGS :
         table = SYS_ARGS['from']
-        contexts = [item['context'] for item in config['pipeline'] if item['from'] == args['from']]
+        contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
         args = dict(config,**{"private_key":"../curation-prod.json"})
         args = dict(args,**SYS_ARGS)
         args['contexts'] = contexts
-        move(**args)
\ No newline at end of file
+        move(**args)
+    else:
+        print ("NOT YET READY !")
\ No newline at end of file

From debbd48627ff0dfb001d0bc083ec340853815a0b Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 1 Apr 2020 00:53:56 -0500
Subject: [PATCH 083/250] bug fix: batch size per GPU

---
 pipeline.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 5af9550..9a6b8aa 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -277,9 +277,8 @@ if __name__ == '__main__' :
 		
 		args[key] = _config[key]
 	args = dict(args,**SYS_ARGS)
-	
-	
-	args['batch_size']	= 2000 if 'batch_size' not in args else int(args['batch_size'])
+	if 'batch_size' not in args :
+		args['batch_size']	= 2000 #if 'batch_size' not in args else int(args['batch_size'])
 	if 'dataset' not in args :
 		args['dataset'] = 'combined20191004v2_deid'
 	PART_SIZE = int(args['part_size']) if 'part_size' in args else 8

From 87d54c508dd51a4929a593ef76dd723ee8396b60 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 1 Apr 2020 02:36:22 -0500
Subject: [PATCH 084/250] bug fix: with filter dataset

---
 finalize.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/finalize.py b/finalize.py
index fd50d1d..5310e6d 100644
--- a/finalize.py
+++ b/finalize.py
@@ -66,7 +66,7 @@ class Utils :
                     # Let us load the filter in the SQL Query
                     FILTER = item['filter']
                     QUALIFIER = REVERSE_QUALIFIER[FILTER['qualifier'].upper()]
-                    SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')'])]
+                    SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')']).replace(":dataset",args['dataset'])]
             src = ".".join([args['dataset'],args['from']])
             SQL += [" AND ".join(SQL_FILTER)]
             #
@@ -126,7 +126,8 @@ def move (**args):
     SQL = SQL.replace("*"," , ".join(fields))
     # print (SQL)
     out = client.query(SQL,location='US',job_config=config)
-    print (dir (out))
+    print ()
+    print (out.job_id)
     
     
 

From 6d84b25d956dc90f0c8e6fdfec1f253e3adb98b9 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 2 Apr 2020 00:04:05 -0500
Subject: [PATCH 085/250] bug fix: multiple conditions on statement

---
 data/gan.py |  6 ++--
 pipeline.py | 84 ++++++++++++++++++++++-------------------------------
 2 files changed, 38 insertions(+), 52 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index c54f5bd..a46740a 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -536,10 +536,10 @@ class Predict(GNet):
                 self.values     = args['values']
                 self.ROW_COUNT  = args['row_count']
                 self.oROW_COUNT = self.ROW_COUNT
-                if args['no_value'] in ['na','','NA'] :
-                        self.MISSING_VALUES = np.nan
-                else :
+                self.MISSING_VALUES = np.nan
+                if 'no_value' in args and args['no_value'] not in ['na','','NA'] :
                         self.MISSING_VALUES = args['no_value']
+                        
                 # self.MISSING_VALUES = args['no_value']
                 # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else  np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value']
         def load_meta(self, column):
diff --git a/pipeline.py b/pipeline.py
index 9a6b8aa..acb4f6c 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -20,7 +20,12 @@ class Components :
 	class KEYS :
 		PIPELINE_KEY = 'pipeline'
 		SQL_FILTER = 'filter'
-		
+	@staticmethod
+	def get_filter (**args):
+		if args['qualifier'] == 'IN' :
+			return ' '.join([args['field'],args['qualifier'],'(',args['value'],')'])
+		else:
+			return ' '.join([args['field'],args['qualifier'],args['value']])
 	@staticmethod
 	def get_logger(**args) :
 		return factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
@@ -34,8 +39,11 @@ class Components :
 		"""
 		SQL = args['sql']
 		if Components.KEYS.SQL_FILTER in args :
-			SQL_FILTER = Components.KEYS.SQL_FILTER
-			condition = ' '.join([args[SQL_FILTER]['field'],args[SQL_FILTER]['qualifier'],'(',args[SQL_FILTER]['value'],')'])
+			FILTER_KEY = Components.KEYS.SQL_FILTER
+			SQL_FILTER = args[FILTER_KEY] if type(args[FILTER_KEY]) == list else [args[FILTER_KEY]]
+			# condition = ' '.join([args[FILTER_KEY]['field'],args[FILTER_KEY]['qualifier'],'(',args[FILTER_KEY]['value'],')'])
+			
+			condition = ' AND '.join([Components.get_filter(**item) for item in SQL_FILTER])
 			SQL = " ".join([SQL,'WHERE',condition])
 
 		SQL = SQL.replace(':dataset',args['dataset']) #+ " LI "
@@ -76,13 +84,6 @@ class Components :
 		
 		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 		log_folder = args['logs'] if 'logs' in args else 'logs'
-		# _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
-		
-		# _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
-		# _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
-		# _args['gpu'] = args['gpu'] if 'gpu' in args else 0
-
-		# # MAX_ROWS = args['max_rows'] 	if 'max_rows' in args else 0
 		PART_SIZE = int(args['part_size']) 
 
 		partition = args['partition'] 
@@ -156,16 +157,22 @@ class Components :
 		# 	columns = args['columns']
 		# 	df = np.array_split(df[columns].values,PART_SIZE)
 		# 	df = pd.DataFrame(df[ int (partition) ],columns = columns)
-		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE)}
+		# max_rows = int(args['partition_max_rows']) if 'partition_max_rows' in args else 1000000
+		# N = np.divide(df.shape[0],max_rows).astype(int) + 1
+		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE),"partition-info":{"count":int(N),"max_rows":max_rows}}
 		logger.write({"module":"generate","action":"partition","input":info})
 		_args['partition'] = int(partition)
 		_args['continuous']= args['continuous'] if 'continuous' in args else []
-		_args['data'] = df
-			# _args['data'] = reader()
-		#_args['data'] = _args['data'].astype(object)
-		# _args['num_gpu'] = 1
+		#
+		# How many rows sub-partition must we divide this into ?
+		# -- Let us tray assessing
 		
-		_dc = data.maker.generate(**_args) 
+		
+		df = np.array_split(df,N)
+		_dc = pd.DataFrame()
+		# for mdf in df :
+		_args['data'] = df		
+		_dc = _dc.append(data.maker.generate(**_args))
 		#
 		# We need to post the generate the data in order to :
 		#	1. compare immediately
@@ -180,35 +187,13 @@ class Components :
 		#
 		info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
 		x = {}
-		# for name in args['columns'] :
-		# 	ident = data_comp.apply(lambda row: 1*(row[name]==row[name+'_io']),axis=1).sum()
-		# 	count = data_comp[name].unique().size
-		# 	_ident= data_comp.shape[1] - ident
-		# 	_count= data_comp[name+'_io'].unique().size
-		# 	_count= len(set(data_comp[name+'_io'].values.tolist()))
-			
-		# 	info['input']['logs'] += [{"name":name,"identical":int(ident),"no_identical":int(_ident),"original_count":count,"synthetic_count":_count}]
-		# for name in data_comp.columns.tolist() :
-			# g = pd.DataFrame(data_comp.groupby([name]).size())						
-			# g.columns = ['counts']
-			# g[name] = g.index.tolist()
-			# g.index = np.arange(g.shape[0])
-			# logs.append({"name":name,"counts": g.to_dict(orient='records')})
-		# info['input']['logs'] = logs
-		# logger.write(info)
-
+		#
+		# @TODO: Send data over to a process for analytics
 		
 		base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
 		cols = _dc.columns.tolist()
 		for name in cols :
 			_args['data'][name] = _dc[name]
-		# 	info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
-		# 	if partition != '' :
-		# 		info['partition'] = int(partition)
-		# 	logger.write(info)
-			
-			# filename = os.sep.join([log_folder,'output',name+'.csv'])
-			# data_comp[[name]].to_csv(filename,index=False)
 
 		#
 		#-- Let us store all of this into bigquery
@@ -265,7 +250,7 @@ if __name__ == '__main__' :
 		f = [i for i in range(0,N) if PIPELINE[i]['context'] == index]
 		index = f[0] if f else 0
 	#
-	# print 	
+	
 	print ("..::: ",PIPELINE[index]['context'])
 	args =  (PIPELINE[index])
 	for key in _config :
@@ -274,8 +259,8 @@ if __name__ == '__main__' :
 			# skip in case of pipeline or if key exists in the selected pipeline (provided by index)
 			# 
 			continue
-		
 		args[key] = _config[key]
+	
 	args = dict(args,**SYS_ARGS)
 	if 'batch_size' not in args :
 		args['batch_size']	= 2000 #if 'batch_size' not in args else int(args['batch_size'])
@@ -286,13 +271,13 @@ if __name__ == '__main__' :
 	# @TODO:
 	#	Log what was initiated so we have context of this processing ...
 	#
-	if 'listen' not in SYS_ARGS :
-		if 'file' in args :
-			DATA = pd.read_csv(args['file']) ;
-		else:
-			DATA = Components().get(args)
-		COLUMNS = DATA.columns
-		DATA = np.array_split(DATA,PART_SIZE)
+	# if 'listen' not in SYS_ARGS :
+	if 'file' in args :
+		DATA = pd.read_csv(args['file']) ;
+	else:
+		DATA = Components().get(args)
+	COLUMNS = DATA.columns
+	DATA = np.array_split(DATA,PART_SIZE)
 	
 	if 'generate' in SYS_ARGS :
 		#
@@ -325,6 +310,7 @@ if __name__ == '__main__' :
 					args['gpu'] = index
 				else:
 					args['gpu']=0
+
 				make = lambda _args: (Components()).generate(_args)
 				job = Process(target=make,args=(args,))
 				job.name = 'generator # '+str(index)

From 8418208da013fc9cd6b204fd636f3d60b82e70b8 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 2 Apr 2020 01:36:57 -0500
Subject: [PATCH 086/250] bug fix: max_rows missing

---
 pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index acb4f6c..78ffefe 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -159,7 +159,7 @@ class Components :
 		# 	df = pd.DataFrame(df[ int (partition) ],columns = columns)
 		# max_rows = int(args['partition_max_rows']) if 'partition_max_rows' in args else 1000000
 		# N = np.divide(df.shape[0],max_rows).astype(int) + 1
-		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE),"partition-info":{"count":int(N),"max_rows":max_rows}}
+		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE)}
 		logger.write({"module":"generate","action":"partition","input":info})
 		_args['partition'] = int(partition)
 		_args['continuous']= args['continuous'] if 'continuous' in args else []

From 00e640df21cc780f3155e1d2a8064a4cf49ec953 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 2 Apr 2020 07:52:09 -0500
Subject: [PATCH 087/250] bug fix: @TODO: split partition data so we always
 process a decent sized data

---
 pipeline.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 78ffefe..4985156 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -152,6 +152,7 @@ class Components :
 		# reader = args['reader']
 		# df = reader()
 		df = args['reader']() if 'reader' in args else args['data']
+		
 		# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
 		# if partition != '' :
 		# 	columns = args['columns']
@@ -168,7 +169,7 @@ class Components :
 		# -- Let us tray assessing
 		
 		
-		df = np.array_split(df,N)
+		
 		_dc = pd.DataFrame()
 		# for mdf in df :
 		_args['data'] = df		

From c758e840046f085a3ac5a53a0793f292c9bdd2b6 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 9 Apr 2020 10:51:11 -0500
Subject: [PATCH 088/250] setup finalize

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 1c8aef0..939c241 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.2.6","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.2.7","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
@@ -12,5 +12,5 @@ args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
 if sys.version_info[0] == 2 :
     args['use_2to3'] = False
     args['use_2to3_exclude_fixers'] = ['lib2to3.fixes.fix_import']
-args['scripts']=['pipeline.py']
+args['scripts']=['pipeline.py','finalize.py']
 setup(**args)

From e78d72af2107d462d42d01b1d44e97011cfb52ad Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 9 Apr 2020 10:52:48 -0500
Subject: [PATCH 089/250] setup finalize

---
 finalize.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/finalize.py b/finalize.py
index 5310e6d..b163b0d 100644
--- a/finalize.py
+++ b/finalize.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """
 This file will perform basic tasks to finalize the GAN process by performing the following :
     - basic stats & analytics

From ba1f38770d317c6eb38a870debd543e431ad82b9 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 9 Apr 2020 12:42:29 -0500
Subject: [PATCH 090/250] bug fix ...

---
 pipeline.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 4985156..2b5f028 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -166,8 +166,11 @@ class Components :
 		_args['continuous']= args['continuous'] if 'continuous' in args else []
 		#
 		# How many rows sub-partition must we divide this into ?
-		# -- Let us tray assessing
-		
+		#	let us fix the data types here every _id field will be an np.int64...
+		#
+		for name in df.columns.tolist():
+			if name.endwith('_id') :
+				df[name] = df[name].astype(np.int64)
 		
 		
 		_dc = pd.DataFrame()

From 944a3edbf6b718ed1d256c59fd777be46e56d1f3 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 9 Apr 2020 12:57:25 -0500
Subject: [PATCH 091/250] bug fix ... with data-typing in data-frame

---
 pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 2b5f028..4cac273 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -169,7 +169,7 @@ class Components :
 		#	let us fix the data types here every _id field will be an np.int64...
 		#
 		for name in df.columns.tolist():
-			if name.endwith('_id') :
+			if name.endswith('_id') :
 				df[name] = df[name].astype(np.int64)
 		
 		

From e4b164a34b142212379692172ae18dcf9e802af3 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 9 Apr 2020 19:21:25 -0500
Subject: [PATCH 092/250] bug fix with typing

---
 pipeline.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 4cac273..3950942 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -168,9 +168,14 @@ class Components :
 		# How many rows sub-partition must we divide this into ?
 		#	let us fix the data types here every _id field will be an np.int64...
 		#
+		
 		for name in df.columns.tolist():
+			
 			if name.endswith('_id') :
-				df[name] = df[name].astype(np.int64)
+				if df[name].isnull().sum() > 0 :
+					df[name].fillna(0,inplace=True)
+				else:
+					df[name] = df[name].astype(np.int64)
 		
 		
 		_dc = pd.DataFrame()

From 0016aec576004c562b9901cfd8478f850360bec2 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sat, 11 Apr 2020 13:10:01 -0500
Subject: [PATCH 093/250] adding autopilot/drone mode for training and
 automatic data generation

---
 finalize.py | 38 ++++++++++++++++++++++++++++----------
 pipeline.py |  3 +++
 setup.py    |  2 +-
 3 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/finalize.py b/finalize.py
index b163b0d..079830d 100644
--- a/finalize.py
+++ b/finalize.py
@@ -101,10 +101,15 @@ def move (**args):
     client      = bq.Client.from_service_account_json(private_key)
     config      = Utils.get.config(**args)
     dataset     = args['dataset']
-    SQL         = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in config]
-    SQL         += [Utils.get.sql(**args)]
-    SQL         =  ('\n UNION ALL \n'.join(SQL).replace(':dataset','io'))
-
+    if 'contexts' in args :
+        SQL         = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in config]
+        SQL         += [Utils.get.sql(**args)]
+        SQL         =  ('\n UNION ALL \n'.join(SQL).replace(':dataset','io'))
+    else:
+        #
+        # moving a table to a designated location
+        tablename = args['from']
+        SQL = "SELECT * FROM :dataset.:table".replace(":dataset",dataset).replace(":table",tablename)
 
     #
     # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table
@@ -128,7 +133,7 @@ def move (**args):
     # print (SQL)
     out = client.query(SQL,location='US',job_config=config)
     print ()
-    print (out.job_id)
+    return (out.job_id)
     
     
 
@@ -154,11 +159,24 @@ if __name__ == '__main__' :
         finalize --<move|stats> --contexts <c1,c2,...c3> --from <table>
     """
     if 'move' in SYS_ARGS :
-        table = SYS_ARGS['from']
+        # table = SYS_ARGS['from']
+        # args = dict(config,**{"private_key":"../curation-prod.json"})
+        args = dict(args,**SYS_ARGS)        
         contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
-        args = dict(config,**{"private_key":"../curation-prod.json"})
-        args = dict(args,**SYS_ARGS)
-        args['contexts'] = contexts
-        move(**args)
+        log = []
+        if contexts :
+            args['contexts'] = contexts
+            log = move(**args)
+            
+        else:
+            tables = args['from'].split(',')
+            for name in tables :
+                name = name.strip()
+                args['from'] = name
+                log += [move(**args)]
+        print ("\n".join(log))
+        
+        
+        
     else:
         print ("NOT YET READY !")
\ No newline at end of file
diff --git a/pipeline.py b/pipeline.py
index 3950942..e54e746 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -117,6 +117,9 @@ class Components :
 		
 		logger.write({"module":"train","action":"train","input":info})
 		data.maker.train(**_args)
+		if set(['drone','autopilot']) in set( list(args.keys())) :
+			print (['drone mode enabled ....'])
+			data.maker.generate(**args)
 
 		pass
 		
diff --git a/setup.py b/setup.py
index 939c241..71e14e0 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.2.7","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.2.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 0d0ebee9c0f9e340e047984340994f414931ecad Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 12 Apr 2020 04:50:54 -0500
Subject: [PATCH 094/250] bug fix with autopilot/drone mode

---
 pipeline.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index e54e746..22c637d 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -74,6 +74,13 @@ class Components :
 		# pointer  = args['reader'] if 'reader' in args else lambda: Components.get(**args)
 		df = args['data']
 		
+		if 'slice' in args and 'max_rows' in args['slice']:
+			max_rows = args['slice']['max_rows']
+			if df.shape[0] > max_rows :
+				print (".. slicing ")
+				i = np.random.choice(df.shape[0],max_rows,replace=False)
+				df = df.iloc[i]
+		
 		
 		# if df.shape[0] == 0 :
 		# 	print ("CAN NOT TRAIN EMPTY DATASET ")
@@ -117,9 +124,10 @@ class Components :
 		
 		logger.write({"module":"train","action":"train","input":info})
 		data.maker.train(**_args)
+		
 		if set(['drone','autopilot']) in set( list(args.keys())) :
 			print (['drone mode enabled ....'])
-			data.maker.generate(**args)
+			self.generate(**args)
 
 		pass
 		
@@ -155,6 +163,7 @@ class Components :
 		# reader = args['reader']
 		# df = reader()
 		df = args['reader']() if 'reader' in args else args['data']
+
 		
 		# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
 		# if partition != '' :

From 65a1fadfcaeefc67df9b84ab5053103a72f218c6 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 12 Apr 2020 20:07:15 -0500
Subject: [PATCH 095/250] bug fix data type and pipeline

---
 data/gan.py | 3 ++-
 pipeline.py | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index a46740a..5975255 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -592,7 +592,8 @@ class Predict(GNet):
                                 # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
                                 #
                                 
-                                df =   pd.DataFrame(np.round(f)).astype(np.int32)
+                                # df =   pd.DataFrame(np.round(f)).astype(np.int32)
+                                df =   pd.DataFrame(np.round(f),dtype=np.int32)
                                 
                                 p = 0 not in df.sum(axis=1).values
                                 x = df.sum(axis=1).values
diff --git a/pipeline.py b/pipeline.py
index 22c637d..c243ec3 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -125,9 +125,9 @@ class Components :
 		logger.write({"module":"train","action":"train","input":info})
 		data.maker.train(**_args)
 		
-		if set(['drone','autopilot']) in set( list(args.keys())) :
+		if 'autopilot' in ( list(args.keys())) :
 			print (['drone mode enabled ....'])
-			self.generate(**args)
+			self.generate(args)
 
 		pass
 		

From 310d599d06ab74bccc745b1e66993b9efbae2ead Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 13 Apr 2020 01:30:59 -0500
Subject: [PATCH 096/250] bug fix: volume of data

---
 data/gan.py | 2 +-
 pipeline.py | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/data/gan.py b/data/gan.py
index 5975255..4a0fa48 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -593,7 +593,7 @@ class Predict(GNet):
                                 #
                                 
                                 # df =   pd.DataFrame(np.round(f)).astype(np.int32)
-                                df =   pd.DataFrame(np.round(f),dtype=np.int32)
+                                df =   pd.DataFrame(np.round(f),dtype=int)
                                 
                                 p = 0 not in df.sum(axis=1).values
                                 x = df.sum(axis=1).values
diff --git a/pipeline.py b/pipeline.py
index c243ec3..7017592 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -163,6 +163,13 @@ class Components :
 		# reader = args['reader']
 		# df = reader()
 		df = args['reader']() if 'reader' in args else args['data']
+		
+		if 'slice' in args and 'max_rows' in args['slice']:
+			max_rows = args['slice']['max_rows']
+			if df.shape[0] > max_rows :
+				print (".. slicing ")
+				i = np.random.choice(df.shape[0],max_rows,replace=False)
+				df = df.iloc[i]
 
 		
 		# bounds = Components.split(df,MAX_ROWS,PART_SIZE)

From e27624b697cbccb742c4adcb8f84d60e0e944594 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 13 Apr 2020 10:22:32 -0500
Subject: [PATCH 097/250] bug fix: volume of data

---
 data/gan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/gan.py b/data/gan.py
index 4a0fa48..ff51aa8 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -593,7 +593,7 @@ class Predict(GNet):
                                 #
                                 
                                 # df =   pd.DataFrame(np.round(f)).astype(np.int32)
-                                df =   pd.DataFrame(np.round(f),dtype=int)
+                                df =   pd.DataFrame(np.round(f),dtype=np.uint8)
                                 
                                 p = 0 not in df.sum(axis=1).values
                                 x = df.sum(axis=1).values

From 52e91ec0631a1cf31a35472cb0b2a294c18bfc1e Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 14 Apr 2020 00:18:23 -0500
Subject: [PATCH 098/250] bug fix ...

---
 data/gan.py |  4 +++-
 pipeline.py | 22 ++++++++++------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index ff51aa8..5559e4d 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -593,7 +593,7 @@ class Predict(GNet):
                                 #
                                 
                                 # df =   pd.DataFrame(np.round(f)).astype(np.int32)
-                                df =   pd.DataFrame(np.round(f),dtype=np.uint8)
+                                df =   pd.DataFrame(np.round(f),dtype=int)
                                 
                                 p = 0 not in df.sum(axis=1).values
                                 x = df.sum(axis=1).values
@@ -637,6 +637,8 @@ class Predict(GNet):
                         
                         if self.logger :
                                 info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)}
+                                if df.shape[1] > len(self.values) :
+                                        df = df.iloc[:len(self.values)]
                                 if INDEX > 0 :
                                         info =dict(info ,**{"selected":INDEX, "ratio": ratio[INDEX] })
                                 else :
diff --git a/pipeline.py b/pipeline.py
index 7017592..12746fa 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -82,6 +82,9 @@ class Components :
 				df = df.iloc[i]
 		
 		
+			#
+			# Certain columns need to be removed too large of a matrix
+			#
 		# if df.shape[0] == 0 :
 		# 	print ("CAN NOT TRAIN EMPTY DATASET ")
 		# 	return 
@@ -130,7 +133,7 @@ class Components :
 			self.generate(args)
 
 		pass
-		
+	
 	# @staticmethod
 	def generate(self,args):
 		"""
@@ -171,7 +174,7 @@ class Components :
 				i = np.random.choice(df.shape[0],max_rows,replace=False)
 				df = df.iloc[i]
 
-		
+				
 		# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
 		# if partition != '' :
 		# 	columns = args['columns']
@@ -194,13 +197,15 @@ class Components :
 				if df[name].isnull().sum() > 0 :
 					df[name].fillna(0,inplace=True)
 				else:
-					df[name] = df[name].astype(np.int64)
+					df[name] = df[name].astype(int)
 		
 		
 		_dc = pd.DataFrame()
 		# for mdf in df :
-		_args['data'] = df		
+		_args['data'] = df	
+		
 		_dc = _dc.append(data.maker.generate(**_args))
+		
 		#
 		# We need to post the generate the data in order to :
 		#	1. compare immediately
@@ -356,14 +361,7 @@ if __name__ == '__main__' :
 		else:
 			generator.generate(args)
 		# Components.generate(args)
-	elif 'finalize' in args :
-		#
-		# This will finalize a given set of synthetic operations into a table
-		#
-		idataset = args['input'] if 'input' in args else 'io'	#-- input dataset
-		odataset = args['output']	#-- output dataset
-		labels = [name.strip() for name in args['labels'].split(',') ]
-		
+	
 	else:
 		
 		# DATA  = np.array_split(DATA,PART_SIZE)

From 50da9098679984fd28314037c6c75c9da95f0430 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 14 Apr 2020 00:31:54 -0500
Subject: [PATCH 099/250] bug fix: no value data-type np.nan_to_num

---
 data/gan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/gan.py b/data/gan.py
index 5559e4d..5fc7032 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -536,7 +536,7 @@ class Predict(GNet):
                 self.values     = args['values']
                 self.ROW_COUNT  = args['row_count']
                 self.oROW_COUNT = self.ROW_COUNT
-                self.MISSING_VALUES = np.nan
+                self.MISSING_VALUES = np.nan_to_num(np.nan)
                 if 'no_value' in args and args['no_value'] not in ['na','','NA'] :
                         self.MISSING_VALUES = args['no_value']
                         

From 821cec8dd77be3843503fdb788883fd9ee38a614 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 14 Apr 2020 01:54:11 -0500
Subject: [PATCH 100/250] fixed issue around data-types/casting misbehavior
 with pandas and missing values

---
 data/gan.py            | 11 ++++-------
 data/maker/__init__.py |  7 ++++++-
 pipeline.py            |  5 +++--
 setup.py               |  2 +-
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 5fc7032..8a0c7a7 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -647,13 +647,8 @@ class Predict(GNet):
                                         info['ratio'] = __ratio
                                 info['partition'] = self.PARTITION
                                 self.logger.write({"module":"gan-generate","action":"generate","input":info})
-                        df.columns = self.values
-                        if len(found) or df.columns.size == len(self.values):
-                                # print (len(found),NTH_VALID_CANDIDATE)    
-                                # x = df * self.values 
-                                #
-                                # let's get the missing rows (if any) ...
-                                #
+                        # df.columns = self.values
+                        if len(found) or df.columns.size <= len(self.values):
                                 ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
                                 # print ([' **** ',ii.sum()])
 
@@ -669,6 +664,8 @@ class Predict(GNet):
                                 #       Log the findings here in terms of ratio, missing, candidate count
                                 # print ([np.max(ratio),len(missing),len(found),i])
                                 i = np.where(ii == 0)[0]
+                                
+                                
                                 df =  pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
                                 df.columns = columns
                                 df = df[columns[0]].append(pd.Series(missing))
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 378c226..25392f9 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -190,7 +190,7 @@ def generate(**args):
     #
     BIN_SIZE    = 4 if 'bin_size' not in args else int(args['bin_size'])
     NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
-
+    
     _df     = df.copy()
     for col in column :
         args['context'] = col
@@ -237,6 +237,11 @@ def generate(**args):
             
         _df[col]    = r[col]
         #
+        # Let's cast the type to the original type (it makes the data more usable)
+        #
+        otype       = df[col].dtype
+        _df[col]    = _df[col].astype(otype)
+        #
         # @TODO: log basic stats about the synthetic attribute
         #
         # print (r)s
diff --git a/pipeline.py b/pipeline.py
index 12746fa..c678a89 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -195,8 +195,7 @@ class Components :
 			
 			if name.endswith('_id') :
 				if df[name].isnull().sum() > 0 :
-					df[name].fillna(0,inplace=True)
-				else:
+					df[name].fillna(np.nan_to_num(np.nan),inplace=True)					
 					df[name] = df[name].astype(int)
 		
 		
@@ -253,9 +252,11 @@ class Components :
 				print (_args['data'].head())
 			else:
 				Components.lock.acquire()
+				
 				data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)	
 				
 				INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'	
+				print (_args['data'].dtypes)
 				_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
 				Components.lock.release()
 			_id = 'dataset'
diff --git a/setup.py b/setup.py
index 71e14e0..207cb6f 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.2.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.2.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 3dde3bf4ef6eb14d8f094ec6561256d5dcb0001b Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 14 Apr 2020 07:26:24 -0500
Subject: [PATCH 101/250] fixed issue around data-types/casting misbehavior
 with pandas and missing values

---
 pipeline.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index c678a89..80fed9e 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -133,7 +133,7 @@ class Components :
 			self.generate(args)
 
 		pass
-	
+		
 	# @staticmethod
 	def generate(self,args):
 		"""
@@ -168,11 +168,13 @@ class Components :
 		df = args['reader']() if 'reader' in args else args['data']
 		
 		if 'slice' in args and 'max_rows' in args['slice']:
+
 			max_rows = args['slice']['max_rows']
 			if df.shape[0] > max_rows :
 				print (".. slicing ")
 				i = np.random.choice(df.shape[0],max_rows,replace=False)
 				df = df.iloc[i]
+			
 
 				
 		# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
@@ -182,7 +184,7 @@ class Components :
 		# 	df = pd.DataFrame(df[ int (partition) ],columns = columns)
 		# max_rows = int(args['partition_max_rows']) if 'partition_max_rows' in args else 1000000
 		# N = np.divide(df.shape[0],max_rows).astype(int) + 1
-		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE)}
+		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"space":df[args['columns'][0]].unique().size, "part_size":int(PART_SIZE)}
 		logger.write({"module":"generate","action":"partition","input":info})
 		_args['partition'] = int(partition)
 		_args['continuous']= args['continuous'] if 'continuous' in args else []
@@ -256,7 +258,7 @@ class Components :
 				data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)	
 				
 				INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'	
-				print (_args['data'].dtypes)
+				
 				_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
 				Components.lock.release()
 			_id = 'dataset'

From f1076f441b712e860feb1b7a5ce0e16489c9b02d Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 14 Apr 2020 15:14:38 -0500
Subject: [PATCH 102/250] limitations on the matrix shape (feature space
 limitation) per partition

---
 data/bridge.py         | 109 ++++++++++++++++++++++++++---------------
 data/maker/__init__.py |  11 +++--
 2 files changed, 77 insertions(+), 43 deletions(-)

diff --git a/data/bridge.py b/data/bridge.py
index 019f065..41c0429 100644
--- a/data/bridge.py
+++ b/data/bridge.py
@@ -153,7 +153,7 @@ class Binary :
     """
     This is a utility class to import and export a data to/from a binary matrix
     """
-    def __stream(self,column) :
+    def __stream(self,column,size=-1) :
         """
         This function will convert a column into a binary matrix with the value-space representing each column of the resulting matrix        
         :column a column vector i.e every item is a row
@@ -162,12 +162,19 @@ class Binary :
         
         values = column.dropna().unique() 
         values.sort()
+        column = column.values
         #
         # Let's treat the case of missing values i.e nulls 
         #       
         row_count,col_count = column.size,values.size
+        if row_count * col_count > size and row_count < size:
+            N = np.divide(size,row_count).astype(int) 
+            i = np.random.choice(col_count,N)
+            values = values[-i]
+            col_count = N
+
        
-        matrix = [ np.zeros(col_count) for i in np.arange(row_count)]
+        matrix = [ np.zeros(col_count,dtype=np.float32) for i in np.arange(row_count)]
         #
         # let's create a binary matrix of the feature that was passed in
         # The indices of the matrix are inspired by classical x,y axis 
@@ -176,14 +183,31 @@ class Binary :
             
             for yi in np.arange(row_count) :
                 value   = column[yi]
-                if value not in values :
-                    continue
-                xi = np.where(values == value)                
-                xi      = xi[0][0] #-- column index            
-                matrix[yi][xi] = 1
+                # if value not in values :
+                #     continue
+                xi = np.where(values == value)    
+                if xi and xi[0].size > 0:         
+                    xi      = xi[0][0] #-- column index            
+                    matrix[yi][xi] = 1
+        
+        return pd.DataFrame(matrix,columns=values)
+    def apply(self,column,size):
+        return self.__stream(column,size)
+    def get_column_values(self,column,size=-1):
+        values = column.dropna().unique() 
+        values.sort()
         
-        return matrix
-    def Export(self,df) :
+        #
+        # Let's treat the case of missing values i.e nulls 
+        #       
+        row_count,col_count = column.size,values.size
+        if row_count * col_count > size and row_count < size:
+            N = np.divide(size,row_count).astype(int) 
+            i = np.random.choice(col_count,N)
+            values = values[-i]
+        return values
+ 
+    def _Export(self,df) :
         """
         This function will convert a data-frame to a binary matrix
         :return _map,matrix
@@ -192,8 +216,9 @@ class Binary :
         # This will give us a map of how each column was mapped to a bitstream
         
         # _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)
-        _map = df.fillna('').apply(lambda column: self.__stream(column),axis=0)
+        # _map = df.fillna(np.nan).apply(lambda column: column,axis=0)
         
+        print (df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0))
         #
         # We will merge this to have a healthy matrix
         _matrix =  _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1)
@@ -239,37 +264,41 @@ if __name__ == '__main__' :
         --pseudo    will create pseudonyms for a given
         --export    will export data to a specified location
     """
-    has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
-    has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
-    if has_basic and has_action :
-        builder = Builder()
-        if 'export' in SYS_ARGS :
-            print ()
-            print ("exporting ....")
-            if not os.path.exists(SYS_ARGS['export']) :
-                os.mkdir(SYS_ARGS['export'])
-            SQL = builder.encode(**SYS_ARGS)
-            #
-            # Assuming the user wants to filter the records returned :
-            #
+    df = pd.read_csv('sample.csv')
+    print ( pd.get_dummies(df.race))
+    print ( (Binary()).apply(df.race, 30))
+
+    # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
+    # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
+    # if has_basic and has_action :
+    #     builder = Builder()
+    #     if 'export' in SYS_ARGS :
+    #         print ()
+    #         print ("exporting ....")
+    #         if not os.path.exists(SYS_ARGS['export']) :
+    #             os.mkdir(SYS_ARGS['export'])
+    #         SQL = builder.encode(**SYS_ARGS)
+    #         #
+    #         # Assuming the user wants to filter the records returned :
+    #         #
             
-            credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key'])
-            df  = pd.read_gbq(SQL,credentials =credentials,dialect='standard')
-            FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv'])
-            #
-            # This would allow us to export it to wherever we see fit
-            print (FILENAME)
-            df.to_csv(FILENAME,index=False)
-            f = open(FILENAME.replace('.csv','.sql'),'w+')
-            f.write(SQL)
-            f.close()
-        elif 'pseudo' in SYS_ARGS :
-            builder.process(**SYS_ARGS)
-    else:
-        print ("")
-        print (SYS_ARGS.keys())
-        print ("has basic ",has_basic)
-        print ("has action ",has_action)
+    #         credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key'])
+    #         df  = pd.read_gbq(SQL,credentials =credentials,dialect='standard')
+    #         FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv'])
+    #         #
+    #         # This would allow us to export it to wherever we see fit
+    #         print (FILENAME)
+    #         df.to_csv(FILENAME,index=False)
+    #         f = open(FILENAME.replace('.csv','.sql'),'w+')
+    #         f.write(SQL)
+    #         f.close()
+    #     elif 'pseudo' in SYS_ARGS :
+    #         builder.process(**SYS_ARGS)
+    # else:
+    #     print ("")
+    #     print (SYS_ARGS.keys())
+    #     print ("has basic ",has_basic)
+    #     print ("has action ",has_action)
 # pseudonym.apply(table='person',dataset='wgan_original',key='./curation-test-2.json')        
 # args = {"dataset":"wgan_original","table":"observation","key":"./curation-test-2.json"}
 # builder = Builder()
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 25392f9..072b2f2 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -135,7 +135,9 @@ def train (**args) :
             # df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False)
             # print (df[col].dtypes)
             # print (df[col].dropna/(axis=1).unique())
-        args['real']  = pd.get_dummies(df[col].dropna()).astype(np.float32).values
+        # args['real']  = pd.get_dummies(df[col].dropna()).astype(np.float32).values
+        msize = args['matrix_size'] if 'matrix_size' in args else -1
+        args['real'] = (Binary()).apply(df[col],msize)
 
             
         
@@ -190,7 +192,7 @@ def generate(**args):
     #
     BIN_SIZE    = 4 if 'bin_size' not in args else int(args['bin_size'])
     NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
-    
+    bhandler = Binary()    
     _df     = df.copy()
     for col in column :
         args['context'] = col
@@ -207,7 +209,10 @@ def generate(**args):
         #     values = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32).T
             
         # else:
-        values          = df[col].dropna().unique().tolist()
+        # values          = df[col].dropna().unique().tolist()
+        msize = args['matrix_size'] if 'matrix_size' in args else -1
+        values = bhandler.get_column_values(df[col])
+
         
         
         

From f91a58e534417f1826dce701d6fa2ae30d43f4ca Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 14 Apr 2020 15:52:55 -0500
Subject: [PATCH 103/250] limitations on the matrix shape (feature space
 limitation) per partition

---
 data/bridge.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/data/bridge.py b/data/bridge.py
index 41c0429..ac79272 100644
--- a/data/bridge.py
+++ b/data/bridge.py
@@ -167,9 +167,11 @@ class Binary :
         # Let's treat the case of missing values i.e nulls 
         #       
         row_count,col_count = column.size,values.size
-        if row_count * col_count > size and row_count < size:
-            N = np.divide(size,row_count).astype(int) 
-            i = np.random.choice(col_count,N)
+        # if row_count * col_count > size and row_count < size:
+        if col_count > size :
+            # N = np.divide(size,row_count).astype(int) 
+            # N = 
+            i = np.random.choice(col_count,size)
             values = values[-i]
             col_count = N
 

From ed86ff0add1e177b0b2f54139eef932cd3da1d7b Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 14 Apr 2020 15:54:11 -0500
Subject: [PATCH 104/250] limitations on the matrix shape (feature space
 limitation) per partition

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 207cb6f..44a59b1 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.2.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.3.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 1cf9c6e47ab608b2d067980e33663a622f8e1f6e Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 14 Apr 2020 16:01:23 -0500
Subject: [PATCH 105/250] bug fix ... forgot to update a redundancy

---
 data/bridge.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/data/bridge.py b/data/bridge.py
index ac79272..a86deef 100644
--- a/data/bridge.py
+++ b/data/bridge.py
@@ -174,6 +174,7 @@ class Binary :
             i = np.random.choice(col_count,size)
             values = values[-i]
             col_count = N
+            
 
        
         matrix = [ np.zeros(col_count,dtype=np.float32) for i in np.arange(row_count)]
@@ -203,10 +204,12 @@ class Binary :
         # Let's treat the case of missing values i.e nulls 
         #       
         row_count,col_count = column.size,values.size
-        if row_count * col_count > size and row_count < size:
-            N = np.divide(size,row_count).astype(int) 
-            i = np.random.choice(col_count,N)
+        if col_count > size :
+            # N = np.divide(size,row_count).astype(int) 
+            # N = 
+            i = np.random.choice(col_count,size)
             values = values[-i]
+            col_count = N
         return values
  
     def _Export(self,df) :

From 8f390931f33bc462f6b57603de65c9d604b6ed54 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 14 Apr 2020 16:24:02 -0500
Subject: [PATCH 106/250] bug fix: matrix space restriction

---
 data/bridge.py         |  6 +++---
 data/maker/__init__.py |  4 ++--
 pipeline.py            | 24 +++++++-----------------
 3 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/data/bridge.py b/data/bridge.py
index a86deef..2e38431 100644
--- a/data/bridge.py
+++ b/data/bridge.py
@@ -173,7 +173,7 @@ class Binary :
             # N = 
             i = np.random.choice(col_count,size)
             values = values[-i]
-            col_count = N
+            col_count = size
             
 
        
@@ -209,7 +209,7 @@ class Binary :
             # N = 
             i = np.random.choice(col_count,size)
             values = values[-i]
-            col_count = N
+            col_count = size
         return values
  
     def _Export(self,df) :
@@ -271,7 +271,7 @@ if __name__ == '__main__' :
     """
     df = pd.read_csv('sample.csv')
     print ( pd.get_dummies(df.race))
-    print ( (Binary()).apply(df.race, 30))
+    print ( (Binary()).apply(df.race, 2))
 
     # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
     # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 072b2f2..78bc08d 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -136,7 +136,7 @@ def train (**args) :
             # print (df[col].dtypes)
             # print (df[col].dropna/(axis=1).unique())
         # args['real']  = pd.get_dummies(df[col].dropna()).astype(np.float32).values
-        msize = args['matrix_size'] if 'matrix_size' in args else -1
+        msize = args['matrix_size'] if 'matrix_size' in args else 128
         args['real'] = (Binary()).apply(df[col],msize)
 
             
@@ -210,7 +210,7 @@ def generate(**args):
             
         # else:
         # values          = df[col].dropna().unique().tolist()
-        msize = args['matrix_size'] if 'matrix_size' in args else -1
+        msize = args['matrix_size'] if 'matrix_size' in args else 128
         values = bhandler.get_column_values(df[col])
 
         
diff --git a/pipeline.py b/pipeline.py
index 80fed9e..54e12c4 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -73,21 +73,7 @@ class Components :
 		# @TODO: we need to log something here about the parameters being passed
 		# pointer  = args['reader'] if 'reader' in args else lambda: Components.get(**args)
 		df = args['data']
-		
-		if 'slice' in args and 'max_rows' in args['slice']:
-			max_rows = args['slice']['max_rows']
-			if df.shape[0] > max_rows :
-				print (".. slicing ")
-				i = np.random.choice(df.shape[0],max_rows,replace=False)
-				df = df.iloc[i]
-		
-		
-			#
-			# Certain columns need to be removed too large of a matrix
-			#
-		# if df.shape[0] == 0 :
-		# 	print ("CAN NOT TRAIN EMPTY DATASET ")
-		# 	return 
+
 		#
 		# Now we can parse the arguments and submit the entire thing to training
 		#
@@ -102,8 +88,8 @@ class Components :
 		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 		if 'batch_size' in args :
 			_args['batch_size'] = int(args['batch_size'])
-			
-		#
+		
+		_args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128		#
 		# We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel
 		#
 		if int(args['num_gpu']) > 1 :
@@ -157,6 +143,8 @@ class Components :
 		_args['num_gpu'] 	= 1
 		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
 		_args['no_value']= args['no_value']
+		_args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128
+			
 		
 		# MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
 		PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
@@ -298,6 +286,8 @@ if __name__ == '__main__' :
 		args[key] = _config[key]
 	
 	args = dict(args,**SYS_ARGS)
+	if 'matrix_size' in args :
+		args['matrix_size'] = int(args['matrix_size'])
 	if 'batch_size' not in args :
 		args['batch_size']	= 2000 #if 'batch_size' not in args else int(args['batch_size'])
 	if 'dataset' not in args :

From bddba3d908ba5b5680ec6a2c2d7c4101ceeb2807 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 14 Apr 2020 16:32:11 -0500
Subject: [PATCH 107/250] bug fix ...

---
 data/maker/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 78bc08d..527d245 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -211,7 +211,7 @@ def generate(**args):
         # else:
         # values          = df[col].dropna().unique().tolist()
         msize = args['matrix_size'] if 'matrix_size' in args else 128
-        values = bhandler.get_column_values(df[col])
+        values = bhandler.get_column_values(df[col],msize)
 
         
         

From b8f59f85d50b4e82fd61cb7e7691c2f18632422e Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 15 Apr 2020 09:18:06 -0500
Subject: [PATCH 108/250] bug fix: with column count

---
 data/bridge.py         | 31 +++++++++++++++++++------------
 data/maker/__init__.py |  6 +++---
 setup.py               |  2 +-
 3 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/data/bridge.py b/data/bridge.py
index 2e38431..137a504 100644
--- a/data/bridge.py
+++ b/data/bridge.py
@@ -160,20 +160,17 @@ class Binary :
         """
         # values = np.unique(column)    
         
-        values = column.dropna().unique() 
-        values.sort()
+        # values = column.dropna().unique() 
+        
+        # values.sort()
+        # column = column.values
+        values = self.get_column(column,size)
         column = column.values
         #
         # Let's treat the case of missing values i.e nulls 
         #       
         row_count,col_count = column.size,values.size
         # if row_count * col_count > size and row_count < size:
-        if col_count > size :
-            # N = np.divide(size,row_count).astype(int) 
-            # N = 
-            i = np.random.choice(col_count,size)
-            values = values[-i]
-            col_count = size
             
 
        
@@ -196,7 +193,17 @@ class Binary :
         return pd.DataFrame(matrix,columns=values)
     def apply(self,column,size):
         return self.__stream(column,size)
-    def get_column_values(self,column,size=-1):
+    def get_column(self,column,size=-1):
+        """
+        This function will return the columns that are available for processing ...
+        """
+        values = column.dropna().value_counts().index
+        if size > 0 :
+            values = values[:size]
+            values.sort_values()
+        return values
+            
+    def _get_column_values(self,column,size=-1):
         values = column.dropna().unique() 
         values.sort()
         
@@ -204,7 +211,7 @@ class Binary :
         # Let's treat the case of missing values i.e nulls 
         #       
         row_count,col_count = column.size,values.size
-        if col_count > size :
+        if col_count > size and size  > 0:
             # N = np.divide(size,row_count).astype(int) 
             # N = 
             i = np.random.choice(col_count,size)
@@ -270,8 +277,8 @@ if __name__ == '__main__' :
         --export    will export data to a specified location
     """
     df = pd.read_csv('sample.csv')
-    print ( pd.get_dummies(df.race))
-    print ( (Binary()).apply(df.race, 2))
+    print ( df.race.value_counts())
+    print ( (Binary()).apply(df['race'], 3))
 
     # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
     # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 527d245..26cc4de 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -136,7 +136,7 @@ def train (**args) :
             # print (df[col].dtypes)
             # print (df[col].dropna/(axis=1).unique())
         # args['real']  = pd.get_dummies(df[col].dropna()).astype(np.float32).values
-        msize = args['matrix_size'] if 'matrix_size' in args else 128
+        msize = args['matrix_size'] if 'matrix_size' in args else -1        
         args['real'] = (Binary()).apply(df[col],msize)
 
             
@@ -210,8 +210,8 @@ def generate(**args):
             
         # else:
         # values          = df[col].dropna().unique().tolist()
-        msize = args['matrix_size'] if 'matrix_size' in args else 128
-        values = bhandler.get_column_values(df[col],msize)
+        msize = args['matrix_size'] if 'matrix_size' in args else -1        
+        values = bhandler.get_column(df[col],msize)
 
         
         
diff --git a/setup.py b/setup.py
index 44a59b1..0370cdc 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.3.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.3.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 02d8588f5b2bac248aa482a32f7d0fbe8ad312d2 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 15 Apr 2020 09:19:49 -0500
Subject: [PATCH 109/250] bug fix: with column count

---
 pipeline.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 54e12c4..d218216 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -155,13 +155,13 @@ class Components :
 		# df = reader()
 		df = args['reader']() if 'reader' in args else args['data']
 		
-		if 'slice' in args and 'max_rows' in args['slice']:
+		# if 'slice' in args and 'max_rows' in args['slice']:
 
-			max_rows = args['slice']['max_rows']
-			if df.shape[0] > max_rows :
-				print (".. slicing ")
-				i = np.random.choice(df.shape[0],max_rows,replace=False)
-				df = df.iloc[i]
+		# 	max_rows = args['slice']['max_rows']
+		# 	if df.shape[0] > max_rows :
+		# 		print (".. slicing ")
+		# 		i = np.random.choice(df.shape[0],max_rows,replace=False)
+		# 		df = df.iloc[i]
 			
 
 				

From 9fff0d123e9a1a4d89dd996f9c2a10db5fc78be7 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 15 Apr 2020 10:23:14 -0500
Subject: [PATCH 110/250] bug fix urgh

---
 data/bridge.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/bridge.py b/data/bridge.py
index 137a504..902c6d3 100644
--- a/data/bridge.py
+++ b/data/bridge.py
@@ -198,9 +198,9 @@ class Binary :
         This function will return the columns that are available for processing ...
         """
         values = column.dropna().value_counts().index
-        if size > 0 :
+        if size > 0 and column.size > size:
             values = values[:size]
-            values.sort_values()
+        values.sort_values()
         return values
             
     def _get_column_values(self,column,size=-1):

From f9da0f1ce7fa27b53c607c21c3030bb7dd8762f5 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 15 Apr 2020 15:22:43 -0500
Subject: [PATCH 111/250] fix: table schema (urgh)

---
 pipeline.py | 63 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 54 insertions(+), 9 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index d218216..d636c2f 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -7,6 +7,7 @@ import os
 from multiprocessing import Process, Lock
 import pandas as pd
 from google.oauth2 import service_account
+from google.cloud import bigquery as bq
 import data.maker
 
 from data.params import SYS_ARGS 
@@ -115,11 +116,45 @@ class Components :
 		data.maker.train(**_args)
 		
 		if 'autopilot' in ( list(args.keys())) :
-			print (['drone mode enabled ....'])
+			print (['autopilot mode enabled ....'])
 			self.generate(args)
 
 		pass
+	def shuffle(self,args):
+		"""
+		"""
+		df 		= args['reader']() if 'reader' in args else args['data']
+		
+		
+		col  	= args['columns'][0]
+		distrib = df[col].value_counts()
+		values	= np.array(distrib.index)
+		counts 	= np.array(distrib.values)
+		np.random.shuffle(values)
+		np.random.shuffle(counts)
+		N = len (values)
+		theta = np.random.sample()
+		pad = 0
+		# print (values)
+		iovalues = np.zeros(df.shape[0],dtype=df[col].dtype)
+		for i in range(N) :
+			# n = int(counts[i] - counts[i]*theta)
+			n = counts[i]
+			print ([counts[i],theta,n])
+			index = np.where(iovalues == 0)[0]
+			if index.size > 0 and index.size > n:
+				index = index[:n]
+				iovalues[index] = values[i]
+			
+		
+		np.random.shuffle(iovalues)
+		df[col] = iovalues
 		
+		return df
+	def post(self,args):
+		pass
+			
+
 	# @staticmethod
 	def generate(self,args):
 		"""
@@ -181,12 +216,12 @@ class Components :
 		#	let us fix the data types here every _id field will be an np.int64...
 		#
 		
-		for name in df.columns.tolist():
+		# for name in df.columns.tolist():
 			
-			if name.endswith('_id') :
-				if df[name].isnull().sum() > 0 :
-					df[name].fillna(np.nan_to_num(np.nan),inplace=True)					
-					df[name] = df[name].astype(int)
+		# 	if name.endswith('_id') :
+		# 		if df[name].isnull().sum() > 0 and name not in ['unique_device_id']:
+		# 			df[name].fillna(np.nan_to_num(np.nan),inplace=True)					
+		# 			df[name] = df[name].astype(int)
 		
 		
 		_dc = pd.DataFrame()
@@ -232,6 +267,11 @@ class Components :
 			
 			_id = 'path'
 		else:
+			client      = bq.Client.from_service_account_json(args["private_key"])
+			full_schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
+			full_schema = [{'name':item.name,'type':item.field_type,'description':item.description} for item in full_schema]
+			io_schema = [{'name':item['name'],'type':item['type'],'description':item['description']} for item in full_schema if item['name'] in args['columns']]
+
 			credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 			_pname = os.sep.join([folder,table+'.csv'])
 			_fname = table.replace('_io','_full_io')
@@ -243,11 +283,11 @@ class Components :
 			else:
 				Components.lock.acquire()
 				
-				data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)	
+				data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000,table_schema=io_schema)	
 				
 				INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'	
 				
-				_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
+				_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000,table_schema=full_schema)
 				Components.lock.release()
 			_id = 'dataset'
 		info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
@@ -354,7 +394,12 @@ if __name__ == '__main__' :
 		else:
 			generator.generate(args)
 		# Components.generate(args)
-	
+	elif 'shuffle' in SYS_ARGS:
+		args['data'] = DATA[0]
+		_df = (Components()).shuffle(args)
+		print (DATA[0][args['columns']])
+		print ()
+		print (_df[args['columns']])
 	else:
 		
 		# DATA  = np.array_split(DATA,PART_SIZE)

From f920ba0eda1844fab4d71d68a835ab5a5ca54782 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 15 Apr 2020 15:51:53 -0500
Subject: [PATCH 112/250] bug fix

---
 pipeline.py | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index d636c2f..5ef3013 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -216,6 +216,15 @@ class Components :
 		#	let us fix the data types here every _id field will be an np.int64...
 		#
 		
+		schema = args['schema']
+		for item in schema :
+			if item.field_type == 'INTEGER' and df[item.name].dtype != np.int64:
+				df[item.name] = np.array(df[item.name].values,dtype=np.int64)
+			elif item.field_type == 'STRING' and df[item.name].dtype != object :
+				df[item.name] = np.array(df[item.name],dtype=object)
+		
+		
+		
 		# for name in df.columns.tolist():
 			
 		# 	if name.endswith('_id') :
@@ -243,7 +252,7 @@ class Components :
 		# performing basic analytics on the synthetic data generated (easy to quickly asses)
 		#
 		info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
-		x = {}
+		
 		#
 		# @TODO: Send data over to a process for analytics
 		
@@ -267,10 +276,6 @@ class Components :
 			
 			_id = 'path'
 		else:
-			client      = bq.Client.from_service_account_json(args["private_key"])
-			full_schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
-			full_schema = [{'name':item.name,'type':item.field_type,'description':item.description} for item in full_schema]
-			io_schema = [{'name':item['name'],'type':item['type'],'description':item['description']} for item in full_schema if item['name'] in args['columns']]
 
 			credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 			_pname = os.sep.join([folder,table+'.csv'])
@@ -282,12 +287,8 @@ class Components :
 				print (_args['data'].head())
 			else:
 				Components.lock.acquire()
-				
-				data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000,table_schema=io_schema)	
-				
-				INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'	
-				
-				_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000,table_schema=full_schema)
+				data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)	
+				_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
 				Components.lock.release()
 			_id = 'dataset'
 		info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
@@ -340,11 +341,15 @@ if __name__ == '__main__' :
 	# if 'listen' not in SYS_ARGS :
 	if 'file' in args :
 		DATA = pd.read_csv(args['file']) ;
+		schema = []
 	else:
 		DATA = Components().get(args)
+		client      = bq.Client.from_service_account_json(args["private_key"])
+		schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
+
 	COLUMNS = DATA.columns
 	DATA = np.array_split(DATA,PART_SIZE)
-	
+	args['schema'] = schema
 	if 'generate' in SYS_ARGS :
 		#
 		# Let us see if we have partitions given the log folder

From 71097103da4d3b4618ee83e3ec50d5a96ccbc8ef Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 29 Apr 2020 01:27:25 -0500
Subject: [PATCH 113/250] fix: handling outliers and missing values

---
 data/bridge.py         |  15 ++++--
 data/gan.py            |  61 +++++++--------------
 data/maker/__init__.py |  81 ++++++----------------------
 data/params.py         |   4 +-
 finalize.py            | 120 ++++++++++++++++++++++++++++++-----------
 pipeline.py            |  47 +++-------------
 setup.py               |   4 +-
 7 files changed, 149 insertions(+), 183 deletions(-)

diff --git a/data/bridge.py b/data/bridge.py
index 902c6d3..3116a4b 100644
--- a/data/bridge.py
+++ b/data/bridge.py
@@ -197,12 +197,21 @@ class Binary :
         """
         This function will return the columns that are available for processing ...
         """
-        values = column.dropna().value_counts().index
+        values = column.dropna().value_counts().index.values
+        
         if size > 0 and column.size > size:
             values = values[:size]
-        values.sort_values()
+        values.sort()
         return values
-            
+    def get_missing(self,column,size=-1):
+        values = column.dropna().value_counts().index.values
+        if size > 0 and column.size > size :
+            values = values[size:]
+        else:
+            values = np.array([])
+        values.sort()
+        return values.tolist();
+        
     def _get_column_values(self,column,size=-1):
         values = column.dropna().unique() 
         values.sort()
diff --git a/data/gan.py b/data/gan.py
index 8a0c7a7..1418a04 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -536,9 +536,10 @@ class Predict(GNet):
                 self.values     = args['values']
                 self.ROW_COUNT  = args['row_count']
                 self.oROW_COUNT = self.ROW_COUNT
-                self.MISSING_VALUES = np.nan_to_num(np.nan)
-                if 'no_value' in args and args['no_value'] not in ['na','','NA'] :
-                        self.MISSING_VALUES = args['no_value']
+                # self.MISSING_VALUES = np.nan_to_num(np.nan)
+                # if 'no_value' in args and args['no_value'] not in ['na','','NA'] :
+                #         self.MISSING_VALUES = args['no_value']
+                self.MISSING_VALUES = args['missing']
                         
                 # self.MISSING_VALUES = args['no_value']
                 # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else  np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value']
@@ -650,15 +651,18 @@ class Predict(GNet):
                         # df.columns = self.values
                         if len(found) or df.columns.size <= len(self.values):
                                 ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
-                                # print ([' **** ',ii.sum()])
-
-                                if ii.shape[0] > 0 :
+                                missing = []
+                                if ii.sum() > 0 :
                                         #
-                                        #@TODO Have this be a configurable variable
-                                        
-                                        missing = np.repeat(self.MISSING_VALUES, np.where(ii==1)[0].size)
-                                else:
-                                        missing = []
+                                        # If the generator had a reductive effect we should be able to get random values from either :
+                                        #       - The space of outliers
+                                        #       - existing values for smaller spaces that have suffered over training
+                                        #
+
+                                        N = ii.sum() 
+                                        missing_values = self.MISSING_VALUES if self.MISSING_VALUES else self.values
+                                        missing = np.random.choice(missing_values,N)
+                                        # missing = []
                                 #
                                 # @TODO:
                                 #       Log the findings here in terms of ratio, missing, candidate count
@@ -669,6 +673,8 @@ class Predict(GNet):
                                 df =  pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
                                 df.columns = columns
                                 df = df[columns[0]].append(pd.Series(missing))
+                                
+                                
                                 if self.logger :
                                         
                                         info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION}
@@ -680,40 +686,9 @@ class Predict(GNet):
                 tf.compat.v1.reset_default_graph()
                 df = pd.DataFrame(df)
                 df.columns = columns
+                np.random.shuffle(df[columns[0]].values)
                 return df.to_dict(orient='list')
-                        # return df.to_dict(orient='list')
-                        # count = str(len(os.listdir(self.out_dir)))
-                        # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])
-                        # df.to_csv(_name,index=False)
 
-                        
-                        # output.extend(np.round(f))
-
-                        # for m in range(2):
-                        #         for n in range(2, self.NUM_LABELS):
-                        #                 idx1 = (demo[:, m] == 1)
-                        #                 idx2 = (demo[:, n] == 1)
-                        #                 idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
-                        #                 num = np.sum(idx)
-                        #                 print ("___________________list__")
-                        #                 print (idx1)
-                        #                 print (idx2)
-                        #                 print (idx)
-                        #                 print (num)
-                        #                 print ("_____________________")
-                        #                 nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU))
-                        #                 label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS))
-                        #                 label_input[:, n] = 1
-                        #                 label_input[:, m] = 1
-                        #                 output = []
-                        #                 for i in range(nbatch):
-                        #                         f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]})
-                        #                         output.extend(np.round(f))
-                        #                 output = np.array(output)[:num]
-                                        # print ([m,n,output])
-                                        
-                                        # np.save(self.out_dir + str(m) + str(n), output)
-        
 
 if __name__ == '__main__' :
         #
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 26cc4de..3e2c9aa 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -21,29 +21,8 @@ class ContinuousToDiscrete :
         """
         This function will convert a continous stream of information into a variety a bit stream of bins
         """
-        # BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist()
-        # print ( X.values.astype(np.float32))
-        # print ("___________________________")
         values = np.array(X).astype(np.float32)
         BOUNDS = ContinuousToDiscrete.bounds(values,n)
-        # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS]
-        # _matrix = []
-        # m = []
-        # for value in X :
-        #     x_ = np.zeros(n)
-            
-        #     for row in BOUNDS :
-            
-        #         if value>= row.left and value <= row.right :
-        #             index = BOUNDS.index(row)
-        #             x_[index]  = 1
-        #             break
-        #     _matrix += x_.tolist()
-        # #
-        # # for items in BOUNDS :
-        # #   index = BOUNDS.index(items)
-        
-        # return np.array(_matrix).reshape(len(X),n)
         matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n)
 
     
@@ -123,25 +102,9 @@ def train (**args) :
     # @TODO : Consider performing this task on several threads/GPUs simulataneously
     # 
     for col in column : 
-        # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
-        # if 'float' not in df[col].dtypes.name :
-            # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
-        # if col in CONTINUOUS:
-        #     BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
-        #     args['real']    = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32)
-        #     # args['real'] = args['real'].reshape(df.shape[0],BIN_SIZE)
-            
-        # else:
-            # df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False)
-            # print (df[col].dtypes)
-            # print (df[col].dropna/(axis=1).unique())
-        # args['real']  = pd.get_dummies(df[col].dropna()).astype(np.float32).values
         msize = args['matrix_size'] if 'matrix_size' in args else -1        
         args['real'] = (Binary()).apply(df[col],msize)
 
-            
-        
-
         context     = args['context']
         if 'store' in args :
             args['store']['args']['doc'] = context
@@ -191,61 +154,49 @@ def generate(**args):
     #   If the identifier is not present, we should fine a way to determine or make one
     #
     BIN_SIZE    = 4 if 'bin_size' not in args else int(args['bin_size'])
-    NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
+    # NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
     bhandler = Binary()    
     _df     = df.copy()
     for col in column :
         args['context'] = col
         args['column']  = col
         
-        # if 'float' in df[col].dtypes.name or col in CONTINUOUS :
-        #     #
-        #     # We should create the bins for the values we are observing here
-        #     BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
-        #     values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE)
-        #     # values = np.unique(values).tolist()
-        # else:
-        # if col in CONTINUOUS :
-        #     values = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32).T
-            
-        # else:
-        # values          = df[col].dropna().unique().tolist()
         msize = args['matrix_size'] if 'matrix_size' in args else -1        
         values = bhandler.get_column(df[col],msize)
-
+        MISSING= bhandler.get_missing(df[col],msize)
         
         
         
         args['values']      = values    
         args['row_count']   = df.shape[0]
-        if col in NO_VALUE :
-            args['no_value'] = NO_VALUE[col]
-        else:
-            args['no_value'] = NO_VALUE
-        
+        # if col in NO_VALUE :
+        #     args['no_value'] = NO_VALUE[col] 
+        # else:
+        #     args['no_value'] = NO_VALUE
+        # novalue = NO_VALUE[col] if NO_VALUE[col] in ['na',''] else NO_VALUE[col]
+        # MISSING += [NO_VALUE[col]]
+        args['missing'] = MISSING
         #
         # we can determine the cardinalities here so we know what to allow or disallow
         handler     = gan.Predict (**args)
         handler.load_meta(col)
         r           =  handler.apply()                
         if col in CONTINUOUS :
-            r[col] = np.array(r[col])
-            MISSING= np.nan if args['no_value'] in ['na','','NA'] else args['no_value']
+            r[col] = np.array(r[col])            
+            _approx = ContinuousToDiscrete.continuous(r[col],BIN_SIZE)  #-- approximating based on arbitrary bins                                
+            r[col] = _approx
+            
             
-            if np.isnan(MISSING):
-                i = np.isnan(r[col])                
-                i = np.where (i == False)[0]                
-            else:
-                i = np.where( r[col] != None)[0]            
-            _approx = ContinuousToDiscrete.continuous(r[col][i],BIN_SIZE)  #-- approximating based on arbitrary bins                                
-            r[col][i] = _approx
             
         _df[col]    = r[col]
         #
         # Let's cast the type to the original type (it makes the data more usable)
         #
+        # print (values)
+        # print ([col,df[col].dtype,_df[col].tolist()])
         otype       = df[col].dtype
         _df[col]    = _df[col].astype(otype)
+        
         #
         # @TODO: log basic stats about the synthetic attribute
         #
diff --git a/data/params.py b/data/params.py
index c667063..f2c3536 100644
--- a/data/params.py
+++ b/data/params.py
@@ -9,8 +9,10 @@ if len(sys.argv) > 1:
 		if sys.argv[i].startswith('--'):
 			key = sys.argv[i][2:] #.replace('-','')
 			SYS_ARGS[key] = 1
-			if i + 1 < N:
+			if i + 1 < N and not sys.argv[i + 1].startswith('--'):
 				value = sys.argv[i + 1] = sys.argv[i+1].strip()
+			else:
+				value = None
 			if key and value:
 				SYS_ARGS[key] = value
 				
diff --git a/finalize.py b/finalize.py
index 079830d..d420d7d 100644
--- a/finalize.py
+++ b/finalize.py
@@ -6,10 +6,13 @@ This file will perform basic tasks to finalize the GAN process by performing the
 """
 import pandas as pd
 import numpy as np
+from multiprocessing import Process, Lock
 from google.oauth2 import service_account
 from google.cloud import bigquery as bq
+import transport
 from data.params import SYS_ARGS 
 import json
+
 class Analytics :
     """
     This class will compile basic analytics about a given dataset i.e compare original/synthetic
@@ -33,15 +36,23 @@ class Analytics :
         """
         This function will measure the distance between 
         """
-        df      = args['data']
-        names   = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False]
+        pass
 class Utils :
+    @staticmethod
+    def log(**args):
+        logger = transport.factory.instance(type="mongo.MongoWriter",args={"dbname":"aou","doc":"logs"})        
+        logger.write(args)
+        logger.close()
     class get :
         @staticmethod
-        def config(**args) :
-            contexts    = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts']
-            pipeline    = args['pipeline']
-            return [ item for item in pipeline if item['context'] in contexts]
+        def pipeline(table,path) :
+            # contexts    = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts']
+            config = json.loads((open(path)).read())
+            pipeline    = config['pipeline']
+            # return [ item for item in pipeline if item['context'] in contexts]
+            pipeline =  [item for item in pipeline if 'from' in item and item['from'].strip() == table]
+            Utils.log(module=table,action='init',input={"pipeline":pipeline})
+            return pipeline
         @staticmethod
         def sql(**args) :
             """
@@ -54,7 +65,8 @@ class Utils :
             SQL = ["SELECT * FROM :from "]
             SQL_FILTER = []
             NO_FILTERS_FOUND = True
-            pipeline = Utils.get.config(**args)
+            # pipeline = Utils.get.config(**args)
+            pipeline = args['pipeline']
             REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='}
             for item in pipeline :
                 
@@ -73,7 +85,7 @@ class Utils :
             #
             # let's pull the field schemas out of the table definition
             #
-
+            Utils.log(module=args['from'],action='sql',input={"sql":" ".join(SQL) })
             return " ".join(SQL).replace(":from",src)
 
         
@@ -91,26 +103,36 @@ def mk(**args) :
         return client.create_dataset(dataset)
     return found[0] 
         
-def move (**args):
+def move (args):
     """
     This function will move a table from the synthetic dataset into a designated location
     This is the simplest case for finalizing a synthetic data set
     :private_key        
     """
-    private_key = args['private_key']
-    client      = bq.Client.from_service_account_json(private_key)
-    config      = Utils.get.config(**args)
+    pipeline   = Utils.get.pipeline(args['from'],args['config'])
+    _args = json.loads((open(args['config'])).read())
+    _args['pipeline'] = pipeline
+    # del _args['pipeline']
+    args = dict(args,**_args)
+    # del args['pipeline']
+    # private_key = args['private_key']
+    client      = bq.Client.from_service_account_json(args['private_key'])
+
     dataset     = args['dataset']
-    if 'contexts' in args :
-        SQL         = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in config]
+    if pipeline :
+        SQL         = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in pipeline]
         SQL         += [Utils.get.sql(**args)]
         SQL         =  ('\n UNION ALL \n'.join(SQL).replace(':dataset','io'))
     else:
         #
         # moving a table to a designated location
         tablename = args['from']
-        SQL = "SELECT * FROM :dataset.:table".replace(":dataset",dataset).replace(":table",tablename)
-
+        if 'sql' not in args :
+            SQL = "SELECT * FROM :dataset.:table"
+        else:
+            SQL = args['sql']
+        SQL = SQL.replace(":dataset",dataset).replace(":table",tablename)
+    Utils.log(module=args['from'],action='sql',input={'sql':SQL})
     #
     # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table
     #
@@ -132,7 +154,7 @@ def move (**args):
     SQL = SQL.replace("*"," , ".join(fields))
     # print (SQL)
     out = client.query(SQL,location='US',job_config=config)
-    print ()
+    Utils.log(module=args['from'],action='move',input={'job':out.job_id})
     return (out.job_id)
     
     
@@ -158,23 +180,59 @@ if __name__ == '__main__' :
     Usage :
         finalize --<move|stats> --contexts <c1,c2,...c3> --from <table>
     """
+    
     if 'move' in SYS_ARGS :
-        # table = SYS_ARGS['from']
-        # args = dict(config,**{"private_key":"../curation-prod.json"})
-        args = dict(args,**SYS_ARGS)        
-        contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
-        log = []
-        if contexts :
-            args['contexts'] = contexts
-            log = move(**args)
+
+        if 'init' in SYS_ARGS :
+            dep = config['dep'] if 'dep' in config else {}
+            info = []
             
+            if 'queries' in dep :
+                info += dep['queries']
+                print ('________')
+            if 'tables' in dep :
+                info += dep['tables']
+            args = {}
+            jobs = []
+            for item in info :
+                args = {}
+                if type(item) == str :
+                    args['from'] = item
+                    name = item
+                else:
+                    args = item
+                    name = item['from']
+                args['config'] = SYS_ARGS['config']
+                # args['pipeline'] = []
+                job = Process(target=move,args=(args,))
+                job.name = name
+                jobs.append(job)
+                job.start()
+                
+
+            # while len(jobs) > 0 :
+            #     jobs = [job for job in jobs if job.is_alive()]
+            #     time.sleep(1)
+            
+
         else:
-            tables = args['from'].split(',')
-            for name in tables :
-                name = name.strip()
-                args['from'] = name
-                log += [move(**args)]
-        print ("\n".join(log))
+            move(SYS_ARGS)
+        # # table = SYS_ARGS['from']
+        # # args = dict(config,**{"private_key":"../curation-prod.json"})
+        # args = dict(args,**SYS_ARGS)        
+        # contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
+        # log = []
+        # if contexts :
+        #     args['contexts'] = contexts
+        #     log = move(**args)
+            
+        # else:
+        #     tables = args['from'].split(',')
+        #     for name in tables :
+        #         name = name.strip()
+        #         args['from'] = name
+        #         log += [move(**args)]
+        # print ("\n".join(log))
         
         
         
diff --git a/pipeline.py b/pipeline.py
index 5ef3013..00f558d 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -14,7 +14,6 @@ from data.params import SYS_ARGS
 
 #
 # The configuration array is now loaded and we will execute the pipe line as follows
-DATASET='combined20191004v2_deid'
 
 class Components :
 	lock = Lock()
@@ -120,37 +119,7 @@ class Components :
 			self.generate(args)
 
 		pass
-	def shuffle(self,args):
-		"""
-		"""
-		df 		= args['reader']() if 'reader' in args else args['data']
-		
-		
-		col  	= args['columns'][0]
-		distrib = df[col].value_counts()
-		values	= np.array(distrib.index)
-		counts 	= np.array(distrib.values)
-		np.random.shuffle(values)
-		np.random.shuffle(counts)
-		N = len (values)
-		theta = np.random.sample()
-		pad = 0
-		# print (values)
-		iovalues = np.zeros(df.shape[0],dtype=df[col].dtype)
-		for i in range(N) :
-			# n = int(counts[i] - counts[i]*theta)
-			n = counts[i]
-			print ([counts[i],theta,n])
-			index = np.where(iovalues == 0)[0]
-			if index.size > 0 and index.size > n:
-				index = index[:n]
-				iovalues[index] = values[i]
-			
-		
-		np.random.shuffle(iovalues)
-		df[col] = iovalues
-		
-		return df
+
 	def post(self,args):
 		pass
 			
@@ -177,7 +146,7 @@ class Components :
 			_args['gpu']  = 0
 		_args['num_gpu'] 	= 1
 		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
-		_args['no_value']= args['no_value']
+		# _args['no_value']= args['no_value']
 		_args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128
 			
 		
@@ -207,7 +176,7 @@ class Components :
 		# 	df = pd.DataFrame(df[ int (partition) ],columns = columns)
 		# max_rows = int(args['partition_max_rows']) if 'partition_max_rows' in args else 1000000
 		# N = np.divide(df.shape[0],max_rows).astype(int) + 1
-		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"space":df[args['columns'][0]].unique().size, "part_size":int(PART_SIZE)}
+		info = {"name":args['columns'],"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"space":df[args['columns'][0]].unique().size, "part_size":int(PART_SIZE)}
 		logger.write({"module":"generate","action":"partition","input":info})
 		_args['partition'] = int(partition)
 		_args['continuous']= args['continuous'] if 'continuous' in args else []
@@ -400,11 +369,11 @@ if __name__ == '__main__' :
 			generator.generate(args)
 		# Components.generate(args)
 	elif 'shuffle' in SYS_ARGS:
-		args['data'] = DATA[0]
-		_df = (Components()).shuffle(args)
-		print (DATA[0][args['columns']])
-		print ()
-		print (_df[args['columns']])
+		
+		
+		for data in DATA :
+			args['data'] = data
+			_df = (Components()).shuffle(args)
 	else:
 		
 		# DATA  = np.array_split(DATA,PART_SIZE)
diff --git a/setup.py b/setup.py
index 0370cdc..40e8d11 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.3.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.3.2","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
@@ -14,3 +14,5 @@ if sys.version_info[0] == 2 :
     args['use_2to3_exclude_fixers'] = ['lib2to3.fixes.fix_import']
 args['scripts']=['pipeline.py','finalize.py']
 setup(**args)
+
+

From 97bae5ef92a9dbf9c53ec2dbfe854e099612d67e Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 29 Mar 2021 11:10:57 -0500
Subject: [PATCH 114/250] bug fixes: design improvements

---
 data/__init__.py       |  15 ++
 data/gan.py            | 225 ++++++++++++--------
 data/maker/__init__.py | 110 +++++++++-
 data/maker/__main__.py |  32 ---
 pipeline.py            | 471 +++++++++++++++++++++--------------------
 5 files changed, 500 insertions(+), 353 deletions(-)
 delete mode 100644 data/maker/__main__.py

diff --git a/data/__init__.py b/data/__init__.py
index 98124f1..0ca216d 100644
--- a/data/__init__.py
+++ b/data/__init__.py
@@ -1,2 +1,17 @@
 import data.params as params
+from data.params import SYS_ARGS
+import transport
+from multiprocessing import Process, Queue
+from data.maker import prepare
 
+class Trainer (Process) :
+    pass
+class Maker(Process):
+    pass
+
+if __name__ == '__main__' :
+
+    logger = transport.factory.instance(SYS_ARGS['store']['logger'])
+    
+
+    
\ No newline at end of file
diff --git a/data/gan.py b/data/gan.py
index 1418a04..e7ab6cf 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -111,15 +111,15 @@ class GNet :
                         
                 self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])                
                 self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
-                if self.logger :
-                        #
-                        # We will clear the logs from the data-store 
-                        #
-                        column = self.ATTRIBUTES['synthetic']
-                        db = self.logger.db
-                        if db[column].count() > 0 :
-                                db.backup.insert({'name':column,'logs':list(db[column].find()) })
-                                db[column].drop()
+                # if self.logger :
+                        
+                #         We will clear the logs from the data-store 
+                        
+                #         column = self.ATTRIBUTES['synthetic']
+                #         db = self.logger.db
+                #         if db[column].count() > 0 :
+                #                 db.backup.insert({'name':column,'logs':list(db[column].find()) })
+                #                 db[column].drop()
                 
         def load_meta(self,column):
                 """
@@ -127,7 +127,7 @@ class GNet :
                 Because prediction and training can happen independently
                 """
                 # suffix = "-".join(column) if isinstance(column,list)else column
-                suffix = self.get.suffix()
+                suffix = self.CONTEXT #self.get.suffix()
                 _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json'])
                 if os.path.exists(_name) :
                         attr = json.loads((open(_name)).read())
@@ -159,7 +159,7 @@ class GNet :
                         value= args['value']
                         object[key] = value
                 # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column
-                suffix = self.get.suffix()
+                suffix = self.CONTEXT #self.get.suffix()
                 _name = os.sep.join([self.out_dir,'meta-'+suffix])
                 
                 f = open(_name+'.json','w')
@@ -351,7 +351,7 @@ class Train (GNet):
                 self.discriminator = Discriminator(**args)
                 self._REAL = args['real']
                 self._LABEL= args['label'] if 'label' in args else None
-                self.column = args['column']
+                # self.column = args['column']
                 # print ([" *** ",self.BATCHSIZE_PER_GPU])
                 
                 self.meta = self.log_meta()
@@ -438,6 +438,11 @@ class Train (GNet):
                 per_gpu_w   = []
                 iterator, features_placeholder, labels_placeholder = self.input_fn()
                 with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()):
+                        #
+                        # @TODO: Find a way to handle this across multiple CPU in case the GPU are not available
+                        #       - abstract hardware specification
+                        #       - determine if the GPU/CPU are busy
+                        #
                         for i in range(self.NUM_GPUS):
                                 with tf.device('/gpu:%d' % i):
                                         with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
@@ -510,7 +515,7 @@ class Train (GNet):
                                         # if epoch % self.MAX_EPOCHS == 0:
                                         if epoch in [5,10,20,50,75, self.MAX_EPOCHS] :
                                                 # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
-                                                suffix = self.get.suffix()
+                                                suffix = self.CONTEXT #self.get.suffix()
                                                 _name  = os.sep.join([self.train_dir,suffix])
                                                 # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
                                                 saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
@@ -539,7 +544,8 @@ class Predict(GNet):
                 # self.MISSING_VALUES = np.nan_to_num(np.nan)
                 # if 'no_value' in args and args['no_value'] not in ['na','','NA'] :
                 #         self.MISSING_VALUES = args['no_value']
-                self.MISSING_VALUES = args['missing']
+                self.MISSING_VALUES = args['missing'] if 'missing' in args else []
+                
                         
                 # self.MISSING_VALUES = args['no_value']
                 # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else  np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value']
@@ -548,9 +554,56 @@ class Predict(GNet):
                 self.generator.load_meta(column)
                 self.ROW_COUNT = self.oROW_COUNT
         def apply(self,**args):
+                suffix = self.CONTEXT #self.get.suffix()
+                model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
+                demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
+                #
+                # setup computational graph
+                tf.compat.v1.reset_default_graph()
+                z = tf.random.normal(shape=[self.ROW_COUNT, self.Z_DIM])
+
+                y = tf.compat.v1.placeholder(shape=[self.ROW_COUNT, self.NUM_LABELS], dtype=tf.int32)
+                if self._LABEL is not None :
+                        ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
+                        label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
+                else:
+                        label = None
+
+                fake    = self.generator.network(inputs=z, label=label)
+                init    = tf.compat.v1.global_variables_initializer()
+                saver   = tf.compat.v1.train.Saver()
+                df              = pd.DataFrame()
+                CANDIDATE_COUNT = args['candidates'] if 'candidates' in  args else 1 #0 if self.ROW_COUNT < 1000 else 100
+                candidates = []
+
+                with tf.compat.v1.Session() as sess:
+                        saver.restore(sess, model_dir)
+                        if self._LABEL is not None :
+                                # labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
+                                labels= demo
+                        else:
+                                labels = None
+                        
+                        for i in np.arange(CANDIDATE_COUNT) :
+                                if labels :
+                                        _matrix = sess.run(fake,feed_dict={y:labels})
+                                else:
+                                        _matrix = sess.run(fake)
+                                #
+                                # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
+                                # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
+                                #
+                                
+                                # df =   pd.DataFrame(np.round(f)).astype(np.int32)
+                                candidates.append (np.round(_matrix).astype(np.int64))                        
+                # return candidates[0] if len(candidates) == 1 else candidates
+                
+                return candidates 
+
+        def _apply(self,**args):
                 # print (self.train_dir)
                 # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
-                suffix = self.get.suffix()
+                suffix = self.CONTEXT #self.get.suffix()
                 model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
                 demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
                 tf.compat.v1.reset_default_graph()
@@ -567,11 +620,12 @@ class Predict(GNet):
                 init    = tf.compat.v1.global_variables_initializer()
                 saver   = tf.compat.v1.train.Saver()
                 df              = pd.DataFrame()
-                CANDIDATE_COUNT = 10 #0 if self.ROW_COUNT < 1000 else 100
+                CANDIDATE_COUNT = 5 #0 if self.ROW_COUNT < 1000 else 100
                 NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0]
                 with tf.compat.v1.Session() as sess:
                         
                         # sess.run(init)
+
                         saver.restore(sess, model_dir)
                         if self._LABEL is not None :
                                 labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
@@ -585,109 +639,110 @@ class Predict(GNet):
                         __ratio=0
                         for i in np.arange(CANDIDATE_COUNT) :
                                 if labels :
-                                        f = sess.run(fake,feed_dict={y:labels})
+                                        _matrix = sess.run(fake,feed_dict={y:labels})
                                 else:
-                                        f = sess.run(fake)
+                                        _matrix = sess.run(fake)
                                 #
                                 # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
                                 # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
                                 #
                                 
                                 # df =   pd.DataFrame(np.round(f)).astype(np.int32)
-                                df =   pd.DataFrame(np.round(f),dtype=int)
-                                
+                                found.append (np.round(_matrix).astype(np.int64))
+                                # df =   pd.DataFrame(np.round(_matrix),dtype=int)
                                 p = 0 not in df.sum(axis=1).values
-                                x = df.sum(axis=1).values
+                                # x = df.sum(axis=1).values
                                 
-                                if np.divide( np.sum(x), x.size)  > .9 or p and np.sum(x) == x.size :
-                                        ratio.append(np.divide( np.sum(x), x.size))
-                                        found.append(df)
+                                # if np.divide( np.sum(x), x.size)  > .9 or p and np.sum(x) == x.size :
+                                #         ratio.append(np.divide( np.sum(x), x.size))
+                                #         found.append(df)
                                         
-                                        # break
-                                        if len(found) == CANDIDATE_COUNT:
+                                #         # break
+                                #         if len(found) == CANDIDATE_COUNT:
                                                 
-                                                break
-                                else:
-                                        __x__   = df if  __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __x__
-                                        __ratio = np.divide( np.sum(x), x.size) if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __ratio
-                                        continue
+                                #                 break
+                                # else:
+                                #         __x__   = df if  __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __x__
+                                #         __ratio = np.divide( np.sum(x), x.size) if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __ratio
+                                #         continue
                                        
                         # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
                         # df = (i * df).sum(axis=1)
                         #
                         # In case we are dealing with actual values like diagnosis codes we can perform 
                         #
-                        N = len(found)
-                        _index = [i for i in range(0,N) if found[i].shape[1] == len(self.values)]
-                        if not _index and not found :
-                                df = __x__
-                                INDEX = -1
-                        else :
-                                if not _index :
-                                        INDEX = np.random.choice(np.arange(len(found)),1)[0]
-                                        INDEX = ratio.index(np.max(ratio))
-                                else:
-                                        INDEX = _index[0]
+                        # N = len(found)
+                        # _index = [i for i in range(0,N) if found[i].shape[1] == len(self.values)]
+                        # if not _index and not found :
+                        #         df = __x__
+                        #         INDEX = -1
+                        # else :
+                        #         if not _index :
+                        #                 INDEX = np.random.choice(np.arange(len(found)),1)[0]
+                        #                 INDEX = ratio.index(np.max(ratio))
+                        #         else:
+                        #                 INDEX = _index[0]
 
                         
-                                df = found[INDEX]
-                        columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
+                        #         df = found[INDEX]
+                        # columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
                         
                         # r = np.zeros((self.ROW_COUNT,len(columns)))
                         # r = np.zeros(self.ROW_COUNT)
                         
-                        if self.logger :
-                                info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)}
-                                if df.shape[1] > len(self.values) :
-                                        df = df.iloc[:len(self.values)]
-                                if INDEX > 0 :
-                                        info =dict(info ,**{"selected":INDEX, "ratio": ratio[INDEX] })
-                                else :
+                        # if self.logger :
+                        #         info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)}
+                        #         if df.shape[1] > len(self.values) :
+                        #                 df = df.iloc[:len(self.values)]
+                        #         if INDEX > 0 :
+                        #                 info =dict(info ,**{"selected":INDEX, "ratio": ratio[INDEX] })
+                        #         else :
                                         
-                                        info['selected'] = -1
-                                        info['ratio'] = __ratio
-                                info['partition'] = self.PARTITION
-                                self.logger.write({"module":"gan-generate","action":"generate","input":info})
-                        # df.columns = self.values
-                        if len(found) or df.columns.size <= len(self.values):
-                                ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
-                                missing = []
-                                if ii.sum() > 0 :
-                                        #
-                                        # If the generator had a reductive effect we should be able to get random values from either :
-                                        #       - The space of outliers
-                                        #       - existing values for smaller spaces that have suffered over training
-                                        #
-
-                                        N = ii.sum() 
-                                        missing_values = self.MISSING_VALUES if self.MISSING_VALUES else self.values
-                                        missing = np.random.choice(missing_values,N)
-                                        # missing = []
-                                #
-                                # @TODO:
-                                #       Log the findings here in terms of ratio, missing, candidate count
-                                # print ([np.max(ratio),len(missing),len(found),i])
-                                i = np.where(ii == 0)[0]
+                        #                 info['selected'] = -1
+                        #                 info['ratio'] = __ratio
+                        #         info['partition'] = self.PARTITION
+                        #         self.logger.write({"module":"gan-generate","action":"generate","input":info})
+                        # # df.columns = self.values
+                        # if len(found) or df.columns.size <= len(self.values):
+                        #         ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
+                        #         missing = []
+                        #         if ii.sum() > 0 :
+                        #                 #
+                        #                 # If the generator had a reductive effect we should be able to get random values from either :
+                        #                 #       - The space of outliers
+                        #                 #       - existing values for smaller spaces that have suffered over training
+                        #                 #
+
+                        #                 N = ii.sum() 
+                        #                 missing_values = self.MISSING_VALUES if self.MISSING_VALUES else self.values
+                        #                 missing = np.random.choice(missing_values,N)
+                        #                 # missing = []
+                        #         #
+                        #         # @TODO:
+                        #         #       Log the findings here in terms of ratio, missing, candidate count
+                        #         # print ([np.max(ratio),len(missing),len(found),i])
+                        #         i = np.where(ii == 0)[0]
                                 
                                 
-                                df =  pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
-                                df.columns = columns
-                                df = df[columns[0]].append(pd.Series(missing))
+                        #         df =  pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
+                        #         df.columns = columns
+                        #         df = df[columns[0]].append(pd.Series(missing))
                                 
                                 
-                                if self.logger :
+                        #         if self.logger :
                                         
-                                        info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION}
-                                        self.logger.write({"module":"gan-generate","action":"compile.io","input":info})
+                        #                 info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION}
+                        #                 self.logger.write({"module":"gan-generate","action":"compile.io","input":info})
 
                            
                         
                 # print(df.head())
                 tf.compat.v1.reset_default_graph()
-                df = pd.DataFrame(df)
-                df.columns = columns
-                np.random.shuffle(df[columns[0]].values)
-                return df.to_dict(orient='list')
+                # df = pd.DataFrame(df)
+                # df.columns = columns
+                # np.random.shuffle(df[columns[0]].values)
+                # return df.to_dict(orient='list')
+                return _matrix
 
 
 if __name__ == '__main__' :
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 3e2c9aa..086df3f 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -14,6 +14,11 @@ import data.gan as gan
 from transport import factory
 from data.bridge import Binary
 import threading as thread
+from data.maker import prepare
+import copy
+import os
+import json
+
 class ContinuousToDiscrete :
     ROUND_UP = 2
     @staticmethod
@@ -77,8 +82,62 @@ class ContinuousToDiscrete :
             
 
 
+def train (**_args):
+    """
+    :params sql
+    :params store
+    """
+    #
+    # Let us prepare the data by calling the utility function
+    #
+    if 'file' in _args :
+        #
+        # We are reading data from a file
+        _args['data'] = pd.read_csv(_args['file'])
+    else:
+        #
+        # data will be read from elsewhere (a data-store)...
+        pass        
+    # if 'ignore' in _args and 'columns' in _args['ignore']:
+        
+    _inputhandler = prepare.Input(**_args)
+    values,_matrix = _inputhandler.convert()
+    args  = {"real":_matrix,"context":_args['context']}
+    _map = {}
+    if 'store' in _args :
+        #
+        # This 
+        args['store'] = copy.deepcopy(_args['store']['logs'])
+        args['store']['args']['doc'] = _args['context']
+        logger = factory.instance(**args['store'])
+        args['logger'] = logger
+        
+        for key in _inputhandler._map :
+            beg = _inputhandler._map[key]['beg']
+            end = _inputhandler._map[key]['end']
+            values = _inputhandler._map[key]['values'].tolist()
+            _map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()}
+        info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map}
+        logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":info})
+    
+    args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
+    args ['max_epochs'] = _args['max_epochs']
+    args['matrix_size'] = _matrix.shape[0]
+    args['batch_size'] = 2000
+    args['partition'] = 0 if 'partition' not in _args else _args['partition']
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
     
-def train (**args) :
+    trainer = gan.Train(**args)   
+    #
+    # @TODO: Write the map.json in the output directory for the logs
+    # 
+    f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w')
+    f.write(json.dumps(_map))
+    f.close()
+
+    trainer.apply()
+    pass    
+def _train (**args) :
     """
     This function is intended to train the GAN in order to learn about the distribution of the features
     :column     columns that need to be synthesized (discrete)
@@ -122,18 +181,53 @@ def train (**args) :
         # If the s
         trainer = gan.Train(**args)        
         trainer.apply()
-def post(**args):
-    """
-    This uploads the tensorflow checkpoint to a data-store (mongodb, biguqery, s3)
-    
-    """
-    pass
 def get(**args):
     """
     This function will restore a checkpoint from a persistant storage on to disk
     """
     pass
-def generate(**args):
+def generate(**_args):
+    """
+    This function will generate a set of records, before we must load the parameters needed
+    :param data
+    :param context
+    :param logs
+    """
+    f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
+    _map = json.loads(f.read())
+    f.close()
+    if 'file' in _args :
+        df = pd.read_csv(_args['file'])
+    else:
+        df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])
+    args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']}
+    args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
+    args ['max_epochs'] = _args['max_epochs']
+    # args['matrix_size'] = _matrix.shape[0]
+    args['batch_size'] = 2000
+    args['partition'] = 0 if 'partition' not in _args else _args['partition']
+    args['row_count'] = df.shape[0]
+    #
+    # @TODO: perhaps get the space of values here ... (not sure it's a good idea)
+    #
+    _args['map']  = _map
+    _inputhandler = prepare.Input(**_args)
+    values,_matrix = _inputhandler.convert()    
+    args['values'] = np.array(values)
+    if 'gpu' in _args :
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])
+    handler     = gan.Predict (**args)
+    handler.load_meta(None)
+    #
+    # Let us now format the matrices as we expect them to be 
+    #
+
+    candidates = handler.apply(candidates=args['candidates'])       
+    return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]
+    
+
+
+def _generate(**args):
     """
     This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
     @return pandas.DataFrame
diff --git a/data/maker/__main__.py b/data/maker/__main__.py
deleted file mode 100644
index d71d400..0000000
--- a/data/maker/__main__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import pandas as pd
-import data.maker
-from data.params import SYS_ARGS
-import json
-from scipy.stats import wasserstein_distance as wd
-import risk
-import numpy as np
-if 'config' in SYS_ARGS :
-    ARGS = json.loads(open(SYS_ARGS['config']).read())
-    if 'generate' not in SYS_ARGS :
-        data.maker.train(**ARGS)    
-    else:
-        #
-        #
-        ARGS['no_value'] = ''
-        _df = data.maker.generate(**ARGS)
-        odf = pd.read_csv (ARGS['data'])
-        odf.columns = [name.lower() for name in odf.columns]
-        column = ARGS['column']  if isinstance(ARGS['column'],list) else [ARGS['column']]
-        # print (odf.head())
-        # print (_df.head())
-        print(odf.join(_df[column],rsuffix='_io'))
-        # print (_df[column].risk.evaluate(flag='synth'))
-        # print (odf[column].risk.evaluate(flag='original'))
-        # _x = pd.get_dummies(_df[column]).values
-        # y  = pd.get_dummies(odf[column]).values
-        # N = _df.shape[0]
-        # print (np.mean([ wd(_x[i],y[i])for i in range(0,N)]))
-        # print (wd(_x[0],y[0]) )
-        
-        # column = SYS_ARGS['column']
-        # odf = open(SYS_ARGS['data'])
\ No newline at end of file
diff --git a/pipeline.py b/pipeline.py
index 00f558d..4a86d94 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -9,7 +9,7 @@ import pandas as pd
 from google.oauth2 import service_account
 from google.cloud import bigquery as bq
 import data.maker
-
+import copy
 from data.params import SYS_ARGS 
 
 #
@@ -69,53 +69,45 @@ class Components :
 		This function will perform training on the basis of a given pointer that reads data
 
 		"""
-		#
-		# @TODO: we need to log something here about the parameters being passed
-		# pointer  = args['reader'] if 'reader' in args else lambda: Components.get(**args)
-		df = args['data']
-
-		#
-		# Now we can parse the arguments and submit the entire thing to training
-		#
-		
-		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
-		log_folder = args['logs'] if 'logs' in args else 'logs'
-		PART_SIZE = int(args['part_size']) 
-
-		partition = args['partition'] 
-		log_folder = os.sep.join([log_folder,args['context'],str(partition)])
-		_args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
-		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
-		if 'batch_size' in args :
-			_args['batch_size'] = int(args['batch_size'])
-		
-		_args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128		#
-		# We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel
-		#
-		if int(args['num_gpu']) > 1 :
-			_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)
+		schema = None
+		if 'file' in args :
+			
+			df = pd.read_csv(args['file'])
+			del args['file']
+		elif 'data' not in args :
+			reader = factory.instance(**args['store']['source'])
+			if 'row_limit' in args :
+				df = reader.read(sql=args['sql'],limit=args['row_limit'])
+			else:
+				df = reader.read(sql=args['sql'])
+			schema = reader.meta(table=args['from']) if hasattr(reader,'meta') and 'from' in args else None
 		else:
-			_args['gpu']  = 0
-		_args['num_gpu'] 	= 1
-		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
-		_args['partition'] = int(partition)
-		_args['continuous']= args['continuous'] if 'continuous' in args else []
-		_args['store'] = {'type':'mongo.MongoWriter','args':{'dbname':'aou','doc':args['context']}}
-		_args['data'] = args['data']
-		
-		# print (['partition ',partition,df.value_source_concept_id.unique()])
-		#
-		# @log :
-		#	Logging information about the training process for this partition (or not)
-		#
-		
-		info = {"rows":df.shape[0],"cols":df.shape[1], "partition":int(partition),"logs":_args['logs']}
+			df = args['data']
+			
+			
+		# df = df.fillna('')
+		if schema :
+			_schema = {}
+			for _item in schema :                
+				_type = int
+				_value = 0
+				if _item.field_type == 'FLOAT' :
+					_type =float
+				elif _item.field_type != 'INTEGER' :
+					_type = str
+					_value = ''
+				_schema[_item.name] = _type
+				df[_item.name] = df[_item.name].fillna(_value).astype(_type)
+			args['schema'] = _schema
+		# 		df[_item.name] = df[_item.name].astype(_type)
+		_args = copy.deepcopy(args)
+		# _args['store']  = args['store']['source']
+		_args['data'] = df
 		
-		logger.write({"module":"train","action":"train","input":info})
 		data.maker.train(**_args)
 		
 		if 'autopilot' in ( list(args.keys())) :
-			print (['autopilot mode enabled ....'])
+			print (['autopilot mode enabled ....',args['context']])
 			self.generate(args)
 
 		pass
@@ -129,141 +121,167 @@ class Components :
 		"""
 		This function will generate data and store it to a given,
 		"""
-		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
-		log_folder = args['logs'] if 'logs' in args else 'logs'
-		partition = args['partition'] if 'partition' in args else ''
-		log_folder = os.sep.join([log_folder,args['context'],str(partition)])
-		
-		_args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
-		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
+		store = args['store']['logs']
+		store['doc'] = args['context']
+		logger = factory.instance(**store) #type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
+
+		ostore = args['store']['target']
+		writer = factory.instance(**ostore)
+		# log_folder = args['logs'] if 'logs' in args else 'logs'
+		# partition = args['partition'] if 'partition' in args else ''
+		# log_folder = os.sep.join([log_folder,args['context'],str(partition)])
+		
+		# _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+		# _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 		# _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
-		if 'batch_size' in args :
-			_args['batch_size'] = int(args['batch_size'])
-		
-		if int(args['num_gpu']) > 1 :
-			_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)
-		else:
-			_args['gpu']  = 0
-		_args['num_gpu'] 	= 1
-		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
-		# _args['no_value']= args['no_value']
-		_args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128
+		# if 'batch_size' in args :
+		# 	_args['batch_size'] = int(args['batch_size'])
+		
+		# if int(args['num_gpu']) > 1 :
+		# 	_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)
+		# else:
+		# 	_args['gpu']  = 0
+		# _args['num_gpu'] 	= 1
+		# os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
+		# # _args['no_value']= args['no_value']
+		# _args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128
 			
 		
-		# MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
-		PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
+		# # MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
+		# PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
 		
 		# credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 		# _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
 		# reader = args['reader']
 		# df = reader()
-		df = args['reader']() if 'reader' in args else args['data']
-		
-		# if 'slice' in args and 'max_rows' in args['slice']:
-
-		# 	max_rows = args['slice']['max_rows']
-		# 	if df.shape[0] > max_rows :
-		# 		print (".. slicing ")
-		# 		i = np.random.choice(df.shape[0],max_rows,replace=False)
-		# 		df = df.iloc[i]
+		schema = args['schema'] if 'schema' in args else None
+		if 'file' in args :
 			
+			df = pd.read_csv(args['file'])
+		else:
+			if 'data' not in args :
+				reader = factory.instance(**args['store']['source'])
+				if 'row_limit' in args :
+					df = reader.read(sql=args['sql'],limit=args['row_limit'])
+				else:
+					df = reader.read(sql=args['sql'])
+				if 'schema' not in args and hasattr(reader,'meta'):
+					schema = reader.meta(table=args['from'])
 
-				
-		# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
-		# if partition != '' :
-		# 	columns = args['columns']
-		# 	df = np.array_split(df[columns].values,PART_SIZE)
-		# 	df = pd.DataFrame(df[ int (partition) ],columns = columns)
-		# max_rows = int(args['partition_max_rows']) if 'partition_max_rows' in args else 1000000
-		# N = np.divide(df.shape[0],max_rows).astype(int) + 1
-		info = {"name":args['columns'],"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"space":df[args['columns'][0]].unique().size, "part_size":int(PART_SIZE)}
-		logger.write({"module":"generate","action":"partition","input":info})
-		_args['partition'] = int(partition)
-		_args['continuous']= args['continuous'] if 'continuous' in args else []
-		#
-		# How many rows sub-partition must we divide this into ?
-		#	let us fix the data types here every _id field will be an np.int64...
-		#
-		
-		schema = args['schema']
-		for item in schema :
-			if item.field_type == 'INTEGER' and df[item.name].dtype != np.int64:
-				df[item.name] = np.array(df[item.name].values,dtype=np.int64)
-			elif item.field_type == 'STRING' and df[item.name].dtype != object :
-				df[item.name] = np.array(df[item.name],dtype=object)
-		
-		
-		
-		# for name in df.columns.tolist():
-			
-		# 	if name.endswith('_id') :
-		# 		if df[name].isnull().sum() > 0 and name not in ['unique_device_id']:
-		# 			df[name].fillna(np.nan_to_num(np.nan),inplace=True)					
-		# 			df[name] = df[name].astype(int)
-		
+					
+			else:
+				#
+				# This will account for autopilot mode ...
+				df = args['data']
+	
+		_info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[0]}}
 		
+				
 		_dc = pd.DataFrame()
 		# for mdf in df :
-		_args['data'] = df	
-		
-		_dc = _dc.append(data.maker.generate(**_args))
+		args['data'] = df
+		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
+		
+		candidates = (data.maker.generate(**args))
+		if 'sql.BQWriter' in ostore['type'] :
+			#table = ".".join([ostore['['dataset'],args['context']])
+			# writer  = factory.instance(**ostore)
+			_columns = None
+			skip_columns = []
+			_schema = [{"name":field.name,"type":field.field_type,"description":field.description} for field in schema] if schema else []
+			for _df in candidates :
+				#
+				# we need to format the fields here to make sure we have something cohesive
+				#
+
+				if not skip_columns :
+					# _columns = set(df.columns) - set(_df.columns)
+					if 'ignore' in args and 'columns' in args['ignore'] :
+							
+							for name in args['ignore']['columns'] :
+								for _name in _df.columns:
+									if _name in name:
+										skip_columns.append(_name)
+				#
+				# We perform a series of set operations to insure that the following conditions are met:
+				#	- the synthetic dataset only has fields that need to be synthesized
+				#	- The original dataset has all the fields except those that need to be synthesized
+				#
+				
+				_df = _df[list(set(_df.columns)  - set(skip_columns))]
+				
+				if set(df.columns) & set(_df.columns) :
+					_columns = set(df.columns) - set(_df.columns)
+					df = df[_columns]
+
+				#
+				# Let us merge the dataset here and and have a comprehensive dataset
+
+				_df = pd.DataFrame.join(df,_df)
+				
+				writer.write(_df,schema=_schema,table=args['from'])
+			# 	writer.write(df,table=table)
+			pass
+		else:
+			pass
+
 		
-		#
-		# We need to post the generate the data in order to :
-		#	1. compare immediately
-		#	2. synthetic copy
-		#
+		# #
+		# # We need to post the generate the data in order to :
+		# #	1. compare immediately
+		# #	2. synthetic copy
+		# #
 		
-		cols = _dc.columns.tolist()
+		# cols = _dc.columns.tolist()
 		
-		data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io')				#-- will be used for comparison (store this in big query)
-		#
-		# performing basic analytics on the synthetic data generated (easy to quickly asses)
-		#
-		info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
+		# data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io')				#-- will be used for comparison (store this in big query)
+		# #
+		# # performing basic analytics on the synthetic data generated (easy to quickly asses)
+		# #
+		# info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
 		
-		#
-		# @TODO: Send data over to a process for analytics
+		# #
+		# # @TODO: Send data over to a process for analytics
 		
-		base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
-		cols = _dc.columns.tolist()
-		for name in cols :
-			_args['data'][name] = _dc[name]
+		# base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
+		# cols = _dc.columns.tolist()
+		# for name in cols :
+		# 	_args['data'][name] = _dc[name]
 
-		#
-		#-- Let us store all of this into bigquery
-		prefix = args['notify']+'.'+_args['context']
-		partition = str(partition)
-		table = '_'.join([prefix,partition,'io']).replace('__','_')
-		folder = os.sep.join([args['logs'],args['context'],partition,'output']) 
-		if 'file' in args :
+		# #
+		# #-- Let us store all of this into bigquery
+		# prefix = args['notify']+'.'+_args['context']
+		# partition = str(partition)
+		# table = '_'.join([prefix,partition,'io']).replace('__','_')
+		# folder = os.sep.join([args['logs'],args['context'],partition,'output']) 
+		# if 'file' in args :
 			
-			_fname = os.sep.join([folder,table.replace('_io','_full_io.csv')])
-			_pname = os.sep.join([folder,table])+'.csv'
-			data_comp.to_csv( _pname,index=False)
-			_args['data'].to_csv(_fname,index=False)
+		# 	_fname = os.sep.join([folder,table.replace('_io','_full_io.csv')])
+		# 	_pname = os.sep.join([folder,table])+'.csv'
+		# 	data_comp.to_csv( _pname,index=False)
+		# 	_args['data'].to_csv(_fname,index=False)
 			
-			_id = 'path'
-		else:
+		# 	_id = 'path'
+		# else:
 
-			credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
-			_pname = os.sep.join([folder,table+'.csv'])
-			_fname = table.replace('_io','_full_io')
-			partial = '.'.join(['io',args['context']+'_partial_io'])
-			complete= '.'.join(['io',args['context']+'_full_io'])
-			data_comp.to_csv(_pname,index=False)
-			if 'dump' in args :
-				print (_args['data'].head())
-			else:
-				Components.lock.acquire()
-				data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)	
-				_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
-				Components.lock.release()
-			_id = 'dataset'
-		info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
-		if partition :
-			info ['partition'] = int(partition)
-		logger.write({"module":"generate","action":"write","input":info} )
+		# 	credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
+		# 	_pname = os.sep.join([folder,table+'.csv'])
+		# 	_fname = table.replace('_io','_full_io')
+		# 	partial = '.'.join(['io',args['context']+'_partial_io'])
+		# 	complete= '.'.join(['io',args['context']+'_full_io'])
+		# 	data_comp.to_csv(_pname,index=False)
+		# 	if 'dump' in args :
+		# 		print (_args['data'].head())
+		# 	else:
+		# 		Components.lock.acquire()
+		# 		data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)	
+		# 		_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
+		# 		Components.lock.release()
+		# 	_id = 'dataset'
+		# info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
+		# if partition :
+		# 	info ['partition'] = int(partition)
+		# logger.write({"module":"generate","action":"write","input":info} )
 		
 	
 	
@@ -308,98 +326,95 @@ if __name__ == '__main__' :
 	#	Log what was initiated so we have context of this processing ...
 	#
 	# if 'listen' not in SYS_ARGS :
-	if 'file' in args :
-		DATA = pd.read_csv(args['file']) ;
-		schema = []
-	else:
-		DATA = Components().get(args)
-		client      = bq.Client.from_service_account_json(args["private_key"])
-		schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
+	# if 'file' in args :
+	# 	DATA = pd.read_csv(args['file']) ;
+	# 	schema = []
+	# else:
+	# 	DATA = Components().get(args)
+	# 	client      = bq.Client.from_service_account_json(args["private_key"])
+	# 	schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
 
-	COLUMNS = DATA.columns
-	DATA = np.array_split(DATA,PART_SIZE)
-	args['schema'] = schema
+	# COLUMNS = DATA.columns
+	# DATA = np.array_split(DATA,PART_SIZE)
+	# args['schema'] = schema
 	if 'generate' in SYS_ARGS :
 		#
 		# Let us see if we have partitions given the log folder
 		
-		content = os.listdir( os.sep.join([args['logs'],args['context']]))
+		content = os.listdir( os.sep.join([args['logs'],'train',args['context']]))
 		generator = Components()
 		
-		if ''.join(content).isnumeric() :
-			#
-			# we have partitions we are working with
+		# if ''.join(content).isnumeric() :
+		# 	#
+		# 	# we have partitions we are working with
 			
-			jobs = []
+		# 	jobs = []
 			
-			# columns = DATA.columns.tolist()
+		# 	# columns = DATA.columns.tolist()
 			
-			# DATA  = np.array_split(DATA,PART_SIZE)
+		# 	# DATA  = np.array_split(DATA,PART_SIZE)
 
-			for index in range(0,PART_SIZE) :
-				if 'focus' in args and int(args['focus']) != index :
-					#
-					# This handles failures/recoveries for whatever reason
-					# If we are only interested in generating data for a given partition 
-					continue
-				# index = id.index(id)
+		# 	for index in range(0,PART_SIZE) :
+		# 		if 'focus' in args and int(args['focus']) != index :
+		# 			#
+		# 			# This handles failures/recoveries for whatever reason
+		# 			# If we are only interested in generating data for a given partition 
+		# 			continue
+		# 		# index = id.index(id)
 				
-				args['partition'] = index
-				args['data'] = DATA[index]
-				if int(args['num_gpu']) > 1 :
-					args['gpu'] = index
-				else:
-					args['gpu']=0
+		# 		args['partition'] = index
+		# 		args['data'] = DATA[index]
+		# 		if int(args['num_gpu']) > 1 :
+		# 			args['gpu'] = index
+		# 		else:
+		# 			args['gpu']=0
 
-				make = lambda _args: (Components()).generate(_args)
-				job = Process(target=make,args=(args,))
-				job.name = 'generator # '+str(index)
-				job.start()
-				jobs.append(job)
-				# if len(jobs) == 1 :
-				# 	job.join()
+		# 		make = lambda _args: (Components()).generate(_args)
+		# 		job = Process(target=make,args=(args,))
+		# 		job.name = 'generator # '+str(index)
+		# 		job.start()
+		# 		jobs.append(job)
+		# 		# if len(jobs) == 1 :
+		# 		# 	job.join()
 
-			print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ])
-			while len(jobs)> 0 :
-				jobs = [job for job in jobs if job.is_alive()]
-				time.sleep(2)
+		# 	print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ])
+		# 	while len(jobs)> 0 :
+		# 		jobs = [job for job in jobs if job.is_alive()]
+		# 		time.sleep(2)
 				
-				# generator.generate(args)
-		else:
-			generator.generate(args)
+		# 		# generator.generate(args)
+		# else:
+		# 	generator.generate(args)
 		# Components.generate(args)
-	elif 'shuffle' in SYS_ARGS:
-		
-		
-		for data in DATA :
-			args['data'] = data
-			_df = (Components()).shuffle(args)
+		generator.generate(args)
+	
 	else:
 		
 		# DATA  = np.array_split(DATA,PART_SIZE)
-		
-		jobs = []
-		for index in range(0,PART_SIZE) :
-			if 'focus' in args and int(args['focus']) != index :
-				continue
-			args['part_size'] = PART_SIZE
-			args['partition'] = index
-			args['data'] = DATA[index]
-			if int(args['num_gpu']) > 1 :
-				args['gpu'] = index
-			else:
-				args['gpu']=0
+		agent = Components()
+		agent.train(**args)
+		# jobs = []
+		# for index in range(0,PART_SIZE) :
+		# 	if 'focus' in args and int(args['focus']) != index :
+		# 		continue
+		# 	args['part_size'] = PART_SIZE
+		# 	args['partition'] = index
+		# 	args['data'] = DATA[index]
+		# 	if int(args['num_gpu']) > 1 :
+		# 		args['gpu'] = index
+		# 	else:
+		# 		args['gpu']=0
 
-			make = lambda _args: (Components()).train(**_args)
-			job = Process(target=make,args=( dict(args),))
-			job.name = 'Trainer # ' + str(index)
-			job.start()
-			jobs.append(job)
-			# args['gpu']
-		print (["Started ",len(jobs),"trainers" if len(jobs)>1 else "trainer" ])
-		while len(jobs)> 0 :
-			jobs = [job for job in jobs if job.is_alive()]
-			time.sleep(2)
+		# 	make = lambda _args: (Components()).train(**_args)
+		# 	job = Process(target=make,args=( dict(args),))
+		# 	job.name = 'Trainer # ' + str(index)
+		# 	job.start()
+		# 	jobs.append(job)
+		# 	# args['gpu']
+		# print (["Started ",len(jobs),"trainers" if len(jobs)>1 else "trainer" ])
+		# while len(jobs)> 0 :
+		# 	jobs = [job for job in jobs if job.is_alive()]
+		# 	time.sleep(2)
 
 		# trainer = Components()
 		# trainer.train(**args)

From 4725b6eff9176f18601bd1f381398d54a507ebe4 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 29 Mar 2021 18:53:57 -0500
Subject: [PATCH 115/250] new features, bug fixes

---
 bin/data-maker         | 1 +
 data/maker/__init__.py | 2 +-
 setup.py               | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)
 create mode 120000 bin/data-maker

diff --git a/bin/data-maker b/bin/data-maker
new file mode 120000
index 0000000..f63f773
--- /dev/null
+++ b/bin/data-maker
@@ -0,0 +1 @@
+pipeline.py
\ No newline at end of file
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 086df3f..cfdd8e2 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -219,7 +219,7 @@ def generate(**_args):
     handler     = gan.Predict (**args)
     handler.load_meta(None)
     #
-    # Let us now format the matrices as we expect them to be 
+    # Let us now format the matrices by reverting them to a data-frame with values
     #
 
     candidates = handler.apply(candidates=args['candidates'])       
diff --git a/setup.py b/setup.py
index 40e8d11..7970f14 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.3.2","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.4.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 46f2fd7be406f0bcdda0525655b48e3d64fca398 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 29 Mar 2021 22:59:31 -0500
Subject: [PATCH 116/250] data preparation script (preconditions)

---
 data/maker/prepare/__init__.py | 252 +++++++++++++++++++++++++++++++++
 data/maker/prepare/__main__.py |   1 +
 2 files changed, 253 insertions(+)
 create mode 100644 data/maker/prepare/__init__.py
 create mode 120000 data/maker/prepare/__main__.py

diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
new file mode 100644
index 0000000..2c773de
--- /dev/null
+++ b/data/maker/prepare/__init__.py
@@ -0,0 +1,252 @@
+"""
+(c) 2018 - 2021, Vanderbilt University Medical Center
+Steve L. Nyemba, steve.l.nyemba@vumc.org
+
+This file is designed to handle preconditions for a generative adversarial network:
+    - The file will read/get data from a source specified by transport (or data-frame)
+    - The class will convert the data to a binary vector
+    - The class will also help rebuild the data from a binary matrix.
+Usage :
+
+"""
+import transport
+import json
+import pandas as pd
+import numpy as np
+import cupy as cp
+import sys
+import os
+# from multiprocessing import Process, Queue
+
+# if 'GPU' in os.environ :
+#     import cupy as np
+# else:
+#     import numpy as np    
+class void:
+    pass
+class Hardware :
+    """
+    This class is intended to allow the use of hardware i.e GPU, index or CPU
+    """
+    pass
+
+class Input :
+    """
+    This class is designed to read data from a source and and perform a variet of operations :
+        - provide a feature space, and rows (matrix profile)
+        - a data index map
+    """
+    # def learn(self,**_args):
+    #     """
+    #     This function is designed to learn about, the data and persist
+    #     :param table
+    #     :param store
+    #     """
+    #     table = _args['table']
+    #     reader  = transport.factory.instance(**_args['store'])
+    #     df = reader.read(table=table,limit=1)
+    #     self.columns = df.columns.tolist()
+
+    #     self._metadf = pd.DataFrame(self.df[self._columns].dtypes.values.astype(str)).T  #,self._columns]
+    #     self._metadf.columns = self._columns
+
+    #     sql = "SELECT :fields from :table".replace(":table",table)        
+
+
+    def __init__(self,**_args):
+        """
+        :param table    
+        :param store    data-store parameters/configuration
+        :param sql      sql query  that pulls a representative sample of the data
+        """
+        self._schema = _args['schema'] if 'schema' in _args else {}
+        self.df = _args['data']
+        if 'sql' not in _args :
+            # self._initdata(**_args)   
+            # 
+            pass  
+        else:
+            self._initsql(**_args)
+        self._map = {} if 'map' not in _args else _args['map']
+        # self._metadf = pd.DataFrame(self.df[self._columns].dtypes.values.astype(str)).T  #,self._columns]
+        # self._metadf.columns = self._columns
+        if 'gpu' in _args and 'GPU' in os.environ:
+            
+            np = cp
+            index = int(_args['gpu'])
+            np.cuda.Device(index).use()            
+            print(['..:: GPU ',index])
+        
+    def _initsql(self,**_args):
+        """
+        This function will initialize the class on the basis of a data-store and optionally pre-defined columns to be used to be synthesized 
+        :param store        data-store configuration
+        :param sql          sql query to be applied to the transported data
+        :param columns      list of columns to be 
+        """
+        # _store_args = _args['store']
+        # reader = transport.factory.instance(**_store_args)
+        # sql = _args['sql']
+        
+        # self.df = reader.read(sql=_args['sql'])
+        
+        
+        if 'columns' not in _args :
+            self._initcols(data=self.df)
+        else:
+            self._initcols(data=self.df,columns=_args['columns'])
+        
+        pass     
+    def _initcols (self,**_args) :
+        """
+        This function will initialize the columns to be synthesized and/or determine which ones can be synthesized
+        :param data         data-frame that holds the data (matrix)
+        :param columns      optional columns to be synthesized 
+        """
+        # df = _args['data'].copy()
+        row_count = self.df.shape[0]
+        cols    = None if 'columns' not in _args else _args['columns']
+        self.columns = self.df.columns.tolist()
+        if 'columns' in _args :
+            self._columns = _args['columns']
+        else:
+            #
+            # We will look into the count and make a judgment call
+            _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T
+            MIN_SPACE_SIZE = 2
+            self._columns  = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist()
+    def _initdata(self,**_args):
+        """
+        This function will initialize the class with a data-frame and columns of interest (if any)
+        :param data       data-frame that holds the data
+        :param columns columns that need to be synthesized if any
+        """
+        #
+        # setting class-level variables to be reused across the class
+        # self.df = _args['data']  
+        row_count = self.df.shape[0]
+        # self.columns = self.df.columns 
+        # self._metadf = self.df.apply(lambda col: col.unique().size)
+        # _df = pd.DataFrame(self.df.apply(lambda col: col.unique().size )).T
+        # cols = None if 'columns' not in _args else _args['columns']
+        self._initcols(**_args)
+
+    def convert(self,**_args):
+        """
+        This function will convert a data-frame into a binary matrix and provide a map to be able to map the values back to the matrix 
+        :param columns  in case we specify the columns to account for (just in case the original assumptions don't hold)
+        """
+        if 'columns' in _args or 'column' in _args :
+            columns = _args['columns'] if 'columns' in _args else [_args['column']]
+        else:
+            columns = self._columns
+        _df = self.df if 'data' not in _args else _args['data']
+        #
+        # At this point we have the list of features we want to use
+        i = 0
+        
+        _m = np.array([])
+        _values  = []
+        for name in columns :
+            #
+            # In case we have  dataset with incomplete value space, we should still be able to generate something meaningful
+            #
+            values = None if name not in self._map else list(self._map[name]['values'])
+            _type = self._schema[name] if name in self._schema else _df[name].dtype
+            cols, _matrix = self.tobinary(_df[name],values)
+            _beg,_end = i,i+len(cols)
+            if name not in self._map :
+                self._map[name] = {"beg":_beg,"end":_end ,"values":cols}
+            i += len(cols)
+            if not _m.shape[0]:
+                _m = _matrix ;
+            else:
+                _m = np.concatenate((_m,_matrix),axis=1)
+            if values :
+                _values += list(values)
+        #
+        # @NOTE:
+        # The map should allow us to be able to convert or reconvert the binary matrix to whatever we want ...
+        #
+        # self._matrix = _m
+        
+        return _values,_m
+        
+    def revert(self,**_args) :
+        """
+        This function will take in a binary matrix and based on the map of values it will repopulate it with values
+        :param _matrix  binary matrix
+        :param column|columns   column name or columns if the column is specified
+        """
+        _column = _args['column'] if 'column' in _args else None
+            
+        
+        matrix = _args['matrix']
+        row_count = matrix.shape[0]
+        r = {}
+        for key in self._map :
+            if _column and key != _column :
+                continue
+            _item = self._map[key]
+            _beg = _item['beg']
+            _end = _item['end']
+            columns = np.array(_item['values'])
+            #
+            # @NOTE: We are accessing matrices in terms of [row,col], 
+            # The beg,end variables are for the columns in the matrix (mini matrix)
+            #
+            # if not _column :
+            #     _matrix = matrix[:,_beg:_end] #-- The understanding is that _end is not included
+            # else:
+                # _matrix = matrix
+            _matrix = matrix[:,_beg:_end]
+            #
+            # vectorize the matrix to replace the bits by their actual values (accounting for the data-types)
+            # @TODO: Find ways to do this on a GPU (for big data) or across threads
+            #
+            row_count = _matrix.shape[0]
+            # r[key] = [columns[np.where(row == 1) [0][0] ] for row in _matrix[:,_beg:_end]]
+            
+            r[key] = [columns[np.where(row==1)[0][0]] if np.where(row==1)[0].size > 0 else '' for row in _matrix]
+            
+            
+        return pd.DataFrame(r)
+     
+    def tobinary(self,rows,cols=None) :
+        """
+        This function will compile a binary matrix from a row of values this allows hopefully this can be done in parallel, this function can be vectorized and processed 
+        :param rows     np.array or list of vector of values
+        :param cols     a space of values if it were to be different fromt he current sample.
+        """
+        
+        if not cols:
+            #
+            # In the advent the sample rows do NOT have the values of the 
+            cols = rows.unique()
+        cols = np.array(cols)
+        row_count = len(rows)
+        # if 'GPU' not in os.environ :
+        _matrix = np.zeros([row_count,cols.size])
+        
+        [np.put(_matrix[i], np.where(cols ==  rows[i])  ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
+        # else:
+        #     _matrix = cp.zeros([row_count,cols.size])
+        #     [cp.put(_matrix[i], cp.where(cols ==  rows[i]),1)for i in cp.arange(row_count) ]
+        #     _matrix = _matrix.asnumpy()
+
+        
+        return cols,_matrix
+
+if __name__ == '__main__' :
+    df = pd.read_csv('../../sample.csv')
+    _input = Input(data=df,columns=['age','race'])
+    _m = _input.convert(column='age')
+    print (_m.shape)
+    print (_input.revert(matrix=_m,column='age'))
+    print (_input._metadf)
+
+# _args = {"store":{"type":"sql.BQReader","args":{"service_key":"/home/steve/dev/aou/accounts/curation-prod.json"}}}
+# _args['table'] = 'io.observation'
+# _i = Input(**_args)
+# df = pd.read_csv('../../sample.csv')
+# print (Input.ToBinary(df.age))
\ No newline at end of file
diff --git a/data/maker/prepare/__main__.py b/data/maker/prepare/__main__.py
new file mode 120000
index 0000000..93f5256
--- /dev/null
+++ b/data/maker/prepare/__main__.py
@@ -0,0 +1 @@
+__init__.py
\ No newline at end of file

From 43873697a0fb46e8ef961388c4bd5b4c9110cf93 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 30 Mar 2021 04:56:01 -0500
Subject: [PATCH 117/250] bug fixes

---
 data/gan.py                    | 12 ++++++++++--
 data/maker/__init__.py         | 19 ++++++++++---------
 data/maker/prepare/__init__.py | 18 +++++++++++-------
 setup.py                       |  2 +-
 4 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index e7ab6cf..c61d1b1 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -58,7 +58,14 @@ class GNet :
                 self.layers.normalize = self.normalize
                 self.logs = {}
 
-                self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
+                # self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
+                self.GPU_CHIPS = None if 'gpu' not in args else args['gpu']
+                if self.GPU_CHIPS is None:
+                        self.GPU_CHIPS = [0]
+                        if 'CUDA_VISIBLE_DEVICES' in os.environ :
+                                os.environ.pop('CUDA_VISIBLE_DEVICES')
+                self.NUM_GPUS = len(self.GPU_CHIPS)
+                
                 self.PARTITION = args['partition']
                 # if self.NUM_GPUS > 1 :
                 #     os.environ['CUDA_VISIBLE_DEVICES'] = "4"
@@ -150,6 +157,7 @@ class GNet :
                         "D_STRUCTURE":self.D_STRUCTURE,
                         "G_STRUCTURE":self.G_STRUCTURE,
                         "NUM_GPUS":self.NUM_GPUS,
+                        "GPU_CHIPS":self.GPU_CHIPS,
                         "NUM_LABELS":self.NUM_LABELS,
                         "MAX_EPOCHS":self.MAX_EPOCHS,
                         "ROW_COUNT":self.ROW_COUNT
@@ -443,7 +451,7 @@ class Train (GNet):
                         #       - abstract hardware specification
                         #       - determine if the GPU/CPU are busy
                         #
-                        for i in range(self.NUM_GPUS):
+                        for i in self.GPU_CHIPS : #range(self.NUM_GPUS):
                                 with tf.device('/gpu:%d' % i):
                                         with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
                                                 if self._LABEL is not None :
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index cfdd8e2..fbdf208 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -90,16 +90,16 @@ def train (**_args):
     #
     # Let us prepare the data by calling the utility function
     #
-    if 'file' in _args :
-        #
-        # We are reading data from a file
-        _args['data'] = pd.read_csv(_args['file'])
-    else:
-        #
-        # data will be read from elsewhere (a data-store)...
-        pass        
+    # if 'file' in _args :
+    #     #
+    #     # We are reading data from a file
+    #     _args['data'] = pd.read_csv(_args['file'])
+    # else:
+    #     #
+    #     # data will be read from elsewhere (a data-store)...
+    #     pass        
     # if 'ignore' in _args and 'columns' in _args['ignore']:
-        
+    
     _inputhandler = prepare.Input(**_args)
     values,_matrix = _inputhandler.convert()
     args  = {"real":_matrix,"context":_args['context']}
@@ -107,6 +107,7 @@ def train (**_args):
     if 'store' in _args :
         #
         # This 
+        
         args['store'] = copy.deepcopy(_args['store']['logs'])
         args['store']['args']['doc'] = _args['context']
         logger = factory.instance(**args['store'])
diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 2c773de..381dfc0 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -13,7 +13,7 @@ import transport
 import json
 import pandas as pd
 import numpy as np
-import cupy as cp
+# import cupy as cp
 import sys
 import os
 # from multiprocessing import Process, Queue
@@ -62,7 +62,7 @@ class Input :
         self._schema = _args['schema'] if 'schema' in _args else {}
         self.df = _args['data']
         if 'sql' not in _args :
-            # self._initdata(**_args)   
+            self._initdata(**_args)   
             # 
             pass  
         else:
@@ -70,12 +70,12 @@ class Input :
         self._map = {} if 'map' not in _args else _args['map']
         # self._metadf = pd.DataFrame(self.df[self._columns].dtypes.values.astype(str)).T  #,self._columns]
         # self._metadf.columns = self._columns
-        if 'gpu' in _args and 'GPU' in os.environ:
+        # if 'gpu' in _args and 'GPU' in os.environ:
             
-            np = cp
-            index = int(_args['gpu'])
-            np.cuda.Device(index).use()            
-            print(['..:: GPU ',index])
+        #     np = cp
+        #     index = int(_args['gpu'])
+        #     np.cuda.Device(index).use()            
+        #     print(['..:: GPU ',index])
         
     def _initsql(self,**_args):
         """
@@ -107,6 +107,8 @@ class Input :
         row_count = self.df.shape[0]
         cols    = None if 'columns' not in _args else _args['columns']
         self.columns = self.df.columns.tolist()
+        self._io = []
+
         if 'columns' in _args :
             self._columns = _args['columns']
         else:
@@ -115,6 +117,8 @@ class Input :
             _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T
             MIN_SPACE_SIZE = 2
             self._columns  = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist()
+            self._io = _df.to_dict(orient='records')
+            
     def _initdata(self,**_args):
         """
         This function will initialize the class with a data-frame and columns of interest (if any)
diff --git a/setup.py b/setup.py
index 7970f14..4b96d08 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.4.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.4.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From 13053febb773143b90c5ff5d1122ef355c21a979 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 30 Mar 2021 05:18:28 -0500
Subject: [PATCH 118/250] ...

---
 pipeline.py | 101 ++++++++++++++++++++++++++--------------------------
 1 file changed, 50 insertions(+), 51 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 4a86d94..e643278 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -63,6 +63,24 @@ class Components :
 	def split(X,MAX_ROWS=3,PART_SIZE=3):
 		
 		return list(pd.cut( np.arange(X.shape[0]+1),PART_SIZE).categories)
+	def format_schema(self,schema):
+		_schema = {}
+		for _item in schema :                
+			_type = int
+			_value = 0
+			if _item.field_type == 'FLOAT' :
+				_type =float
+			elif _item.field_type != 'INTEGER' :
+				_type = str
+				_value = ''
+			_schema[_item.name] = _type	
+		return _schema	
+	def get_ignore(self,**_args) :
+		if 'columns' in _args and 'data' in _args :
+			_df = _args['data']
+			terms = _args['columns']
+			return [name for name in _df.columns if name in terms]
+		return []
 
 	def train(self,**args):
 		"""
@@ -83,11 +101,15 @@ class Components :
 			schema = reader.meta(table=args['from']) if hasattr(reader,'meta') and 'from' in args else None
 		else:
 			df = args['data']
-			
-			
+		
+		#
+		#
+		if 'ignore' in args and 'columns' in args['ignore'] :
+			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
+			df = df[ list(set(df.columns)- set(_cols))]
 		# df = df.fillna('')
 		if schema :
-			_schema = {}
+			_schema = []
 			for _item in schema :                
 				_type = int
 				_value = 0
@@ -96,7 +118,7 @@ class Components :
 				elif _item.field_type != 'INTEGER' :
 					_type = str
 					_value = ''
-				_schema[_item.name] = _type
+				_schema += [{"name":_item.name,"type":_item.field_type}]
 				df[_item.name] = df[_item.name].fillna(_value).astype(_type)
 			args['schema'] = _schema
 		# 		df[_item.name] = df[_item.name].astype(_type)
@@ -107,6 +129,8 @@ class Components :
 		data.maker.train(**_args)
 		
 		if 'autopilot' in ( list(args.keys())) :
+
+			args['data'] = df
 			print (['autopilot mode enabled ....',args['context']])
 			self.generate(args)
 
@@ -127,52 +151,27 @@ class Components :
 
 		ostore = args['store']['target']
 		writer = factory.instance(**ostore)
-		# log_folder = args['logs'] if 'logs' in args else 'logs'
-		# partition = args['partition'] if 'partition' in args else ''
-		# log_folder = os.sep.join([log_folder,args['context'],str(partition)])
-		
-		# _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
-		# _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
-		# _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
-		# if 'batch_size' in args :
-		# 	_args['batch_size'] = int(args['batch_size'])
-		
-		# if int(args['num_gpu']) > 1 :
-		# 	_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)
-		# else:
-		# 	_args['gpu']  = 0
-		# _args['num_gpu'] 	= 1
-		# os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
-		# # _args['no_value']= args['no_value']
-		# _args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128
-			
-		
-		# # MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
-		# PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
-		
-		# credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
-		# _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
-		# reader = args['reader']
-		# df = reader()
+
 		schema = args['schema'] if 'schema' in args else None
-		if 'file' in args :
+		if 'data' in args :
 			
-			df = pd.read_csv(args['file'])
+			df = args['data']
 		else:
-			if 'data' not in args :
-				reader = factory.instance(**args['store']['source'])
-				if 'row_limit' in args :
-					df = reader.read(sql=args['sql'],limit=args['row_limit'])
-				else:
-					df = reader.read(sql=args['sql'])
-				if 'schema' not in args and hasattr(reader,'meta'):
-					schema = reader.meta(table=args['from'])
 
-					
+			reader = factory.instance(**args['store']['source'])
+			if 'row_limit' in args :
+				df = reader.read(sql=args['sql'],limit=args['row_limit'])
 			else:
-				#
-				# This will account for autopilot mode ...
-				df = args['data']
+				df = reader.read(sql=args['sql'])
+			if 'schema' not in args and hasattr(reader,'meta'):
+				schema = reader.meta(table=args['from'])
+				schema = [{"name":_item.name,"type":_item.field_type} for _item in schema]
+
+					
+			# else:
+			# 	#
+			# 	# This will account for autopilot mode ...
+			# 	df = args['data']
 	
 		_info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[0]}}
 		
@@ -188,7 +187,7 @@ class Components :
 			# writer  = factory.instance(**ostore)
 			_columns = None
 			skip_columns = []
-			_schema = [{"name":field.name,"type":field.field_type,"description":field.description} for field in schema] if schema else []
+			_schema = schema
 			for _df in candidates :
 				#
 				# we need to format the fields here to make sure we have something cohesive
@@ -197,11 +196,11 @@ class Components :
 				if not skip_columns :
 					# _columns = set(df.columns) - set(_df.columns)
 					if 'ignore' in args and 'columns' in args['ignore'] :
-							
-							for name in args['ignore']['columns'] :
-								for _name in _df.columns:
-									if _name in name:
-										skip_columns.append(_name)
+							skip_columns = self.get_ignore(data=_df,columns=args['ignore']['columns'])
+							# for name in args['ignore']['columns'] :
+							# 	for _name in _df.columns:
+							# 		if _name in name:
+							# 			skip_columns.append(_name)
 				#
 				# We perform a series of set operations to insure that the following conditions are met:
 				#	- the synthetic dataset only has fields that need to be synthesized

From ac8968c3e33cfe0408a67074bb7bbc0695981d21 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 30 Mar 2021 05:23:55 -0500
Subject: [PATCH 119/250] bug fix: fields skipped  (training)

---
 pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index e643278..f4db40c 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -104,9 +104,6 @@ class Components :
 		
 		#
 		#
-		if 'ignore' in args and 'columns' in args['ignore'] :
-			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
-			df = df[ list(set(df.columns)- set(_cols))]
 		# df = df.fillna('')
 		if schema :
 			_schema = []
@@ -125,6 +122,9 @@ class Components :
 		_args = copy.deepcopy(args)
 		# _args['store']  = args['store']['source']
 		_args['data'] = df
+		if 'ignore' in args and 'columns' in args['ignore'] :
+			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
+			_args['data'] = df[ list(set(df.columns)- set(_cols))]
 		
 		data.maker.train(**_args)
 		

From 0ef149f76b4d17d0b4bd78847b8c9403d895800a Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 30 Mar 2021 05:35:29 -0500
Subject: [PATCH 120/250] bug fix: fields skipped  (training)

---
 pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index f4db40c..6cb1be9 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -79,7 +79,7 @@ class Components :
 		if 'columns' in _args and 'data' in _args :
 			_df = _args['data']
 			terms = _args['columns']
-			return [name for name in _df.columns if name in terms]
+			return [name for name in _df.columns if  np.sum( [int(field in  name )for field in terms ]) > 0 ]
 		return []
 
 	def train(self,**args):

From 9bbe9b7ff908e8baa3ba9b38fc84a38c4b1d0d09 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 30 Mar 2021 09:00:57 -0500
Subject: [PATCH 121/250] optimization (minor)

---
 data/maker/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index fbdf208..611b13d 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -197,17 +197,17 @@ def generate(**_args):
     f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
     _map = json.loads(f.read())
     f.close()
-    if 'file' in _args :
-        df = pd.read_csv(_args['file'])
-    else:
-        df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])
+    # if 'file' in _args :
+    #     df = pd.read_csv(_args['file'])
+    # else:
+    #     df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])
     args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']}
     args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
     args ['max_epochs'] = _args['max_epochs']
     # args['matrix_size'] = _matrix.shape[0]
     args['batch_size'] = 2000
     args['partition'] = 0 if 'partition' not in _args else _args['partition']
-    args['row_count'] = df.shape[0]
+    args['row_count'] = _args['data'].shape[0]
     #
     # @TODO: perhaps get the space of values here ... (not sure it's a good idea)
     #

From b283f72dc963a433152ab97b5b934b36d65471a8 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 30 Mar 2021 10:48:25 -0500
Subject: [PATCH 122/250] bug fix

---
 pipeline.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pipeline.py b/pipeline.py
index 6cb1be9..d00ddb7 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -179,6 +179,10 @@ class Components :
 		_dc = pd.DataFrame()
 		# for mdf in df :
 		args['data'] = df
+		if 'ignore' in args and 'columns' in args['ignore'] :
+			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
+			args['data'] = df[ list(set(df.columns)- set(_cols))]
+
 		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
 		
 		candidates = (data.maker.generate(**args))

From 341b9ffec165a280a00ddd71ad97a468f920e3ac Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 30 Mar 2021 16:14:48 -0500
Subject: [PATCH 123/250] bug fix: log information about space

---
 data/maker/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 611b13d..59a7ff0 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -119,7 +119,7 @@ def train (**_args):
             values = _inputhandler._map[key]['values'].tolist()
             _map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()}
         info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map}
-        logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":info})
+        logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io})
     
     args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
     args ['max_epochs'] = _args['max_epochs']

From bdd752550ef5f3b9bd63868d621471653e3848a7 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 30 Mar 2021 17:01:32 -0500
Subject: [PATCH 124/250] bug fix attempt: large matrix conversion fails

---
 setup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 4b96d08..a2e6744 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,9 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.4.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker",
+        "version":"1.4.2",
+        "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

From f65b082fb129cbb37091a35d8823218c766882e7 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 30 Mar 2021 17:01:59 -0500
Subject: [PATCH 125/250] bug fix attempt: large matrix conversion fails

---
 data/gan.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/data/gan.py b/data/gan.py
index c61d1b1..767a24b 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -603,7 +603,8 @@ class Predict(GNet):
                                 #
                                 
                                 # df =   pd.DataFrame(np.round(f)).astype(np.int32)
-                                candidates.append (np.round(_matrix).astype(np.int64))                        
+                                # candidates.append (np.round(_matrix).astype(np.int64)) 
+                                candidates.append( [np.round(row).astype(int) for row in _matrix])                       
                 # return candidates[0] if len(candidates) == 1 else candidates
                 
                 return candidates 

From 62a665464dca608439e2328447c9ea0880cd4ed8 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 30 Mar 2021 17:17:10 -0500
Subject: [PATCH 126/250] ..

---
 data/gan.py                    |  2 +-
 data/maker/prepare/__init__.py | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 767a24b..985e706 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -604,7 +604,7 @@ class Predict(GNet):
                                 
                                 # df =   pd.DataFrame(np.round(f)).astype(np.int32)
                                 # candidates.append (np.round(_matrix).astype(np.int64)) 
-                                candidates.append( [np.round(row).astype(int) for row in _matrix])                       
+                                candidates.append(np.array([np.round(row).astype(int) for row in _matrix]))                       
                 # return candidates[0] if len(candidates) == 1 else candidates
                 
                 return candidates 
diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 381dfc0..9fb0fa7 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -111,13 +111,13 @@ class Input :
 
         if 'columns' in _args :
             self._columns = _args['columns']
-        else:
-            #
-            # We will look into the count and make a judgment call
-            _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T
-            MIN_SPACE_SIZE = 2
-            self._columns  = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist()
-            self._io = _df.to_dict(orient='records')
+        # else:
+        #
+        # We will look into the count and make a judgment call
+        _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T
+        MIN_SPACE_SIZE = 2
+        self._columns  = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist()
+        self._io = _df.to_dict(orient='records')
             
     def _initdata(self,**_args):
         """

From 846fa99743eae03d87acd12a3503064398db0e8d Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 30 Mar 2021 17:50:12 -0500
Subject: [PATCH 127/250] bug fix: data type and schema fields (order)

---
 pipeline.py | 3 ++-
 setup.py    | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index d00ddb7..47f8547 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -192,6 +192,7 @@ class Components :
 			_columns = None
 			skip_columns = []
 			_schema = schema
+			cols = [_item['name'] for _item in _schema]
 			for _df in candidates :
 				#
 				# we need to format the fields here to make sure we have something cohesive
@@ -222,7 +223,7 @@ class Components :
 
 				_df = pd.DataFrame.join(df,_df)
 				
-				writer.write(_df,schema=_schema,table=args['from'])
+				writer.write(_df[cols],schema=_schema,table=args['from'])
 			# 	writer.write(df,table=table)
 			pass
 		else:
diff --git a/setup.py b/setup.py
index a2e6744..450d0d9 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.2",
+        "version":"1.4.3",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']

From 20ee62178a60df87f161183bf13f89bb70f95c60 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 30 Mar 2021 22:00:01 -0500
Subject: [PATCH 128/250] bug fixes with data-types

---
 pipeline.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 47f8547..49b2039 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -222,7 +222,9 @@ class Components :
 				# Let us merge the dataset here and and have a comprehensive dataset
 
 				_df = pd.DataFrame.join(df,_df)
-				
+				for _item in _schema :
+					if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
+						_df[_item['name']] = _df[_item['name']].astype(str)
 				writer.write(_df[cols],schema=_schema,table=args['from'])
 			# 	writer.write(df,table=table)
 			pass

From e0601edea547a28d06c6b82fe313a4f4e5930542 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 1 Apr 2021 12:14:51 -0500
Subject: [PATCH 129/250] bug fix: zeros matrix and continuous variables

---
 data/maker/prepare/__init__.py | 10 ++++-
 pipeline.py                    | 70 ++++++++++++++++++++++++++++++----
 setup.py                       |  2 +-
 3 files changed, 71 insertions(+), 11 deletions(-)

diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 9fb0fa7..e15c63b 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -16,6 +16,9 @@ import numpy as np
 # import cupy as cp
 import sys
 import os
+#
+# The following is to address the issue over creating a large matrix ...
+# 
 # from multiprocessing import Process, Queue
 
 # if 'GPU' in os.environ :
@@ -230,8 +233,11 @@ class Input :
         cols = np.array(cols)
         row_count = len(rows)
         # if 'GPU' not in os.environ :
-        _matrix = np.zeros([row_count,cols.size])
-        
+        # _matrix = np.zeros([row_count,cols.size],dtype=int)
+        #
+        # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure)
+        #
+        _matrix = np.array([np.zeros(cols.size) for i in np.arange(row_count)])
         [np.put(_matrix[i], np.where(cols ==  rows[i])  ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
         # else:
         #     _matrix = cp.zeros([row_count,cols.size])
diff --git a/pipeline.py b/pipeline.py
index 49b2039..a38029d 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -122,10 +122,20 @@ class Components :
 		_args = copy.deepcopy(args)
 		# _args['store']  = args['store']['source']
 		_args['data'] = df
+		#
+		# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
+		if 'continuous' in args :
+			x_cols = args['continuous']
+		else:
+			x_cols = []
+
 		if 'ignore' in args and 'columns' in args['ignore'] :
 			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
 			_args['data'] = df[ list(set(df.columns)- set(_cols))]
-		
+		#
+		# We need to make sure that continuous columns are removed 
+		if x_cols :
+			_args['data'] = df[list(set(df.columns) - set(x_cols))]
 		data.maker.train(**_args)
 		
 		if 'autopilot' in ( list(args.keys())) :
@@ -136,7 +146,26 @@ class Components :
 
 		pass
 
-	def post(self,args):
+	def approximate(self,values):
+		"""
+		:param values	array of values to be approximated
+		"""
+		if values.dtype in [int,float] :
+			r = np.random.dirichlet(values)
+			x = []
+			_type = values.dtype
+			for index in np.arange(values.size) :
+				
+				if np.random.choice([0,1],1)[0] :
+					value = values[index] + (values[index] * r[index])
+				else :
+					value = values[index] - (values[index] * r[index])
+				value = int(value) if _type == int else np.round(value,2)
+				x.append( value)
+			np.random.shuffle(x)
+			return np.array(x)
+		else:
+			return values
 		pass
 			
 
@@ -179,10 +208,23 @@ class Components :
 		_dc = pd.DataFrame()
 		# for mdf in df :
 		args['data'] = df
+		#
+		# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
+		if 'continuous' in args :
+			x_cols = args['continuous']
+		else:
+			x_cols = []
+
 		if 'ignore' in args and 'columns' in args['ignore'] :
 			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
 			args['data'] = df[ list(set(df.columns)- set(_cols))]
-
+		#
+		# We need to remove the continuous columns from the data-frame
+		# @TODO: Abstract this !!
+		#
+		if x_cols :
+			args['data'] = df[list(set(df.columns) - set(x_cols))]
+		
 		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
 		
 		candidates = (data.maker.generate(**args))
@@ -192,7 +234,10 @@ class Components :
 			_columns = None
 			skip_columns = []
 			_schema = schema
-			cols = [_item['name'] for _item in _schema]
+			if schema :
+				cols = [_item['name'] for _item in _schema]
+			else:
+				cols = df.columns
 			for _df in candidates :
 				#
 				# we need to format the fields here to make sure we have something cohesive
@@ -206,6 +251,9 @@ class Components :
 							# 	for _name in _df.columns:
 							# 		if _name in name:
 							# 			skip_columns.append(_name)
+				if x_cols :
+					for _col in x_cols :
+						_df[_col] = self.approximate(df[_col])
 				#
 				# We perform a series of set operations to insure that the following conditions are met:
 				#	- the synthetic dataset only has fields that need to be synthesized
@@ -222,10 +270,16 @@ class Components :
 				# Let us merge the dataset here and and have a comprehensive dataset
 
 				_df = pd.DataFrame.join(df,_df)
-				for _item in _schema :
-					if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
-						_df[_item['name']] = _df[_item['name']].astype(str)
-				writer.write(_df[cols],schema=_schema,table=args['from'])
+				if _schema :
+					for _item in _schema :
+						if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
+							_df[_item['name']] = _df[_item['name']].astype(str)
+				
+						pass
+				if _schema :
+					writer.write(_df[cols],schema=_schema,table=args['from'])
+				else:
+					writer.write(_df[cols],table=args['from'])
 			# 	writer.write(df,table=table)
 			pass
 		else:
diff --git a/setup.py b/setup.py
index 450d0d9..544f4b3 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.3",
+        "version":"1.4.4",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']

From 3fb82acd32885856b4fd7abe5b1041e50c7e53c8 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 1 Apr 2021 12:21:57 -0500
Subject: [PATCH 130/250] ...

---
 pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index a38029d..c1c5719 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -253,7 +253,7 @@ class Components :
 							# 			skip_columns.append(_name)
 				if x_cols :
 					for _col in x_cols :
-						_df[_col] = self.approximate(df[_col])
+						_df[_col] = self.approximate(df[_col].fillna(-1))
 				#
 				# We perform a series of set operations to insure that the following conditions are met:
 				#	- the synthetic dataset only has fields that need to be synthesized

From cf478016b06a023c6058012465bb16779534aac0 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 1 Apr 2021 12:28:09 -0500
Subject: [PATCH 131/250] ...

---
 pipeline.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index c1c5719..6f28eac 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -253,7 +253,10 @@ class Components :
 							# 			skip_columns.append(_name)
 				if x_cols :
 					for _col in x_cols :
-						_df[_col] = self.approximate(df[_col].fillna(-1))
+						if df[_col].unique().size > 0 :
+							_df[_col] = self.approximate(df[_col].fillna(-1))
+						else:
+							_df[_col] = -1
 				#
 				# We perform a series of set operations to insure that the following conditions are met:
 				#	- the synthetic dataset only has fields that need to be synthesized

From 732ccb42e5cd886da632f5a0f523db13bbc07494 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 1 Apr 2021 12:43:09 -0500
Subject: [PATCH 132/250] ...

---
 pipeline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pipeline.py b/pipeline.py
index 6f28eac..8b1dd9e 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -136,6 +136,8 @@ class Components :
 		# We need to make sure that continuous columns are removed 
 		if x_cols :
 			_args['data'] = df[list(set(df.columns) - set(x_cols))]
+		if 'gpu' in args :
+			_args['gpu'] = args['gpu']
 		data.maker.train(**_args)
 		
 		if 'autopilot' in ( list(args.keys())) :

From 5a16e325ac10348eacc4e93eac6a283069ed9722 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 1 Apr 2021 13:09:06 -0500
Subject: [PATCH 133/250] gpu indexing

---
 data/maker/__init__.py | 79 ++----------------------------------------
 pipeline.py            | 11 ++++--
 2 files changed, 11 insertions(+), 79 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 59a7ff0..3e42419 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -81,7 +81,6 @@ class ContinuousToDiscrete :
         return values
             
 
-
 def train (**_args):
     """
     :params sql
@@ -126,7 +125,7 @@ def train (**_args):
     args['matrix_size'] = _matrix.shape[0]
     args['batch_size'] = 2000
     args['partition'] = 0 if 'partition' not in _args else _args['partition']
-    os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
+    # os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
     
     trainer = gan.Train(**args)   
     #
@@ -215,8 +214,7 @@ def generate(**_args):
     _inputhandler = prepare.Input(**_args)
     values,_matrix = _inputhandler.convert()    
     args['values'] = np.array(values)
-    if 'gpu' in _args :
-        os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])
+       
     handler     = gan.Predict (**args)
     handler.load_meta(None)
     #
@@ -226,76 +224,3 @@ def generate(**_args):
     candidates = handler.apply(candidates=args['candidates'])       
     return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]
     
-
-
-def _generate(**args):
-    """
-    This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
-    @return pandas.DataFrame
-
-    :data   data-frame to be synthesized
-    :column   columns that need to be synthesized (discrete)
-    :id     column identifying an entity
-    :logs   location on disk where the learnt knowledge of the dataset is
-    """
-    # df      = args['data']
-    df      = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
-    
-    CONTINUOUS = args['continuous'] if 'continuous' in args else []
-    column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
-    # column_id   = args['id']
-    #
-    #@TODO:
-    #   If the identifier is not present, we should fine a way to determine or make one
-    #
-    BIN_SIZE    = 4 if 'bin_size' not in args else int(args['bin_size'])
-    # NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
-    bhandler = Binary()    
-    _df     = df.copy()
-    for col in column :
-        args['context'] = col
-        args['column']  = col
-        
-        msize = args['matrix_size'] if 'matrix_size' in args else -1        
-        values = bhandler.get_column(df[col],msize)
-        MISSING= bhandler.get_missing(df[col],msize)
-        
-        
-        
-        args['values']      = values    
-        args['row_count']   = df.shape[0]
-        # if col in NO_VALUE :
-        #     args['no_value'] = NO_VALUE[col] 
-        # else:
-        #     args['no_value'] = NO_VALUE
-        # novalue = NO_VALUE[col] if NO_VALUE[col] in ['na',''] else NO_VALUE[col]
-        # MISSING += [NO_VALUE[col]]
-        args['missing'] = MISSING
-        #
-        # we can determine the cardinalities here so we know what to allow or disallow
-        handler     = gan.Predict (**args)
-        handler.load_meta(col)
-        r           =  handler.apply()                
-        if col in CONTINUOUS :
-            r[col] = np.array(r[col])            
-            _approx = ContinuousToDiscrete.continuous(r[col],BIN_SIZE)  #-- approximating based on arbitrary bins                                
-            r[col] = _approx
-            
-            
-            
-        _df[col]    = r[col]
-        #
-        # Let's cast the type to the original type (it makes the data more usable)
-        #
-        # print (values)
-        # print ([col,df[col].dtype,_df[col].tolist()])
-        otype       = df[col].dtype
-        _df[col]    = _df[col].astype(otype)
-        
-        #
-        # @TODO: log basic stats about the synthetic attribute
-        #
-        # print (r)s
-        # break
-        
-    return _df
\ No newline at end of file
diff --git a/pipeline.py b/pipeline.py
index 8b1dd9e..6f39b55 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -81,7 +81,12 @@ class Components :
 			terms = _args['columns']
 			return [name for name in _df.columns if  np.sum( [int(field in  name )for field in terms ]) > 0 ]
 		return []
-
+	def set_gpu(self,**_args) :
+		if 'gpu' in _args :
+			gpu = _args['gpu'] if type(_args['gpu']) != str else [_args['gpu']]
+			_index = str(gpu[0])
+			os.environ['CUDA_VISIBLE_DEVICES'] = _index
+			return gpu
 	def train(self,**args):
 		"""
 		This function will perform training on the basis of a given pointer that reads data
@@ -137,7 +142,7 @@ class Components :
 		if x_cols :
 			_args['data'] = df[list(set(df.columns) - set(x_cols))]
 		if 'gpu' in args :
-			_args['gpu'] = args['gpu']
+			_args['gpu'] = self.set_gpu(gpu=args['gpu'])
 		data.maker.train(**_args)
 		
 		if 'autopilot' in ( list(args.keys())) :
@@ -228,6 +233,8 @@ class Components :
 			args['data'] = df[list(set(df.columns) - set(x_cols))]
 		
 		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
+		if 'gpu' in args :
+			args['gpu'] = self.set_gpu(gpu=args['gpu'])
 		
 		candidates = (data.maker.generate(**args))
 		if 'sql.BQWriter' in ostore['type'] :

From a73e186f77d174056063760af65fbd25ad29ee44 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 1 Apr 2021 13:20:35 -0500
Subject: [PATCH 134/250] gpu indexing

---
 data/gan.py            |  4 ++-
 data/maker/__init__.py | 59 +++---------------------------------------
 pipeline.py            |  2 ++
 3 files changed, 9 insertions(+), 56 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 985e706..dd8ea6a 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -64,7 +64,9 @@ class GNet :
                         self.GPU_CHIPS = [0]
                         if 'CUDA_VISIBLE_DEVICES' in os.environ :
                                 os.environ.pop('CUDA_VISIBLE_DEVICES')
-                self.NUM_GPUS = len(self.GPU_CHIPS)
+                        self.NUM_GPUS = 0
+                else:
+                        self.NUM_GPUS = len(self.GPU_CHIPS)
                 
                 self.PARTITION = args['partition']
                 # if self.NUM_GPUS > 1 :
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 3e42419..bfd6a5f 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -86,18 +86,6 @@ def train (**_args):
     :params sql
     :params store
     """
-    #
-    # Let us prepare the data by calling the utility function
-    #
-    # if 'file' in _args :
-    #     #
-    #     # We are reading data from a file
-    #     _args['data'] = pd.read_csv(_args['file'])
-    # else:
-    #     #
-    #     # data will be read from elsewhere (a data-store)...
-    #     pass        
-    # if 'ignore' in _args and 'columns' in _args['ignore']:
     
     _inputhandler = prepare.Input(**_args)
     values,_matrix = _inputhandler.convert()
@@ -125,6 +113,8 @@ def train (**_args):
     args['matrix_size'] = _matrix.shape[0]
     args['batch_size'] = 2000
     args['partition'] = 0 if 'partition' not in _args else _args['partition']
+    if 'gpu' in _args :
+        args['gpu'] = _args['gpu']
     # os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
     
     trainer = gan.Train(**args)   
@@ -137,50 +127,7 @@ def train (**_args):
 
     trainer.apply()
     pass    
-def _train (**args) :
-    """
-    This function is intended to train the GAN in order to learn about the distribution of the features
-    :column     columns that need to be synthesized (discrete)
-    :logs       where the output of the (location on disk)
-    :id         identifier of the dataset
-    :data       data-frame to be synthesized
-    :context    label of what we are synthesizing
-    """
-    column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
-    # CONTINUOUS  = args['continuous'] if 'continuous' in args else []
-    # column_id   = args['id']
-    df          = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
-    df.columns = [name.lower() for name in df.columns]
-    #
-    # @TODO:
-    # Consider sequential training of sub population for extremely large datasets
-    #
-    
-    #
-    # If we have several columns we will proceed one at a time (it could be done in separate threads)
-    # @TODO : Consider performing this task on several threads/GPUs simulataneously
-    # 
-    for col in column : 
-        msize = args['matrix_size'] if 'matrix_size' in args else -1        
-        args['real'] = (Binary()).apply(df[col],msize)
 
-        context     = args['context']
-        if 'store' in args :
-            args['store']['args']['doc'] = context
-            logger = factory.instance(**args['store'])
-            args['logger'] = logger
-            info = {"rows":args['real'].shape[0],"cols":args['real'].shape[1],"name":col,"partition":args['partition']}
-            logger.write({"module":"gan-train","action":"data-prep","input":info})
-            
-        else:
-            logger = None
-        args['column']  = col
-        args['context'] = col
-
-        #
-        # If the s
-        trainer = gan.Train(**args)        
-        trainer.apply()
 def get(**args):
     """
     This function will restore a checkpoint from a persistant storage on to disk
@@ -214,6 +161,8 @@ def generate(**_args):
     _inputhandler = prepare.Input(**_args)
     values,_matrix = _inputhandler.convert()    
     args['values'] = np.array(values)
+    if 'gpu' in _args :
+        args['gpu'] = _args['gpu']
        
     handler     = gan.Predict (**args)
     handler.load_meta(None)
diff --git a/pipeline.py b/pipeline.py
index 6f39b55..e2bbbec 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -87,6 +87,8 @@ class Components :
 			_index = str(gpu[0])
 			os.environ['CUDA_VISIBLE_DEVICES'] = _index
 			return gpu
+		else :
+			return None
 	def train(self,**args):
 		"""
 		This function will perform training on the basis of a given pointer that reads data

From 5b2aeb0e3e8781a35a590b621e714e5d9278514d Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 1 Apr 2021 13:38:28 -0500
Subject: [PATCH 135/250] continuous functions

---
 pipeline.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index e2bbbec..0583116 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -142,10 +142,15 @@ class Components :
 		#
 		# We need to make sure that continuous columns are removed 
 		if x_cols :
-			_args['data'] = df[list(set(df.columns) - set(x_cols))]
+			_args['data'] = _args['data'][list(set(df.columns) - set(x_cols))]
 		if 'gpu' in args :
 			_args['gpu'] = self.set_gpu(gpu=args['gpu'])
-		data.maker.train(**_args)
+		if df.shape[0] and df.shape[0] :
+			#
+			# We have a full blown matrix to be processed 
+			data.maker.train(**_args)
+		else:
+			print ("... skipping training !!")
 		
 		if 'autopilot' in ( list(args.keys())) :
 
@@ -216,7 +221,7 @@ class Components :
 				
 		_dc = pd.DataFrame()
 		# for mdf in df :
-		args['data'] = df
+		args['data'] = df.copy()
 		#
 		# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
 		if 'continuous' in args :
@@ -232,7 +237,7 @@ class Components :
 		# @TODO: Abstract this !!
 		#
 		if x_cols :
-			args['data'] = df[list(set(df.columns) - set(x_cols))]
+			args['data'] = args['data'][list(set(df.columns) - set(x_cols))]
 		
 		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
 		if 'gpu' in args :

From db496f998341565d2895f1682ce1c0c4a995cabd Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 1 Apr 2021 13:43:07 -0500
Subject: [PATCH 136/250] continuous functions skipping fields

---
 pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 0583116..f32a45e 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -142,7 +142,7 @@ class Components :
 		#
 		# We need to make sure that continuous columns are removed 
 		if x_cols :
-			_args['data'] = _args['data'][list(set(df.columns) - set(x_cols))]
+			_args['data'] = _args['data'][list(set(_args['data'].columns) - set(x_cols))]
 		if 'gpu' in args :
 			_args['gpu'] = self.set_gpu(gpu=args['gpu'])
 		if df.shape[0] and df.shape[0] :
@@ -237,7 +237,7 @@ class Components :
 		# @TODO: Abstract this !!
 		#
 		if x_cols :
-			args['data'] = args['data'][list(set(df.columns) - set(x_cols))]
+			args['data'] = args['data'][list(set(args['data'].columns) - set(x_cols))]
 		
 		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
 		if 'gpu' in args :

From e56254000e1ef89681dcbcd287a0efa770fb6071 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 1 Apr 2021 13:53:29 -0500
Subject: [PATCH 137/250] ..

---
 pipeline.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index f32a45e..a09fbde 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -231,7 +231,7 @@ class Components :
 
 		if 'ignore' in args and 'columns' in args['ignore'] :
 			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
-			args['data'] = df[ list(set(df.columns)- set(_cols))]
+			args['data'] = args['data'][ list(set(df.columns)- set(_cols))]
 		#
 		# We need to remove the continuous columns from the data-frame
 		# @TODO: Abstract this !!
@@ -267,12 +267,6 @@ class Components :
 							# 	for _name in _df.columns:
 							# 		if _name in name:
 							# 			skip_columns.append(_name)
-				if x_cols :
-					for _col in x_cols :
-						if df[_col].unique().size > 0 :
-							_df[_col] = self.approximate(df[_col].fillna(-1))
-						else:
-							_df[_col] = -1
 				#
 				# We perform a series of set operations to insure that the following conditions are met:
 				#	- the synthetic dataset only has fields that need to be synthesized
@@ -284,6 +278,12 @@ class Components :
 				if set(df.columns) & set(_df.columns) :
 					_columns = set(df.columns) - set(_df.columns)
 					df = df[_columns]
+				if x_cols :
+					for _col in x_cols :
+						if df[_col].unique().size > 0 :
+							_df[_col] = self.approximate(df[_col].fillna(-1))
+						else:
+							_df[_col] = -1
 
 				#
 				# Let us merge the dataset here and and have a comprehensive dataset

From cd7841be92ab3c894bcf1a9082ce6f9c20c58d68 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 1 Apr 2021 13:56:23 -0500
Subject: [PATCH 138/250] ..

---
 pipeline.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index a09fbde..9f57d59 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -274,16 +274,16 @@ class Components :
 				#
 				
 				_df = _df[list(set(_df.columns)  - set(skip_columns))]
-				
-				if set(df.columns) & set(_df.columns) :
-					_columns = set(df.columns) - set(_df.columns)
-					df = df[_columns]
 				if x_cols :
 					for _col in x_cols :
 						if df[_col].unique().size > 0 :
 							_df[_col] = self.approximate(df[_col].fillna(-1))
 						else:
 							_df[_col] = -1
+				
+				if set(df.columns) & set(_df.columns) :
+					_columns = set(df.columns) - set(_df.columns)											
+					df = df[_columns]
 
 				#
 				# Let us merge the dataset here and and have a comprehensive dataset

From 7ccf9848b2d001b2249deb89fe84cacc12c58558 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 1 Apr 2021 14:04:55 -0500
Subject: [PATCH 139/250] ..

---
 pipeline.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 9f57d59..72dea06 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -236,8 +236,10 @@ class Components :
 		# We need to remove the continuous columns from the data-frame
 		# @TODO: Abstract this !!
 		#
+		real_df = pd.DataFrame()
 		if x_cols :
 			args['data'] = args['data'][list(set(args['data'].columns) - set(x_cols))]
+			real_df = args[x_cols].copy()
 		
 		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
 		if 'gpu' in args :
@@ -276,7 +278,7 @@ class Components :
 				_df = _df[list(set(_df.columns)  - set(skip_columns))]
 				if x_cols :
 					for _col in x_cols :
-						if df[_col].unique().size > 0 :
+						if real_df[_col].unique().size > 0 :
 							_df[_col] = self.approximate(df[_col].fillna(-1))
 						else:
 							_df[_col] = -1
@@ -289,6 +291,7 @@ class Components :
 				# Let us merge the dataset here and and have a comprehensive dataset
 
 				_df = pd.DataFrame.join(df,_df)
+				
 				if _schema :
 					for _item in _schema :
 						if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :

From 452014ec1783f81bbbc6abc7a19a9d5e051b8cc5 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 1 Apr 2021 14:07:21 -0500
Subject: [PATCH 140/250] ...

---
 pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 72dea06..7082b71 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -239,7 +239,7 @@ class Components :
 		real_df = pd.DataFrame()
 		if x_cols :
 			args['data'] = args['data'][list(set(args['data'].columns) - set(x_cols))]
-			real_df = args[x_cols].copy()
+			real_df = args['data'][x_cols].copy()
 		
 		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
 		if 'gpu' in args :

From 1178cb7343644b1c9b0c2158aad3c53452e0f3b8 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 1 Apr 2021 14:09:38 -0500
Subject: [PATCH 141/250] ...

---
 pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 7082b71..29d15a7 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -239,7 +239,7 @@ class Components :
 		real_df = pd.DataFrame()
 		if x_cols :
 			args['data'] = args['data'][list(set(args['data'].columns) - set(x_cols))]
-			real_df = args['data'][x_cols].copy()
+			real_df = df['data'][x_cols].copy()
 		
 		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
 		if 'gpu' in args :

From 6bbe7677146b7914ff8ffa37dd44bfbce96837de Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 1 Apr 2021 14:13:00 -0500
Subject: [PATCH 142/250] ...

---
 pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 29d15a7..6476221 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -239,7 +239,7 @@ class Components :
 		real_df = pd.DataFrame()
 		if x_cols :
 			args['data'] = args['data'][list(set(args['data'].columns) - set(x_cols))]
-			real_df = df['data'][x_cols].copy()
+			real_df = df[x_cols].copy()
 		
 		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
 		if 'gpu' in args :
@@ -279,7 +279,7 @@ class Components :
 				if x_cols :
 					for _col in x_cols :
 						if real_df[_col].unique().size > 0 :
-							_df[_col] = self.approximate(df[_col].fillna(-1))
+							_df[_col] = self.approximate(real_df[_col].fillna(-1))
 						else:
 							_df[_col] = -1
 				

From fa8915a990873086040e29908edc8f9beeb5f220 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 4 Apr 2021 10:54:44 -0500
Subject: [PATCH 143/250] bug fix: matrix allocation error

---
 data/maker/prepare/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index e15c63b..ecb47bd 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -237,7 +237,7 @@ class Input :
         #
         # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure)
         #
-        _matrix = np.array([np.zeros(cols.size) for i in np.arange(row_count)])
+        _matrix = np.array([np.repeat(0,cols.size) for i in range(row_count)])
         [np.put(_matrix[i], np.where(cols ==  rows[i])  ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
         # else:
         #     _matrix = cp.zeros([row_count,cols.size])

From a43247ac65101128a8a2a07f6cce45511e5864ee Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 4 Apr 2021 11:17:34 -0500
Subject: [PATCH 144/250] logging generator ....

---
 pipeline.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 6476221..0a9c549 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -217,6 +217,7 @@ class Components :
 			# 	df = args['data']
 	
 		_info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[0]}}
+		logger.write(_info)
 		
 				
 		_dc = pd.DataFrame()
@@ -244,7 +245,8 @@ class Components :
 		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
 		if 'gpu' in args :
 			args['gpu'] = self.set_gpu(gpu=args['gpu'])
-		
+		_info = {"module":"gan-prep","action":"prune","shape":{"rows":args['data'].shape[0],"columns":args['data'].shape[0]}}
+		logger.write(_info)
 		candidates = (data.maker.generate(**args))
 		if 'sql.BQWriter' in ostore['type'] :
 			#table = ".".join([ostore['['dataset'],args['context']])

From efd2fd6a9a8419049e59473a62f2e99762d399c7 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 4 Apr 2021 11:26:24 -0500
Subject: [PATCH 145/250] logging generator ....

---
 pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 0a9c549..bbc54bc 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -189,7 +189,7 @@ class Components :
 		This function will generate data and store it to a given,
 		"""
 		store = args['store']['logs']
-		store['doc'] = args['context']
+		store['args']['doc'] = args['context']
 		logger = factory.instance(**store) #type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 
 		ostore = args['store']['target']
@@ -216,7 +216,7 @@ class Components :
 			# 	# This will account for autopilot mode ...
 			# 	df = args['data']
 	
-		_info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[0]}}
+		_info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[1]}}
 		logger.write(_info)
 		
 				
@@ -245,7 +245,7 @@ class Components :
 		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
 		if 'gpu' in args :
 			args['gpu'] = self.set_gpu(gpu=args['gpu'])
-		_info = {"module":"gan-prep","action":"prune","shape":{"rows":args['data'].shape[0],"columns":args['data'].shape[0]}}
+		_info = {"module":"gan-prep","action":"prune","shape":{"rows":args['data'].shape[0],"columns":args['data'].shape[1]}}
 		logger.write(_info)
 		candidates = (data.maker.generate(**args))
 		if 'sql.BQWriter' in ostore['type'] :

From 88d602de1cb5db449c997314465ce9f93fc5cb7a Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 4 Apr 2021 11:45:20 -0500
Subject: [PATCH 146/250] ...

---
 pipeline.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index bbc54bc..87d2af6 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -79,7 +79,8 @@ class Components :
 		if 'columns' in _args and 'data' in _args :
 			_df = _args['data']
 			terms = _args['columns']
-			return [name for name in _df.columns if  np.sum( [int(field in  name )for field in terms ]) > 0 ]
+			return [name for name in _df.columns if  np.sum( [int(field in  name )for field in terms ])  ]
+			
 		return []
 	def set_gpu(self,**_args) :
 		if 'gpu' in _args :
@@ -247,7 +248,10 @@ class Components :
 			args['gpu'] = self.set_gpu(gpu=args['gpu'])
 		_info = {"module":"gan-prep","action":"prune","shape":{"rows":args['data'].shape[0],"columns":args['data'].shape[1]}}
 		logger.write(_info)
-		candidates = (data.maker.generate(**args))
+		if args['data'].shape[0] > 0 and args['data'].shape[1] > 0 :
+			candidates = (data.maker.generate(**args))
+		else:
+			candidate = [df]
 		if 'sql.BQWriter' in ostore['type'] :
 			#table = ".".join([ostore['['dataset'],args['context']])
 			# writer  = factory.instance(**ostore)

From 0a346d7abc72434f57c477d4a2816d67bb388f03 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 4 Apr 2021 11:46:36 -0500
Subject: [PATCH 147/250] ...

---
 pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 87d2af6..ece3030 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -251,7 +251,7 @@ class Components :
 		if args['data'].shape[0] > 0 and args['data'].shape[1] > 0 :
 			candidates = (data.maker.generate(**args))
 		else:
-			candidate = [df]
+			candidates = [df]
 		if 'sql.BQWriter' in ostore['type'] :
 			#table = ".".join([ostore['['dataset'],args['context']])
 			# writer  = factory.instance(**ostore)

From 73115724fe2d91d4ffbc5b21dec50c24d6963480 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 4 Apr 2021 12:05:23 -0500
Subject: [PATCH 148/250] ...

---
 pipeline.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index ece3030..da7b27e 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -216,8 +216,22 @@ class Components :
 			# 	#
 			# 	# This will account for autopilot mode ...
 			# 	df = args['data']
-	
-		_info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[1]}}
+		_cast = {}
+		if schema :
+			dtype = str
+			name = schema['name']
+			novalue = -1
+			if schema['type'] == 'INTEGER' :
+				dtype = np.int64
+				
+			elif schema['type'] == 'FLOAT' :
+				dtype = np.float64
+			else:
+				novalue = ''
+			_cast[schema['name']] = dtype
+			df[name] = df[name].fillna(novalue).astype(dtype)
+
+		_info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[1]},"schema":schema}
 		logger.write(_info)
 		
 				

From 856d1e4bd7650b74f07f047f193777766dd1b947 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 4 Apr 2021 12:09:34 -0500
Subject: [PATCH 149/250] ...

---
 pipeline.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index da7b27e..1ca19e5 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -218,18 +218,19 @@ class Components :
 			# 	df = args['data']
 		_cast = {}
 		if schema :
-			dtype = str
-			name = schema['name']
-			novalue = -1
-			if schema['type'] == 'INTEGER' :
-				dtype = np.int64
-				
-			elif schema['type'] == 'FLOAT' :
-				dtype = np.float64
-			else:
-				novalue = ''
-			_cast[schema['name']] = dtype
-			df[name] = df[name].fillna(novalue).astype(dtype)
+			for _item in schem :
+				dtype = str
+				name = _item['name']
+				novalue = -1
+				if _item['type'] == 'INTEGER' :
+					dtype = np.int64
+					
+				elif _item['type'] == 'FLOAT' :
+					dtype = np.float64
+				else:
+					novalue = ''
+				# _cast[schema['name']] = dtype
+				df[name] = df[name].fillna(novalue).astype(dtype)
 
 		_info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[1]},"schema":schema}
 		logger.write(_info)

From 0f82f002dfb7512f358e786635f492251e714d20 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 4 Apr 2021 12:11:40 -0500
Subject: [PATCH 150/250] ...

---
 pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 1ca19e5..8d35cd8 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -218,7 +218,7 @@ class Components :
 			# 	df = args['data']
 		_cast = {}
 		if schema :
-			for _item in schem :
+			for _item in schema :
 				dtype = str
 				name = _item['name']
 				novalue = -1

From dbbe0d94ced4649eefc4e245d50b4304c7a79bca Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 4 Apr 2021 12:51:27 -0500
Subject: [PATCH 151/250] bg fix : approximation

---
 pipeline.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 8d35cd8..00bb80c 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -166,7 +166,9 @@ class Components :
 		:param values	array of values to be approximated
 		"""
 		if values.dtype in [int,float] :
-			r = np.random.dirichlet(values)
+			#
+			# @TODO: create bins?
+			r = np.random.dirichlet(values+.001) #-- dirichlet doesn't work on values with zeros
 			x = []
 			_type = values.dtype
 			for index in np.arange(values.size) :
@@ -222,7 +224,7 @@ class Components :
 				dtype = str
 				name = _item['name']
 				novalue = -1
-				if _item['type'] == 'INTEGER' :
+				if _item['type'] in ['INTEGER','NUMERIC']:
 					dtype = np.int64
 					
 				elif _item['type'] == 'FLOAT' :
@@ -296,11 +298,11 @@ class Components :
 				#	- The original dataset has all the fields except those that need to be synthesized
 				#
 				
-				_df = _df[list(set(_df.columns)  - set(skip_columns))]
+				_df = _df[list(set(_df.columns)  - set(skip_columns))].copy()
 				if x_cols :
 					for _col in x_cols :
 						if real_df[_col].unique().size > 0 :
-							_df[_col] = self.approximate(real_df[_col].fillna(-1))
+							_df[_col] = self.approximate(real_df[_col])
 						else:
 							_df[_col] = -1
 				

From 8997a5ca10dc0a2f9dd58ea3e7ee13a1415298ae Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 4 Apr 2021 13:29:57 -0500
Subject: [PATCH 152/250] bg fix : approximation

---
 pipeline.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 00bb80c..1bb0707 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -300,12 +300,19 @@ class Components :
 				
 				_df = _df[list(set(_df.columns)  - set(skip_columns))].copy()
 				if x_cols :
+					_approx = {}
 					for _col in x_cols :
 						if real_df[_col].unique().size > 0 :
+							
+
 							_df[_col] = self.approximate(real_df[_col])
+							_approx[_col] = {
+								"io":{"min":_df[_col].min(),"max":_df[_col].max(),"mean":_df[_col].mean(),"sd":_df[_col].values.std(),"missing": _df[_col].where(_df[_col] == -1).dropna().count(),"zeros":_df[_col].where(_df[_col] == 0).dropna().count()},
+								"real":{"min":real_df[_col].min(),"max":real_df[_col].max(),"mean":real_df[_col].mean(),"sd":real_df[_col].values.std(),"missing": real_df[_col].where(_df[_col] == -1).dropna().count(),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count()}
+							}
 						else:
 							_df[_col] = -1
-				
+					logger.write({"module":"gan-generate","action":"approximate","status":_approx})				
 				if set(df.columns) & set(_df.columns) :
 					_columns = set(df.columns) - set(_df.columns)											
 					df = df[_columns]

From fc7b694d0272b9c310b9d501d932e660fc44ab97 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 4 Apr 2021 13:36:47 -0500
Subject: [PATCH 153/250] bg fix : approximation

---
 pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 1bb0707..2ed5cdc 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -307,8 +307,8 @@ class Components :
 
 							_df[_col] = self.approximate(real_df[_col])
 							_approx[_col] = {
-								"io":{"min":_df[_col].min(),"max":_df[_col].max(),"mean":_df[_col].mean(),"sd":_df[_col].values.std(),"missing": _df[_col].where(_df[_col] == -1).dropna().count(),"zeros":_df[_col].where(_df[_col] == 0).dropna().count()},
-								"real":{"min":real_df[_col].min(),"max":real_df[_col].max(),"mean":real_df[_col].mean(),"sd":real_df[_col].values.std(),"missing": real_df[_col].where(_df[_col] == -1).dropna().count(),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count()}
+								"io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)},
+								"real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}
 							}
 						else:
 							_df[_col] = -1

From 12d7573ba8915f5ee43e81f46d5a556db899fbcb Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sun, 4 Apr 2021 13:52:15 -0500
Subject: [PATCH 154/250] bg fix : approximation

---
 pipeline.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 2ed5cdc..d73f1fc 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -169,14 +169,24 @@ class Components :
 			#
 			# @TODO: create bins?
 			r = np.random.dirichlet(values+.001) #-- dirichlet doesn't work on values with zeros
+			_sd = values[values > 0].std()
+			_me = values[values > 0].mean()
 			x = []
 			_type = values.dtype
 			for index in np.arange(values.size) :
 				
 				if np.random.choice([0,1],1)[0] :
 					value = values[index] + (values[index] * r[index])
+					
 				else :
 					value = values[index] - (values[index] * r[index])
+				#
+				# randomly shifting the measurements 
+				if np.random.choice([0,1],1)[0] and _me > _sd:
+					if np.random.choice([0,1],1)[0] :
+						value = value * np.divide(_me,_sd)
+					else:
+						value = value + (np.divide(_me,_sd))
 				value = int(value) if _type == int else np.round(value,2)
 				x.append( value)
 			np.random.shuffle(x)
@@ -305,7 +315,7 @@ class Components :
 						if real_df[_col].unique().size > 0 :
 							
 
-							_df[_col] = self.approximate(real_df[_col])
+							_df[_col] = self.approximate(real_df[_col].values)
 							_approx[_col] = {
 								"io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)},
 								"real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}

From f26795387ef60690512ee33b3dcfaee2ca7c5c10 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 7 Apr 2021 15:30:59 -0500
Subject: [PATCH 155/250] feature: bootstrap-like with candidates

---
 data/gan.py                    | 26 +++++++++----
 data/maker/__init__.py         | 14 +++++--
 data/maker/prepare/__init__.py |  2 +-
 pipeline.py                    | 71 +++++++++++++++++++++++++++++++---
 4 files changed, 94 insertions(+), 19 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index dd8ea6a..643e838 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -67,8 +67,9 @@ class GNet :
                         self.NUM_GPUS = 0
                 else:
                         self.NUM_GPUS = len(self.GPU_CHIPS)
+                        # os.environ['CUDA_VISIBLE_DEVICES'] = str(self.GPU_CHIPS[0])
                 
-                self.PARTITION = args['partition']
+                self.PARTITION = args['partition'] if 'partition' in args else None
                 # if self.NUM_GPUS > 1 :
                 #     os.environ['CUDA_VISIBLE_DEVICES'] = "4"
 
@@ -117,9 +118,14 @@ class GNet :
                 for key in ['train','output'] :
                         self.mkdir(os.sep.join([self.log_dir,key]))
                         self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT]))
+                        if 'partition' in args :
+                                self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT,str(args['partition'])]))
                         
                 self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])                
                 self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
+                if 'partition' in args :
+                        self.train_dir = os.sep.join([self.train_dir,str(args['partition'])])
+                        self.out_dir = os.sep.join([self.out_dir,str(args['partition'])])
                 # if self.logger :
                         
                 #         We will clear the logs from the data-store 
@@ -130,7 +136,7 @@ class GNet :
                 #                 db.backup.insert({'name':column,'logs':list(db[column].find()) })
                 #                 db[column].drop()
                 
-        def load_meta(self,column):
+        def load_meta(self,**args):
                 """
                 This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model.
                 Because prediction and training can happen independently
@@ -145,6 +151,9 @@ class GNet :
                                 setattr(self,key,value)
                 self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])                
                 self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
+                if 'partition' in args :
+                        self.train_dir = os.sep.join([self.train_dir,str(args['partition'])])
+                        self.out_dir = os.sep.join([self.out_dir,str(args['partition'])])
                                 
                         
         def log_meta(self,**args) :
@@ -265,9 +274,9 @@ class Generator (GNet):
                 #tf.add_to_collection('glosses', loss)
                 tf.compat.v1.add_to_collection('glosses', loss)
                 return loss, loss                
-        def load_meta(self, column):
-                super().load_meta(column)
-                self.discriminator.load_meta(column)
+        def load_meta(self, **args):
+                super().load_meta(**args)
+                self.discriminator.load_meta(**args)
         def network(self,**args) :
                 """
                 This function will build the network that will generate the synthetic candidates
@@ -454,6 +463,7 @@ class Train (GNet):
                         #       - determine if the GPU/CPU are busy
                         #
                         for i in self.GPU_CHIPS : #range(self.NUM_GPUS):
+                                
                                 with tf.device('/gpu:%d' % i):
                                         with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
                                                 if self._LABEL is not None :
@@ -559,9 +569,9 @@ class Predict(GNet):
                         
                 # self.MISSING_VALUES = args['no_value']
                 # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else  np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value']
-        def load_meta(self, column):
-                super().load_meta(column)
-                self.generator.load_meta(column)
+        def load_meta(self, **args):
+                super().load_meta(**args)
+                self.generator.load_meta(**args)
                 self.ROW_COUNT = self.oROW_COUNT
         def apply(self,**args):
                 suffix = self.CONTEXT #self.get.suffix()
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index bfd6a5f..803590a 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -112,7 +112,8 @@ def train (**_args):
     args ['max_epochs'] = _args['max_epochs']
     args['matrix_size'] = _matrix.shape[0]
     args['batch_size'] = 2000
-    args['partition'] = 0 if 'partition' not in _args else _args['partition']
+    if 'partition' in _args :
+        args['partition'] = _args['partition']
     if 'gpu' in _args :
         args['gpu'] = _args['gpu']
     # os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
@@ -121,7 +122,8 @@ def train (**_args):
     #
     # @TODO: Write the map.json in the output directory for the logs
     # 
-    f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w')
+    # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w')
+    f = open(os.sep.join([trainer.out_dir,'map.json']),'w')
     f.write(json.dumps(_map))
     f.close()
 
@@ -140,7 +142,11 @@ def generate(**_args):
     :param context
     :param logs
     """
-    f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
+    partition = _args['partition'] if 'partition' in _args else None
+    if not partition :
+        f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
+    else:
+        f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json']))
     _map = json.loads(f.read())
     f.close()
     # if 'file' in _args :
@@ -165,7 +171,7 @@ def generate(**_args):
         args['gpu'] = _args['gpu']
        
     handler     = gan.Predict (**args)
-    handler.load_meta(None)
+    handler.load_meta(column=None)
     #
     # Let us now format the matrices by reverting them to a data-frame with values
     #
diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index ecb47bd..5ace56a 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -237,7 +237,7 @@ class Input :
         #
         # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure)
         #
-        _matrix = np.array([np.repeat(0,cols.size) for i in range(row_count)])
+        _matrix = np.array([np.repeat(0,cols.size) for i in range(0,row_count)])
         [np.put(_matrix[i], np.where(cols ==  rows[i])  ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
         # else:
         #     _matrix = cp.zeros([row_count,cols.size])
diff --git a/pipeline.py b/pipeline.py
index d73f1fc..3f8358b 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -146,6 +146,8 @@ class Components :
 			_args['data'] = _args['data'][list(set(_args['data'].columns) - set(x_cols))]
 		if 'gpu' in args :
 			_args['gpu'] = self.set_gpu(gpu=args['gpu'])
+		if 'partition' in args :
+			_args['partition'] = args['partition']
 		if df.shape[0] and df.shape[0] :
 			#
 			# We have a full blown matrix to be processed 
@@ -154,7 +156,7 @@ class Components :
 			print ("... skipping training !!")
 		
 		if 'autopilot' in ( list(args.keys())) :
-
+			
 			args['data'] = df
 			print (['autopilot mode enabled ....',args['context']])
 			self.generate(args)
@@ -171,6 +173,7 @@ class Components :
 			r = np.random.dirichlet(values+.001) #-- dirichlet doesn't work on values with zeros
 			_sd = values[values > 0].std()
 			_me = values[values > 0].mean()
+			_mi = values.min()
 			x = []
 			_type = values.dtype
 			for index in np.arange(values.size) :
@@ -182,7 +185,7 @@ class Components :
 					value = values[index] - (values[index] * r[index])
 				#
 				# randomly shifting the measurements 
-				if np.random.choice([0,1],1)[0] and _me > _sd:
+				if np.random.choice([0,1],1)[0] and _me > _sd :
 					if np.random.choice([0,1],1)[0] :
 						value = value * np.divide(_me,_sd)
 					else:
@@ -273,6 +276,9 @@ class Components :
 		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
 		if 'gpu' in args :
 			args['gpu'] = self.set_gpu(gpu=args['gpu'])
+		# if 'partition' in args :
+		# 	args['logs'] = os.sep.join([args['logs'],str(args['partition'])])
+		
 		_info = {"module":"gan-prep","action":"prune","shape":{"rows":args['data'].shape[0],"columns":args['data'].shape[1]}}
 		logger.write(_info)
 		if args['data'].shape[0] > 0 and args['data'].shape[1] > 0 :
@@ -459,12 +465,18 @@ if __name__ == '__main__' :
 	# COLUMNS = DATA.columns
 	# DATA = np.array_split(DATA,PART_SIZE)
 	# args['schema'] = schema
+	GPU_CHIPS = SYS_ARGS['gpu'] if 'gpu' in SYS_ARGS else None
+	if GPU_CHIPS and type(GPU_CHIPS) != list :				
+		GPU_CHIPS = [int(_id.strip()) for _id in GPU_CHIPS.split(',')] if type(GPU_CHIPS) == str else [GPU_CHIPS]
+	if 'gpu' in SYS_ARGS :
+		args['gpu'] = GPU_CHIPS
+	jobs = []
 	if 'generate' in SYS_ARGS :
 		#
 		# Let us see if we have partitions given the log folder
 		
 		content = os.listdir( os.sep.join([args['logs'],'train',args['context']]))
-		generator = Components()
+		
 		
 		# if ''.join(content).isnumeric() :
 		# 	#
@@ -508,13 +520,60 @@ if __name__ == '__main__' :
 		# else:
 		# 	generator.generate(args)
 		# Components.generate(args)
-		generator.generate(args)
+		if '--all-chips' in SYS_ARGS and GPU_CHIPS:
+			index = 0
+			jobs = []
+			for _id in GPU_CHIPS :
+				_args = copy.deepcopy(args)
+				_args['gpu'] = [int(_gpu)]
+				_args['partition'] = index
+				index += 1
+				make = lambda _params: (Components()).generate(_params)
+				job = Process(target=make,args=( dict(_args),))
+				job.name = 'Trainer # ' + str(index)
+				job.start()
+				jobs.append(job)
+			pass
+		else:
+			generator = Components()
+			generator.generate(args)
 	
 	else:
 		
 		# DATA  = np.array_split(DATA,PART_SIZE)
-		agent = Components()
-		agent.train(**args)
+		#
+		# Let us create n-jobs across n-gpus, The assumption here is the data that is produced will be a partition
+		# @TODO: Find better name for partition
+		#
+		if GPU_CHIPS and '--all-chips' in SYS_ARGS:
+			index = 0
+			
+			for _gpu in GPU_CHIPS :
+				_args = copy.deepcopy(args)
+				_args['gpu'] = [int(_gpu)]
+				_args['partition'] = index
+				index += 1
+				make = lambda _params: (Components()).train(**_params)
+				job = Process(target=make,args=( dict(_args),))
+				job.name = 'Trainer # ' + str(index)
+				job.start()
+				jobs.append(job)
+			
+
+
+
+		else:
+			#
+			# The choice of the chip will be made internally
+			agent = Components()
+			agent.train(**args)
+		#
+		# If we have any obs we should wait till they finish
+		#
+		while len(jobs)> 0 :
+			jobs = [job for job in jobs if job.is_alive()]
+			time.sleep(2)
+
 		# jobs = []
 		# for index in range(0,PART_SIZE) :
 		# 	if 'focus' in args and int(args['focus']) != index :

From 6a6352169c50beb4a12c39107fc3cbd32fdbc6c7 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 12 Apr 2021 12:55:01 -0500
Subject: [PATCH 156/250] adding shuffle feature to be used for very large
 spaces

---
 pipeline.py | 88 +++++++++++++++++++++++++++++++++++++++++++++--------
 setup.py    |  2 +-
 2 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 3f8358b..9d33873 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -198,6 +198,52 @@ class Components :
 			return values
 		pass
 			
+	def shuffle(self,_args):
+		if 'data' in args :
+			df = data['data'] 
+		else:
+			reader = factory.instance(**args['store']['source'])
+			if 'file' in args :
+				df = pd.read_csv(args['file'])
+			else:
+				if 'row_limit' in args and 'sql' in args:
+					df = reader.read(sql=args['sql'],limit=args['row_limit'])
+				else:
+					df = reader.read(sql=args['sql'])
+			schema = None
+			if 'schema' not in args and hasattr(reader,'meta') and 'file' not in args:
+				schema = reader.meta(table=args['from'])
+				schema = [{"name":_item.name,"type":_item.field_type} for _item in schema]
+		#
+		# We are shufling designated colmns and will be approximating the others
+		#
+		x_cols 	= []	#-- coumns tobe approximated.
+		_cols 	= []	#-- columns to be ignored
+		if 'continuous' in args :
+			x_cols = args['continuous']
+		if 'ignore' in args and 'columns' in args['ignore'] :
+			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
+		
+
+		for name in list (set(df.columns) - set(_cols)) :
+			i = np.arange(df.shape[0])
+			np.random.shuffle(i)
+			if name in x_cols :
+				df[name] = self.approximate(df[name].values)
+			df[name] = df.iloc[i][name]
+		self.post(data=df,schema=schema,store=args['store']['target'])	
+	def post(self,**_args) :
+		_schema = _args['schema'] if 'schema' in _args else None
+		writer 	= factory.instance(**_args['store'])
+		_df 	= _args['data']
+		if _schema :
+			
+			for _item in _schema :
+				if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
+					_df[_item['name']] = _df[_item['name']].astype(str)			
+			writer.write(_df,schema=_schema,table=args['from'])
+		else:
+			writer.write(_df,table=args['from'])
 
 	# @staticmethod
 	def generate(self,args):
@@ -338,20 +384,25 @@ class Components :
 
 				_df = pd.DataFrame.join(df,_df)
 				
-				if _schema :
-					for _item in _schema :
-						if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
-							_df[_item['name']] = _df[_item['name']].astype(str)
+				# if _schema :
+				# 	for _item in _schema :
+				# 		if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
+				# 			_df[_item['name']] = _df[_item['name']].astype(str)
 				
-						pass
+				# 		pass
+				_params = {'data':_df,'store' : ostore}
 				if _schema :
-					writer.write(_df[cols],schema=_schema,table=args['from'])
-				else:
-					writer.write(_df[cols],table=args['from'])
-			# 	writer.write(df,table=table)
-			pass
-		else:
+					_params ['schema'] = _schema
+				self.post(**_params)
+				# if _schema :
+				# 	writer.write(_df[cols],schema=_schema,table=args['from'])
+				# 	self.post(data=_df,schema=)
+				# else:
+				# 	writer.write(_df[cols],table=args['from'])
+			
 			pass
+		# else:
+		# 	pass
 
 		
 		# #
@@ -537,7 +588,20 @@ if __name__ == '__main__' :
 		else:
 			generator = Components()
 			generator.generate(args)
-	
+	elif 'shuffle' in SYS_ARGS :
+		index = 0
+		if GPU_CHIPS and '--all-chips':
+
+			for index in GPU_CHIPS :
+				publisher = lambda _params: ( Components() ).shuffle(_params)
+				job = Process (target = publisher,args=( dict(args)))
+				job.name = 'Shuffler #' + str(index)
+				job.start()
+				jobs.append(job)
+		else:
+			shuffler = Components()
+			shuffler.shuffle(args)
+		pass
 	else:
 		
 		# DATA  = np.array_split(DATA,PART_SIZE)
diff --git a/setup.py b/setup.py
index 544f4b3..4eb869f 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.4",
+        "version":"1.4.5",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']

From abed87db22ad47c1d8e9c717967692078248fb36 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 12 Apr 2021 15:11:41 -0500
Subject: [PATCH 157/250] bug fix: column specification for shuffle

---
 pipeline.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 9d33873..dcf649c 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -224,12 +224,13 @@ class Components :
 		if 'ignore' in args and 'columns' in args['ignore'] :
 			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
 		
-
-		for name in list (set(df.columns) - set(_cols)) :
+		columns = args['columns'] if 'columns' in args else df.columns
+		columns = list(set(columns) - set(_cols))
+		for name in columns :
 			i = np.arange(df.shape[0])
 			np.random.shuffle(i)
 			if name in x_cols :
-				df[name] = self.approximate(df[name].values)
+				df[name] = self.approximate(df.iloc[i][name].values)
 			df[name] = df.iloc[i][name]
 		self.post(data=df,schema=schema,store=args['store']['target'])	
 	def post(self,**_args) :

From 677a99425ae6c20c05f53869ad2c5bf628befb41 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 13 Apr 2021 10:24:36 -0500
Subject: [PATCH 158/250] bug fix: date formatting

---
 pipeline.py | 44 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index dcf649c..0aba799 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -11,7 +11,7 @@ from google.cloud import bigquery as bq
 import data.maker
 import copy
 from data.params import SYS_ARGS 
-
+                 
 #
 # The configuration array is now loaded and we will execute the pipe line as follows
 
@@ -205,6 +205,8 @@ class Components :
 			reader = factory.instance(**args['store']['source'])
 			if 'file' in args :
 				df = pd.read_csv(args['file'])
+			elif 'data' in _args :
+				df = _args['data']
 			else:
 				if 'row_limit' in args and 'sql' in args:
 					df = reader.read(sql=args['sql'],limit=args['row_limit'])
@@ -226,25 +228,45 @@ class Components :
 		
 		columns = args['columns'] if 'columns' in args else df.columns
 		columns = list(set(columns) - set(_cols))
-		for name in columns :
-			i = np.arange(df.shape[0])
-			np.random.shuffle(i)
-			if name in x_cols :
-				df[name] = self.approximate(df.iloc[i][name].values)
-			df[name] = df.iloc[i][name]
+		# for name in columns:
+		# 	i = np.arange(df.shape[0])
+		# 	np.random.shuffle(i)
+		# 	if name in x_cols :
+		# 		if df[name].unique().size > 0 :
+		# 			df[name] = self.approximate(df.iloc[i][name].fillna(0).values)
+		# 	df[name] = df[name].copy().astype(str)
+		# 	pass
+			
+		df.index = np.arange(df.shape[0])		
 		self.post(data=df,schema=schema,store=args['store']['target'])	
 	def post(self,**_args) :
 		_schema = _args['schema'] if 'schema' in _args else None
 		writer 	= factory.instance(**_args['store'])
 		_df 	= _args['data']
 		if _schema :
-			
+			columns = []
 			for _item in _schema :
-				if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
-					_df[_item['name']] = _df[_item['name']].astype(str)			
+				name = _item['name']
+				_type = str
+				_value = 0
+				if _item['type'] in ['DATE','TIMESTAMP','DATETIMESTAMP','DATETIME'] :
+					if _item['type'] == 'DATE' :
+						_df[name] = _df[name].dt.date
+					
+					
+					
+				else:
+					if _item['type'] == 'INTEGER' :
+						_type = np.int64
+					elif _item['type'] in ['FLOAT','NUMERIC']:
+						_type = np.float64
+					else:
+						_value = ''
+					_df[name] = _df[name].fillna(_value).astype(_type)
+				columns.append(name)
 			writer.write(_df,schema=_schema,table=args['from'])
 		else:
-			writer.write(_df,table=args['from'])
+			writer.write(_df[columns],table=args['from'])
 
 	# @staticmethod
 	def generate(self,args):

From be55b14e2b5723f39c387d1ebd97e7daf333463d Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 13 Apr 2021 17:41:30 -0500
Subject: [PATCH 159/250] bug fix

---
 pipeline.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 0aba799..2a3919c 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -228,13 +228,13 @@ class Components :
 		
 		columns = args['columns'] if 'columns' in args else df.columns
 		columns = list(set(columns) - set(_cols))
-		# for name in columns:
-		# 	i = np.arange(df.shape[0])
-		# 	np.random.shuffle(i)
-		# 	if name in x_cols :
-		# 		if df[name].unique().size > 0 :
-		# 			df[name] = self.approximate(df.iloc[i][name].fillna(0).values)
-		# 	df[name] = df[name].copy().astype(str)
+		for name in columns:
+			i = np.arange(df.shape[0])
+			np.random.shuffle(i)
+			if name in x_cols :
+				if df[name].unique().size > 0 :
+					df[name] = self.approximate(df.iloc[i][name].fillna(0).values)
+			df[name] = df[name].astype(str)
 		# 	pass
 			
 		df.index = np.arange(df.shape[0])		
@@ -539,7 +539,7 @@ if __name__ == '__main__' :
 	# COLUMNS = DATA.columns
 	# DATA = np.array_split(DATA,PART_SIZE)
 	# args['schema'] = schema
-	GPU_CHIPS = SYS_ARGS['gpu'] if 'gpu' in SYS_ARGS else None
+	GPU_CHIPS = args['gpu'] if 'gpu' in args else None
 	if GPU_CHIPS and type(GPU_CHIPS) != list :				
 		GPU_CHIPS = [int(_id.strip()) for _id in GPU_CHIPS.split(',')] if type(GPU_CHIPS) == str else [GPU_CHIPS]
 	if 'gpu' in SYS_ARGS :
@@ -594,7 +594,7 @@ if __name__ == '__main__' :
 		# else:
 		# 	generator.generate(args)
 		# Components.generate(args)
-		if '--all-chips' in SYS_ARGS and GPU_CHIPS:
+		if 'all-chips' in SYS_ARGS and GPU_CHIPS:
 			index = 0
 			jobs = []
 			for _id in GPU_CHIPS :
@@ -613,7 +613,7 @@ if __name__ == '__main__' :
 			generator.generate(args)
 	elif 'shuffle' in SYS_ARGS :
 		index = 0
-		if GPU_CHIPS and '--all-chips':
+		if GPU_CHIPS and 'all-chips' in SYS_ARGS:
 
 			for index in GPU_CHIPS :
 				publisher = lambda _params: ( Components() ).shuffle(_params)
@@ -632,7 +632,7 @@ if __name__ == '__main__' :
 		# Let us create n-jobs across n-gpus, The assumption here is the data that is produced will be a partition
 		# @TODO: Find better name for partition
 		#
-		if GPU_CHIPS and '--all-chips' in SYS_ARGS:
+		if GPU_CHIPS and 'all-chips' in SYS_ARGS:
 			index = 0
 			
 			for _gpu in GPU_CHIPS :

From 567671c43ec783c97e65186c53536b2fc47b4fbd Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 13 Apr 2021 17:43:43 -0500
Subject: [PATCH 160/250] bug fix

---
 pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 2a3919c..ae6c2b8 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -634,14 +634,14 @@ if __name__ == '__main__' :
 		#
 		if GPU_CHIPS and 'all-chips' in SYS_ARGS:
 			index = 0
-			
+			print (['... launching ',len(GPU_CHIPS),' jobs',args['context']])
 			for _gpu in GPU_CHIPS :
 				_args = copy.deepcopy(args)
 				_args['gpu'] = [int(_gpu)]
 				_args['partition'] = index
 				index += 1
 				make = lambda _params: (Components()).train(**_params)
-				job = Process(target=make,args=( dict(_args),))
+				job = Process(target=make,args=( _args,))
 				job.name = 'Trainer # ' + str(index)
 				job.start()
 				jobs.append(job)

From e44fae01a6307ff4310505c2d5dddf6db69fd715 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 13 Apr 2021 17:46:24 -0500
Subject: [PATCH 161/250] bug fix

---
 pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index ae6c2b8..56b742f 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -617,7 +617,7 @@ if __name__ == '__main__' :
 
 			for index in GPU_CHIPS :
 				publisher = lambda _params: ( Components() ).shuffle(_params)
-				job = Process (target = publisher,args=( dict(args)))
+				job = Process (target = publisher,args=( args,))
 				job.name = 'Shuffler #' + str(index)
 				job.start()
 				jobs.append(job)

From 3eb28dd798f53e551d0b178b9459ee935dd98e11 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 13 Apr 2021 17:53:15 -0500
Subject: [PATCH 162/250] bug fix: data-typing

---
 pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 56b742f..c9d01d0 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -234,7 +234,7 @@ class Components :
 			if name in x_cols :
 				if df[name].unique().size > 0 :
 					df[name] = self.approximate(df.iloc[i][name].fillna(0).values)
-			df[name] = df[name].astype(str)
+			# df[name] = df[name].astype(str)
 		# 	pass
 			
 		df.index = np.arange(df.shape[0])		

From 94798fd9a2572245942b255cb850e620dc35b877 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 28 Apr 2021 16:47:38 -0500
Subject: [PATCH 163/250] bug fix: finalize to remove duplicate keys

---
 pipeline.py | 123 ++++++++++++++++++++++++++--------------------------
 setup.py    |   2 +-
 2 files changed, 63 insertions(+), 62 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index c9d01d0..78559cb 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -268,7 +268,48 @@ class Components :
 		else:
 			writer.write(_df[columns],table=args['from'])
 
-	# @staticmethod
+	def finalize(self,args):
+		"""
+		This function performs post-processing opertions on a synthetic table i.e :
+			- remove duplicate keys
+			- remove orphaned keys i.e 
+		"""
+		reader = factory.instance(**args['store']['source'])
+		logger = factory.instance(**args['store']['logs'])
+		target = args['store']['target']['args']['dataset']
+		source = args['store']['source']['args']['dataset']
+		table = args['from']
+		schema = reader.meta(table=args['from'])
+		#
+		# keys :
+		unique_field = "_".join([args['from'],'id']) if 'unique_fields' not in args else args['unique_fields']
+		fields = [ item.name if item.name != unique_field else "y."+item.name for item in schema]
+		SQL = [
+			"SELECT :fields FROM ",
+			"(SELECT ROW_NUMBER() OVER() AS row_number,* FROM :target.:table) x","INNER JOIN",
+			"(SELECT ROW_NUMBER() OVER() AS row_number, :unique_field FROM :source.:table) y",
+			"ON y.row_number = x.row_number"
+		]
+		SQL = " ".join(SQL).replace(":fields",",".join(fields)).replace(":table",table).replace(":source",source).replace(":target",target)
+		SQL = SQL.replace(":unique_field",unique_field)
+		#
+		# Use a native job to get this done ...
+		#
+		client      = bq.Client.from_service_account_json(args['store']['source']['args']["private_key"])
+		job = bq.QueryJobConfig()
+		job.destination = client.dataset(target).table(table)
+		job.use_query_cache = True
+		job.allow_large_results = True
+		# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
+		job.write_disposition = "WRITE_TRUNCATE"
+		job.priority = 'BATCH'
+		r = client.query(SQL,location='US',job_config=job)		
+		logger.write({"job":r.job_id,"action":"finalize", "args":{"sql":SQL,"source":"".join([source,table]),"destimation":".".join([target,table])}})
+		#
+		# Keep a log of what just happened...
+		#
+		otable = ".".join([args['store']['source']['args']['dataset'],args['from']])
+		dtable = ".".join([args['store']['target']['args']['dataset'],args['from']])
 	def generate(self,args):
 		"""
 		This function will generate data and store it to a given,
@@ -527,18 +568,7 @@ if __name__ == '__main__' :
 	# @TODO:
 	#	Log what was initiated so we have context of this processing ...
 	#
-	# if 'listen' not in SYS_ARGS :
-	# if 'file' in args :
-	# 	DATA = pd.read_csv(args['file']) ;
-	# 	schema = []
-	# else:
-	# 	DATA = Components().get(args)
-	# 	client      = bq.Client.from_service_account_json(args["private_key"])
-	# 	schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
 
-	# COLUMNS = DATA.columns
-	# DATA = np.array_split(DATA,PART_SIZE)
-	# args['schema'] = schema
 	GPU_CHIPS = args['gpu'] if 'gpu' in args else None
 	if GPU_CHIPS and type(GPU_CHIPS) != list :				
 		GPU_CHIPS = [int(_id.strip()) for _id in GPU_CHIPS.split(',')] if type(GPU_CHIPS) == str else [GPU_CHIPS]
@@ -550,50 +580,6 @@ if __name__ == '__main__' :
 		# Let us see if we have partitions given the log folder
 		
 		content = os.listdir( os.sep.join([args['logs'],'train',args['context']]))
-		
-		
-		# if ''.join(content).isnumeric() :
-		# 	#
-		# 	# we have partitions we are working with
-			
-		# 	jobs = []
-			
-		# 	# columns = DATA.columns.tolist()
-			
-		# 	# DATA  = np.array_split(DATA,PART_SIZE)
-
-		# 	for index in range(0,PART_SIZE) :
-		# 		if 'focus' in args and int(args['focus']) != index :
-		# 			#
-		# 			# This handles failures/recoveries for whatever reason
-		# 			# If we are only interested in generating data for a given partition 
-		# 			continue
-		# 		# index = id.index(id)
-				
-		# 		args['partition'] = index
-		# 		args['data'] = DATA[index]
-		# 		if int(args['num_gpu']) > 1 :
-		# 			args['gpu'] = index
-		# 		else:
-		# 			args['gpu']=0
-
-		# 		make = lambda _args: (Components()).generate(_args)
-		# 		job = Process(target=make,args=(args,))
-		# 		job.name = 'generator # '+str(index)
-		# 		job.start()
-		# 		jobs.append(job)
-		# 		# if len(jobs) == 1 :
-		# 		# 	job.join()
-
-		# 	print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ])
-		# 	while len(jobs)> 0 :
-		# 		jobs = [job for job in jobs if job.is_alive()]
-		# 		time.sleep(2)
-				
-		# 		# generator.generate(args)
-		# else:
-		# 	generator.generate(args)
-		# Components.generate(args)
 		if 'all-chips' in SYS_ARGS and GPU_CHIPS:
 			index = 0
 			jobs = []
@@ -625,7 +611,7 @@ if __name__ == '__main__' :
 			shuffler = Components()
 			shuffler.shuffle(args)
 		pass
-	else:
+	elif 'train' in SYS_ARGS:
 		
 		# DATA  = np.array_split(DATA,PART_SIZE)
 		#
@@ -657,10 +643,25 @@ if __name__ == '__main__' :
 		#
 		# If we have any obs we should wait till they finish
 		#
-		while len(jobs)> 0 :
-			jobs = [job for job in jobs if job.is_alive()]
-			time.sleep(2)
-
+	DIRTY = 0
+	while len(jobs)> 0 :
+		DIRTY =1
+		jobs = [job for job in jobs if job.is_alive()]
+		time.sleep(2)
+	if DIRTY:
+		print (["..:: jobs finished "])
+	#
+	# We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations
+	#
+	print (['finalize' in SYS_ARGS, ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) ])
+	if 'finalize' in SYS_ARGS  or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) :
+		#
+		# We should pull all the primary keys and regenerate them in order to insure some form of consistency
+		#
+		
+		(Components()).finalize(args)
+		# finalize(args)
+		pass
 		# jobs = []
 		# for index in range(0,PART_SIZE) :
 		# 	if 'focus' in args and int(args['focus']) != index :
diff --git a/setup.py b/setup.py
index 4eb869f..d75f1d3 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.5",
+        "version":"1.4.6",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']

From 089c1d1d76b36ef6b054969cfc94c3141db7e3d9 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 28 Apr 2021 18:16:55 -0500
Subject: [PATCH 164/250] bug fix ...

---
 pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.py b/pipeline.py
index 78559cb..5759696 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -287,7 +287,7 @@ class Components :
 		SQL = [
 			"SELECT :fields FROM ",
 			"(SELECT ROW_NUMBER() OVER() AS row_number,* FROM :target.:table) x","INNER JOIN",
-			"(SELECT ROW_NUMBER() OVER() AS row_number, :unique_field FROM :source.:table) y",
+			"(SELECT ROW_NUMBER() OVER() AS row_number, :unique_field FROM :source.:table ORDER BY RAND()) y",
 			"ON y.row_number = x.row_number"
 		]
 		SQL = " ".join(SQL).replace(":fields",",".join(fields)).replace(":table",table).replace(":source",source).replace(":target",target)

From 79c5f3ff259993e89de370f4847759b191e024e2 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 10 May 2021 14:10:31 -0500
Subject: [PATCH 165/250] bug fix ...

---
 pipeline.py | 4 ++--
 setup.py    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 5759696..2311007 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -244,7 +244,7 @@ class Components :
 		writer 	= factory.instance(**_args['store'])
 		_df 	= _args['data']
 		if _schema :
-			columns = []
+			columns = _df.columns.tolist
 			for _item in _schema :
 				name = _item['name']
 				_type = str
@@ -266,7 +266,7 @@ class Components :
 				columns.append(name)
 			writer.write(_df,schema=_schema,table=args['from'])
 		else:
-			writer.write(_df[columns],table=args['from'])
+			writer.write(_df,table=args['from'])
 
 	def finalize(self,args):
 		"""
diff --git a/setup.py b/setup.py
index d75f1d3..1efc05e 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.6",
+        "version":"1.4.7",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']

From b10296246da4d944a723587cf8ed6183239c9cfd Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 10 May 2021 14:33:18 -0500
Subject: [PATCH 166/250] bug fix ...

---
 data/maker/__init__.py | 9 ++++++---
 pipeline.py            | 6 +++---
 setup.py               | 2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 803590a..4867bf6 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -144,9 +144,12 @@ def generate(**_args):
     """
     partition = _args['partition'] if 'partition' in _args else None
     if not partition :
-        f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
+        LOG_DIR = os.sep.join([_args['logs'],'output',_args['context']])
+        # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
     else:
-        f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json']))
+        LOG_DIR = os.sep.join([_args['logs'],'output',_args['context'],str(partition)])
+        # f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json']))
+    f = open(os.sep.join([LOG_DIR,'map.json']))
     _map = json.loads(f.read())
     f.close()
     # if 'file' in _args :
@@ -154,7 +157,7 @@ def generate(**_args):
     # else:
     #     df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])
     args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']}
-    args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
+    args['logs'] = LOG_DIR if 'logs' in _args else 'logs'
     args ['max_epochs'] = _args['max_epochs']
     # args['matrix_size'] = _matrix.shape[0]
     args['batch_size'] = 2000
diff --git a/pipeline.py b/pipeline.py
index 2311007..a958bb8 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -583,10 +583,10 @@ if __name__ == '__main__' :
 		if 'all-chips' in SYS_ARGS and GPU_CHIPS:
 			index = 0
 			jobs = []
-			for _id in GPU_CHIPS :
+			for _gpu in GPU_CHIPS :
 				_args = copy.deepcopy(args)
 				_args['gpu'] = [int(_gpu)]
-				_args['partition'] = index
+				_args['partition'] = int(_gpu) #index
 				index += 1
 				make = lambda _params: (Components()).generate(_params)
 				job = Process(target=make,args=( dict(_args),))
@@ -624,7 +624,7 @@ if __name__ == '__main__' :
 			for _gpu in GPU_CHIPS :
 				_args = copy.deepcopy(args)
 				_args['gpu'] = [int(_gpu)]
-				_args['partition'] = index
+				_args['partition'] = int(_gpu) #index
 				index += 1
 				make = lambda _params: (Components()).train(**_params)
 				job = Process(target=make,args=( _args,))
diff --git a/setup.py b/setup.py
index 1efc05e..1c126f5 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.7",
+        "version":"1.4.7.1",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']

From 7de89a576ae0595ccc5eca2264bbc85d8a34afc1 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 10 May 2021 14:43:29 -0500
Subject: [PATCH 167/250] bug fix

---
 data/maker/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 4867bf6..3a4caf6 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -174,7 +174,11 @@ def generate(**_args):
         args['gpu'] = _args['gpu']
        
     handler     = gan.Predict (**args)
-    handler.load_meta(column=None)
+    lparams = {'columns':None}
+    if partition :
+        lparams['partition'] = partition
+    
+    handler.load_meta(lparams)
     #
     # Let us now format the matrices by reverting them to a data-frame with values
     #

From 28d919cade137a9c6498a18432de0f3d37f4e8e4 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 10 May 2021 14:43:44 -0500
Subject: [PATCH 168/250] bug fix

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1c126f5..9f091c8 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.7.1",
+        "version":"1.4.7.2",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']

From 6e45704252be28c5c50aeada6ddeb14bfd9b39ff Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 10 May 2021 14:49:08 -0500
Subject: [PATCH 169/250] bug fixes ....

---
 data/maker/__init__.py | 2 +-
 setup.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 3a4caf6..8180903 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -178,7 +178,7 @@ def generate(**_args):
     if partition :
         lparams['partition'] = partition
     
-    handler.load_meta(lparams)
+    handler.load_meta(**lparams)
     #
     # Let us now format the matrices by reverting them to a data-frame with values
     #
diff --git a/setup.py b/setup.py
index 9f091c8..d09d66d 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.7.2",
+        "version":"1.4.7.3",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']

From d54758aac30467b8534250a7bc58aaafdc3afb9f Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Mon, 10 May 2021 15:02:55 -0500
Subject: [PATCH 170/250] bug fix ...

---
 data/maker/__init__.py | 11 ++++++-----
 pipeline.py            |  2 +-
 setup.py               |  2 +-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 8180903..7439e45 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -144,20 +144,22 @@ def generate(**_args):
     """
     partition = _args['partition'] if 'partition' in _args else None
     if not partition :
-        LOG_DIR = os.sep.join([_args['logs'],'output',_args['context']])
+        MAP_FLDER = os.sep.join([_args['logs'],'output',_args['context']])
         # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
     else:
-        LOG_DIR = os.sep.join([_args['logs'],'output',_args['context'],str(partition)])
+        MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context'],str(partition)])
         # f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json']))
-    f = open(os.sep.join([LOG_DIR,'map.json']))
+    f = open(os.sep.join([MAP_FOLDER,'map.json']))
     _map = json.loads(f.read())
     f.close()
+    #
+    # 
     # if 'file' in _args :
     #     df = pd.read_csv(_args['file'])
     # else:
     #     df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])
     args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']}
-    args['logs'] = LOG_DIR if 'logs' in _args else 'logs'
+    args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
     args ['max_epochs'] = _args['max_epochs']
     # args['matrix_size'] = _matrix.shape[0]
     args['batch_size'] = 2000
@@ -177,7 +179,6 @@ def generate(**_args):
     lparams = {'columns':None}
     if partition :
         lparams['partition'] = partition
-    
     handler.load_meta(**lparams)
     #
     # Let us now format the matrices by reverting them to a data-frame with values
diff --git a/pipeline.py b/pipeline.py
index a958bb8..27f23e6 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -244,7 +244,7 @@ class Components :
 		writer 	= factory.instance(**_args['store'])
 		_df 	= _args['data']
 		if _schema :
-			columns = _df.columns.tolist
+			columns = []
 			for _item in _schema :
 				name = _item['name']
 				_type = str
diff --git a/setup.py b/setup.py
index d09d66d..7e014c7 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.7.3",
+        "version":"1.4.7.4",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']

From 776a1103f294a9941f36dc6ba19191ad49b00f3f Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 12 May 2021 09:33:57 -0500
Subject: [PATCH 171/250] bug fix with dates

---
 pipeline.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pipeline.py b/pipeline.py
index 27f23e6..252a850 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -252,6 +252,9 @@ class Components :
 				if _item['type'] in ['DATE','TIMESTAMP','DATETIMESTAMP','DATETIME'] :
 					if _item['type'] == 'DATE' :
 						_df[name] = _df[name].dt.date
+						_df[name] = pd.to_datetime(_df[name],errors='coerce')
+
+
 					
 					
 					

From 79b83c71d5043427c37bd81f3beebc4637fac9eb Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 12 May 2021 10:14:53 -0500
Subject: [PATCH 172/250] bug fix: date, hack put in place

---
 pipeline.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 252a850..b6e808f 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -251,13 +251,16 @@ class Components :
 				_value = 0
 				if _item['type'] in ['DATE','TIMESTAMP','DATETIMESTAMP','DATETIME'] :
 					if _item['type'] == 'DATE' :
-						_df[name] = _df[name].dt.date
-						_df[name] = pd.to_datetime(_df[name],errors='coerce')
-
-
-					
-					
-					
+						#
+						# There is an issue with missing dates that needs to be resolved.
+						# for some reason a missing date/time here will cause the types to turn into timestamp (problem)
+						#	The following is a hack to address the issue (alas) assuming 10 digit dates and 'NaT' replaces missing date values (pandas specifications)
+						#
+						_df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10])
+						#_df[name] = _df[name].dt.date
+						# _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce')
+					else:
+						print ([' ** ',name,_item['type']])
 				else:
 					if _item['type'] == 'INTEGER' :
 						_type = np.int64

From 14933b877f742fa6628e852fe3ef951d20ab6a2d Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 12 May 2021 10:28:33 -0500
Subject: [PATCH 173/250] bug fix with dates

---
 pipeline.py | 4 +---
 setup.py    | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index b6e808f..3644a7e 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -259,8 +259,6 @@ class Components :
 						_df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10])
 						#_df[name] = _df[name].dt.date
 						# _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce')
-					else:
-						print ([' ** ',name,_item['type']])
 				else:
 					if _item['type'] == 'INTEGER' :
 						_type = np.int64
@@ -660,7 +658,7 @@ if __name__ == '__main__' :
 	# We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations
 	#
 	print (['finalize' in SYS_ARGS, ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) ])
-	if 'finalize' in SYS_ARGS  or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) :
+	if 'autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS  or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) :
 		#
 		# We should pull all the primary keys and regenerate them in order to insure some form of consistency
 		#
diff --git a/setup.py b/setup.py
index 7e014c7..eb8ea4d 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.7.4",
+        "version":"1.4.7.5",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']

From 4ed0e31aa5c94ea11cf2d6e96e459e7a941cce44 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 12 May 2021 10:37:06 -0500
Subject: [PATCH 174/250] bug fix ...

---
 pipeline.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 3644a7e..9aad2de 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -652,17 +652,17 @@ if __name__ == '__main__' :
 		DIRTY =1
 		jobs = [job for job in jobs if job.is_alive()]
 		time.sleep(2)
-	if DIRTY:
-		print (["..:: jobs finished "])
+	# if DIRTY:
+	# 	print (["..:: jobs finished "])
 	#
 	# We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations
 	#
-	print (['finalize' in SYS_ARGS, ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) ])
+	
 	if 'autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS  or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) :
 		#
 		# We should pull all the primary keys and regenerate them in order to insure some form of consistency
 		#
-		
+		print (["..:: jobs finished "])
 		(Components()).finalize(args)
 		# finalize(args)
 		pass

From 157df9334cff645116c176c95a7063832b690de1 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 12 May 2021 10:37:40 -0500
Subject: [PATCH 175/250] bug fix ...

---
 pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 9aad2de..56e522e 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -652,8 +652,8 @@ if __name__ == '__main__' :
 		DIRTY =1
 		jobs = [job for job in jobs if job.is_alive()]
 		time.sleep(2)
-	# if DIRTY:
-	# 	print (["..:: jobs finished "])
+	if DIRTY:
+		print (["..:: jobs finished "])
 	#
 	# We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations
 	#
@@ -662,7 +662,7 @@ if __name__ == '__main__' :
 		#
 		# We should pull all the primary keys and regenerate them in order to insure some form of consistency
 		#
-		print (["..:: jobs finished "])
+		print (["..:: Finalizing process"])
 		(Components()).finalize(args)
 		# finalize(args)
 		pass

From f99af3655d0c8792c34f11a246e437c7d00ae46c Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 13 Jan 2022 15:05:00 -0600
Subject: [PATCH 176/250] bug fix: misc. improvements

---
 data/gan.py            | 21 ++++++++++++---------
 data/maker/__init__.py |  5 ++++-
 pipeline.py            | 18 ++++++++++++++----
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 643e838..0008489 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -20,7 +20,9 @@ EMBEDDED IN CODE :
 
 """
 import tensorflow as tf
-from tensorflow.contrib.layers import l2_regularizer
+# from tensorflow.contrib.layers import l2_regularizer
+from tensorflow.keras import layers
+from tensorflow.keras.regularizers  import L2 as l2_regularizer
 import numpy as np
 import pandas as pd
 import time
@@ -34,7 +36,7 @@ import pickle
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ['CUDA_VISIBLE_DEVICES'] = "0"
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
-
+tf.compat.v1.disable_eager_execution()
 # STEPS_PER_EPOCH         = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256
 # NUM_GPUS                        = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu'])
 # BATCHSIZE_PER_GPU   = 2000
@@ -211,13 +213,14 @@ class GNet :
                 labels  = None if 'labels' not in args else args['labels']
                 n_labels= None if 'n_labels' not in args else args['n_labels']
                 shift   = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing
-                mean, var       = tf.nn.moments(inputs, shift, keep_dims=True)
-                shape           = inputs.shape[1].value
+                # mean, var       = tf.nn.moments(inputs, shift, keep_dims=True)
+                mean, var       = tf.nn.moments(inputs, shift,keepdims=True)
+                # shape           = inputs.shape[1].value
+                shape           = inputs.shape[1]
+               
                 if labels is not None:
-                        offset_m        = self.get.variables(shape=[1,shape], name='offset'+name,
-                                                                                initializer=tf.zeros_initializer)
-                        scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,
-                                                                        initializer=tf.ones_initializer)
+                        offset_m        = self.get.variables(shape=[1,shape], name='offset'+name,initializer=tf.zeros_initializer)
+                        scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,initializer=tf.ones_initializer)
                         offset  = tf.nn.embedding_lookup(offset_m, labels)
                         scale   = tf.nn.embedding_lookup(scale_m, labels)
 
@@ -595,7 +598,7 @@ class Predict(GNet):
                 df              = pd.DataFrame()
                 CANDIDATE_COUNT = args['candidates'] if 'candidates' in  args else 1 #0 if self.ROW_COUNT < 1000 else 100
                 candidates = []
-
+        
                 with tf.compat.v1.Session() as sess:
                         saver.restore(sess, model_dir)
                         if self._LABEL is not None :
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 7439e45..9db2b8d 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -106,6 +106,8 @@ def train (**_args):
             values = _inputhandler._map[key]['values'].tolist()
             _map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()}
         info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map}
+        print()
+        # print ([_args['context'],_inputhandler._io])
         logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io})
     
     args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
@@ -142,9 +144,10 @@ def generate(**_args):
     :param context
     :param logs
     """
+    _args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
     partition = _args['partition'] if 'partition' in _args else None
     if not partition :
-        MAP_FLDER = os.sep.join([_args['logs'],'output',_args['context']])
+        MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context']])
         # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
     else:
         MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context'],str(partition)])
diff --git a/pipeline.py b/pipeline.py
index 56e522e..296d4d5 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -151,6 +151,7 @@ class Components :
 		if df.shape[0] and df.shape[0] :
 			#
 			# We have a full blown matrix to be processed 
+			print ('-- Training --')
 			data.maker.train(**_args)
 		else:
 			print ("... skipping training !!")
@@ -259,16 +260,23 @@ class Components :
 						_df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10])
 						#_df[name] = _df[name].dt.date
 						# _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce')
+					else:
+						pass
+						_df[name] = pd.to_datetime(_df[name])
 				else:
+					value = 0
 					if _item['type'] == 'INTEGER' :
 						_type = np.int64
 					elif _item['type'] in ['FLOAT','NUMERIC']:
 						_type = np.float64
 					else:
+						
 						_value = ''
-					_df[name] = _df[name].fillna(_value).astype(_type)
+					_df[name] = _df[name].fillna(_value) #.astype(_type)
 				columns.append(name)
-			writer.write(_df,schema=_schema,table=args['from'])
+			print ()
+			print (_df)
+			writer.write(_df.astype(object),schema=_schema,table=args['from'])
 		else:
 			writer.write(_df,table=args['from'])
 
@@ -350,7 +358,7 @@ class Components :
 			for _item in schema :
 				dtype = str
 				name = _item['name']
-				novalue = -1
+				novalue = 0
 				if _item['type'] in ['INTEGER','NUMERIC']:
 					dtype = np.int64
 					
@@ -550,7 +558,7 @@ if __name__ == '__main__' :
 		index = f[0] if f else 0
 	#
 	
-	print ("..::: ",PIPELINE[index]['context'])
+	print ("..::: ",PIPELINE[index]['context'],':::..')
 	args =  (PIPELINE[index])
 	for key in _config :
 		if key == 'pipeline' or key in args:
@@ -567,6 +575,7 @@ if __name__ == '__main__' :
 		args['batch_size']	= 2000 #if 'batch_size' not in args else int(args['batch_size'])
 	if 'dataset' not in args :
 		args['dataset'] = 'combined20191004v2_deid'
+	args['logs'] = args['logs'] if 'logs' in args else 'logs'
 	PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
 	#
 	# @TODO:
@@ -599,6 +608,7 @@ if __name__ == '__main__' :
 				jobs.append(job)
 			pass
 		else:
+			
 			generator = Components()
 			generator.generate(args)
 	elif 'shuffle' in SYS_ARGS :

From cad54d7b45d08b8d4749a736ed9fe6ef6762949e Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 13 Jan 2022 17:36:53 -0600
Subject: [PATCH 177/250] version upgrade

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index eb8ea4d..c43bd15 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.7.5",
+        "version":"1.4.7.6",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']

From ee0165de0188faba09c55e518fca6c2e5761f287 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 24 Mar 2022 11:38:52 -0500
Subject: [PATCH 178/250] bug fixes: enhancements

---
 binder.py                      | 377 +++++++++++++++++++++++++++++++++
 data/gan.py                    |  54 +----
 data/maker/__init__.py         |   6 +-
 data/maker/prepare/__init__.py |  58 +----
 pipeline.py                    | 330 ++++++++++++++---------------
 5 files changed, 543 insertions(+), 282 deletions(-)
 create mode 100644 binder.py

diff --git a/binder.py b/binder.py
new file mode 100644
index 0000000..5379d62
--- /dev/null
+++ b/binder.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+"""
+This file will perform basic tasks to finalize the GAN process by performing the following :
+    - basic stats & analytics
+    - rebuild io to another dataset
+"""
+import pandas as pd
+import numpy as np
+from multiprocessing import Process, Lock
+from google.oauth2 import service_account
+from google.cloud import bigquery as bq
+import transport
+from data.params import SYS_ARGS 
+import json
+
+import pandas as pd
+import numpy as np
+from google.oauth2 import service_account
+import json
+
+# path = '../curation-prod.json'
+# credentials = service_account.Credentials.from_service_account_file(path)
+# df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
+filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config']
+f = open(filename)
+config = json.loads(f.read())
+args = config['pipeline']
+f.close()
+
+def _formatSQL(**_args):
+    """
+    This function will build the _map for a given segment
+    """
+    sql = """
+    select DISTINCT x.person_id synthetic,y.person_id original 
+    FROM :synthetic.:table x 
+    INNER JOIN :original.:table y on x.person_id in (:ids)
+    AND x.person_id <> y.person_id AND x.gender_source_value = y.gender_source_value 
+    AND x.year_of_birth = y.year_of_birth 
+    ORDER BY 1
+    """
+    table= _args['table']
+    original,synthetic = _args['schema']['original'],_args['schema']['synthetic']
+    _ids = np.array(_args['ids']).astype(str)
+    return sql.replace(":ids",",".join(_ids)).replace(":synthetic",synthetic).replace(":original",original).replace(":table",table)
+def _addCounts(**_args) :
+    store   = _args['store']
+    sql     = _args['sql']
+    reader = transport.factory.instance(**store['source'])
+    _df = reader.read(sql=sql)
+    _ids = _df.synthetic.unique()
+    _counts = [ np.sum(_df.synthetic == value) for value in _ids]
+    original = [_df[_df.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids]
+    _df = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts})
+
+    #
+    # We can post this to the backend ...
+    #
+    table = '_map'  #-- Yes this is hard-coded
+    writer = transport.factory.instance(**dict(store['target'],**{"parallel":True,"table":table}))
+    # if writer.has(table=table) is False:
+    #     writer.write(_df)
+    # else:
+    _schema = [{"name":name,"type":"INTEGER"} for name in _df.columns]
+    writer.write(_df,schema=_schema)
+    
+        
+    
+    
+
+def Init(**_args) :
+    """
+    This function will build a map of the synthetic to real individuals. 
+    The assumption is that the synthesized data is stored in the same data-store as the original the parameters provided are :
+    :param  store       object from the configuration file with source,target entries
+    :param  table       name of the original/synthetic tables (they should be the same)
+    :param  feat.       featuress/attributes ... demographics to account for
+    """
+    store = _args['store']
+    reader = transport.factory.instance(**store['source'])
+    original,synthetic = _args['schema']['original'],_args['schema']['synthetic']
+    table = _args['table']
+    sql = _args['sql'].replace(':synthetic',synthetic).replace(':original',original).replace(':table',table)
+   
+    _map = reader.read(sql=sql)
+    
+    
+    
+    k = _args['k'] if 'k' in _args else 2
+    # _iodf = reader.read(table=table)
+    # _ids = _iodf['person_id'].unique().tolist()
+    # x_  = np.array_split(_ids,1000)
+    jobs = []
+    # for _items in x_ :
+    #     _p = {"ids":_items,"schema":_args['schema'],'store':store,'table':table}
+    #     sql = _formatSQL(**_p)
+    #     _p['sql'] = sql
+    #     _apply = lambda params: _addCounts(**params)
+    #     thread = Process(target=_apply,args=(_p,))
+    #     thread.start()
+    #     jobs.append(thread)
+
+    # return jobs
+    #
+    # We have performed a m:m (many-to-many) relationship with original participants and synthetic participants
+    # The goal is to obtain a singular map against which records will be migrated
+    #
+    print (['... computing counts (k)'])
+    _ids = _map.synthetic.unique()
+    _counts = [ np.sum(_map.synthetic == value) for value in _ids]
+    original = [_map[_map.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids]
+    print (['Building k-classes/groups'])
+    _mdf = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts})
+    i = _mdf.apply(lambda row: row.counts >= k,axis=1)
+    _mdf = _mdf[i]
+    #
+    # Log what just happened here so we know about the equivalence classes, 
+    # {"module":"binder","action":"map-generation","input":{"k":k,"rows":{"synthetic":_mdf.shape[0],"original":len(_counts)}}}
+    
+    return _mdf
+    #
+    # now we are posting this to target storage ...
+    #
+def ApplyOn (**_args):
+    """
+    This  function will rewrite SQL that applies the synthetic identifier to the entries of the pipeline
+    We assume that the _map has two attributes (synthetic and original)
+    :param  store
+    :param  _config
+    """
+    store_args = _args['store']
+    _config = _args['config']
+    
+    table = _config['from']
+    reader  = transport.factory.instance(**dict(store_args['source'],**{"table":table}))
+    attr = reader.read(limit=1).columns.tolist()
+    original_key = _args['original_key'] #-- assuming referential integrity
+    
+    # synthetic_key= columns['synthetic']
+    # mapped_original=columns['orginal']
+    fields = list(set(attr) - set([original_key]))
+    sql = "select _map.synthetic as :original_key,:fields from :original_schema.:table inner join :synthetic_schema._map on _map.original = :table.:original_key"
+    sql = sql.replace(":table",table).replace(":fields",",".join(fields))
+    sql = sql.replace(":original_key",original_key)
+    _schema = _args['schema']
+    sql = sql.replace(":original_schema",_schema['original']).replace(":synthetic_schema",_schema['synthetic'])
+    
+    return reader.read (sql=sql)
+   
+if __name__ == '__main__' :
+    pass
+
+# class Analytics :
+#     """
+#     This class will compile basic analytics about a given dataset i.e compare original/synthetic
+#     """
+#     @staticmethod
+#     def distribution(**args):
+#         context = args['context']
+#         df = args['data']
+#         #
+#         #-- This data frame counts unique values for each feature (space)
+#         df_counts = pd.DataFrame(df.apply(lambda col: col.unique().size),columns=['counts']).T  # unique counts
+#         #
+#         #-- Get the distributions for common values
+#         #
+#         names   = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False]
+#         ddf     = df.apply(lambda col: pd.DataFrame(col.values,columns=[col.name]).groupby([col.name]).size() ).fillna(0)
+#         ddf[context] = ddf.index
+          
+#         pass
+#     def distance(**args):
+#         """
+#         This function will measure the distance between 
+#         """
+#         pass
+# class Utils :
+#     @staticmethod
+#     def log(**args):
+#         logger = transport.factory.instance(type="mongo.MongoWriter",args={"dbname":"aou","doc":"logs"})        
+#         logger.write(args)
+#         logger.close()
+#     class get :
+#         @staticmethod
+#         def pipeline(table,path) :
+#             # contexts    = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts']
+#             config = json.loads((open(path)).read())
+#             pipeline    = config['pipeline']
+#             # return [ item for item in pipeline if item['context'] in contexts]
+#             pipeline =  [item for item in pipeline if 'from' in item and item['from'].strip() == table]
+#             Utils.log(module=table,action='init',input={"pipeline":pipeline})
+#             return pipeline
+#         @staticmethod
+#         def sql(**args) :
+#             """
+#             This function is intended to build SQL query for the remainder of the table that was not synthesized
+#             :config configuration entries
+#             :from   source of the table name
+#             :dataset    name of the source dataset
+            
+#             """
+#             SQL = ["SELECT * FROM :from "]
+#             SQL_FILTER = []
+#             NO_FILTERS_FOUND = True
+#             # pipeline = Utils.get.config(**args)
+#             pipeline = args['pipeline']
+#             REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='}
+#             for item in pipeline :
+                
+
+#                 if 'filter' in item :
+#                     if NO_FILTERS_FOUND :
+#                         NO_FILTERS_FOUND = False
+#                         SQL  += ['WHERE']
+#                     #
+#                     # Let us load the filter in the SQL Query
+#                     FILTER = item['filter']
+#                     QUALIFIER = REVERSE_QUALIFIER[FILTER['qualifier'].upper()]
+#                     SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')']).replace(":dataset",args['dataset'])]
+#             src = ".".join([args['dataset'],args['from']])
+#             SQL += [" AND ".join(SQL_FILTER)]
+#             #
+#             # let's pull the field schemas out of the table definition
+#             #
+#             Utils.log(module=args['from'],action='sql',input={"sql":" ".join(SQL) })
+#             return " ".join(SQL).replace(":from",src)
+
+        
+# def mk(**args) :
+#     dataset  = args['dataset']
+#     client  = args['client'] if 'client' in args else bq.Client.from_service_account_file(args['private_key'])
+#     #
+#     # let us see if we have a dataset handy here 
+#     #
+#     datasets = list(client.list_datasets())
+#     found = [item for item in datasets if item.dataset_id == dataset]
+    
+#     if not found :
+
+#         return client.create_dataset(dataset)
+#     return found[0] 
+        
+# def move (args):
+#     """
+#     This function will move a table from the synthetic dataset into a designated location
+#     This is the simplest case for finalizing a synthetic data set
+#     :private_key        
+#     """
+#     pipeline   = Utils.get.pipeline(args['from'],args['config'])
+#     _args = json.loads((open(args['config'])).read())
+#     _args['pipeline'] = pipeline
+#     # del _args['pipeline']
+#     args = dict(args,**_args)
+#     # del args['pipeline']
+#     # private_key = args['private_key']
+#     client      = bq.Client.from_service_account_json(args['private_key'])
+
+#     dataset     = args['dataset']
+#     if pipeline :
+#         SQL         = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in pipeline]
+#         SQL         += [Utils.get.sql(**args)]
+#         SQL         =  ('\n UNION ALL \n'.join(SQL).replace(':dataset','io'))
+#     else:
+#         #
+#         # moving a table to a designated location
+#         tablename = args['from']
+#         if 'sql' not in args :
+#             SQL = "SELECT * FROM :dataset.:table"
+#         else:
+#             SQL = args['sql']
+#         SQL = SQL.replace(":dataset",dataset).replace(":table",tablename)
+#     Utils.log(module=args['from'],action='sql',input={'sql':SQL})
+#     #
+#     # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table
+#     #
+    
+    
+
+#     odataset    = mk(dataset=dataset+'_io',client=client)
+#     # SQL =       "SELECT * FROM io.:context_full_io".replace(':context',context)
+#     config = bq.QueryJobConfig()
+#     config.destination = client.dataset(odataset.dataset_id).table(args['from'])
+#     config.use_query_cache = True
+#     config.allow_large_results = True
+#     config.priority = 'INTERACTIVE'
+#     #
+#     #
+
+#     schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
+#     fields = [" ".join(["CAST (",item.name,"AS",item.field_type.replace("INTEGER","INT64").replace("FLOAT","FLOAT64"),") ",item.name]) for item in schema]
+#     SQL = SQL.replace("*"," , ".join(fields))
+#     # print (SQL)
+#     out = client.query(SQL,location='US',job_config=config)
+#     Utils.log(module=args['from'],action='move',input={'job':out.job_id})
+#     return (out.job_id)
+    
+    
+
+
+# import pandas as pd
+# import numpy as np
+# from google.oauth2 import service_account
+# import json
+
+# # path = '../curation-prod.json'
+# # credentials = service_account.Credentials.from_service_account_file(path)
+# # df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
+# filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config']
+# f = open(filename)
+# config = json.loads(f.read())
+# args = config['pipeline']
+# f.close()
+
+
+# if __name__ == '__main__' :
+#     """
+#     Usage :
+#         finalize --<move|stats> --contexts <c1,c2,...c3> --from <table>
+#     """
+    
+#     if 'move' in SYS_ARGS :
+
+#         if 'init' in SYS_ARGS :
+#             dep = config['dep'] if 'dep' in config else {}
+#             info = []
+            
+#             if 'queries' in dep :
+#                 info += dep['queries']
+#                 print ('________')
+#             if 'tables' in dep :
+#                 info += dep['tables']
+#             args = {}
+#             jobs = []
+#             for item in info :
+#                 args = {}
+#                 if type(item) == str :
+#                     args['from'] = item
+#                     name = item
+#                 else:
+#                     args = item
+#                     name = item['from']
+#                 args['config'] = SYS_ARGS['config']
+#                 # args['pipeline'] = []
+#                 job = Process(target=move,args=(args,))
+#                 job.name = name
+#                 jobs.append(job)
+#                 job.start()
+                
+
+#             # while len(jobs) > 0 :
+#             #     jobs = [job for job in jobs if job.is_alive()]
+#             #     time.sleep(1)
+            
+
+#         else:
+#             move(SYS_ARGS)
+#         # # table = SYS_ARGS['from']
+#         # # args = dict(config,**{"private_key":"../curation-prod.json"})
+#         # args = dict(args,**SYS_ARGS)        
+#         # contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
+#         # log = []
+#         # if contexts :
+#         #     args['contexts'] = contexts
+#         #     log = move(**args)
+            
+#         # else:
+#         #     tables = args['from'].split(',')
+#         #     for name in tables :
+#         #         name = name.strip()
+#         #         args['from'] = name
+#         #         log += [move(**args)]
+#         # print ("\n".join(log))
+        
+        
+        
+#     else:
+#         print ("NOT YET READY !")
\ No newline at end of file
diff --git a/data/gan.py b/data/gan.py
index 0008489..f5705ea 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -622,7 +622,7 @@ class Predict(GNet):
                                 candidates.append(np.array([np.round(row).astype(int) for row in _matrix]))                       
                 # return candidates[0] if len(candidates) == 1 else candidates
                 
-                return candidates 
+                return [candidates [0]]
 
         def _apply(self,**args):
                 # print (self.train_dir)
@@ -768,55 +768,3 @@ class Predict(GNet):
                 # return df.to_dict(orient='list')
                 return _matrix
 
-
-if __name__ == '__main__' :
-        #
-        # Now we get things done ...
-        column          = SYS_ARGS['column']
-        column_id       = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id'
-        column_id       = column_id.split(',') if ',' in column_id else column_id
-        df = pd.read_csv(SYS_ARGS['raw-data'])  
-        LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
-        
-        context         = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4]
-        if set(['train','learn']) & set(SYS_ARGS.keys()):
-                
-                df = pd.read_csv(SYS_ARGS['raw-data'])   
-                
-                # cols = SYS_ARGS['column']
-                # _map,_df = (Binary()).Export(df)
-                # i = np.arange(_map[column]['start'],_map[column]['end'])
-                max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10
-                # REAL    = _df[:,i]
-                REAL    = pd.get_dummies(df[column]).astype(np.float32).values
-                LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
-                trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id)
-                trainer.apply()
-                
-                
-           
-                
-                #
-                # We should train upon this data
-                #
-                # -- we need to convert the data-frame to binary matrix, given a column
-                #
-                pass
-        elif 'generate' in SYS_ARGS:
-                values = df[column].unique().tolist()
-                values.sort()
-                
-                p = Predict(context=context,label=LABEL,values=values,column=column)
-                p.load_meta(column)
-                r = p.apply()
-                # print (df)
-                # print ()
-                df[column] = r[column]
-                # print (df)
-                
-                
-        else:
-                print (SYS_ARGS.keys())
-                print (__doc__)
-        pass
-
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 9db2b8d..a7d8d69 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -96,7 +96,11 @@ def train (**_args):
         # This 
         
         args['store'] = copy.deepcopy(_args['store']['logs'])
-        args['store']['args']['doc'] = _args['context']
+        if 'args' in _args['store']:
+            args['store']['args']['doc'] = _args['context']
+        else:
+            
+            args['store']['doc'] = _args['context']
         logger = factory.instance(**args['store'])
         args['logger'] = logger
         
diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 5ace56a..6e67cb2 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -39,26 +39,10 @@ class Input :
         - provide a feature space, and rows (matrix profile)
         - a data index map
     """
-    # def learn(self,**_args):
-    #     """
-    #     This function is designed to learn about, the data and persist
-    #     :param table
-    #     :param store
-    #     """
-    #     table = _args['table']
-    #     reader  = transport.factory.instance(**_args['store'])
-    #     df = reader.read(table=table,limit=1)
-    #     self.columns = df.columns.tolist()
-
-    #     self._metadf = pd.DataFrame(self.df[self._columns].dtypes.values.astype(str)).T  #,self._columns]
-    #     self._metadf.columns = self._columns
-
-    #     sql = "SELECT :fields from :table".replace(":table",table)        
-
 
     def __init__(self,**_args):
         """
-        :param table    
+        :param data    
         :param store    data-store parameters/configuration
         :param sql      sql query  that pulls a representative sample of the data
         """
@@ -70,29 +54,18 @@ class Input :
             pass  
         else:
             self._initsql(**_args)
+        #
+        # We need to have a means to map of values,columns and vector positions in order
+        # to perform convert and revert to and from binary
+        #
         self._map = {} if 'map' not in _args else _args['map']
-        # self._metadf = pd.DataFrame(self.df[self._columns].dtypes.values.astype(str)).T  #,self._columns]
-        # self._metadf.columns = self._columns
-        # if 'gpu' in _args and 'GPU' in os.environ:
-            
-        #     np = cp
-        #     index = int(_args['gpu'])
-        #     np.cuda.Device(index).use()            
-        #     print(['..:: GPU ',index])
         
     def _initsql(self,**_args):
         """
         This function will initialize the class on the basis of a data-store and optionally pre-defined columns to be used to be synthesized 
         :param store        data-store configuration
-        :param sql          sql query to be applied to the transported data
         :param columns      list of columns to be 
         """
-        # _store_args = _args['store']
-        # reader = transport.factory.instance(**_store_args)
-        # sql = _args['sql']
-        
-        # self.df = reader.read(sql=_args['sql'])
-        
         
         if 'columns' not in _args :
             self._initcols(data=self.df)
@@ -128,14 +101,6 @@ class Input :
         :param data       data-frame that holds the data
         :param columns columns that need to be synthesized if any
         """
-        #
-        # setting class-level variables to be reused across the class
-        # self.df = _args['data']  
-        row_count = self.df.shape[0]
-        # self.columns = self.df.columns 
-        # self._metadf = self.df.apply(lambda col: col.unique().size)
-        # _df = pd.DataFrame(self.df.apply(lambda col: col.unique().size )).T
-        # cols = None if 'columns' not in _args else _args['columns']
         self._initcols(**_args)
 
     def convert(self,**_args):
@@ -247,16 +212,3 @@ class Input :
         
         return cols,_matrix
 
-if __name__ == '__main__' :
-    df = pd.read_csv('../../sample.csv')
-    _input = Input(data=df,columns=['age','race'])
-    _m = _input.convert(column='age')
-    print (_m.shape)
-    print (_input.revert(matrix=_m,column='age'))
-    print (_input._metadf)
-
-# _args = {"store":{"type":"sql.BQReader","args":{"service_key":"/home/steve/dev/aou/accounts/curation-prod.json"}}}
-# _args['table'] = 'io.observation'
-# _i = Input(**_args)
-# df = pd.read_csv('../../sample.csv')
-# print (Input.ToBinary(df.age))
\ No newline at end of file
diff --git a/pipeline.py b/pipeline.py
index 296d4d5..5fb62fe 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -101,11 +101,14 @@ class Components :
 			df = pd.read_csv(args['file'])
 			del args['file']
 		elif 'data' not in args :
+			
 			reader = factory.instance(**args['store']['source'])
+			
+			
 			if 'row_limit' in args :
 				df = reader.read(sql=args['sql'],limit=args['row_limit'])
 			else:
-				df = reader.read(sql=args['sql'])
+				df = reader.read(sql=args['sql'])		
 			schema = reader.meta(table=args['from']) if hasattr(reader,'meta') and 'from' in args else None
 		else:
 			df = args['data']
@@ -241,6 +244,7 @@ class Components :
 		df.index = np.arange(df.shape[0])		
 		self.post(data=df,schema=schema,store=args['store']['target'])	
 	def post(self,**_args) :
+		table = _args['from'] if 'from' in _args else _args['store']['table']
 		_schema = _args['schema'] if 'schema' in _args else None
 		writer 	= factory.instance(**_args['store'])
 		_df 	= _args['data']
@@ -251,13 +255,13 @@ class Components :
 				_type = str
 				_value = 0
 				if _item['type'] in ['DATE','TIMESTAMP','DATETIMESTAMP','DATETIME'] :
-					if _item['type'] == 'DATE' :
+					if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
 						#
 						# There is an issue with missing dates that needs to be resolved.
 						# for some reason a missing date/time here will cause the types to turn into timestamp (problem)
 						#	The following is a hack to address the issue (alas) assuming 10 digit dates and 'NaT' replaces missing date values (pandas specifications)
 						#
-						_df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10])
+						_df[name] = _df[name].apply(lambda value: None if str(value) == 'NaT' else (str(value)[:10]) if _item['type'] in ['DATE','DATETIME'] else str(value))
 						#_df[name] = _df[name].dt.date
 						# _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce')
 					else:
@@ -274,11 +278,33 @@ class Components :
 						_value = ''
 					_df[name] = _df[name].fillna(_value) #.astype(_type)
 				columns.append(name)
-			print ()
-			print (_df)
-			writer.write(_df.astype(object),schema=_schema,table=args['from'])
+		
+		fields = _df.columns.tolist()
+		if not writer.has(table=table) and _args['store']['provider'] != 'bigquery':
+			
+			_map = {'STRING':'VARCHAR(256)','INTEGER':'BIGINT'} if 'provider' in _args['store'] and _args['store']['provider'] != 'bigquery' else {}
+			_params = {'map':_map,'table':args['from']}
+			if _schema :
+				_params['schema'] = _schema
+				
+			else:
+				_params['fields'] = fields
+			
+			writer.make(**_params)
+			
+		fields = _df.columns.tolist()
+		_df = _df[fields]
+		# writer.fields = fields
+		if _args['store']['provider'] == 'bigquery' :
+			print (['_______ POSTING ______________ ',table])
+			print (['_______________ ',_df.shape[0],' ___________________'])
+			writer.write(_df.astype(object),schema=_schema,table=table)
 		else:
-			writer.write(_df,table=args['from'])
+			writer.table = table	
+			writer.write(_df)
+		# else:
+		# 	writer.write(_df,table=args['from'])
+		
 
 	def finalize(self,args):
 		"""
@@ -288,8 +314,9 @@ class Components :
 		"""
 		reader = factory.instance(**args['store']['source'])
 		logger = factory.instance(**args['store']['logs'])
-		target = args['store']['target']['args']['dataset']
-		source = args['store']['source']['args']['dataset']
+		
+		target = args['store']['target']['args']['dataset'] 
+		source = args['store']['source']['args']['dataset'] 
 		table = args['from']
 		schema = reader.meta(table=args['from'])
 		#
@@ -327,7 +354,10 @@ class Components :
 		This function will generate data and store it to a given,
 		"""
 		store = args['store']['logs']
-		store['args']['doc'] = args['context']
+		if 'args' in store :
+			store['args']['doc'] = args['context']
+		else:
+			store['doc'] = args['context']
 		logger = factory.instance(**store) #type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 
 		ostore = args['store']['target']
@@ -348,13 +378,13 @@ class Components :
 				schema = reader.meta(table=args['from'])
 				schema = [{"name":_item.name,"type":_item.field_type} for _item in schema]
 
-					
 			# else:
 			# 	#
 			# 	# This will account for autopilot mode ...
 			# 	df = args['data']
 		_cast = {}
 		if schema :
+			
 			for _item in schema :
 				dtype = str
 				name = _item['name']
@@ -405,139 +435,72 @@ class Components :
 		logger.write(_info)
 		if args['data'].shape[0] > 0 and args['data'].shape[1] > 0 :
 			candidates = (data.maker.generate(**args))
+			
 		else:
 			candidates = [df]
-		if 'sql.BQWriter' in ostore['type'] :
-			#table = ".".join([ostore['['dataset'],args['context']])
-			# writer  = factory.instance(**ostore)
-			_columns = None
-			skip_columns = []
-			_schema = schema
-			if schema :
-				cols = [_item['name'] for _item in _schema]
-			else:
-				cols = df.columns
-			for _df in candidates :
-				#
-				# we need to format the fields here to make sure we have something cohesive
-				#
+		
+		# if 'sql.BQWriter' in ostore['type'] :
+		_columns = None
+		skip_columns = []
+		_schema = schema
+		if schema :
+			cols = [_item['name'] for _item in _schema]
+		else:
+			cols = df.columns.tolist()
+		_info = {"module":"gan-prep","action":"selection","input":{"candidates":len(candidates),"features":cols}}
+		logger.write(_info)
+		for _df in candidates :
+			#
+			# we need to format the fields here to make sure we have something cohesive
+			#
 
-				if not skip_columns :
-					# _columns = set(df.columns) - set(_df.columns)
-					if 'ignore' in args and 'columns' in args['ignore'] :
-							skip_columns = self.get_ignore(data=_df,columns=args['ignore']['columns'])
-							# for name in args['ignore']['columns'] :
-							# 	for _name in _df.columns:
-							# 		if _name in name:
-							# 			skip_columns.append(_name)
-				#
-				# We perform a series of set operations to insure that the following conditions are met:
-				#	- the synthetic dataset only has fields that need to be synthesized
-				#	- The original dataset has all the fields except those that need to be synthesized
-				#
-				
-				_df = _df[list(set(_df.columns)  - set(skip_columns))].copy()
-				if x_cols :
-					_approx = {}
-					for _col in x_cols :
-						if real_df[_col].unique().size > 0 :
-							
+			if not skip_columns :
+				if 'ignore' in args and 'columns' in args['ignore'] :
+						skip_columns = self.get_ignore(data=_df,columns=args['ignore']['columns'])
+			#
+			# We perform a series of set operations to insure that the following conditions are met:
+			#	- the synthetic dataset only has fields that need to be synthesized
+			#	- The original dataset has all the fields except those that need to be synthesized
+			#
+			
+			_df = _df[list(set(_df.columns)  - set(skip_columns))].copy()
+			if x_cols :
+				_approx = {}
+				for _col in x_cols :
+					if real_df[_col].unique().size > 0 :
+						
 
-							_df[_col] = self.approximate(real_df[_col].values)
-							_approx[_col] = {
-								"io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)},
-								"real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}
-							}
-						else:
-							_df[_col] = -1
-					logger.write({"module":"gan-generate","action":"approximate","status":_approx})				
-				if set(df.columns) & set(_df.columns) :
-					_columns = set(df.columns) - set(_df.columns)											
-					df = df[_columns]
+						_df[_col] = self.approximate(real_df[_col].values)
+						_approx[_col] = {
+							"io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)},
+							"real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}
+						}
+					else:
+						_df[_col] = -1
+				logger.write({"module":"gan-generate","action":"approximate","status":_approx})				
+			if set(df.columns) & set(_df.columns) :
+				_columns = list(set(df.columns) - set(_df.columns))											
+				df = df[_columns]
 
-				#
-				# Let us merge the dataset here and and have a comprehensive dataset
+			#
+			# Let us merge the dataset here and and have a comprehensive dataset
 
-				_df = pd.DataFrame.join(df,_df)
-				
-				# if _schema :
-				# 	for _item in _schema :
-				# 		if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
-				# 			_df[_item['name']] = _df[_item['name']].astype(str)
-				
-				# 		pass
-				_params = {'data':_df,'store' : ostore}
-				if _schema :
-					_params ['schema'] = _schema
-				self.post(**_params)
-				# if _schema :
-				# 	writer.write(_df[cols],schema=_schema,table=args['from'])
-				# 	self.post(data=_df,schema=)
-				# else:
-				# 	writer.write(_df[cols],table=args['from'])
+			_df = pd.DataFrame.join(df,_df)
+			_params = {'data':_df,'store' : ostore}
+			if _schema :
+				_params ['schema'] = _schema
+			_info = {"module":"gan-prep","action":"write","input":{"rows":_df.shape[0],"cols":_df.shape[1]}}
+			logger.write(_info)
+			self.post(**_params)
+			# print (['_______ posting _________________',_df.shape])
+			break
+			
 			
 			pass
 		# else:
 		# 	pass
-
-		
-		# #
-		# # We need to post the generate the data in order to :
-		# #	1. compare immediately
-		# #	2. synthetic copy
-		# #
-		
-		# cols = _dc.columns.tolist()
-		
-		# data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io')				#-- will be used for comparison (store this in big query)
-		# #
-		# # performing basic analytics on the synthetic data generated (easy to quickly asses)
-		# #
-		# info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
-		
-		# #
-		# # @TODO: Send data over to a process for analytics
-		
-		# base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
-		# cols = _dc.columns.tolist()
-		# for name in cols :
-		# 	_args['data'][name] = _dc[name]
-
-		# #
-		# #-- Let us store all of this into bigquery
-		# prefix = args['notify']+'.'+_args['context']
-		# partition = str(partition)
-		# table = '_'.join([prefix,partition,'io']).replace('__','_')
-		# folder = os.sep.join([args['logs'],args['context'],partition,'output']) 
-		# if 'file' in args :
-			
-		# 	_fname = os.sep.join([folder,table.replace('_io','_full_io.csv')])
-		# 	_pname = os.sep.join([folder,table])+'.csv'
-		# 	data_comp.to_csv( _pname,index=False)
-		# 	_args['data'].to_csv(_fname,index=False)
-			
-		# 	_id = 'path'
-		# else:
-
-		# 	credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
-		# 	_pname = os.sep.join([folder,table+'.csv'])
-		# 	_fname = table.replace('_io','_full_io')
-		# 	partial = '.'.join(['io',args['context']+'_partial_io'])
-		# 	complete= '.'.join(['io',args['context']+'_full_io'])
-		# 	data_comp.to_csv(_pname,index=False)
-		# 	if 'dump' in args :
-		# 		print (_args['data'].head())
-		# 	else:
-		# 		Components.lock.acquire()
-		# 		data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)	
-		# 		_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
-		# 		Components.lock.release()
-		# 	_id = 'dataset'
-		# info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
-		# if partition :
-		# 	info ['partition'] = int(partition)
-		# logger.write({"module":"generate","action":"write","input":info} )
-		
+	def bind(self,**_args):
+		print (_args)
 	
 	
 if __name__ == '__main__' :
@@ -611,6 +574,50 @@ if __name__ == '__main__' :
 			
 			generator = Components()
 			generator.generate(args)
+	elif 'bind' in SYS_ARGS :
+		import binder
+		_args = _config['_map']
+		_args['store'] = copy.deepcopy(_config['store'])
+		if 'init' in SYS_ARGS :
+			#
+			# Creating and persisting the map ...
+			print (['.... Binding Initialization'])
+			# jobs = binder.Init(**_args)
+			_mapped = binder.Init(**_args)
+			
+			
+			_schema = [{"name":_name,"type":"INTEGER"} for _name in _mapped.columns.tolist()]
+			publisher = lambda _params: (Components()).post(**_params)
+			_args = {'data':_mapped,'store':_config['store']['target']}
+			_args['store']['table'] = '_map'
+			if _args['store']['provider'] =='bigquery' :
+				_args['schema'] = _schema
+			
+			job = Process (target = publisher,args=(_args,))
+			job.start()
+			jobs = [job]
+		else:
+			#
+			# Applying the map of k on a particular dataset
+			#
+			index = int(SYS_ARGS['index'])
+			_args['config'] = _config['pipeline'][index]
+			_args['original_key'] = 'person_id' if 'original_key' in _config else 'person_id'
+			table = _config['pipeline'][index]['from']
+			_df =  binder.ApplyOn(**_args)
+			_df = np.array_split(_df,PART_SIZE)
+			jobs = []
+			print (['Publishing ',PART_SIZE,' PARTITION'])
+			for data in _df :
+				publisher = lambda _params: ( Components() ).post(**_params)
+				_args = {'data':data,'store':_config['store']['target']}
+				_args['store']['table'] = table
+				print (_args['store'])
+				job = Process(target = publisher,args=(_args,))
+				job.name = "Publisher "+str(len(jobs)+1)
+				job.start()
+				jobs.append(job)
+			
 	elif 'shuffle' in SYS_ARGS :
 		index = 0
 		if GPU_CHIPS and 'all-chips' in SYS_ARGS:
@@ -632,6 +639,7 @@ if __name__ == '__main__' :
 		# Let us create n-jobs across n-gpus, The assumption here is the data that is produced will be a partition
 		# @TODO: Find better name for partition
 		#
+		
 		if GPU_CHIPS and 'all-chips' in SYS_ARGS:
 			index = 0
 			print (['... launching ',len(GPU_CHIPS),' jobs',args['context']])
@@ -652,12 +660,15 @@ if __name__ == '__main__' :
 		else:
 			#
 			# The choice of the chip will be made internally
+			
 			agent = Components()
 			agent.train(**args)
 		#
 		# If we have any obs we should wait till they finish
 		#
 	DIRTY = 0
+	if (len(jobs)) :
+		print (['.... waiting on ',len(jobs),' jobs'])
 	while len(jobs)> 0 :
 		DIRTY =1
 		jobs = [job for job in jobs if job.is_alive()]
@@ -666,47 +677,16 @@ if __name__ == '__main__' :
 		print (["..:: jobs finished "])
 	#
 	# We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations
-	#
-	
-	if 'autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS  or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) :
-		#
-		# We should pull all the primary keys and regenerate them in order to insure some form of consistency
-		#
-		print (["..:: Finalizing process"])
-		(Components()).finalize(args)
-		# finalize(args)
-		pass
-		# jobs = []
-		# for index in range(0,PART_SIZE) :
-		# 	if 'focus' in args and int(args['focus']) != index :
-		# 		continue
-		# 	args['part_size'] = PART_SIZE
-		# 	args['partition'] = index
-		# 	args['data'] = DATA[index]
-		# 	if int(args['num_gpu']) > 1 :
-		# 		args['gpu'] = index
-		# 	else:
-		# 		args['gpu']=0
+	# This holds true for bigquery - bigquery only
+	IS_BIGQUERY = _config['store']['source']['provider'] == _config['store']['target']['provider'] and _config['store']['source']['provider'] == 'bigquery'
 
-		# 	make = lambda _args: (Components()).train(**_args)
-		# 	job = Process(target=make,args=( dict(args),))
-		# 	job.name = 'Trainer # ' + str(index)
-		# 	job.start()
-		# 	jobs.append(job)
-		# 	# args['gpu']
-		# print (["Started ",len(jobs),"trainers" if len(jobs)>1 else "trainer" ])
-		# while len(jobs)> 0 :
-		# 	jobs = [job for job in jobs if job.is_alive()]
-		# 	time.sleep(2)
+	# if 'bind' not in SYS_ARGS and IS_BIGQUERY and ('autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS  or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS)) :
+	# 	#
+	# 	# We should pull all the primary keys and regenerate them in order to insure some form of consistency
+	# 	#
 
-		# trainer = Components()
-		# trainer.train(**args)
-		
+	# 	#
+	# 	#
 		
-		# Components.train(**args)
-#for args in PIPELINE :
-	#args['dataset'] = 'combined20190510'
-	#process = Process(target=Components.train,args=(args,))
-	#process.name = args['context']
-	#process.start()
-#	Components.train(args)
+	# 	print (["..:: Finalizing process"])
+	# 	(Components()).finalize(args)

From 964ddb06abec16de023a05d754837b1410bb80f1 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 24 Mar 2022 11:47:02 -0500
Subject: [PATCH 179/250] version increment

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index c43bd15..d3f0d4b 100644
--- a/setup.py
+++ b/setup.py
@@ -5,10 +5,10 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.7.6",
+        "version":"1.4.7.8",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
-args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
+args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
 
 if sys.version_info[0] == 2 :

From 0384a2e96f40d98bb28aae0b723e2cec865fe9cd Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Mon, 11 Apr 2022 18:33:07 -0500
Subject: [PATCH 180/250] bug fixes and simplified interface

---
 data/maker/__init__.py         | 177 +++++++++++++++++++++++-
 data/maker/prepare/__init__.py |   4 +-
 finalize.py                    | 240 ---------------------------------
 pipeline.py                    |   2 +-
 4 files changed, 177 insertions(+), 246 deletions(-)
 delete mode 100644 finalize.py

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index a7d8d69..bf388a6 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -11,13 +11,15 @@ This package is designed to generate synthetic data from a dataset from an origi
 import pandas as pd
 import numpy as np
 import data.gan as gan
-from transport import factory
+import transport
 from data.bridge import Binary
 import threading as thread
 from data.maker import prepare
 import copy
 import os
 import json
+from multiprocessing import Process, RLock
+
 
 class ContinuousToDiscrete :
     ROUND_UP = 2
@@ -101,7 +103,7 @@ def train (**_args):
         else:
             
             args['store']['doc'] = _args['context']
-        logger = factory.instance(**args['store'])
+        logger = transport.factory.instance(**args['store'])
         args['logger'] = logger
         
         for key in _inputhandler._map :
@@ -193,4 +195,173 @@ def generate(**_args):
 
     candidates = handler.apply(candidates=args['candidates'])       
     return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]
-    
+
+class Learner(Process):
+    def __init__(self,**_args):
+        
+        
+        super(Learner, self).__init__() 
+        if 'gpu' in _args :
+            print (_args['gpu'])
+            os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])
+            self.gpu = int(_args['gpu'])
+        else:
+            self.gpu = None
+        self.info = _args['info']
+        self.columns    = self.info['columns'] if 'columns' in self.info else None
+        self.store      = _args['store']
+        if 'network_args' not in _args :
+            self.network_args ={
+                'context':_args['context'] if 'context' in _args else 'GENERAL',
+                'logs':_args['logpath'] if 'logpath' in _args else 'logs',
+                'max_epochs':int(_args['epochs']) if 'epochs' in _args else 2,
+                'batch_size':int (_args['batch']) if 'batch' in _args else 2000
+            }
+        else:
+            self.network_args = _args['network_args']
+        self._encoder = None
+        self._map = None
+        self._df = _args['data'] if 'data' in _args else None
+        #
+        # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork
+        #
+
+        # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
+        # sel.max_epoc
+    def get_schema(self):
+        return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
+    def initalize(self):
+        reader  = transport.factory.instance(**self.store['source'])
+        _read_args= self.info
+        if self._df is None :
+            self._df     = reader.read(**_read_args)
+        columns = self.columns if self.columns else self._df.columns
+        #
+        # convert the data to binary here ...
+        
+        _args = {"schema":self.get_schema(),"data":self._df,"columns":columns}
+        if self._map :
+            _args['map'] = self._map
+        self._encoder = prepare.Input(**_args)        
+class Trainer(Learner):
+    """
+    This will perform training using a GAN
+    """
+    def __init__(self,**_args):
+        super().__init__(**_args)
+        # self.info   = _args['info']
+        self.limit  = int(_args['limit']) if 'limit' in _args else None
+        self.name   =  _args['name']
+        self.autopilot = _args['autopilot'] if 'autopilot' in _args else False
+        self.generate = None
+        self.candidates = int(_args['candidates']) if 'candidates' in _args else 1
+    def run(self):
+        self.initalize()
+        _space,_matrix = self._encoder.convert()
+        
+        _args   = self.network_args
+        if self.gpu :
+            _args['gpu'] = self.gpu
+        _args['real'] = _matrix
+        _args['candidates'] = self.candidates
+        #
+        # At this point we have the binary matrix, we can initiate training
+        #
+
+        gTrain = gan.Train(**_args)
+        gTrain.apply()
+       
+        writer = transport.factory.instance(provider='file',context='write',path=os.sep.join([gTrain.out_dir,'map.json']))
+        writer.write(self._encoder._map,overwrite=True)
+        writer.close()
+
+        #
+        # @TODO: At this point we need to generate another some other objects
+        #
+        _args = {"network_args":self.network_args,"store":self.store,"info":self.info,"candidates":self.candidates,"data":self._df}
+        if self.gpu :
+            _args['gpu'] = self.gpu
+        g = Generator(**_args)
+        # g.run() 
+        self.generate = g
+        if self.autopilot :
+            self.generate.run()
+    def generate (self):
+        if self.autopilot :
+            print( "Autopilot is set ... No need to call this function")
+        else:
+            raise Exception( "Autopilot has not been, Wait till training is finished. Use is_alive function on process object")
+
+class Generator (Learner):
+    def __init__(self,**_args):
+        super().__init__(**_args)
+        #
+        # We need to load the mapping information for the space we are working with ...
+        #
+        self.network_args['candidates'] = int(_args['candidates']) if 'candidates' in _args else 1
+        filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json'])
+        file = open(filename)
+        self._map = json.loads(file.read())
+        file.close()
+    def run(self):
+        self.initalize()
+        #
+        # The values will be returned because we have provided _map information from the constructor
+        #
+        values,_matrix = self._encoder.convert()
+        _args = self.network_args
+        _args['map'] = self._map
+        _args['values'] = np.array(values)
+        _args['row_count'] = self._df.shape[0]
+        
+        gHandler = gan.Predict(**_args)
+        gHandler.load_meta(columns=None)
+        _iomatrix = gHandler.apply()
+        _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
+        self.post(_candidates)
+    def appriximate(self,_df):
+        _columns = self.info['approximate']
+        _schema = {}
+        for _info in self.get_schema() :
+            _schema[_info['name']] = _info['type']
+
+        
+        for name in _columns :
+            batches = np.array_split(_df[name].values,10)
+            x = []
+            for values in batches :
+                _values = np.random.dirichlet(values)
+                x += list(values + _values )if np.random.randint(0,2) else list(values - _values)
+            _df[name] = np.int64(x) if 'int' in _schema[name] else np.float64(x)
+        return _df
+    def format(self,_df):
+        pass
+    def post(self,_candidates):
+        
+        _store = self.store['target'] if 'target' in self.store else {'provider':'console'}
+        _store['lock'] = True
+        writer = transport.factory.instance(**_store)
+       
+        for _iodf in _candidates :
+            _df = self._df.copy()
+            _df[self.columns] = _iodf[self.columns]
+            if 'approximate' in self.info :
+                
+                _df = self.appriximate(_df)
+            writer.write(_df,schema=self.get_schema())
+        pass
+class factory :
+    _infocache = {}
+    @staticmethod
+    def instance(**_args):
+        """
+        An instance of an object that trains and generates candidate datasets
+        :param gpu (optional)         index of the gpu to be used if using one
+        :param store                  {source,target} if no target is provided console will be output
+        :param epochs (default 2)     number of epochs to train
+        :param candidates(default 1)  number of candidates to generate
+        :param info             {columns,sql,from}
+        :param autopilot        will generate output automatically
+        :param batch (default 2k)           size of the batch
+        """
+        return Trainer(**_args)
\ No newline at end of file
diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 6e67cb2..478d435 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -128,7 +128,7 @@ class Input :
             cols, _matrix = self.tobinary(_df[name],values)
             _beg,_end = i,i+len(cols)
             if name not in self._map :
-                self._map[name] = {"beg":_beg,"end":_end ,"values":cols}
+                self._map[name] = {"beg":_beg,"end":_end ,"values":cols.tolist()}
             i += len(cols)
             if not _m.shape[0]:
                 _m = _matrix ;
@@ -196,7 +196,7 @@ class Input :
             # In the advent the sample rows do NOT have the values of the 
             cols = rows.unique()
         cols = np.array(cols)
-        row_count = len(rows)
+        row_count = np.int64(len(rows))
         # if 'GPU' not in os.environ :
         # _matrix = np.zeros([row_count,cols.size],dtype=int)
         #
diff --git a/finalize.py b/finalize.py
deleted file mode 100644
index d420d7d..0000000
--- a/finalize.py
+++ /dev/null
@@ -1,240 +0,0 @@
-#!/usr/bin/env python3
-"""
-This file will perform basic tasks to finalize the GAN process by performing the following :
-    - basic stats & analytics
-    - rebuild io to another dataset
-"""
-import pandas as pd
-import numpy as np
-from multiprocessing import Process, Lock
-from google.oauth2 import service_account
-from google.cloud import bigquery as bq
-import transport
-from data.params import SYS_ARGS 
-import json
-
-class Analytics :
-    """
-    This class will compile basic analytics about a given dataset i.e compare original/synthetic
-    """
-    @staticmethod
-    def distribution(**args):
-        context = args['context']
-        df = args['data']
-        #
-        #-- This data frame counts unique values for each feature (space)
-        df_counts = pd.DataFrame(df.apply(lambda col: col.unique().size),columns=['counts']).T  # unique counts
-        #
-        #-- Get the distributions for common values
-        #
-        names   = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False]
-        ddf     = df.apply(lambda col: pd.DataFrame(col.values,columns=[col.name]).groupby([col.name]).size() ).fillna(0)
-        ddf[context] = ddf.index
-          
-        pass
-    def distance(**args):
-        """
-        This function will measure the distance between 
-        """
-        pass
-class Utils :
-    @staticmethod
-    def log(**args):
-        logger = transport.factory.instance(type="mongo.MongoWriter",args={"dbname":"aou","doc":"logs"})        
-        logger.write(args)
-        logger.close()
-    class get :
-        @staticmethod
-        def pipeline(table,path) :
-            # contexts    = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts']
-            config = json.loads((open(path)).read())
-            pipeline    = config['pipeline']
-            # return [ item for item in pipeline if item['context'] in contexts]
-            pipeline =  [item for item in pipeline if 'from' in item and item['from'].strip() == table]
-            Utils.log(module=table,action='init',input={"pipeline":pipeline})
-            return pipeline
-        @staticmethod
-        def sql(**args) :
-            """
-            This function is intended to build SQL query for the remainder of the table that was not synthesized
-            :config configuration entries
-            :from   source of the table name
-            :dataset    name of the source dataset
-            
-            """
-            SQL = ["SELECT * FROM :from "]
-            SQL_FILTER = []
-            NO_FILTERS_FOUND = True
-            # pipeline = Utils.get.config(**args)
-            pipeline = args['pipeline']
-            REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='}
-            for item in pipeline :
-                
-
-                if 'filter' in item :
-                    if NO_FILTERS_FOUND :
-                        NO_FILTERS_FOUND = False
-                        SQL  += ['WHERE']
-                    #
-                    # Let us load the filter in the SQL Query
-                    FILTER = item['filter']
-                    QUALIFIER = REVERSE_QUALIFIER[FILTER['qualifier'].upper()]
-                    SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')']).replace(":dataset",args['dataset'])]
-            src = ".".join([args['dataset'],args['from']])
-            SQL += [" AND ".join(SQL_FILTER)]
-            #
-            # let's pull the field schemas out of the table definition
-            #
-            Utils.log(module=args['from'],action='sql',input={"sql":" ".join(SQL) })
-            return " ".join(SQL).replace(":from",src)
-
-        
-def mk(**args) :
-    dataset  = args['dataset']
-    client  = args['client'] if 'client' in args else bq.Client.from_service_account_file(args['private_key'])
-    #
-    # let us see if we have a dataset handy here 
-    #
-    datasets = list(client.list_datasets())
-    found = [item for item in datasets if item.dataset_id == dataset]
-    
-    if not found :
-
-        return client.create_dataset(dataset)
-    return found[0] 
-        
-def move (args):
-    """
-    This function will move a table from the synthetic dataset into a designated location
-    This is the simplest case for finalizing a synthetic data set
-    :private_key        
-    """
-    pipeline   = Utils.get.pipeline(args['from'],args['config'])
-    _args = json.loads((open(args['config'])).read())
-    _args['pipeline'] = pipeline
-    # del _args['pipeline']
-    args = dict(args,**_args)
-    # del args['pipeline']
-    # private_key = args['private_key']
-    client      = bq.Client.from_service_account_json(args['private_key'])
-
-    dataset     = args['dataset']
-    if pipeline :
-        SQL         = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in pipeline]
-        SQL         += [Utils.get.sql(**args)]
-        SQL         =  ('\n UNION ALL \n'.join(SQL).replace(':dataset','io'))
-    else:
-        #
-        # moving a table to a designated location
-        tablename = args['from']
-        if 'sql' not in args :
-            SQL = "SELECT * FROM :dataset.:table"
-        else:
-            SQL = args['sql']
-        SQL = SQL.replace(":dataset",dataset).replace(":table",tablename)
-    Utils.log(module=args['from'],action='sql',input={'sql':SQL})
-    #
-    # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table
-    #
-    
-    
-
-    odataset    = mk(dataset=dataset+'_io',client=client)
-    # SQL =       "SELECT * FROM io.:context_full_io".replace(':context',context)
-    config = bq.QueryJobConfig()
-    config.destination = client.dataset(odataset.dataset_id).table(args['from'])
-    config.use_query_cache = True
-    config.allow_large_results = True
-    config.priority = 'INTERACTIVE'
-    #
-    #
-
-    schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
-    fields = [" ".join(["CAST (",item.name,"AS",item.field_type.replace("INTEGER","INT64").replace("FLOAT","FLOAT64"),") ",item.name]) for item in schema]
-    SQL = SQL.replace("*"," , ".join(fields))
-    # print (SQL)
-    out = client.query(SQL,location='US',job_config=config)
-    Utils.log(module=args['from'],action='move',input={'job':out.job_id})
-    return (out.job_id)
-    
-    
-
-
-import pandas as pd
-import numpy as np
-from google.oauth2 import service_account
-import json
-
-# path = '../curation-prod.json'
-# credentials = service_account.Credentials.from_service_account_file(path)
-# df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
-filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config']
-f = open(filename)
-config = json.loads(f.read())
-args = config['pipeline']
-f.close()
-
-
-if __name__ == '__main__' :
-    """
-    Usage :
-        finalize --<move|stats> --contexts <c1,c2,...c3> --from <table>
-    """
-    
-    if 'move' in SYS_ARGS :
-
-        if 'init' in SYS_ARGS :
-            dep = config['dep'] if 'dep' in config else {}
-            info = []
-            
-            if 'queries' in dep :
-                info += dep['queries']
-                print ('________')
-            if 'tables' in dep :
-                info += dep['tables']
-            args = {}
-            jobs = []
-            for item in info :
-                args = {}
-                if type(item) == str :
-                    args['from'] = item
-                    name = item
-                else:
-                    args = item
-                    name = item['from']
-                args['config'] = SYS_ARGS['config']
-                # args['pipeline'] = []
-                job = Process(target=move,args=(args,))
-                job.name = name
-                jobs.append(job)
-                job.start()
-                
-
-            # while len(jobs) > 0 :
-            #     jobs = [job for job in jobs if job.is_alive()]
-            #     time.sleep(1)
-            
-
-        else:
-            move(SYS_ARGS)
-        # # table = SYS_ARGS['from']
-        # # args = dict(config,**{"private_key":"../curation-prod.json"})
-        # args = dict(args,**SYS_ARGS)        
-        # contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
-        # log = []
-        # if contexts :
-        #     args['contexts'] = contexts
-        #     log = move(**args)
-            
-        # else:
-        #     tables = args['from'].split(',')
-        #     for name in tables :
-        #         name = name.strip()
-        #         args['from'] = name
-        #         log += [move(**args)]
-        # print ("\n".join(log))
-        
-        
-        
-    else:
-        print ("NOT YET READY !")
\ No newline at end of file
diff --git a/pipeline.py b/pipeline.py
index 5fb62fe..9d095d9 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -486,7 +486,7 @@ class Components :
 			# Let us merge the dataset here and and have a comprehensive dataset
 
 			_df = pd.DataFrame.join(df,_df)
-			_params = {'data':_df,'store' : ostore}
+			_params = {'data':_df,'store' : ostore,'from':args['from']}
 			if _schema :
 				_params ['schema'] = _schema
 			_info = {"module":"gan-prep","action":"write","input":{"rows":_df.shape[0],"cols":_df.shape[1]}}

From ee518316c07d26ac546c0d7870de9584239c7b47 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Mon, 11 Apr 2022 18:52:46 -0500
Subject: [PATCH 181/250] verion update

---
 setup.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/setup.py b/setup.py
index d3f0d4b..3822df5 100644
--- a/setup.py
+++ b/setup.py
@@ -4,17 +4,12 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker",
-        "version":"1.4.7.8",
-        "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.5.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
-args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow','pandas','pandas-gbq','pymongo']
-args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
+args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow']
+args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git'
 
 if sys.version_info[0] == 2 :
     args['use_2to3'] = False
     args['use_2to3_exclude_fixers'] = ['lib2to3.fixes.fix_import']
-args['scripts']=['pipeline.py','finalize.py']
 setup(**args)
-
-

From 0797e3dba18580d275a668e9732534e5414c6eb3 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Mon, 11 Apr 2022 23:27:25 -0500
Subject: [PATCH 182/250] post processing features with dates

---
 data/maker/__init__.py | 78 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 63 insertions(+), 15 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index bf388a6..9d3bdb5 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -12,14 +12,14 @@ import pandas as pd
 import numpy as np
 import data.gan as gan
 import transport
-from data.bridge import Binary
+# from data.bridge import Binary
 import threading as thread
 from data.maker import prepare
 import copy
 import os
 import json
 from multiprocessing import Process, RLock
-
+from datetime import datetime, timedelta
 
 class ContinuousToDiscrete :
     ROUND_UP = 2
@@ -229,7 +229,11 @@ class Learner(Process):
         # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
         # sel.max_epoc
     def get_schema(self):
-        return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
+        if self.store['source']['provider'] != 'bigquery' :
+            return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
+        else:
+            reader  = transport.factory.instance(**self.store['source'])
+            return reader.meta(table=self.info['from'])
     def initalize(self):
         reader  = transport.factory.instance(**self.store['source'])
         _read_args= self.info
@@ -319,21 +323,56 @@ class Generator (Learner):
         _iomatrix = gHandler.apply()
         _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
         self.post(_candidates)
-    def appriximate(self,_df):
+    def approximate(self,_df):
         _columns = self.info['approximate']
-        _schema = {}
-        for _info in self.get_schema() :
-            _schema[_info['name']] = _info['type']
+        # _schema = {}
+        # for _info in self.get_schema() :
+        #     _schema[_info['name']] = _info['type'] 
 
         
         for name in _columns :
-            batches = np.array_split(_df[name].values,10)
+            batches = np.array_split(_df[name].fillna(np.nan).values,2)
+            _type = np.int64 if 'int' in self.info['approximate'][name]else np.float64
             x = []
             for values in batches :
-                _values = np.random.dirichlet(values)
-                x += list(values + _values )if np.random.randint(0,2) else list(values - _values)
-            _df[name] = np.int64(x) if 'int' in _schema[name] else np.float64(x)
+                
+                index = np.where(values != '')
+                _values = np.random.dirichlet(values[index].astype(_type))                                
+                values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values)
+                values[index] = values[index].astype(_type)
+                x += values.tolist()
+            if x :                
+                _df[name] = x  #np.array(x,dtype=np.int64) if 'int' in _type  else np.arry(x,dtype=np.float64)
         return _df
+    def make_date(self,**_args) :
+        """
+        :param year  initial value        
+        """
+        if _args['year'] in ['',None,np.nan] :
+            return None
+        year = int(_args['year'])
+        offset = _args['offset'] if 'offset' in _args else 0
+        month   = np.random.randint(1,13)
+        if month == 2:
+            _end = 28 if year % 4 != 0 else 29
+        else:
+            _end = 31 if month in [1,3,5,7,8,10,12] else 30
+        day = np.random.randint(1,_end)
+
+        #-- synthetic date
+        _date = datetime(year=year,month=month,day=day)
+        FORMAT = _args['format'] if 'format' in _args else '%Y-%m-%d'
+        r = []
+        if offset :
+            r = [_date.strftime(FORMAT)]
+            for _delta in offset :
+                _date = _date + timedelta(_delta)
+                r.append(_date.strftime(FORMAT))
+            return r
+        else:
+            return _date.strftime(FORMAT)
+
+        pass
     def format(self,_df):
         pass
     def post(self,_candidates):
@@ -345,10 +384,19 @@ class Generator (Learner):
         for _iodf in _candidates :
             _df = self._df.copy()
             _df[self.columns] = _iodf[self.columns]
-            if 'approximate' in self.info :
-                
-                _df = self.appriximate(_df)
-            writer.write(_df,schema=self.get_schema())
+            if 'approximate' in self.info :                
+                _df = self.approximate(_df)
+            if 'make_date' in self.info :
+                for name in self.info['make_date'] :
+                    # iname = self.info['make_date']['init_field']
+                    iname = self.info['make_date'][name]
+
+                    years = _df[iname]
+                    _dates = [self.make_date(year=year) for year in years]
+                    if _dates :
+                        _df[name] = _dates
+
+            writer.write(_df[['birth_datetime']+self.columns],schema=self.get_schema())
         pass
 class factory :
     _infocache = {}

From a35c0ed6a28dbd66e274ca04be0bd70bc34e408f Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Mon, 11 Apr 2022 23:40:23 -0500
Subject: [PATCH 183/250] bug fix ...

---
 data/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/__init__.py b/data/__init__.py
index 0ca216d..0f84ec8 100644
--- a/data/__init__.py
+++ b/data/__init__.py
@@ -1,4 +1,4 @@
-import data.params as params
+# import data.params as params
 from data.params import SYS_ARGS
 import transport
 from multiprocessing import Process, Queue

From 260f1021863a00c3bf36e41a59b179ad8c04883c Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 12 Apr 2022 13:16:48 -0500
Subject: [PATCH 184/250] bug fixes, added logger (not yet using though)

---
 data/__init__.py       | 12 ------------
 data/maker/__init__.py |  6 ++++++
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/data/__init__.py b/data/__init__.py
index 0f84ec8..2b4a6aa 100644
--- a/data/__init__.py
+++ b/data/__init__.py
@@ -3,15 +3,3 @@ from data.params import SYS_ARGS
 import transport
 from multiprocessing import Process, Queue
 from data.maker import prepare
-
-class Trainer (Process) :
-    pass
-class Maker(Process):
-    pass
-
-if __name__ == '__main__' :
-
-    logger = transport.factory.instance(SYS_ARGS['store']['logger'])
-    
-
-    
\ No newline at end of file
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 9d3bdb5..d91c89e 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -207,9 +207,11 @@ class Learner(Process):
             self.gpu = int(_args['gpu'])
         else:
             self.gpu = None
+        
         self.info = _args['info']
         self.columns    = self.info['columns'] if 'columns' in self.info else None
         self.store      = _args['store']
+        self.logger = transport.factory.instance(_args['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True)
         if 'network_args' not in _args :
             self.network_args ={
                 'context':_args['context'] if 'context' in _args else 'GENERAL',
@@ -379,11 +381,15 @@ class Generator (Learner):
         
         _store = self.store['target'] if 'target' in self.store else {'provider':'console'}
         _store['lock'] = True
+        _store['context'] = 'write' #-- Just in case
         writer = transport.factory.instance(**_store)
        
         for _iodf in _candidates :
             _df = self._df.copy()
             _df[self.columns] = _iodf[self.columns]
+            #
+            #@TODO:
+            # Improve formatting with better post-processing pipeline
             if 'approximate' in self.info :                
                 _df = self.approximate(_df)
             if 'make_date' in self.info :

From d6fd7bceba5f9d1634432d2404f65f1a4a656d16 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 12 Apr 2022 14:00:03 -0500
Subject: [PATCH 185/250] bug fix

---
 data/maker/__init__.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index d91c89e..e2a072a 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -214,7 +214,7 @@ class Learner(Process):
         self.logger = transport.factory.instance(_args['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True)
         if 'network_args' not in _args :
             self.network_args ={
-                'context':_args['context'] if 'context' in _args else 'GENERAL',
+                'context':self.info['context'] ,
                 'logs':_args['logpath'] if 'logpath' in _args else 'logs',
                 'max_epochs':int(_args['epochs']) if 'epochs' in _args else 2,
                 'batch_size':int (_args['batch']) if 'batch' in _args else 2000
@@ -363,7 +363,13 @@ class Generator (Learner):
 
         #-- synthetic date
         _date = datetime(year=year,month=month,day=day)
-        FORMAT = _args['format'] if 'format' in _args else '%Y-%m-%d'
+        FORMAT =  '%Y-%m-%d'
+        if 'format' in self.info and 'field' in _args and _args['field'] in self.info['format']:
+            _name = _args['field']
+            FORMAT = self.info['format'][_name]
+
+
+        
         r = []
         if offset :
             r = [_date.strftime(FORMAT)]
@@ -382,6 +388,8 @@ class Generator (Learner):
         _store = self.store['target'] if 'target' in self.store else {'provider':'console'}
         _store['lock'] = True
         _store['context'] = 'write' #-- Just in case
+        if 'table' not in _store :
+            _store['table'] = self.info['from']
         writer = transport.factory.instance(**_store)
        
         for _iodf in _candidates :
@@ -398,11 +406,12 @@ class Generator (Learner):
                     iname = self.info['make_date'][name]
 
                     years = _df[iname]
-                    _dates = [self.make_date(year=year) for year in years]
+                    _dates = [self.make_date(year=year,field=name) for year in years]
                     if _dates :
                         _df[name] = _dates
-
-            writer.write(_df[['birth_datetime']+self.columns],schema=self.get_schema())
+            _schema = self.get_schema()
+            _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
+            writer.write(_df[['birth_datetime']+self.columns],schema=_schema)
         pass
 class factory :
     _infocache = {}

From 838c7978de6f85ce7fe1affadbb9b6b60b4a633c Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 12 Apr 2022 14:32:39 -0500
Subject: [PATCH 186/250] bug fix: gpu visibility

---
 data/gan.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index f5705ea..e0f97b1 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -61,16 +61,19 @@ class GNet :
                 self.logs = {}
 
                 # self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
-                self.GPU_CHIPS = None if 'gpu' not in args else args['gpu']
-                if self.GPU_CHIPS is None:
-                        self.GPU_CHIPS = [0]
-                        if 'CUDA_VISIBLE_DEVICES' in os.environ :
-                                os.environ.pop('CUDA_VISIBLE_DEVICES')
-                        self.NUM_GPUS = 0
-                else:
-                        self.NUM_GPUS = len(self.GPU_CHIPS)
+                # self.GPU_CHIPS = None if 'gpu' not in args else args['gpu']
+                # if self.GPU_CHIPS is None:
+                #         self.GPU_CHIPS = [0]
+                #         if 'CUDA_VISIBLE_DEVICES' in os.environ :
+                #                 os.environ.pop('CUDA_VISIBLE_DEVICES')
+                #         self.NUM_GPUS = 0
+                # else:
+                #         self.NUM_GPUS = len(self.GPU_CHIPS)
                         # os.environ['CUDA_VISIBLE_DEVICES'] = str(self.GPU_CHIPS[0])
-                
+                self.NUM_GPUS = 0 if 'gpu' not in args else args['gpu']
+                self.GPU_CHIPS = None if self.NUM_GPUS == 0 else [args['gpu']]
+                if self.GPU_CHIPS :
+                        os.environ['CUDA_VISIBLE_DEVICES'] = str(self.GPU_CHIPS[0])
                 self.PARTITION = args['partition'] if 'partition' in args else None
                 # if self.NUM_GPUS > 1 :
                 #     os.environ['CUDA_VISIBLE_DEVICES'] = "4"

From 4aaefedce02a063abc2f512031d9f9cf40e51ff5 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 12 Apr 2022 14:35:58 -0500
Subject: [PATCH 187/250] bug fix

---
 data/maker/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index e2a072a..b7608d7 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -411,7 +411,7 @@ class Generator (Learner):
                         _df[name] = _dates
             _schema = self.get_schema()
             _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
-            writer.write(_df[['birth_datetime']+self.columns],schema=_schema)
+            writer.write(_df[self.columns],schema=_schema)
         pass
 class factory :
     _infocache = {}

From bbbeb5172a274d1ed15718a6a55878bd5a45eba0 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 12 Apr 2022 14:50:19 -0500
Subject: [PATCH 188/250] bug fix

---
 data/gan.py            | 21 +++++++++------------
 data/maker/__init__.py |  3 ++-
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index e0f97b1..26f19a2 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -61,19 +61,16 @@ class GNet :
                 self.logs = {}
 
                 # self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
-                # self.GPU_CHIPS = None if 'gpu' not in args else args['gpu']
-                # if self.GPU_CHIPS is None:
-                #         self.GPU_CHIPS = [0]
-                #         if 'CUDA_VISIBLE_DEVICES' in os.environ :
-                #                 os.environ.pop('CUDA_VISIBLE_DEVICES')
-                #         self.NUM_GPUS = 0
-                # else:
-                #         self.NUM_GPUS = len(self.GPU_CHIPS)
+                self.GPU_CHIPS = None if 'gpu' not in args else [args['gpu']]
+                if self.GPU_CHIPS is None:
+                        self.GPU_CHIPS = [0]
+                        if 'CUDA_VISIBLE_DEVICES' in os.environ :
+                                os.environ.pop('CUDA_VISIBLE_DEVICES')
+                        self.NUM_GPUS = 0
+                else:
+                        self.NUM_GPUS = len(self.GPU_CHIPS)
                         # os.environ['CUDA_VISIBLE_DEVICES'] = str(self.GPU_CHIPS[0])
-                self.NUM_GPUS = 0 if 'gpu' not in args else args['gpu']
-                self.GPU_CHIPS = None if self.NUM_GPUS == 0 else [args['gpu']]
-                if self.GPU_CHIPS :
-                        os.environ['CUDA_VISIBLE_DEVICES'] = str(self.GPU_CHIPS[0])
+                
                 self.PARTITION = args['partition'] if 'partition' in args else None
                 # if self.NUM_GPUS > 1 :
                 #     os.environ['CUDA_VISIBLE_DEVICES'] = "4"
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index b7608d7..4c175e9 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -319,7 +319,8 @@ class Generator (Learner):
         _args['map'] = self._map
         _args['values'] = np.array(values)
         _args['row_count'] = self._df.shape[0]
-        
+        if self.gpu :
+            _args['gpu'] = self.gpu
         gHandler = gan.Predict(**_args)
         gHandler.load_meta(columns=None)
         _iomatrix = gHandler.apply()

From 9b3031af1c8b17a8b5a6c2d12d9cbcdc25e79ecf Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 12 Apr 2022 14:59:46 -0500
Subject: [PATCH 189/250] bug fix: preconditions

---
 data/maker/__init__.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 4c175e9..fba1361 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -248,7 +248,7 @@ class Learner(Process):
         _args = {"schema":self.get_schema(),"data":self._df,"columns":columns}
         if self._map :
             _args['map'] = self._map
-        self._encoder = prepare.Input(**_args)        
+        self._encoder = prepare.Input(**_args)  if self._df.shape[0] > 0 else None     
 class Trainer(Learner):
     """
     This will perform training using a GAN
@@ -263,6 +263,10 @@ class Trainer(Learner):
         self.candidates = int(_args['candidates']) if 'candidates' in _args else 1
     def run(self):
         self.initalize()
+        if self._encoder is None :
+            #
+            # @TODO Log that the dataset was empty or not statistically relevant
+            return 
         _space,_matrix = self._encoder.convert()
         
         _args   = self.network_args
@@ -311,9 +315,15 @@ class Generator (Learner):
         file.close()
     def run(self):
         self.initalize()
+        if self._encoder is None :
+            #
+            # @TODO Log that the dataset was empty or not statistically relevant
+            return 
+        
         #
         # The values will be returned because we have provided _map information from the constructor
         #
+
         values,_matrix = self._encoder.convert()
         _args = self.network_args
         _args['map'] = self._map

From becc30ff4279e2b547123e1f3b6819a0e87b0af5 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 13 Apr 2022 09:36:21 -0500
Subject: [PATCH 190/250] bug fix added logger and approximation fix

---
 data/maker/__init__.py | 56 ++++++++++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index fba1361..382c209 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -202,7 +202,7 @@ class Learner(Process):
         
         super(Learner, self).__init__() 
         if 'gpu' in _args :
-            print (_args['gpu'])
+            
             os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])
             self.gpu = int(_args['gpu'])
         else:
@@ -224,9 +224,13 @@ class Learner(Process):
         self._encoder = None
         self._map = None
         self._df = _args['data'] if 'data' in _args else None
+        self.name   =  self.__class__.__name__+'::'+self.info['context']+'::'+self.info['from']
         #
         # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork
         #
+        if self.logger :
+            _args = {'module':self.name,'action':'init','context':self.info['context'],'gpu':(self.gpu if self.gpu is not None else -1)}
+            self.logger.write(_args)
 
         # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
         # sel.max_epoc
@@ -249,6 +253,9 @@ class Learner(Process):
         if self._map :
             _args['map'] = self._map
         self._encoder = prepare.Input(**_args)  if self._df.shape[0] > 0 else None     
+        if self.logger :
+            _args = {'module':self.name,'action':'data-prep','input':{'rows':self._df.shape[0],'cols':self._df.shape[1]} }
+            self.logger.write(_args)
 class Trainer(Learner):
     """
     This will perform training using a GAN
@@ -257,10 +264,11 @@ class Trainer(Learner):
         super().__init__(**_args)
         # self.info   = _args['info']
         self.limit  = int(_args['limit']) if 'limit' in _args else None
-        self.name   =  _args['name']
+        
         self.autopilot = _args['autopilot'] if 'autopilot' in _args else False
         self.generate = None
         self.candidates = int(_args['candidates']) if 'candidates' in _args else 1
+
     def run(self):
         self.initalize()
         if self._encoder is None :
@@ -277,7 +285,7 @@ class Trainer(Learner):
         #
         # At this point we have the binary matrix, we can initiate training
         #
-
+        beg = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
         gTrain = gan.Train(**_args)
         gTrain.apply()
        
@@ -293,6 +301,10 @@ class Trainer(Learner):
             _args['gpu'] = self.gpu
         g = Generator(**_args)
         # g.run() 
+        if self.logger :
+            end = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+            logs = {'module':self.name,'action':'train','input':{'start':beg,'end':end}}
+            self.logger.write(logs)
         self.generate = g
         if self.autopilot :
             self.generate.run()
@@ -333,29 +345,38 @@ class Generator (Learner):
             _args['gpu'] = self.gpu
         gHandler = gan.Predict(**_args)
         gHandler.load_meta(columns=None)
-        _iomatrix = gHandler.apply()
+        _iomatrix = gHandler.apply()        
         _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
+        if self.logger :
+            _size = np.sum([len(_item) for _item in _iomatrix])
+            _log = {'module':self.name,'action':'io-data','input':{'candidates':len(_candidates),'rows':_size}}
+            self.logger.write(_log)
         self.post(_candidates)
     def approximate(self,_df):
         _columns = self.info['approximate']
-        # _schema = {}
-        # for _info in self.get_schema() :
-        #     _schema[_info['name']] = _info['type'] 
-
-        
+ 
         for name in _columns :
-            batches = np.array_split(_df[name].fillna(np.nan).values,2)
+            if _df[name].size > 100 :
+                BATCH_SIZE = 10
+                
+            else:
+                BATCH_SIZE = 1
+            batches = np.array_split(_df[name].fillna(np.nan).values,BATCH_SIZE)
             _type = np.int64 if 'int' in self.info['approximate'][name]else np.float64
             x = []
+            _log = {'module':self.name,'action':'approximate','input':{'batch':BATCH_SIZE,'col':name}}
             for values in batches :
-                
-                index = np.where(values != '')
+               
+                index = [ _x not in ['',None,np.nan] for _x in values]
                 _values = np.random.dirichlet(values[index].astype(_type))                                
                 values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values)
                 values[index] = values[index].astype(_type)
                 x += values.tolist()
-            if x :                
+            if x :   
+                _log['input']['diff'] = 1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size)
                 _df[name] = x  #np.array(x,dtype=np.int64) if 'int' in _type  else np.arry(x,dtype=np.float64)
+                if self.logger :
+                    self.logger.write(_log)
         return _df
     def make_date(self,**_args) :
         """
@@ -402,10 +423,11 @@ class Generator (Learner):
         if 'table' not in _store :
             _store['table'] = self.info['from']
         writer = transport.factory.instance(**_store)
-       
+        N = 0
         for _iodf in _candidates :
             _df = self._df.copy()
             _df[self.columns] = _iodf[self.columns]
+            N += _df.shape[0]
             #
             #@TODO:
             # Improve formatting with better post-processing pipeline
@@ -422,8 +444,10 @@ class Generator (Learner):
                         _df[name] = _dates
             _schema = self.get_schema()
             _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
-            writer.write(_df[self.columns],schema=_schema)
-        pass
+            
+            writer.write(_df,schema=_schema)
+        if self.logger :
+            self.logger.write({'module':self.name,'action':'write','input':{'rows':N,'candidates':len(_candidates)}})
 class factory :
     _infocache = {}
     @staticmethod

From 2fdc7c8f5c92c1159dc8d716f10d23a352d61892 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 13 Apr 2022 10:07:27 -0500
Subject: [PATCH 191/250] bug fix

---
 data/maker/__init__.py | 221 +++++------------------------------------
 1 file changed, 26 insertions(+), 195 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 382c209..3acddc1 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -21,181 +21,6 @@ import json
 from multiprocessing import Process, RLock
 from datetime import datetime, timedelta
 
-class ContinuousToDiscrete :
-    ROUND_UP = 2
-    @staticmethod
-    def binary(X,n=4) :
-        """
-        This function will convert a continous stream of information into a variety a bit stream of bins
-        """
-        values = np.array(X).astype(np.float32)
-        BOUNDS = ContinuousToDiscrete.bounds(values,n)
-        matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n)
-
-    
-    @staticmethod
-    def bounds(x,n):
-        # return np.array_split(x,n)
-        values = np.round(x,ContinuousToDiscrete.ROUND_UP)
-        return list(pd.cut(values,n).categories)
-        
-
-        
-    @staticmethod
-    def continuous(X,BIN_SIZE=4) :
-        """
-        This function will approximate a binary vector given boundary information
-        :X  binary matrix
-        :BIN_SIZE
-        """
-        BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)
-        
-        values = []
-        # _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
-        # # # print (BOUNDS)
-        l = {}
-        for i in np.arange(len(X)): #value in X :
-            
-            value = X[i]
-            
-            for item in BOUNDS :
-                if value >= item.left and value <= item.right :
-                    values += [np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP)]
-                    break
-            # values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if  value >= item.left and value <= item.right ]
-            
-    
-        # # values = []
-        # for row in _BINARY :
-        #     # ubound = BOUNDS[row.index(1)]
-        #     index = np.where(row == 1)[0][0]
-            
-        #     ubound = BOUNDS[ index ].right
-        #     lbound = BOUNDS[ index ].left
-            
-        #     x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)            
-        #     values.append(x_)
-            
-        #     lbound = ubound
-
-        # values = [np.random.uniform() for item in BOUNDS]
-        
-        return values
-            
-
-def train (**_args):
-    """
-    :params sql
-    :params store
-    """
-    
-    _inputhandler = prepare.Input(**_args)
-    values,_matrix = _inputhandler.convert()
-    args  = {"real":_matrix,"context":_args['context']}
-    _map = {}
-    if 'store' in _args :
-        #
-        # This 
-        
-        args['store'] = copy.deepcopy(_args['store']['logs'])
-        if 'args' in _args['store']:
-            args['store']['args']['doc'] = _args['context']
-        else:
-            
-            args['store']['doc'] = _args['context']
-        logger = transport.factory.instance(**args['store'])
-        args['logger'] = logger
-        
-        for key in _inputhandler._map :
-            beg = _inputhandler._map[key]['beg']
-            end = _inputhandler._map[key]['end']
-            values = _inputhandler._map[key]['values'].tolist()
-            _map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()}
-        info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map}
-        print()
-        # print ([_args['context'],_inputhandler._io])
-        logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io})
-    
-    args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
-    args ['max_epochs'] = _args['max_epochs']
-    args['matrix_size'] = _matrix.shape[0]
-    args['batch_size'] = 2000
-    if 'partition' in _args :
-        args['partition'] = _args['partition']
-    if 'gpu' in _args :
-        args['gpu'] = _args['gpu']
-    # os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
-    
-    trainer = gan.Train(**args)   
-    #
-    # @TODO: Write the map.json in the output directory for the logs
-    # 
-    # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w')
-    f = open(os.sep.join([trainer.out_dir,'map.json']),'w')
-    f.write(json.dumps(_map))
-    f.close()
-
-    trainer.apply()
-    pass    
-
-def get(**args):
-    """
-    This function will restore a checkpoint from a persistant storage on to disk
-    """
-    pass
-def generate(**_args):
-    """
-    This function will generate a set of records, before we must load the parameters needed
-    :param data
-    :param context
-    :param logs
-    """
-    _args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
-    partition = _args['partition'] if 'partition' in _args else None
-    if not partition :
-        MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context']])
-        # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
-    else:
-        MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context'],str(partition)])
-        # f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json']))
-    f = open(os.sep.join([MAP_FOLDER,'map.json']))
-    _map = json.loads(f.read())
-    f.close()
-    #
-    # 
-    # if 'file' in _args :
-    #     df = pd.read_csv(_args['file'])
-    # else:
-    #     df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])
-    args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']}
-    args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
-    args ['max_epochs'] = _args['max_epochs']
-    # args['matrix_size'] = _matrix.shape[0]
-    args['batch_size'] = 2000
-    args['partition'] = 0 if 'partition' not in _args else _args['partition']
-    args['row_count'] = _args['data'].shape[0]
-    #
-    # @TODO: perhaps get the space of values here ... (not sure it's a good idea)
-    #
-    _args['map']  = _map
-    _inputhandler = prepare.Input(**_args)
-    values,_matrix = _inputhandler.convert()    
-    args['values'] = np.array(values)
-    if 'gpu' in _args :
-        args['gpu'] = _args['gpu']
-       
-    handler     = gan.Predict (**args)
-    lparams = {'columns':None}
-    if partition :
-        lparams['partition'] = partition
-    handler.load_meta(**lparams)
-    #
-    # Let us now format the matrices by reverting them to a data-frame with values
-    #
-
-    candidates = handler.apply(candidates=args['candidates'])       
-    return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]
-
 class Learner(Process):
     def __init__(self,**_args):
         
@@ -211,7 +36,7 @@ class Learner(Process):
         self.info = _args['info']
         self.columns    = self.info['columns'] if 'columns' in self.info else None
         self.store      = _args['store']
-        self.logger = transport.factory.instance(_args['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True)
+        
         if 'network_args' not in _args :
             self.network_args ={
                 'context':self.info['context'] ,
@@ -228,12 +53,18 @@ class Learner(Process):
         #
         # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork
         #
-        if self.logger :
-            _args = {'module':self.name,'action':'init','context':self.info['context'],'gpu':(self.gpu if self.gpu is not None else -1)}
-            self.logger.write(_args)
+        
+        _log = {'module':self.name,'action':'init','context':self.info['context'],'gpu':(self.gpu if self.gpu is not None else -1)}
+        self.log(**_log)
 
         # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
         # sel.max_epoc
+    def log(self,**_args):
+        logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True)
+        logger.write(_args)
+        if hasattr(logger,'close') :
+            logger.close()
+
     def get_schema(self):
         if self.store['source']['provider'] != 'bigquery' :
             return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
@@ -253,9 +84,9 @@ class Learner(Process):
         if self._map :
             _args['map'] = self._map
         self._encoder = prepare.Input(**_args)  if self._df.shape[0] > 0 else None     
-        if self.logger :
-            _args = {'module':self.name,'action':'data-prep','input':{'rows':self._df.shape[0],'cols':self._df.shape[1]} }
-            self.logger.write(_args)
+        
+        _log = {'module':self.name,'action':'data-prep','input':{'rows':self._df.shape[0],'cols':self._df.shape[1]} }
+        self.log(**_log)
 class Trainer(Learner):
     """
     This will perform training using a GAN
@@ -301,10 +132,10 @@ class Trainer(Learner):
             _args['gpu'] = self.gpu
         g = Generator(**_args)
         # g.run() 
-        if self.logger :
-            end = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-            logs = {'module':self.name,'action':'train','input':{'start':beg,'end':end}}
-            self.logger.write(logs)
+        
+        end = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+        _logs = {'module':self.name,'action':'train','input':{'start':beg,'end':end}}
+        self.log(**_logs)
         self.generate = g
         if self.autopilot :
             self.generate.run()
@@ -347,10 +178,10 @@ class Generator (Learner):
         gHandler.load_meta(columns=None)
         _iomatrix = gHandler.apply()        
         _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
-        if self.logger :
-            _size = np.sum([len(_item) for _item in _iomatrix])
-            _log = {'module':self.name,'action':'io-data','input':{'candidates':len(_candidates),'rows':_size}}
-            self.logger.write(_log)
+        
+        _size = np.sum([len(_item) for _item in _iomatrix])
+        _log = {'module':self.name,'action':'io-data','input':{'candidates':len(_candidates),'rows':int(_size)}}
+        self.log(**_log)
         self.post(_candidates)
     def approximate(self,_df):
         _columns = self.info['approximate']
@@ -373,10 +204,10 @@ class Generator (Learner):
                 values[index] = values[index].astype(_type)
                 x += values.tolist()
             if x :   
-                _log['input']['diff'] = 1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size)
+                _log['input']['diff_pct'] = 100 * (1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size))
                 _df[name] = x  #np.array(x,dtype=np.int64) if 'int' in _type  else np.arry(x,dtype=np.float64)
-                if self.logger :
-                    self.logger.write(_log)
+                
+                self.log(**_log)
         return _df
     def make_date(self,**_args) :
         """
@@ -446,8 +277,8 @@ class Generator (Learner):
             _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
             
             writer.write(_df,schema=_schema)
-        if self.logger :
-            self.logger.write({'module':self.name,'action':'write','input':{'rows':N,'candidates':len(_candidates)}})
+        
+        self.log(**{'module':self.name,'action':'write','input':{'rows':N,'candidates':len(_candidates)}})
 class factory :
     _infocache = {}
     @staticmethod

From 1bdf6cc8b3adbb200e3f2e318553ddbef5b55e2f Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 13 Apr 2022 10:55:55 -0500
Subject: [PATCH 192/250] bug fix

---
 data/maker/__init__.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 3acddc1..ff93104 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -244,7 +244,14 @@ class Generator (Learner):
             return _date.strftime(FORMAT)
 
         pass
-    def format(self,_df):
+    def format(self,_df,_schema):
+        for _item in _schema :
+            name = _item['name']
+            if _item['type'].upper() in ['DATETIME','TIMESTAMP'] :
+                
+                _df[name] = pd.to_datetime(_df[name], format='%Y-%m-%d %H:%M:%S').astype('datetime64[ns]')
+        return _df
+
         pass
     def post(self,_candidates):
         
@@ -272,10 +279,10 @@ class Generator (Learner):
                     years = _df[iname]
                     _dates = [self.make_date(year=year,field=name) for year in years]
                     if _dates :
-                        _df[name] = _dates
+                        _df[name] = _dates            
             _schema = self.get_schema()
             _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
-            
+            _df = self.format(_df,_schema)
             writer.write(_df,schema=_schema)
         
         self.log(**{'module':self.name,'action':'write','input':{'rows':N,'candidates':len(_candidates)}})

From 1bffb8d7be70e4b1af868977d07964c700f0acff Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 13 Apr 2022 11:11:23 -0500
Subject: [PATCH 193/250] bug fix (exception handling)

---
 data/maker/__init__.py         | 19 ++++++++++++-------
 data/maker/prepare/__init__.py | 13 ++++++++-----
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index ff93104..807bd84 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -22,10 +22,12 @@ from multiprocessing import Process, RLock
 from datetime import datetime, timedelta
 
 class Learner(Process):
+
     def __init__(self,**_args):
         
         
         super(Learner, self).__init__() 
+        self.ndx = 0
         if 'gpu' in _args :
             
             os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])
@@ -49,19 +51,22 @@ class Learner(Process):
         self._encoder = None
         self._map = None
         self._df = _args['data'] if 'data' in _args else None
-        self.name   =  self.__class__.__name__+'::'+self.info['context']+'::'+self.info['from']
+        self.name   =  self.__class__.__name__+'::'+self.info['from']
+        self.name = self.name.replace('?','')
         #
         # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork
         #
         
-        _log = {'module':self.name,'action':'init','context':self.info['context'],'gpu':(self.gpu if self.gpu is not None else -1)}
+        _log = {'action':'init','context':self.info['context'],'gpu':(self.gpu if self.gpu is not None else -1)}
         self.log(**_log)
 
         # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
         # sel.max_epoc
     def log(self,**_args):
         logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True)
+        _args = dict({'ndx':self.ndx,'module':self.name,'info':self.info['context'],**_args})
         logger.write(_args)
+        self.ndx += 1
         if hasattr(logger,'close') :
             logger.close()
 
@@ -85,7 +90,7 @@ class Learner(Process):
             _args['map'] = self._map
         self._encoder = prepare.Input(**_args)  if self._df.shape[0] > 0 else None     
         
-        _log = {'module':self.name,'action':'data-prep','input':{'rows':self._df.shape[0],'cols':self._df.shape[1]} }
+        _log = {'action':'data-prep','input':{'rows':self._df.shape[0],'cols':self._df.shape[1]} }
         self.log(**_log)
 class Trainer(Learner):
     """
@@ -134,7 +139,7 @@ class Trainer(Learner):
         # g.run() 
         
         end = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-        _logs = {'module':self.name,'action':'train','input':{'start':beg,'end':end}}
+        _logs = {'action':'train','input':{'start':beg,'end':end,"unique_counts":self._encoder._io[0]}}
         self.log(**_logs)
         self.generate = g
         if self.autopilot :
@@ -180,7 +185,7 @@ class Generator (Learner):
         _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
         
         _size = np.sum([len(_item) for _item in _iomatrix])
-        _log = {'module':self.name,'action':'io-data','input':{'candidates':len(_candidates),'rows':int(_size)}}
+        _log = {'action':'io-data','input':{'candidates':len(_candidates),'rows':int(_size)}}
         self.log(**_log)
         self.post(_candidates)
     def approximate(self,_df):
@@ -195,7 +200,7 @@ class Generator (Learner):
             batches = np.array_split(_df[name].fillna(np.nan).values,BATCH_SIZE)
             _type = np.int64 if 'int' in self.info['approximate'][name]else np.float64
             x = []
-            _log = {'module':self.name,'action':'approximate','input':{'batch':BATCH_SIZE,'col':name}}
+            _log = {'action':'approximate','input':{'batch':BATCH_SIZE,'col':name}}
             for values in batches :
                
                 index = [ _x not in ['',None,np.nan] for _x in values]
@@ -285,7 +290,7 @@ class Generator (Learner):
             _df = self.format(_df,_schema)
             writer.write(_df,schema=_schema)
         
-        self.log(**{'module':self.name,'action':'write','input':{'rows':N,'candidates':len(_candidates)}})
+        self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}})
 class factory :
     _infocache = {}
     @staticmethod
diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 478d435..bc316e9 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -90,11 +90,14 @@ class Input :
         # else:
         #
         # We will look into the count and make a judgment call
-        _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T
-        MIN_SPACE_SIZE = 2
-        self._columns  = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist()
-        self._io = _df.to_dict(orient='records')
-            
+        try:
+            _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T
+            MIN_SPACE_SIZE = 2
+            self._columns  = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist()
+            self._io = _df.to_dict(orient='records')
+        except Exception as e:
+            print (e)
+            self._io = []
     def _initdata(self,**_args):
         """
         This function will initialize the class with a data-frame and columns of interest (if any)

From 15e53cb6569eec301a3eb75637f7233a9c8a6ee4 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 13 Apr 2022 11:19:36 -0500
Subject: [PATCH 194/250] bug fix (exception handling)

---
 data/maker/prepare/__init__.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index bc316e9..f7ae3f7 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -91,9 +91,11 @@ class Input :
         #
         # We will look into the count and make a judgment call
         try:
-            _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T
-            MIN_SPACE_SIZE = 2
-            self._columns  = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist()
+            # _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T
+            # MIN_SPACE_SIZE = 2
+            # self._columns  = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist()
+            # self._io = _df.to_dict(orient='records')
+            _df  = self.df.nunique().T / self.df.shape[0]
             self._io = _df.to_dict(orient='records')
         except Exception as e:
             print (e)

From 167e4b873d550860d772d3c608a8352f3d78f0db Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 13 Apr 2022 11:39:54 -0500
Subject: [PATCH 195/250] bug fix

---
 data/maker/prepare/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index f7ae3f7..3ef494e 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -96,7 +96,7 @@ class Input :
             # self._columns  = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist()
             # self._io = _df.to_dict(orient='records')
             _df  = self.df.nunique().T / self.df.shape[0]
-            self._io = _df.to_dict(orient='records')
+            self._io = pd.DataFrame(_df).to_dict(orient='records')
         except Exception as e:
             print (e)
             self._io = []

From 289f2e7b895885fd95cf8f0991aeaf7dbfcae198 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 13 Apr 2022 11:45:39 -0500
Subject: [PATCH 196/250] bugfix

---
 data/maker/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 807bd84..21e38c5 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -51,8 +51,8 @@ class Learner(Process):
         self._encoder = None
         self._map = None
         self._df = _args['data'] if 'data' in _args else None
-        self.name   =  self.__class__.__name__+'::'+self.info['from']
-        self.name = self.name.replace('?','')
+        self.name   =  self.__class__.__name__
+        
         #
         # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork
         #
@@ -64,7 +64,7 @@ class Learner(Process):
         # sel.max_epoc
     def log(self,**_args):
         logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True)
-        _args = dict({'ndx':self.ndx,'module':self.name,'info':self.info['context'],**_args})
+        _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'info':self.info['context'],**_args})
         logger.write(_args)
         self.ndx += 1
         if hasattr(logger,'close') :

From e93fe7fea8f45db056642f913d82732279c77149 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 14 Apr 2022 10:06:27 -0500
Subject: [PATCH 197/250] bug fixes

---
 data/maker/__init__.py         | 14 +++++++++++---
 data/maker/prepare/__init__.py |  2 +-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 21e38c5..77effb3 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -121,7 +121,7 @@ class Trainer(Learner):
         #
         # At this point we have the binary matrix, we can initiate training
         #
-        beg = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+        beg = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S')
         gTrain = gan.Train(**_args)
         gTrain.apply()
        
@@ -138,8 +138,9 @@ class Trainer(Learner):
         g = Generator(**_args)
         # g.run() 
         
-        end = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-        _logs = {'action':'train','input':{'start':beg,'end':end,"unique_counts":self._encoder._io[0]}}
+        end = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S')
+        _min = float(timedelta(end,beg).seconds/ 60)
+        _logs = {'action':'train','input':{'start':beg.strftime('%Y-%m-%d %H:%M:%S'),'minutes':_min,"unique_counts":self._encoder._io[0]}}
         self.log(**_logs)
         self.generate = g
         if self.autopilot :
@@ -158,6 +159,7 @@ class Generator (Learner):
         #
         self.network_args['candidates'] = int(_args['candidates']) if 'candidates' in _args else 1
         filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json'])
+        self.log(**{'action':'init-map','input':{'filename':filename,'exists':os.path.exists(filename)}})
         file = open(filename)
         self._map = json.loads(file.read())
         file.close()
@@ -291,6 +293,12 @@ class Generator (Learner):
             writer.write(_df,schema=_schema)
         
         self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}})
+class Shuffle(Trainer):    
+    """
+    This is a method that will yield data with low utility
+    """
+    def __init__(self,**_args):
+        super().__init__(self)
 class factory :
     _infocache = {}
     @staticmethod
diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 3ef494e..1bf4872 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -96,7 +96,7 @@ class Input :
             # self._columns  = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist()
             # self._io = _df.to_dict(orient='records')
             _df  = self.df.nunique().T / self.df.shape[0]
-            self._io = pd.DataFrame(_df).to_dict(orient='records')
+            self._io = pd.DataFrame(_df).astype(float).to_dict(orient='records')
         except Exception as e:
             print (e)
             self._io = []

From 4345146f3a7f6bd6ffea63c0d416844b261a6103 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 14 Apr 2022 11:06:28 -0500
Subject: [PATCH 198/250] bug fix: logger and io space

---
 data/maker/__init__.py         | 54 ++++++++++++++++++++++++++--------
 data/maker/prepare/__init__.py |  5 ++--
 setup.py                       |  3 +-
 3 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 77effb3..bce8d65 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -28,6 +28,7 @@ class Learner(Process):
         
         super(Learner, self).__init__() 
         self.ndx = 0
+        self.lock = RLock()
         if 'gpu' in _args :
             
             os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])
@@ -63,13 +64,21 @@ class Learner(Process):
         # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
         # sel.max_epoc
     def log(self,**_args):
-        logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True)
-        _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'info':self.info['context'],**_args})
-        logger.write(_args)
-        self.ndx += 1
-        if hasattr(logger,'close') :
-            logger.close()
-
+        self.lock.acquire()
+        try:
+            logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True)
+            _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'info':self.info['context'],**_args})
+            logger.write(_args)
+            self.ndx += 1
+            if hasattr(logger,'close') :
+                logger.close()
+        except Exception as e:
+            print ()
+            print (_args)
+            print (e)
+            pass
+        finally:
+            self.lock.release()
     def get_schema(self):
         if self.store['source']['provider'] != 'bigquery' :
             return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
@@ -88,9 +97,8 @@ class Learner(Process):
         _args = {"schema":self.get_schema(),"data":self._df,"columns":columns}
         if self._map :
             _args['map'] = self._map
-        self._encoder = prepare.Input(**_args)  if self._df.shape[0] > 0 else None     
-        
-        _log = {'action':'data-prep','input':{'rows':self._df.shape[0],'cols':self._df.shape[1]} }
+        self._encoder = prepare.Input(**_args)  if self._df.shape[0] > 0 else None             
+        _log = {'action':'data-prep','input':{'rows':int(self._df.shape[0]),'cols':int(self._df.shape[1]) } }
         self.log(**_log)
 class Trainer(Learner):
     """
@@ -139,7 +147,7 @@ class Trainer(Learner):
         # g.run() 
         
         end = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S')
-        _min = float(timedelta(end,beg).seconds/ 60)
+        _min = float((end-beg).seconds/ 60)
         _logs = {'action':'train','input':{'start':beg.strftime('%Y-%m-%d %H:%M:%S'),'minutes':_min,"unique_counts":self._encoder._io[0]}}
         self.log(**_logs)
         self.generate = g
@@ -293,12 +301,27 @@ class Generator (Learner):
             writer.write(_df,schema=_schema)
         
         self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}})
-class Shuffle(Trainer):    
+class Shuffle(Generator):    
     """
     This is a method that will yield data with low utility
     """
     def __init__(self,**_args):
         super().__init__(self)
+    def run(self):
+        
+
+        self.initalize()
+        _index = np.arange(self._df.shape[0])
+        np.random.shuffle(_index)
+        _iocolumns = self.info['columns']
+        _ocolumns = list(set(self._df.columns) - set(_iocolumns) )
+        _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(self._df.shape[0]))
+        self._df = self._df[_ocolumns].join(_iodf)
+       
+        
+        _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}}
+        self.log(**_log)
+        self.post([self._df])
 class factory :
     _infocache = {}
     @staticmethod
@@ -313,4 +336,9 @@ class factory :
         :param autopilot        will generate output automatically
         :param batch (default 2k)           size of the batch
         """
-        return Trainer(**_args)
\ No newline at end of file
+        if 'apply' not in _args :
+            return Trainer(**_args)
+        elif _args['apply'] == 'shuffe' :
+            return Shuffle(**_args)
+        elif _args['apply'] == 'generate' :
+            return Generator(**_args)
\ No newline at end of file
diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 1bf4872..50fcfdf 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -95,8 +95,9 @@ class Input :
             # MIN_SPACE_SIZE = 2
             # self._columns  = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist()
             # self._io = _df.to_dict(orient='records')
-            _df  = self.df.nunique().T / self.df.shape[0]
-            self._io = pd.DataFrame(_df).astype(float).to_dict(orient='records')
+            _df  = pd.DataFrame(self.df.nunique().T / self.df.shape[0]).T
+            self._io =  (_df.to_dict(orient='records'))
+            
         except Exception as e:
             print (e)
             self._io = []
diff --git a/setup.py b/setup.py
index 3822df5..c96877b 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,8 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.5.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.5.1",
+        "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git'

From 7a22314a46ca5c806428f63cb3f90419e99b1af1 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 14 Apr 2022 11:27:55 -0500
Subject: [PATCH 199/250] bugfix

---
 data/maker/__init__.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index bce8d65..eb4c02d 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -336,9 +336,10 @@ class factory :
         :param autopilot        will generate output automatically
         :param batch (default 2k)           size of the batch
         """
-        if 'apply' not in _args :
-            return Trainer(**_args)
-        elif _args['apply'] == 'shuffe' :
+
+        if _args['apply'] == 'shuffe' :
             return Shuffle(**_args)
         elif _args['apply'] == 'generate' :
-            return Generator(**_args)
\ No newline at end of file
+            return Generator(**_args)
+        else:
+            return Trainer(**_args)
\ No newline at end of file

From 528e6db0b8fa09ee21c1e4598fed32d2fba6c3db Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 14 Apr 2022 11:41:30 -0500
Subject: [PATCH 200/250] bug fix

---
 data/maker/__init__.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index eb4c02d..49227a6 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -306,7 +306,7 @@ class Shuffle(Generator):
     This is a method that will yield data with low utility
     """
     def __init__(self,**_args):
-        super().__init__(self)
+        super().__init__(**_args)
     def run(self):
         
 
@@ -315,7 +315,8 @@ class Shuffle(Generator):
         np.random.shuffle(_index)
         _iocolumns = self.info['columns']
         _ocolumns = list(set(self._df.columns) - set(_iocolumns) )
-        _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(self._df.shape[0]))
+        # _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size))
+        _iodf = pd.DataFrame(self._df[_iocolumns],index = np.arange(_index.size))
         self._df = self._df[_ocolumns].join(_iodf)
        
         
@@ -336,8 +337,8 @@ class factory :
         :param autopilot        will generate output automatically
         :param batch (default 2k)           size of the batch
         """
-
-        if _args['apply'] == 'shuffe' :
+        
+        if _args['apply'] == 'shuffle' :
             return Shuffle(**_args)
         elif _args['apply'] == 'generate' :
             return Generator(**_args)

From 9f198f3b1556f411afda04b76359ce4ec0f47334 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 14 Apr 2022 12:03:59 -0500
Subject: [PATCH 201/250] bug fix: generator iherited by shuffle

---
 data/maker/__init__.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 49227a6..3f437d2 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -168,9 +168,12 @@ class Generator (Learner):
         self.network_args['candidates'] = int(_args['candidates']) if 'candidates' in _args else 1
         filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json'])
         self.log(**{'action':'init-map','input':{'filename':filename,'exists':os.path.exists(filename)}})
-        file = open(filename)
-        self._map = json.loads(file.read())
-        file.close()
+        if os.path.exists(filename):
+            file = open(filename)
+            self._map = json.loads(file.read())
+            file.close()
+        else:
+            self._map = {}
     def run(self):
         self.initalize()
         if self._encoder is None :

From 1ff4145eeaaff2c7e901bb73cc3c1c650298f2a3 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 14 Apr 2022 12:36:20 -0500
Subject: [PATCH 202/250] bugfix: formatter

---
 data/maker/__init__.py | 13 +++++++++++--
 setup.py               |  2 +-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 3f437d2..2b53def 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -265,9 +265,17 @@ class Generator (Learner):
     def format(self,_df,_schema):
         for _item in _schema :
             name = _item['name']
-            if _item['type'].upper() in ['DATETIME','TIMESTAMP'] :
+            
+            if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] :
+                FORMAT = '%Y-%d-%m'
+                if 'format' in self.info and name in self.info['format'] :
+                    FORMAT = self.info['format'][name]
+                else:
+                    if _item['type'] == ['DATETIME','TIMESTAMP'] :
+                        FORMAT = '%Y-%d-%m %H:%M:%S'
+                self.log(**{'action':'format','input':{'name':name,'format':FORMAT}})
                 
-                _df[name] = pd.to_datetime(_df[name], format='%Y-%m-%d %H:%M:%S').astype('datetime64[ns]')
+                _df[name] = pd.to_datetime(_df[name], format=FORMAT).astype(str) #.astype('datetime64[ns]')
         return _df
 
         pass
@@ -298,6 +306,7 @@ class Generator (Learner):
                     _dates = [self.make_date(year=year,field=name) for year in years]
                     if _dates :
                         _df[name] = _dates            
+            
             _schema = self.get_schema()
             _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
             _df = self.format(_df,_schema)
diff --git a/setup.py b/setup.py
index c96877b..1991bde 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.5.1",
+args = {"name":"data-maker","version":"1.5.2",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow']

From febcaa588395f5ee84c4cc7bec08683ba368d765 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 14 Apr 2022 12:42:11 -0500
Subject: [PATCH 203/250] bugfix: logs for formatting dates

---
 data/maker/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 2b53def..2921b46 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -263,6 +263,7 @@ class Generator (Learner):
 
         pass
     def format(self,_df,_schema):
+        r = {}
         for _item in _schema :
             name = _item['name']
             
@@ -273,9 +274,12 @@ class Generator (Learner):
                 else:
                     if _item['type'] == ['DATETIME','TIMESTAMP'] :
                         FORMAT = '%Y-%d-%m %H:%M:%S'
-                self.log(**{'action':'format','input':{'name':name,'format':FORMAT}})
+                r[name] = FORMAT
+                
                 
                 _df[name] = pd.to_datetime(_df[name], format=FORMAT).astype(str) #.astype('datetime64[ns]')
+        if r :
+            self.log(**{'action':'format','input':r})
         return _df
 
         pass

From 0e4148d4e79e23267be6a71285d5001104ec401d Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 14 Apr 2022 13:37:47 -0500
Subject: [PATCH 204/250] bugfix: date/timestamp conversions

---
 data/maker/__init__.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 2921b46..184bca4 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -64,7 +64,7 @@ class Learner(Process):
         # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
         # sel.max_epoc
     def log(self,**_args):
-        self.lock.acquire()
+        # self.lock.acquire()
         try:
             logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True)
             _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'info':self.info['context'],**_args})
@@ -78,7 +78,8 @@ class Learner(Process):
             print (e)
             pass
         finally:
-            self.lock.release()
+            # self.lock.release()
+            pass
     def get_schema(self):
         if self.store['source']['provider'] != 'bigquery' :
             return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
@@ -222,7 +223,7 @@ class Generator (Learner):
                 values[index] = values[index].astype(_type)
                 x += values.tolist()
             if x :   
-                _log['input']['diff_pct'] = 100 * (1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size))
+                _log['input']['identical_percentage'] = 100 * (1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size))
                 _df[name] = x  #np.array(x,dtype=np.int64) if 'int' in _type  else np.arry(x,dtype=np.float64)
                 
                 self.log(**_log)
@@ -243,14 +244,15 @@ class Generator (Learner):
         day = np.random.randint(1,_end)
 
         #-- synthetic date
-        _date = datetime(year=year,month=month,day=day)
-        FORMAT =  '%Y-%m-%d'
-        if 'format' in self.info and 'field' in _args and _args['field'] in self.info['format']:
+        _date = datetime(year=year,month=month,day=day) #,minute=0,hour=0,second=0)
+        FORMAT =  '%Y-%d-%m'
+        _name = _args['field'] if 'field' in _args else None
+        if 'format' in self.info and _name in self.info['format']:
             _name = _args['field']
             FORMAT = self.info['format'][_name]
 
 
-        
+        # print ([_name,FORMAT, _date.strftime(FORMAT)])
         r = []
         if offset :
             r = [_date.strftime(FORMAT)]
@@ -277,7 +279,7 @@ class Generator (Learner):
                 r[name] = FORMAT
                 
                 
-                _df[name] = pd.to_datetime(_df[name], format=FORMAT).astype(str) #.astype('datetime64[ns]')
+                _df[name] = pd.to_datetime(_df[name], format=FORMAT).astype('datetime64[ns]')
         if r :
             self.log(**{'action':'format','input':r})
         return _df
@@ -308,12 +310,13 @@ class Generator (Learner):
 
                     years = _df[iname]
                     _dates = [self.make_date(year=year,field=name) for year in years]
-                    if _dates :
-                        _df[name] = _dates            
+                    if _dates :                        
+                        _df[name] = _dates
             
             _schema = self.get_schema()
             _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
             _df = self.format(_df,_schema)
+            
             writer.write(_df,schema=_schema)
         
         self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}})

From 93ebe8ee1b2b20f29c80281799b48eec65bf90eb Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 14 Apr 2022 18:07:17 -0500
Subject: [PATCH 205/250] bugfix: date type casting bug

---
 data/maker/__init__.py | 43 ++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 184bca4..6c2a463 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -235,6 +235,7 @@ class Generator (Learner):
         if _args['year'] in ['',None,np.nan] :
             return None
         year = int(_args['year'])
+        
         offset = _args['offset'] if 'offset' in _args else 0
         month   = np.random.randint(1,13)
         if month == 2:
@@ -244,13 +245,13 @@ class Generator (Learner):
         day = np.random.randint(1,_end)
 
         #-- synthetic date
-        _date = datetime(year=year,month=month,day=day) #,minute=0,hour=0,second=0)
-        FORMAT =  '%Y-%d-%m'
+        _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0)
+        FORMAT =  '%Y-%m-%d'
         _name = _args['field'] if 'field' in _args else None
         if 'format' in self.info and _name in self.info['format']:
-            _name = _args['field']
+            # _name = _args['field']
             FORMAT = self.info['format'][_name]
-
+        
 
         # print ([_name,FORMAT, _date.strftime(FORMAT)])
         r = []
@@ -258,7 +259,7 @@ class Generator (Learner):
             r = [_date.strftime(FORMAT)]
             for _delta in offset :
                 _date = _date + timedelta(_delta)
-                r.append(_date.strftime(FORMAT))
+                r.append(_date.strptime(FORMAT))
             return r
         else:
             return _date.strftime(FORMAT)
@@ -270,16 +271,19 @@ class Generator (Learner):
             name = _item['name']
             
             if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] :
-                FORMAT = '%Y-%d-%m'
+                FORMAT = '%Y-%m-%d'
+                
                 if 'format' in self.info and name in self.info['format'] :
                     FORMAT = self.info['format'][name]
-                else:
-                    if _item['type'] == ['DATETIME','TIMESTAMP'] :
-                        FORMAT = '%Y-%d-%m %H:%M:%S'
-                r[name] = FORMAT
+                elif _item['type'] in ['DATETIME','TIMESTAMP'] :
+                        FORMAT = '%Y-%m-%d %H:%M:%S'
                 
-                
-                _df[name] = pd.to_datetime(_df[name], format=FORMAT).astype('datetime64[ns]')
+                r[name] = FORMAT
+                _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
+                if _item['type'] in ['DATETIME','TIMESTAMP']:
+                    _df[name] = _df[name].astype('datetime64[ns]')
+                else:
+                    _df[name] = _df[name].astype(str)
         if r :
             self.log(**{'action':'format','input':r})
         return _df
@@ -309,10 +313,12 @@ class Generator (Learner):
                     iname = self.info['make_date'][name]
 
                     years = _df[iname]
-                    _dates = [self.make_date(year=year,field=name) for year in years]
-                    if _dates :                        
+                    _dates = [self.make_date(year=_year,field=name) for _year in years]
+                    if _dates :                         
                         _df[name] = _dates
-            
+                        
+                        
+           
             _schema = self.get_schema()
             _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
             _df = self.format(_df,_schema)
@@ -341,7 +347,12 @@ class Shuffle(Generator):
         
         _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}}
         self.log(**_log)
-        self.post([self._df])
+        try:
+            self.post([self._df])
+            self.log(**{'action':'completed','input':{'candidates':1,'rows':int(self._df.shape[0])}})
+        except Exception as e :
+            # print (e)
+            self.log(**{'action':'failed','input':{'msg':e,'info':self.info}})
 class factory :
     _infocache = {}
     @staticmethod

From 01ca780c99d2e3dc7a42b1e4642d756f0bd74f15 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 14 Apr 2022 18:23:43 -0500
Subject: [PATCH 206/250] bugfix: date type casting bug

---
 data/maker/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 6c2a463..c65cbcf 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -284,6 +284,7 @@ class Generator (Learner):
                     _df[name] = _df[name].astype('datetime64[ns]')
                 else:
                     _df[name] = _df[name].astype(str)
+                _df[name] = _df[name].fillna('')
         if r :
             self.log(**{'action':'format','input':r})
         return _df

From 133b0120db26643f374004e3f4eb0e9a622861d3 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 14 Apr 2022 18:29:28 -0500
Subject: [PATCH 207/250] bugfix: date type casting bug

---
 data/maker/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index c65cbcf..cde3928 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -284,7 +284,7 @@ class Generator (Learner):
                     _df[name] = _df[name].astype('datetime64[ns]')
                 else:
                     _df[name] = _df[name].astype(str)
-                _df[name] = _df[name].fillna('')
+                _df[name] = _df[name].replace('NaT','')
         if r :
             self.log(**{'action':'format','input':r})
         return _df

From 5d4c534faeac12c19ca39a1564c0ccd19b9a22cd Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 21 Apr 2022 10:14:00 -0500
Subject: [PATCH 208/250] bug fix: approximation null values

---
 data/maker/__init__.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index cde3928..723991f 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -218,7 +218,13 @@ class Generator (Learner):
             for values in batches :
                
                 index = [ _x not in ['',None,np.nan] for _x in values]
-                _values = np.random.dirichlet(values[index].astype(_type))                                
+                
+                if len(index) == len(values):
+                    #
+                    # Sometimes messy data has unpleasant surprises
+                    continue
+                _values = np.random.dirichlet(values[index].astype(_type))    
+                
                 values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values)
                 values[index] = values[index].astype(_type)
                 x += values.tolist()
@@ -284,7 +290,7 @@ class Generator (Learner):
                     _df[name] = _df[name].astype('datetime64[ns]')
                 else:
                     _df[name] = _df[name].astype(str)
-                _df[name] = _df[name].replace('NaT','')
+        _df = _df.replace('NaT','')
         if r :
             self.log(**{'action':'format','input':r})
         return _df

From 4aacb74f29967eb483082258d56285a5f7bda094 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 21 Apr 2022 10:53:19 -0500
Subject: [PATCH 209/250] bug fix with shuffler

---
 data/maker/__init__.py | 13 +++++++++----
 setup.py               |  2 +-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 723991f..630aa41 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -341,16 +341,21 @@ class Shuffle(Generator):
         super().__init__(**_args)
     def run(self):
         
-
+        np.random.seed(1)
         self.initalize()
         _index = np.arange(self._df.shape[0])
         np.random.shuffle(_index)
+        np.random.shuffle(_index)
         _iocolumns = self.info['columns']
         _ocolumns = list(set(self._df.columns) - set(_iocolumns) )
         # _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size))
-        _iodf = pd.DataFrame(self._df[_iocolumns],index = np.arange(_index.size))
-        self._df = self._df[_ocolumns].join(_iodf)
-       
+        _iodf = pd.DataFrame(self._df[_iocolumns].copy(),index = np.arange(_index.size))
+        # self._df = self._df.loc[_index][_ocolumns].join(_iodf)
+        self._df = self._df.loc[_index][_ocolumns]
+        self._df.index = np.arange(self._df.shape[0])
+        self._df = self._df.join(_iodf)
+
+        
         
         _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}}
         self.log(**_log)
diff --git a/setup.py b/setup.py
index 1991bde..801dc48 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.5.2",
+args = {"name":"data-maker","version":"1.5.3",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow']

From 587248c63b84b010b6c481ac8e64692e950dfaf3 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 21 Apr 2022 11:07:56 -0500
Subject: [PATCH 210/250] bug fix

---
 data/maker/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 630aa41..35a8967 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -219,7 +219,7 @@ class Generator (Learner):
                
                 index = [ _x not in ['',None,np.nan] for _x in values]
                 
-                if len(index) == len(values):
+                if np.sum(index) == 0:
                     #
                     # Sometimes messy data has unpleasant surprises
                     continue
@@ -228,6 +228,7 @@ class Generator (Learner):
                 values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values)
                 values[index] = values[index].astype(_type)
                 x += values.tolist()
+            print (batches)
             if x :   
                 _log['input']['identical_percentage'] = 100 * (1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size))
                 _df[name] = x  #np.array(x,dtype=np.int64) if 'int' in _type  else np.arry(x,dtype=np.float64)

From aa41d371f4a02bf97bbb66d6bcdd8060635778a9 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 21 Apr 2022 11:12:09 -0500
Subject: [PATCH 211/250] bug fix

---
 data/maker/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 35a8967..7f1c896 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -355,7 +355,10 @@ class Shuffle(Generator):
         self._df = self._df.loc[_index][_ocolumns]
         self._df.index = np.arange(self._df.shape[0])
         self._df = self._df.join(_iodf)
-
+        #
+        # The following is a full shuffle 
+        self._df = self._df.loc[_index]
+        self._df.index = np.arange(self._df.shape[0])
         
         
         _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}}

From 88b4fdd8610d62c82b72aef36436d402c8b81673 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 21 Apr 2022 11:40:41 -0500
Subject: [PATCH 212/250] bug fix

---
 data/maker/__init__.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 7f1c896..60141d0 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -223,16 +223,22 @@ class Generator (Learner):
                     #
                     # Sometimes messy data has unpleasant surprises
                     continue
-                _values = np.random.dirichlet(values[index].astype(_type))    
+                
+                _values = np.random.rand( len(values[index]))   
+                _values += np.std(values[index]) / 4
                 
                 values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values)
                 values[index] = values[index].astype(_type)
                 x += values.tolist()
-            print (batches)
+            
             if x :   
-                _log['input']['identical_percentage'] = 100 * (1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size))
-                _df[name] = x  #np.array(x,dtype=np.int64) if 'int' in _type  else np.arry(x,dtype=np.float64)
+                _log['input']['identical_percentage'] = 100 * (np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size))
+                print (_df[name] == x)
                 
+                print (_log)
+
+
+                _df[name] = x  #np.array(x,dtype=np.int64) if 'int' in _type  else np.arry(x,dtype=np.float64)
                 self.log(**_log)
         return _df
     def make_date(self,**_args) :

From 4b4647d200a41a075b6be61fcb5256a238e9d6bb Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 21 Apr 2022 12:17:32 -0500
Subject: [PATCH 213/250] bug fix

---
 data/maker/__init__.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 60141d0..3c4d45f 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -233,11 +233,7 @@ class Generator (Learner):
             
             if x :   
                 _log['input']['identical_percentage'] = 100 * (np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size))
-                print (_df[name] == x)
-                
-                print (_log)
-
-
+               
                 _df[name] = x  #np.array(x,dtype=np.int64) if 'int' in _type  else np.arry(x,dtype=np.float64)
                 self.log(**_log)
         return _df

From 42ccca5f8dd1c707ba56567cb58d9863d348a9e8 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Mon, 16 May 2022 11:11:33 -0500
Subject: [PATCH 214/250] bug fixes can now be used as a library

---
 data/maker/__init__.py | 42 ++++++++++++++++++++++++++++--------------
 setup.py               |  2 +-
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 3c4d45f..50abfd2 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -82,7 +82,7 @@ class Learner(Process):
             pass
     def get_schema(self):
         if self.store['source']['provider'] != 'bigquery' :
-            return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
+            return [] #{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
         else:
             reader  = transport.factory.instance(**self.store['source'])
             return reader.meta(table=self.info['from'])
@@ -276,24 +276,35 @@ class Generator (Learner):
         pass
     def format(self,_df,_schema):
         r = {}
+        
         for _item in _schema :
             name = _item['name']
             
             if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] :
                 FORMAT = '%Y-%m-%d'
-                
-                if 'format' in self.info and name in self.info['format'] :
-                    FORMAT = self.info['format'][name]
-                elif _item['type'] in ['DATETIME','TIMESTAMP'] :
-                        FORMAT = '%Y-%m-%d %H:%M:%S'
-                
-                r[name] = FORMAT
-                _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
-                if _item['type'] in ['DATETIME','TIMESTAMP']:
-                    _df[name] = _df[name].astype('datetime64[ns]')
-                else:
-                    _df[name] = _df[name].astype(str)
+                try:
+                    #
+                    #-- Sometimes data isn't all it's meant to be
+                    if 'format' in self.info and name in self.info['format'] :
+                        FORMAT = self.info['format'][name]
+                    elif _item['type'] in ['DATETIME','TIMESTAMP'] :
+                            FORMAT = '%Y-%m-%d %H:%M:%S'
+                    
+                    r[name] = FORMAT
+                    _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
+                    if _item['type'] in ['DATETIME','TIMESTAMP']:                   
+                        _df[name] = _df[name].fillna('').astype('datetime64[ns]')
+                    else:
+                        _df[name] = _df[name].astype(str)
+                except Exception as e:
+                    pass
+                finally:
+                    pass
+            else:
+                # print (_item)
+                pass
         _df = _df.replace('NaT','')
+        
         if r :
             self.log(**{'action':'format','input':r})
         return _df
@@ -391,4 +402,7 @@ class factory :
         elif _args['apply'] == 'generate' :
             return Generator(**_args)
         else:
-            return Trainer(**_args)
\ No newline at end of file
+            pthread= Trainer(**_args)
+            if 'start' in _args and _args['start'] == True :
+                pthread.start()
+            return pthread
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 801dc48..b5d3733 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.5.3",
+args = {"name":"data-maker","version":"1.5.4",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow']

From 1d0bbce74819bd83763ca833a29c87ee842e0fa7 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Mon, 16 May 2022 13:59:58 -0500
Subject: [PATCH 215/250] bug fixes data format

---
 data/maker/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 50abfd2..42af8f9 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -287,13 +287,15 @@ class Generator (Learner):
                     #-- Sometimes data isn't all it's meant to be
                     if 'format' in self.info and name in self.info['format'] :
                         FORMAT = self.info['format'][name]
+                        SIZE = 10
                     elif _item['type'] in ['DATETIME','TIMESTAMP'] :
                             FORMAT = '%Y-%m-%d %H:%M:%S'
+                            SIZE = 19
                     
                     r[name] = FORMAT
                     _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
                     if _item['type'] in ['DATETIME','TIMESTAMP']:                   
-                        _df[name] = _df[name].fillna('').astype('datetime64[ns]')
+                        pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]')
                     else:
                         _df[name] = _df[name].astype(str)
                 except Exception as e:

From 1e3e0eac45b689602639397213488f9b43e5e84e Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Mon, 16 May 2022 14:02:40 -0500
Subject: [PATCH 216/250] bug fixes data format

---
 data/maker/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 42af8f9..1eea945 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -305,7 +305,7 @@ class Generator (Learner):
             else:
                 # print (_item)
                 pass
-        _df = _df.replace('NaT','')
+        _df = _df.replace('NaT','').replace('NA','')
         
         if r :
             self.log(**{'action':'format','input':r})

From 2b228f60750b4521e864b25f0bf36f63262cf2a5 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 17 May 2022 03:05:44 -0500
Subject: [PATCH 217/250] bug fix with type inference

---
 data/maker/__init__.py         | 32 +++++++++++++++++++++++++-------
 data/maker/prepare/__init__.py |  1 +
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 1eea945..24fabe8 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -282,9 +282,11 @@ class Generator (Learner):
             
             if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] :
                 FORMAT = '%Y-%m-%d'
+                
                 try:
                     #
                     #-- Sometimes data isn't all it's meant to be
+                    SIZE = -1
                     if 'format' in self.info and name in self.info['format'] :
                         FORMAT = self.info['format'][name]
                         SIZE = 10
@@ -292,20 +294,34 @@ class Generator (Learner):
                             FORMAT = '%Y-%m-%d %H:%M:%S'
                             SIZE = 19
                     
+                    if SIZE > 0 :
+                        
+                        values = pd.to_datetime(_df[name], format=FORMAT).astype(str)
+                        _df[name] = [_date[:SIZE] for _date in values]
+                       
+                    
                     r[name] = FORMAT
-                    _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
+                    # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
                     if _item['type'] in ['DATETIME','TIMESTAMP']:                   
                         pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]')
-                    else:
-                        _df[name] = _df[name].astype(str)
+                    
                 except Exception as e:
                     pass
                 finally:
                     pass
             else:
-                # print (_item)
-                pass
-        _df = _df.replace('NaT','').replace('NA','')
+                
+                #
+                # Because types are inferred on the basis of the sample being processed they can sometimes be wrong
+                #   To help disambiguate we add the schema information
+                _type = None
+                if 'int' in _df[name].dtypes.name or 'int' in _item['type'].lower():                    
+                    _type = np.int
+                elif 'float' in _df[name].dtypes.name or 'float' in _item['type'].lower():
+                    _type = np.float
+                if _type :
+                    _df[name] = _df[name].fillna(0).replace('',0).astype(_type)
+        # _df = _df.replace('NaT','').replace('NA','')
         
         if r :
             self.log(**{'action':'format','input':r})
@@ -319,7 +335,7 @@ class Generator (Learner):
         _store['context'] = 'write' #-- Just in case
         if 'table' not in _store :
             _store['table'] = self.info['from']
-        writer = transport.factory.instance(**_store)
+        
         N = 0
         for _iodf in _candidates :
             _df = self._df.copy()
@@ -346,7 +362,9 @@ class Generator (Learner):
             _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
             _df = self.format(_df,_schema)
             
+            writer = transport.factory.instance(**_store)
             writer.write(_df,schema=_schema)
+            # _df.to_csv('foo.csv')
         
         self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}})
 class Shuffle(Generator):    
diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 50fcfdf..17da778 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -209,6 +209,7 @@ class Input :
         # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure)
         #
         _matrix = np.array([np.repeat(0,cols.size) for i in range(0,row_count)])
+        
         [np.put(_matrix[i], np.where(cols ==  rows[i])  ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
         # else:
         #     _matrix = cp.zeros([row_count,cols.size])

From 6841ccbd5e4abb8322df6da8b55904f99bcae89c Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 17 May 2022 13:24:24 -0500
Subject: [PATCH 218/250] bug fix:  missing data, adding an additional type:
 pandas._lib.missing.NAType in addition to np.nan, np.na

---
 data/maker/__init__.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 24fabe8..b9b48e4 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -52,6 +52,7 @@ class Learner(Process):
         self._encoder = None
         self._map = None
         self._df = _args['data'] if 'data' in _args else None
+        
         self.name   =  self.__class__.__name__
         
         #
@@ -92,10 +93,22 @@ class Learner(Process):
         if self._df is None :
             self._df     = reader.read(**_read_args)
         columns = self.columns if self.columns else self._df.columns
+        #
+        # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases
+        # - The code below tries to address the issue (Perhaps better suited for the reading components)
+        for name in columns :
+            _index = np.random.choice(np.arange(self._df[name].size),5,False)
+            no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]]
+            print ([name,np.sum(no_value)])
+            no_value = 0 if np.sum(no_value) > 0 else ''
+            
+            self._df[name] = self._df[name].fillna(no_value)
+
+
         #
         # convert the data to binary here ...
-        
-        _args = {"schema":self.get_schema(),"data":self._df,"columns":columns}
+        _schema = self.get_schema()       
+        _args = {"schema":_schema,"data":self._df,"columns":columns}
         if self._map :
             _args['map'] = self._map
         self._encoder = prepare.Input(**_args)  if self._df.shape[0] > 0 else None             

From 1dae4ffba8cba71f1cf8daf792cd0aa8b795431c Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 17 May 2022 13:27:13 -0500
Subject: [PATCH 219/250] bug fix:  missing data, adding an additional type:
 pandas._lib.missing.NAType in addition to np.nan, np.na

---
 data/maker/__init__.py         | 3 +--
 data/maker/prepare/__init__.py | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index b9b48e4..c8dc02a 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -98,8 +98,7 @@ class Learner(Process):
         # - The code below tries to address the issue (Perhaps better suited for the reading components)
         for name in columns :
             _index = np.random.choice(np.arange(self._df[name].size),5,False)
-            no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]]
-            print ([name,np.sum(no_value)])
+            no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]]            
             no_value = 0 if np.sum(no_value) > 0 else ''
             
             self._df[name] = self._df[name].fillna(no_value)
diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 17da778..45fc61c 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -196,7 +196,6 @@ class Input :
         :param rows     np.array or list of vector of values
         :param cols     a space of values if it were to be different fromt he current sample.
         """
-        
         if not cols:
             #
             # In the advent the sample rows do NOT have the values of the 

From 377e84daea23ad126ea787102449b4ffa09b1fd3 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 17 May 2022 18:04:05 -0500
Subject: [PATCH 220/250] bug fix: uploading data

---
 data/maker/__init__.py | 72 ++++++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 27 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index c8dc02a..d05509d 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -96,14 +96,17 @@ class Learner(Process):
         #
         # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases
         # - The code below tries to address the issue (Perhaps better suited for the reading components)
+        _log = {}
         for name in columns :
             _index = np.random.choice(np.arange(self._df[name].size),5,False)
             no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]]            
             no_value = 0 if np.sum(no_value) > 0 else ''
             
             self._df[name] = self._df[name].fillna(no_value)
-
-
+            
+            _log[name] = self._df[name].dtypes.name
+        _log = {'action':'structure','input':_log}
+        self.log(**_log)
         #
         # convert the data to binary here ...
         _schema = self.get_schema()       
@@ -293,46 +296,52 @@ class Generator (Learner):
             name = _item['name']
             
             if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] :
-                FORMAT = '%Y-%m-%d'
+                FORMAT = '%m-%d-%Y'
                 
-                try:
-                    #
-                    #-- Sometimes data isn't all it's meant to be
-                    SIZE = -1
-                    if 'format' in self.info and name in self.info['format'] :
-                        FORMAT = self.info['format'][name]
-                        SIZE = 10
-                    elif _item['type'] in ['DATETIME','TIMESTAMP'] :
-                            FORMAT = '%Y-%m-%d %H:%M:%S'
-                            SIZE = 19
+                # try:
+                #     #
+                #     #-- Sometimes data isn't all it's meant to be
+                #     SIZE = -1
+                #     if 'format' in self.info and name in self.info['format'] :
+                #         FORMAT = self.info['format'][name]
+                #         SIZE = 10
+                #     elif _item['type'] in ['DATETIME','TIMESTAMP'] :
+                #             FORMAT = '%m-%d-%Y %H:%M:%S'
+                #             SIZE = 19
                     
-                    if SIZE > 0 :
+                #     if SIZE > 0 :
+                        
+                #         values = pd.to_datetime(_df[name], format=FORMAT).astype(str)
+                #         _df[name] = [_date[:SIZE].strip() for _date in values]
                         
-                        values = pd.to_datetime(_df[name], format=FORMAT).astype(str)
-                        _df[name] = [_date[:SIZE] for _date in values]
                        
+                #     # _df[name] = _df[name].astype(str)
+                #     r[name] = FORMAT
+                #     # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
+                #     if _item['type'] in ['DATETIME','TIMESTAMP']:                   
+                #         pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]')
                     
-                    r[name] = FORMAT
-                    # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
-                    if _item['type'] in ['DATETIME','TIMESTAMP']:                   
-                        pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]')
-                    
-                except Exception as e:
-                    pass
-                finally:
-                    pass
+                # except Exception as e:
+                #     pass
+                # finally:
+                #     pass
             else:
                 
                 #
                 # Because types are inferred on the basis of the sample being processed they can sometimes be wrong
                 #   To help disambiguate we add the schema information
                 _type = None
+                
                 if 'int' in _df[name].dtypes.name or 'int' in _item['type'].lower():                    
                     _type = np.int
+                    
                 elif 'float' in _df[name].dtypes.name or 'float' in _item['type'].lower():
                     _type = np.float
                 if _type :
-                    _df[name] = _df[name].fillna(0).replace('',0).astype(_type)
+                    
+                    _df[name] = _df[name].fillna(0).replace('',0).replace('NA',0).replace('nan',0).astype(_type)
+                # else:
+                #     _df[name] = _df[name].astype(str)
         # _df = _df.replace('NaT','').replace('NA','')
         
         if r :
@@ -373,10 +382,19 @@ class Generator (Learner):
             _schema = self.get_schema()
             _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
             _df = self.format(_df,_schema)
+            _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ]
+            self.log(**{"action":"consolidate","input":_log})
+
+            # w = transport.factory.instance(doc='observation',provider='mongodb',context='write',db='IOV01_LOGS',auth_file='/home/steve/dev/transport/mongo.json')
+            # w.write(_df)
+            # print (_df[cols])
             
             writer = transport.factory.instance(**_store)
             writer.write(_df,schema=_schema)
-            # _df.to_csv('foo.csv')
+            
+            
+
+           
         
         self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}})
 class Shuffle(Generator):    

From f3598efa0d3399516d4d4178671abe55605419de Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 17 May 2022 19:10:33 -0500
Subject: [PATCH 221/250] bug fix: date conversions

---
 data/maker/__init__.py | 49 +++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index d05509d..403255c 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -296,35 +296,35 @@ class Generator (Learner):
             name = _item['name']
             
             if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] :
-                FORMAT = '%m-%d-%Y'
+                FORMAT = '%Y-%m-%d'
                 
-                # try:
-                #     #
-                #     #-- Sometimes data isn't all it's meant to be
-                #     SIZE = -1
-                #     if 'format' in self.info and name in self.info['format'] :
-                #         FORMAT = self.info['format'][name]
-                #         SIZE = 10
-                #     elif _item['type'] in ['DATETIME','TIMESTAMP'] :
-                #             FORMAT = '%m-%d-%Y %H:%M:%S'
-                #             SIZE = 19
+                try:
+                    #
+                    #-- Sometimes data isn't all it's meant to be
+                    SIZE = -1
+                    if 'format' in self.info and name in self.info['format'] :
+                        FORMAT = self.info['format'][name]
+                        SIZE = 10
+                    elif _item['type'] in ['DATETIME','TIMESTAMP'] :
+                            FORMAT = '%Y-%m-%-d %H:%M:%S'
+                            SIZE = 19
                     
-                #     if SIZE > 0 :
+                    if SIZE > 0 :
                         
-                #         values = pd.to_datetime(_df[name], format=FORMAT).astype(str)
-                #         _df[name] = [_date[:SIZE].strip() for _date in values]
+                        values = pd.to_datetime(_df[name], format=FORMAT).astype(np.datetime64)
+                        # _df[name] = [_date[:SIZE].strip() for _date in values]
                         
                        
-                #     # _df[name] = _df[name].astype(str)
-                #     r[name] = FORMAT
-                #     # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
-                #     if _item['type'] in ['DATETIME','TIMESTAMP']:                   
-                #         pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]')
+                    # _df[name] = _df[name].astype(str)
+                    r[name] = FORMAT
+                    # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
+                    if _item['type'] in ['DATETIME','TIMESTAMP']:                   
+                        pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]')
                     
-                # except Exception as e:
-                #     pass
-                # finally:
-                #     pass
+                except Exception as e:
+                    pass
+                finally:
+                    pass
             else:
                 
                 #
@@ -387,10 +387,11 @@ class Generator (Learner):
 
             # w = transport.factory.instance(doc='observation',provider='mongodb',context='write',db='IOV01_LOGS',auth_file='/home/steve/dev/transport/mongo.json')
             # w.write(_df)
+            cols = [name for name in _df.columns if name.endswith('datetime')]
             # print (_df[cols])
             
             writer = transport.factory.instance(**_store)
-            writer.write(_df,schema=_schema)
+            writer.write(_df[cols],schema=[_item for _item in _schema if _item['name'] in cols])
             
             
 

From e8edf886adfc0bd4e05b9ff40b137cc9667beb77 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Fri, 10 Jun 2022 13:00:28 -0500
Subject: [PATCH 222/250] bug fix: write data

---
 data/maker/__init__.py | 7 ++++---
 setup.py               | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 403255c..1666a42 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -380,18 +380,19 @@ class Generator (Learner):
                         
            
             _schema = self.get_schema()
-            _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
+            _schema = [{'name':_item.name,'type':(_item.field_type if has_attr(_item,'field_type') else 'VARCHAR(256)')} for _item in _schema]
             _df = self.format(_df,_schema)
             _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ]
             self.log(**{"action":"consolidate","input":_log})
 
             # w = transport.factory.instance(doc='observation',provider='mongodb',context='write',db='IOV01_LOGS',auth_file='/home/steve/dev/transport/mongo.json')
             # w.write(_df)
-            cols = [name for name in _df.columns if name.endswith('datetime')]
+            # cols = [name for name in _df.columns if name.endswith('datetime')]
+            
             # print (_df[cols])
             
             writer = transport.factory.instance(**_store)
-            writer.write(_df[cols],schema=[_item for _item in _schema if _item['name'] in cols])
+            writer.write(_df,schema=[_item for _item in _schema if _item['name'] in cols])
             
             
 
diff --git a/setup.py b/setup.py
index b5d3733..8da19f3 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.5.4",
+args = {"name":"data-maker","version":"1.5.5",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow']

From 44d621941d12c768025bf3f1394f6ec06b6ef411 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Fri, 10 Jun 2022 13:16:11 -0500
Subject: [PATCH 223/250] bug fix

---
 data/maker/__init__.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 1666a42..e8e5363 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -380,7 +380,7 @@ class Generator (Learner):
                         
            
             _schema = self.get_schema()
-            _schema = [{'name':_item.name,'type':(_item.field_type if has_attr(_item,'field_type') else 'VARCHAR(256)')} for _item in _schema]
+            _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
             _df = self.format(_df,_schema)
             _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ]
             self.log(**{"action":"consolidate","input":_log})
@@ -388,11 +388,10 @@ class Generator (Learner):
             # w = transport.factory.instance(doc='observation',provider='mongodb',context='write',db='IOV01_LOGS',auth_file='/home/steve/dev/transport/mongo.json')
             # w.write(_df)
             # cols = [name for name in _df.columns if name.endswith('datetime')]
-            
             # print (_df[cols])
             
             writer = transport.factory.instance(**_store)
-            writer.write(_df,schema=[_item for _item in _schema if _item['name'] in cols])
+            writer.write(_df[:],schema=[_item for _item in _schema if _item['name'] in cols])
             
             
 

From 3087e98bc06a2099f6c271f2dd896783930ccf3a Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Fri, 10 Jun 2022 13:33:51 -0500
Subject: [PATCH 224/250] bug fix

---
 data/maker/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index e8e5363..056cbbc 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -391,7 +391,7 @@ class Generator (Learner):
             # print (_df[cols])
             
             writer = transport.factory.instance(**_store)
-            writer.write(_df[:],schema=[_item for _item in _schema if _item['name'] in cols])
+            writer.write(_df[:],schema=_schema)
             
             
 

From 7bf0b8e5839f95502f94280e5ad93d75e979a26f Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Fri, 10 Jun 2022 14:52:55 -0500
Subject: [PATCH 225/250] bug fix

---
 data/maker/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 056cbbc..71d9c7b 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -98,6 +98,10 @@ class Learner(Process):
         # - The code below tries to address the issue (Perhaps better suited for the reading components)
         _log = {}
         for name in columns :
+            #
+            # randomly sampling 5 elements to make sense of data-types
+            if self._df[name].size < 5 :
+                continue
             _index = np.random.choice(np.arange(self._df[name].size),5,False)
             no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]]            
             no_value = 0 if np.sum(no_value) > 0 else ''

From 7e92571d0acf99d042a2c43b3621c50eb831cafe Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 14 Jun 2022 12:24:56 -0500
Subject: [PATCH 226/250] bug fix: errors occasionally

---
 data/maker/__init__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 71d9c7b..60fc418 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -105,8 +105,10 @@ class Learner(Process):
             _index = np.random.choice(np.arange(self._df[name].size),5,False)
             no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]]            
             no_value = 0 if np.sum(no_value) > 0 else ''
-            
-            self._df[name] = self._df[name].fillna(no_value)
+            try:
+                self._df[name] = self._df[name].fillna(no_value)
+            finally:
+                pass
             
             _log[name] = self._df[name].dtypes.name
         _log = {'action':'structure','input':_log}

From d89daf76d6758424a6c316b1690fce364f32ef4f Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Thu, 16 Jun 2022 23:56:16 -0500
Subject: [PATCH 227/250] bug fixes

---
 data/maker/__init__.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 60fc418..541db37 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -59,7 +59,7 @@ class Learner(Process):
         # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork
         #
         
-        _log = {'action':'init','context':self.info['context'],'gpu':(self.gpu if self.gpu is not None else -1)}
+        _log = {'action':'init','gpu':(self.gpu if self.gpu is not None else -1)}
         self.log(**_log)
 
         # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
@@ -67,8 +67,10 @@ class Learner(Process):
     def log(self,**_args):
         # self.lock.acquire()
         try:
+            _context = self.info['context']
+            _label = self.info['info'] if 'info' in self.info else _context
             logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True)
-            _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'info':self.info['context'],**_args})
+            _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_info,**_args})
             logger.write(_args)
             self.ndx += 1
             if hasattr(logger,'close') :
@@ -345,7 +347,7 @@ class Generator (Learner):
                     _type = np.float
                 if _type :
                     
-                    _df[name] = _df[name].fillna(0).replace('',0).replace('NA',0).replace('nan',0).astype(_type)
+                    _df[name] = _df[name].fillna(0).replace(' ',0).replace('',0).replace('NA',0).replace('nan',0).astype(_type)
                 # else:
                 #     _df[name] = _df[name].astype(str)
         # _df = _df.replace('NaT','').replace('NA','')
@@ -397,7 +399,10 @@ class Generator (Learner):
             # print (_df[cols])
             
             writer = transport.factory.instance(**_store)
-            writer.write(_df[:],schema=_schema)
+            if _store['provider'] == 'bigquery':
+                writer.write(_df,schema=[],table=self.info['from'])
+            else:
+                writer.write(_df,table=self.info['from'])
             
             
 

From 899db5c0368e5d4e3c04ddfb618c4e38ee1ae5da Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Fri, 17 Jun 2022 00:17:00 -0500
Subject: [PATCH 228/250] bug fixes

---
 data/maker/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 541db37..2d1e1f8 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -70,7 +70,7 @@ class Learner(Process):
             _context = self.info['context']
             _label = self.info['info'] if 'info' in self.info else _context
             logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True)
-            _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_info,**_args})
+            _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args})
             logger.write(_args)
             self.ndx += 1
             if hasattr(logger,'close') :

From 322b21aaacccaf2458caff72fd0a090c48c7d371 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 9 Aug 2022 12:22:07 -0500
Subject: [PATCH 229/250] bug fix: encoding/decoding to improve correlations
 between attributes

---
 data/maker/__init__.py         | 123 ++++++++++++++++++++++-----------
 data/maker/prepare/__init__.py |  57 ++++++++++++++-
 2 files changed, 136 insertions(+), 44 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 2d1e1f8..0d8bf33 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -13,13 +13,17 @@ import numpy as np
 import data.gan as gan
 import transport
 # from data.bridge import Binary
-import threading as thread
+import threading
 from data.maker import prepare
 import copy
 import os
-import json
+import nujson as json
 from multiprocessing import Process, RLock
 from datetime import datetime, timedelta
+from multiprocessing import Queue
+
+import time
+
 
 class Learner(Process):
 
@@ -28,6 +32,7 @@ class Learner(Process):
         
         super(Learner, self).__init__() 
         self.ndx = 0
+        self._queue = Queue()
         self.lock = RLock()
         if 'gpu' in _args :
             
@@ -61,34 +66,38 @@ class Learner(Process):
         
         _log = {'action':'init','gpu':(self.gpu if self.gpu is not None else -1)}
         self.log(**_log)
-
+        self.cache = []
         # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
         # sel.max_epoc
     def log(self,**_args):
-        # self.lock.acquire()
+        
         try:
-            _context = self.info['context']
-            _label = self.info['info'] if 'info' in self.info else _context
-            logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider='console',context='write',lock=True)
-            _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args})
-            logger.write(_args)
-            self.ndx += 1
-            if hasattr(logger,'close') :
-                logger.close()
+            # _context = self.info['context']
+            # _label = self.info['info'] if 'info' in self.info else _context
+            # logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider=transport.providers.CONSOLE,context='write',lock=True)
+            # _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args})
+            # logger.write(_args)
+            # self.ndx += 1
+            # if hasattr(logger,'close') :
+            #     logger.close()
+            pass
         except Exception as e:
             print ()
             print (_args)
             print (e)
             pass
         finally:
-            # self.lock.release()
+           
             pass
     def get_schema(self):
-        if self.store['source']['provider'] != 'bigquery' :
-            return [] #{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
-        else:
-            reader  = transport.factory.instance(**self.store['source'])
-            return reader.meta(table=self.info['from'])
+        # if self.store['source']['provider'] != 'bigquery' :
+        #     return [] #{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
+        # else:
+        #     reader  = transport.factory.instance(**self.store['source'])
+        #     return reader.meta(table=self.info['from'])
+        reader  = transport.factory.instance(**self.store['source'])
+        return reader.meta(table=self.info['from'])
+
     def initalize(self):
         reader  = transport.factory.instance(**self.store['source'])
         _read_args= self.info
@@ -124,6 +133,25 @@ class Learner(Process):
         self._encoder = prepare.Input(**_args)  if self._df.shape[0] > 0 else None             
         _log = {'action':'data-prep','input':{'rows':int(self._df.shape[0]),'cols':int(self._df.shape[1]) } }
         self.log(**_log)
+    def get(self):
+        
+        if self.cache :
+            return self.cache if len(self.cache) > 0 else(self.cache if not self.cache else self.cache[0])
+        else:
+            return self._queue.get() if self._queue.qsize() >  0 else []
+         
+    def listen(self):
+        while True :
+            _info = self._queue.get()
+            self.cache.append(_info)
+            self._queue.task_done()
+    def publish(self,caller):
+        if hasattr(caller,'_queue') :           
+            _queue = caller._queue
+            _queue.put(self.cache)
+            
+            # _queue.join()
+        pass
 class Trainer(Learner):
     """
     This will perform training using a GAN
@@ -157,7 +185,8 @@ class Trainer(Learner):
         gTrain = gan.Train(**_args)
         gTrain.apply()
        
-        writer = transport.factory.instance(provider='file',context='write',path=os.sep.join([gTrain.out_dir,'map.json']))
+        writer = transport.factory.instance(provider=transport.providers.FILE,context='write',path=os.sep.join([gTrain.out_dir,'map.json']))
+        
         writer.write(self._encoder._map,overwrite=True)
         writer.close()
 
@@ -174,9 +203,14 @@ class Trainer(Learner):
         _min = float((end-beg).seconds/ 60)
         _logs = {'action':'train','input':{'start':beg.strftime('%Y-%m-%d %H:%M:%S'),'minutes':_min,"unique_counts":self._encoder._io[0]}}
         self.log(**_logs)
-        self.generate = g
-        if self.autopilot :
-            self.generate.run()
+        self._g = g
+        if self.autopilot :            
+            self._g.run()   
+            #
+            #@TODO Find a way to have the data in the object ....
+
+   
+   
     def generate (self):
         if self.autopilot :
             print( "Autopilot is set ... No need to call this function")
@@ -224,6 +258,7 @@ class Generator (Learner):
         _size = np.sum([len(_item) for _item in _iomatrix])
         _log = {'action':'io-data','input':{'candidates':len(_candidates),'rows':int(_size)}}
         self.log(**_log)
+        # self.cache = _candidates
         self.post(_candidates)
     def approximate(self,_df):
         _columns = self.info['approximate']
@@ -359,12 +394,14 @@ class Generator (Learner):
         pass
     def post(self,_candidates):
         
-        _store = self.store['target'] if 'target' in self.store else {'provider':'console'}
-        _store['lock'] = True
-        _store['context'] = 'write' #-- Just in case
-        if 'table' not in _store :
-            _store['table'] = self.info['from']
-        
+        if 'target'  in self.store :
+            _store = self.store['target'] if 'target' in self.store else {'provider':'console'}
+            _store['lock'] = True
+            _store['context'] = 'write' #-- Just in case
+            if 'table' not in _store :
+                _store['table'] = self.info['from']
+        else:
+            _store = None
         N = 0
         for _iodf in _candidates :
             _df = self._df.copy()
@@ -397,13 +434,15 @@ class Generator (Learner):
             # w.write(_df)
             # cols = [name for name in _df.columns if name.endswith('datetime')]
             # print (_df[cols])
-            
-            writer = transport.factory.instance(**_store)
-            if _store['provider'] == 'bigquery':
-                writer.write(_df,schema=[],table=self.info['from'])
+            if _store :
+                writer = transport.factory.instance(**_store)
+                if _store['provider'] == 'bigquery':
+                    writer.write(_df,schema=[],table=self.info['from'])
+                else:
+                    writer.write(_df,table=self.info['from'])
             else:
-                writer.write(_df,table=self.info['from'])
-            
+                self.cache.append(_df)
+                
             
 
            
@@ -444,6 +483,8 @@ class Shuffle(Generator):
         except Exception as e :
             # print (e)
             self.log(**{'action':'failed','input':{'msg':e,'info':self.info}})
+class apply :
+    TRAIN,GENERATE,RANDOM = 'train','generate','random'
 class factory :
     _infocache = {}
     @staticmethod
@@ -459,12 +500,12 @@ class factory :
         :param batch (default 2k)           size of the batch
         """
         
-        if _args['apply'] == 'shuffle' :
-            return Shuffle(**_args)
-        elif _args['apply'] == 'generate' :
-            return Generator(**_args)
+        if _args['apply'] in [apply.RANDOM] :
+            pthread =  Shuffle(**_args)
+        elif _args['apply'] == apply.GENERATE :
+            pthread =  Generator(**_args)
         else:
             pthread= Trainer(**_args)
-            if 'start' in _args and _args['start'] == True :
-                pthread.start()
-            return pthread
\ No newline at end of file
+        if 'start' in _args and _args['start'] == True :
+            pthread.start()
+        return pthread
\ No newline at end of file
diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 45fc61c..d589c17 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -47,6 +47,15 @@ class Input :
         :param sql      sql query  that pulls a representative sample of the data
         """
         self._schema = _args['schema'] if 'schema' in _args else {}
+        #
+        # schema data should be in a hash map for these purposes
+        #
+        if self._schema :
+            r = {}
+            for _item in self._schema :
+                r[_item['name']] = r[_item['type']]
+                self._schema = r
+        
         self.df = _args['data']
         if 'sql' not in _args :
             self._initdata(**_args)   
@@ -60,6 +69,7 @@ class Input :
         #
         self._map = {} if 'map' not in _args else _args['map']
         
+        
     def _initsql(self,**_args):
         """
         This function will initialize the class on the basis of a data-store and optionally pre-defined columns to be used to be synthesized 
@@ -73,6 +83,10 @@ class Input :
             self._initcols(data=self.df,columns=_args['columns'])
         
         pass     
+    def _init_map(self,values):
+        self._map = dict(zip(np.arange(len(values)),values))
+        for key in self._map :
+            self._map[key] = self._map[key].tolist()
     def _initcols (self,**_args) :
         """
         This function will initialize the columns to be synthesized and/or determine which ones can be synthesized
@@ -109,7 +123,7 @@ class Input :
         """
         self._initcols(**_args)
 
-    def convert(self,**_args):
+    def _convert(self,**_args):
         """
         This function will convert a data-frame into a binary matrix and provide a map to be able to map the values back to the matrix 
         :param columns  in case we specify the columns to account for (just in case the original assumptions don't hold)
@@ -150,7 +164,7 @@ class Input :
         
         return _values,_m
         
-    def revert(self,**_args) :
+    def _revert(self,**_args) :
         """
         This function will take in a binary matrix and based on the map of values it will repopulate it with values
         :param _matrix  binary matrix
@@ -186,7 +200,9 @@ class Input :
             # r[key] = [columns[np.where(row == 1) [0][0] ] for row in _matrix[:,_beg:_end]]
             
             r[key] = [columns[np.where(row==1)[0][0]] if np.where(row==1)[0].size > 0 else '' for row in _matrix]
-            
+        #
+        # we should consider decoding the matrix if possible
+        #
             
         return pd.DataFrame(r)
      
@@ -217,4 +233,39 @@ class Input :
 
         
         return cols,_matrix
+    def convert(self,**_args):
+        if 'columns' in _args or 'column' in _args :
+            columns = _args['columns'] if 'columns' in _args else [_args['column']]
+        else:
+            columns = self._columns
+        _df = self.df if 'data' not in _args else _args['data']
+        _values,_matrix = self.encode(_df,columns)
+        _, _matrix =  self.tobinary(_matrix)
+        self._init_map(_values)
+        return _values,_matrix  #-- matrix has been updated !
+    def revert(self,**_args):
+        # _columns = _args['column'] if 'column' in _args else None
+        _matrix = _args['matrix']
+        # print (_matrix)
+        return self.decode(_matrix,columns=self._columns)
+        pass
+    def encode(self,df,columns) :
+        _df = df[columns].drop_duplicates()
+        _values = _df.values.tolist()
+        _encoded = df[columns].apply(lambda row: _values.index( list(row)) ,axis=1)
+        return np.array(_values),_encoded
+    def decode (self,_matrix,**_args):
+        #
+        # _matrix   binary matrix
+        # _values   value space given the columns
+        # columns   name of the columns ...
+        #
+      
+        columns = _args['columns']
+        _values = np.array( list(self._map.values()))
+        _matrix = pd.DataFrame(_matrix) #if type(_matrix) != pd.DataFrame else _matrix
+        x = _matrix.apply(lambda row: _values[row.values == 1 ].tolist()[0] if row.values.sum() > 0 else np.repeat(None,row.size), axis=1).tolist()
+        return pd.DataFrame(x,columns=columns)
+
+
 

From 44d3f4989a7b0a6b401dba47d66725733932b467 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 10 Aug 2022 09:33:12 -0500
Subject: [PATCH 230/250] version update

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 8da19f3..0e70341 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.5.5",
+args = {"name":"data-maker","version":"1.5.6",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow']

From 3aee3e2caea14465ee878cc577098659c90a9303 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 10 Aug 2022 10:56:40 -0500
Subject: [PATCH 231/250] bug fix: schema

---
 data/maker/prepare/__init__.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index d589c17..8da73c3 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -46,15 +46,15 @@ class Input :
         :param store    data-store parameters/configuration
         :param sql      sql query  that pulls a representative sample of the data
         """
-        self._schema = _args['schema'] if 'schema' in _args else {}
-        #
-        # schema data should be in a hash map for these purposes
-        #
-        if self._schema :
-            r = {}
-            for _item in self._schema :
-                r[_item['name']] = r[_item['type']]
-                self._schema = r
+        # self._schema = _args['schema'] if 'schema' in _args else {}
+        # #
+        # # schema data should be in a hash map for these purposes
+        # #
+        # if self._schema :
+        #     r = {}
+        #     for _item in self._schema :
+        #         r[_item['name']] = r[_item['type']]
+        #         self._schema = r
         
         self.df = _args['data']
         if 'sql' not in _args :

From 4013fb8fd5b5ab3ed06f3ca3b28e4de922e8848f Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 10 Aug 2022 11:29:21 -0500
Subject: [PATCH 232/250] minor bug fix, got fixed in data-transport returning
 properly formatted meta data

---
 data/maker/__init__.py         |  2 +-
 data/maker/prepare/__init__.py | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 0d8bf33..7ea2c74 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -425,7 +425,7 @@ class Generator (Learner):
                         
            
             _schema = self.get_schema()
-            _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
+            # _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
             _df = self.format(_df,_schema)
             _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ]
             self.log(**{"action":"consolidate","input":_log})
diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 8da73c3..4b0bfd3 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -46,15 +46,15 @@ class Input :
         :param store    data-store parameters/configuration
         :param sql      sql query  that pulls a representative sample of the data
         """
-        # self._schema = _args['schema'] if 'schema' in _args else {}
-        # #
-        # # schema data should be in a hash map for these purposes
-        # #
+        self._schema = _args['schema'] if 'schema' in _args else {}
+        #
+        # schema data should be in a hash map for these purposes
+        #
         # if self._schema :
         #     r = {}
         #     for _item in self._schema :
         #         r[_item['name']] = r[_item['type']]
-        #         self._schema = r
+        # self._schema = r
         
         self.df = _args['data']
         if 'sql' not in _args :

From 23b3c52230b7109cf8696877059262aafad90ca3 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 10 Aug 2022 12:42:11 -0500
Subject: [PATCH 233/250] bug fix: decoding matrix

---
 data/maker/prepare/__init__.py | 3 ++-
 setup.py                       | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 4b0bfd3..c91c773 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -264,7 +264,8 @@ class Input :
         columns = _args['columns']
         _values = np.array( list(self._map.values()))
         _matrix = pd.DataFrame(_matrix) #if type(_matrix) != pd.DataFrame else _matrix
-        x = _matrix.apply(lambda row: _values[row.values == 1 ].tolist()[0] if row.values.sum() > 0 else np.repeat(None,row.size), axis=1).tolist()
+        # x = _matrix.apply(lambda row: _values[row.values == 1 ].tolist()[0] if row.values.sum() > 0 else None, axis=1).tolist()
+        x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else None ,axis=1).tolist()
         return pd.DataFrame(x,columns=columns)
 
 
diff --git a/setup.py b/setup.py
index 0e70341..ba52b61 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.5.6",
+args = {"name":"data-maker","version":"1.5.8",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow']

From 7ad00166178a50db734ead47ddfebcb5b2324448 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 10 Aug 2022 14:44:29 -0500
Subject: [PATCH 234/250] bug fix: empty row handling

---
 data/maker/prepare/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index c91c773..f025294 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -265,7 +265,7 @@ class Input :
         _values = np.array( list(self._map.values()))
         _matrix = pd.DataFrame(_matrix) #if type(_matrix) != pd.DataFrame else _matrix
         # x = _matrix.apply(lambda row: _values[row.values == 1 ].tolist()[0] if row.values.sum() > 0 else None, axis=1).tolist()
-        x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else None ,axis=1).tolist()
+        x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist()
         return pd.DataFrame(x,columns=columns)
 
 

From e47ffb3fae96adc99f52b601a891e3d65fd4ae31 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 10 Aug 2022 17:06:33 -0500
Subject: [PATCH 235/250] bug fix: random empty values

---
 data/maker/prepare/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index f025294..1fae46c 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -265,7 +265,11 @@ class Input :
         _values = np.array( list(self._map.values()))
         _matrix = pd.DataFrame(_matrix) #if type(_matrix) != pd.DataFrame else _matrix
         # x = _matrix.apply(lambda row: _values[row.values == 1 ].tolist()[0] if row.values.sum() > 0 else None, axis=1).tolist()
-        x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist()
+        #@TODO: Provide random values for things that are missing
+        
+        # x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist()
+        
+        x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.random.choice(np.matrix.flatten(_values,1)).tolist() ,axis=1).tolist()
         return pd.DataFrame(x,columns=columns)
 
 

From afad88411811c67000514f75c3cc4b79c6a38455 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 10 Aug 2022 17:23:42 -0500
Subject: [PATCH 236/250] bug fix: random empty values

---
 data/maker/prepare/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 1fae46c..a19fd31 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -269,7 +269,7 @@ class Input :
         
         # x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist()
         
-        x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.random.choice(np.matrix.flatten(_values,1)).tolist() ,axis=1).tolist()
+        x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else _values[np.random.choice(_values.size,1 )].tolist() ,axis=1).tolist()
         return pd.DataFrame(x,columns=columns)
 
 

From d8aad070eeea60d7ed3f59340386287e8ec01c1e Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 10 Aug 2022 17:30:28 -0500
Subject: [PATCH 237/250] bug fix: random empty values

---
 data/maker/prepare/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index a19fd31..15cbe99 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -268,8 +268,9 @@ class Input :
         #@TODO: Provide random values for things that are missing
         
         # x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist()
-        
-        x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else _values[np.random.choice(_values.size,1 )].tolist() ,axis=1).tolist()
+        novalues = _values[np.random.choice(_values.size,1)[0]].tolist()
+        # novalues = np.repeat(None,len(self._columns))
+        x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else novalues ,axis=1).tolist()
         return pd.DataFrame(x,columns=columns)
 
 

From 96ac4cd9cbc5b3e1609c9381c017d6a2b7645951 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 10 Aug 2022 17:33:48 -0500
Subject: [PATCH 238/250] bug fix: random empty values

---
 data/maker/prepare/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 15cbe99..1adc44d 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -268,7 +268,7 @@ class Input :
         #@TODO: Provide random values for things that are missing
         
         # x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist()
-        novalues = _values[np.random.choice(_values.size,1)[0]].tolist()
+        novalues = _values[np.random.choice( len(_values),1)[0]].tolist()
         # novalues = np.repeat(None,len(self._columns))
         x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else novalues ,axis=1).tolist()
         return pd.DataFrame(x,columns=columns)

From d42d601be7adeb6573a3824d607f300bcf271fda Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 31 Aug 2022 12:51:48 -0500
Subject: [PATCH 239/250] bug fix & enhancements

---
 data/gan.py                    |  3 ++-
 data/maker/__init__.py         | 18 +++++++++++++-----
 data/maker/prepare/__init__.py | 15 +++++++++++----
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 26f19a2..812426a 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -533,7 +533,7 @@ class Train (GNet):
                                         print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
                                         # print (dir (w_distance))
 
-                                        logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })
+                                        logs.append({"epoch": int(epoch),"distance":float(-w_sum/(self.STEPS_PER_EPOCH*2)) })
 
                                         # if epoch % self.MAX_EPOCHS == 0:
                                         if epoch in [5,10,20,50,75, self.MAX_EPOCHS] :
@@ -547,6 +547,7 @@ class Train (GNet):
                                                 if self.logger :
                                                         row = {"module":"gan-train","action":"logs","input":{"partition":self.PARTITION,"logs":logs}} #,"model":pickle.dump(sess)}                                                        
                                                         self.logger.write(row)
+                                                        
                                                         #
                                                         # @TODO:
                                                         # We should upload the files in the checkpoint 
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 7ea2c74..50ac8c1 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -69,15 +69,19 @@ class Learner(Process):
         self.cache = []
         # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
         # sel.max_epoc
+        self.logger = None
+        if 'logger' in self.store :
+            self.logger = transport.factory.instance(**self.store['logger'])
     def log(self,**_args):
         
         try:
-            # _context = self.info['context']
-            # _label = self.info['info'] if 'info' in self.info else _context
+            _context = self.info['context']
+            _label = self.info['info'] if 'info' in self.info else _context
             # logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider=transport.providers.CONSOLE,context='write',lock=True)
-            # _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args})
-            # logger.write(_args)
-            # self.ndx += 1
+            _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args})
+            if self.logger:
+                self.logger.write(_args)
+            self.ndx += 1
             # if hasattr(logger,'close') :
             #     logger.close()
             pass
@@ -178,6 +182,8 @@ class Trainer(Learner):
             _args['gpu'] = self.gpu
         _args['real'] = _matrix
         _args['candidates'] = self.candidates
+        if self.logger :
+            _args['logger'] = transport.factory.instance(**self.store['logger'])
         #
         # At this point we have the binary matrix, we can initiate training
         #
@@ -250,6 +256,8 @@ class Generator (Learner):
         _args['row_count'] = self._df.shape[0]
         if self.gpu :
             _args['gpu'] = self.gpu
+        if self.logger :
+            _args['logger'] = transport.factory.instance(**self.store['logger'])        
         gHandler = gan.Predict(**_args)
         gHandler.load_meta(columns=None)
         _iomatrix = gHandler.apply()        
diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index 1adc44d..c8331bd 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -34,6 +34,8 @@ class Hardware :
     pass
 
 class Input :
+    class NOVALUES :
+        RANDOM,IGNORE,ALWAYS = ['random','ignore','always']
     """
     This class is designed to read data from a source and and perform a variet of operations :
         - provide a feature space, and rows (matrix profile)
@@ -257,8 +259,6 @@ class Input :
     def decode (self,_matrix,**_args):
         #
         # _matrix   binary matrix
-        # _values   value space given the columns
-        # columns   name of the columns ...
         #
       
         columns = _args['columns']
@@ -268,8 +268,15 @@ class Input :
         #@TODO: Provide random values for things that are missing
         
         # x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist()
-        novalues = _values[np.random.choice( len(_values),1)[0]].tolist()
-        # novalues = np.repeat(None,len(self._columns))
+        #
+        # @TODO: Provide a parameter to either:
+        #   - missing = {outlier,random,none}
+        #   -   outlier: select an outlier, random: randomly select a value, none: do nothing ...
+        # 
+        if np.random.choice([0,1],1)[0] :
+            novalues = _values[np.random.choice( len(_values),1)[0]].tolist()
+        else:
+            novalues = np.repeat(None,len(self._columns))
         x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else novalues ,axis=1).tolist()
         return pd.DataFrame(x,columns=columns)
 

From 7af3c3db6ac20465df98430e0429964b8f164b75 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 31 Aug 2022 17:26:45 -0500
Subject: [PATCH 240/250] bug fix

---
 data/maker/__init__.py | 12 ++++++------
 setup.py               |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 50ac8c1..bc5d9cc 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -79,16 +79,16 @@ class Learner(Process):
             _label = self.info['info'] if 'info' in self.info else _context
             # logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider=transport.providers.CONSOLE,context='write',lock=True)
             _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args})
-            if self.logger:
+            if hasattr(self,'logger') :
                 self.logger.write(_args)
             self.ndx += 1
             # if hasattr(logger,'close') :
             #     logger.close()
             pass
         except Exception as e:
-            print ()
-            print (_args)
-            print (e)
+            # print ()
+            # print (_args)
+            # print (e)
             pass
         finally:
            
@@ -182,7 +182,7 @@ class Trainer(Learner):
             _args['gpu'] = self.gpu
         _args['real'] = _matrix
         _args['candidates'] = self.candidates
-        if self.logger :
+        if 'logger' in self.store :
             _args['logger'] = transport.factory.instance(**self.store['logger'])
         #
         # At this point we have the binary matrix, we can initiate training
@@ -256,7 +256,7 @@ class Generator (Learner):
         _args['row_count'] = self._df.shape[0]
         if self.gpu :
             _args['gpu'] = self.gpu
-        if self.logger :
+        if 'logger' in self.store :
             _args['logger'] = transport.factory.instance(**self.store['logger'])        
         gHandler = gan.Predict(**_args)
         gHandler.load_meta(columns=None)
diff --git a/setup.py b/setup.py
index ba52b61..7b06af8 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.5.8",
+args = {"name":"data-maker","version":"1.5.9",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow']

From 4398212cafec8c8454b25ea85a4d06c3d2f154b9 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 31 Aug 2022 17:34:43 -0500
Subject: [PATCH 241/250] bug fix

---
 data/maker/__init__.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index bc5d9cc..cdc48e2 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -77,10 +77,11 @@ class Learner(Process):
         try:
             _context = self.info['context']
             _label = self.info['info'] if 'info' in self.info else _context
-            # logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider=transport.providers.CONSOLE,context='write',lock=True)
+            # logger = 
             _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args})
-            if hasattr(self,'logger') :
-                self.logger.write(_args)
+            if 'logger' in self.store :
+                logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider=transport.providers.CONSOLE,context='write',lock=True)
+                logger.write(_args)
             self.ndx += 1
             # if hasattr(logger,'close') :
             #     logger.close()

From 0efd4b13bc01fd7ebc0a000997f67c4e9defce38 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Fri, 16 Sep 2022 18:18:15 -0500
Subject: [PATCH 242/250] bug fix: crash with dataset & epochs

---
 README.md                      |  12 ++--
 data/__init__.py               |   1 +
 data/gan.py                    |  91 ++++++++++++++++++++++------
 data/maker/__init__.py         | 105 ++++++++++++++++++++++++++-------
 data/maker/prepare/__init__.py |   2 +-
 5 files changed, 167 insertions(+), 44 deletions(-)

diff --git a/README.md b/README.md
index f3c92ed..32224c2 100644
--- a/README.md
+++ b/README.md
@@ -13,17 +13,19 @@ This package is designed to generate synthetic data from a dataset from an origi
 
 After installing the easiest way to get started is as follows (using pandas). The process is as follows:
 
+Read about [data-transport on github](https://github.com/lnyemba/data-transport) or on [healthcareio.the-phi.com/git/code/transport](https://healthcareio.the-phi.com/git/code/transport.git)
+
 **Train the GAN on the original/raw dataset**
 
+1. We define the data sources
+
+The sources will consists in source, target and logger20.
 
     import pandas as pd
     import data.maker
+    import transport
+    from transport import providers
 
-    df      = pd.read_csv('sample.csv')
-    column  = 'gender'
-    id      = 'id' 
-    context = 'demo'
-    data.maker.train(context=context,data=df,column=column,id=id,logs='logs')
 
 The trainer will store the data on disk (for now) in a structured folder that will hold training models that will be used to generate the synthetic data.
 
diff --git a/data/__init__.py b/data/__init__.py
index 2b4a6aa..91b566d 100644
--- a/data/__init__.py
+++ b/data/__init__.py
@@ -3,3 +3,4 @@ from data.params import SYS_ARGS
 import transport
 from multiprocessing import Process, Queue
 from data.maker import prepare
+from data.maker import state
diff --git a/data/gan.py b/data/gan.py
index 812426a..3727edb 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -100,6 +100,13 @@ class GNet :
                 self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS
                 self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)       
                 self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs'])
+                CHECKPOINT_SKIPS = 10
+                if self.MAX_EPOCHS  < 2*CHECKPOINT_SKIPS :
+                        CHECKPOINT_SKIPS = 2
+                self.CHECKPOINTS = np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() 
+                
+                
+
                 self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100
                 self.CONTEXT = args['context']
                 self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None}
@@ -120,14 +127,18 @@ class GNet :
                 for key in ['train','output'] :
                         self.mkdir(os.sep.join([self.log_dir,key]))
                         self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT]))
-                        if 'partition' in args :
-                                self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT,str(args['partition'])]))
-                        
+                        # if 'partition' in args :
+                        #        self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT,str(args['partition'])]))
                 self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])                
                 self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
                 if 'partition' in args :
                         self.train_dir = os.sep.join([self.train_dir,str(args['partition'])])
                         self.out_dir = os.sep.join([self.out_dir,str(args['partition'])])
+                
+                for checkpoint in self.CHECKPOINTS :
+                        self.mkdir (os.sep.join([self.train_dir,str(checkpoint)]))
+                        self.mkdir (os.sep.join([self.out_dir,str(checkpoint)]))
+
                 # if self.logger :
                         
                 #         We will clear the logs from the data-store 
@@ -150,12 +161,13 @@ class GNet :
                         attr = json.loads((open(_name)).read())
                         for key in attr :
                                 value = attr[key]
-                                setattr(self,key,value)
+                                if not hasattr(self,key):
+                                        setattr(self,key,value)
                 self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])                
                 self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
-                if 'partition' in args :
-                        self.train_dir = os.sep.join([self.train_dir,str(args['partition'])])
-                        self.out_dir = os.sep.join([self.out_dir,str(args['partition'])])
+                # if 'partition' in args :
+                #         self.train_dir = os.sep.join([self.train_dir,str(args['partition'])])
+                #         self.out_dir = os.sep.join([self.out_dir,str(args['partition'])])
                                 
                         
         def log_meta(self,**args) :
@@ -183,15 +195,24 @@ class GNet :
                 suffix = self.CONTEXT #self.get.suffix()
                 _name = os.sep.join([self.out_dir,'meta-'+suffix])
                 
-                f = open(_name+'.json','w')
-                f.write(json.dumps(_object))
+                # f = open(_name+'.json','w')
+                # f.write(json.dumps(_object))
+                # f.close()
+
+                for _info in [{"name":os.sep.join([self.out_dir,'meta-'+suffix+'.json']),"data":_object},{"name":os.sep.join([self.out_dir,'epochs.json']),"data":self.logs['epochs'] if 'epochs' in self.logs else []}] :
+                        f = open(_info['name'],'w')
+                        f.write(json.dumps(_info['data']))
+                        f.close()
                 return _object
         def mkdir (self,path):
                 if not os.path.exists(path) :
                         if os.sep in path :
                                 pass
                                 root = []
-                                for loc in path.split(os.sep) :
+                                
+                                for loc in path.strip().split(os.sep) :
+                                        if loc == '' :
+                                                root.append(os.sep)
                                         root.append(loc)
                                         if not os.path.exists(os.sep.join(root)) :                                                
                                                 os.mkdir(os.sep.join(root))
@@ -278,8 +299,10 @@ class Generator (GNet):
                 tf.compat.v1.add_to_collection('glosses', loss)
                 return loss, loss                
         def load_meta(self, **args):
-                super().load_meta(**args)
+                # super().load_meta(**args)
                 self.discriminator.load_meta(**args)
+               
+                
         def network(self,**args) :
                 """
                 This function will build the network that will generate the synthetic candidates
@@ -381,6 +404,7 @@ class Train (GNet):
                         
                         self.logger.write({"module":"gan-train","action":"start","input":{"partition":self.PARTITION,"meta":self.meta} } )
                 
+
                 # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta)
         def load_meta(self, column):
                 """
@@ -445,7 +469,7 @@ class Train (GNet):
                 else :
                         dataset = tf.data.Dataset.from_tensor_slices(features_placeholder)
                 # labels_placeholder = None
-                dataset = dataset.repeat(10000)
+                dataset = dataset.repeat(20000)
                 
                 dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
                 dataset = dataset.prefetch(1)
@@ -472,9 +496,11 @@ class Train (GNet):
                                                 if self._LABEL is not None :
                                                         (real, label) = iterator.get_next()
                                                 else:
+                                                        
                                                         real = iterator.get_next()
                                                         label= None
                                                 loss, w = self.loss(scope=scope, stage=stage, real=real, label=label)
+                                                
                                                 #tf.get_variable_scope().reuse_variables()
                                                 tf.compat.v1.get_variable_scope().reuse_variables()
                                                 #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
@@ -507,6 +533,7 @@ class Train (GNet):
                         # init    = tf.global_variables_initializer()
                         init    = tf.compat.v1.global_variables_initializer()
                         logs = []
+                        self.logs['epochs'] = []
                         #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
                         with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
                                 
@@ -536,25 +563,41 @@ class Train (GNet):
                                         logs.append({"epoch": int(epoch),"distance":float(-w_sum/(self.STEPS_PER_EPOCH*2)) })
 
                                         # if epoch % self.MAX_EPOCHS == 0:
-                                        if epoch in [5,10,20,50,75, self.MAX_EPOCHS] :
+                                        # if epoch in [5,10,20,50,75, self.MAX_EPOCHS] :
+                                        if epoch in self.CHECKPOINTS  or int(epoch) == 1:
                                                 # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
                                                 suffix = self.CONTEXT #self.get.suffix()
-                                                _name  = os.sep.join([self.train_dir,suffix])
+                                                _name  = os.sep.join([self.train_dir,str(epoch),suffix])
                                                 # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
                                                 saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
+                                                
                                                 #
                                                 #
+                                               
+                                                logs = [{"path":_name,"epochs":int(epoch),"loss":float(-w_sum/(self.STEPS_PER_EPOCH*2))}]
                                                 if self.logger :
-                                                        row = {"module":"gan-train","action":"logs","input":{"partition":self.PARTITION,"logs":logs}} #,"model":pickle.dump(sess)}                                                        
-                                                        self.logger.write(row)
-                                                        
+                                                        # row = {"module":"gan-train","action":"epochs","input":{"logs":logs}} #,"model":pickle.dump(sess)}                                                        
+                                                        # self.logger.write(row)
+                                                        self.logs['epochs'] += logs
                                                         #
                                                         # @TODO:
                                                         # We should upload the files in the checkpoint 
                                                         # This would allow the learnt model to be portable to another system
                                                         #
                         tf.compat.v1.reset_default_graph()
-
+                #
+                # let's sort the epochs we've logged thus far (if any)
+                #
+                self.logs['epochs'].sort(key=lambda _item: _item['loss'])
+                if self.logger :
+                        _log = {'module':'gan-train','action':'epochs','input':self.logs['epochs']}
+                        self.logger.write(_log)
+                
+                #
+                # @TODO:
+                # Make another copy of this on disk to be able to load it should we not have a logger setup
+                #
+                self.log_meta()
 class Predict(GNet):
         """
         This class uses synthetic data given a learned model
@@ -565,6 +608,7 @@ class Predict(GNet):
                 self.values     = args['values']
                 self.ROW_COUNT  = args['row_count']
                 self.oROW_COUNT = self.ROW_COUNT
+                
                 # self.MISSING_VALUES = np.nan_to_num(np.nan)
                 # if 'no_value' in args and args['no_value'] not in ['na','','NA'] :
                 #         self.MISSING_VALUES = args['no_value']
@@ -577,9 +621,20 @@ class Predict(GNet):
                 super().load_meta(**args)
                 self.generator.load_meta(**args)
                 self.ROW_COUNT = self.oROW_COUNT
+                #
+                # updating the input/output for the generator, so it points properly
+                #
+                
+                for object in [self,self.generator] :
+                        _train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT,str(self.MAX_EPOCHS)])
+                        _out_dir= os.sep.join([self.log_dir,'output',self.CONTEXT,str(self.MAX_EPOCHS)])
+                        setattr(object,'train_dir',_train_dir)
+                        setattr(object,'out_dir',_out_dir)                
         def apply(self,**args):
                 suffix = self.CONTEXT #self.get.suffix()
                 model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
+                # model_dir = os.sep.join([self.train_dir,str(self.MAX_EPOCHS)])
+               
                 demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
                 #
                 # setup computational graph
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index cdc48e2..dea44eb 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -15,6 +15,7 @@ import transport
 # from data.bridge import Binary
 import threading
 from data.maker import prepare
+from data.maker.state import State
 import copy
 import os
 import nujson as json
@@ -25,6 +26,7 @@ from multiprocessing import Queue
 import time
 
 
+
 class Learner(Process):
 
     def __init__(self,**_args):
@@ -48,7 +50,7 @@ class Learner(Process):
         if 'network_args' not in _args :
             self.network_args ={
                 'context':self.info['context'] ,
-                'logs':_args['logpath'] if 'logpath' in _args else 'logs',
+                'logs':_args['logs'] if 'logs' in _args else 'logs',
                 'max_epochs':int(_args['epochs']) if 'epochs' in _args else 2,
                 'batch_size':int (_args['batch']) if 'batch' in _args else 2000
             }
@@ -72,6 +74,36 @@ class Learner(Process):
         self.logger = None
         if 'logger' in self.store :
             self.logger = transport.factory.instance(**self.store['logger'])
+        self.autopilot = False  #-- to be set by caller
+        self._initStateSpace()
+    def _initStateSpace(self):
+        """
+        Initializing state-space for the data-maker, The state-space functions are used as pre-post processing functions applied to the data accordingly i.e 
+            - Trainer    -> pre-processing
+            - Generation -> post processing
+        The specifications of a state space in the configuration file is as such
+        state:{pre:{path,pipeline:[]}, post:{path,pipeline:[]}}
+        """
+        self._states = None
+        
+        if 'state' in self.info :
+            try:
+                _config = self.info ['state']
+                self._states = State.instance(_config)
+            except Exception as e:
+                print (e)
+                pass
+            finally:
+                # __info = (pd.DataFrame(self._states)[['name','path','args']]).to_dict(orient='records')
+                if self._states :
+                    __info = {}
+                    
+                    for key in self._states :
+                        __info[key] = [{"name":_item['name'],"args":_item['args'],"path":_item['path']} for _item in self._states[key]]
+                    self.log(object='state-space',action='load',input=__info)
+        
+
+        
     def log(self,**_args):
         
         try:
@@ -108,11 +140,36 @@ class Learner(Process):
         _read_args= self.info
         if self._df is None :
             self._df     = reader.read(**_read_args)
+        #
+        # NOTE : PRE
+        # At this point we apply pre-processing of the data if there were ever a need for it
+        #
+        _log = {}
+        HAS_STATES = self._states is not None and 'pre' in self._states
+        NOT_GENERATING = self.name in ['Trainer','Shuffle']
+        IS_AUTOPILOT = self.autopilot
+        #
+        # allow calling pre-conditions if either of the conditions is true
+        #   1. states and not generating
+        #   2. IS_GENERATING and states and not autopilot
+        _ALLOW_PRE_CALL = (HAS_STATES and NOT_GENERATING) or (NOT_GENERATING is False and HAS_STATES and IS_AUTOPILOT is False)
+        if _ALLOW_PRE_CALL :
+        # if HAS_STATES and NOT_GENERATING or (HAS_STATES and IS_AUTOPILOT is False and NOT_GENERATING is False):
+            _logs = {'action':'status','input':{'pre':self._states['pre']}}
+            _beg = list(self._df.shape)
+            self._df = State.apply(self._df,self._states['pre'])
+            _end = list(self._df.shape)
+            _logs['input']['size'] = _beg,_end
+            self.log(**_log)
+
+        #
+        #
+
         columns = self.columns if self.columns else self._df.columns
         #
         # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases
         # - The code below tries to address the issue (Perhaps better suited for the reading components)
-        _log = {}
+        
         for name in columns :
             #
             # randomly sampling 5 elements to make sense of data-types
@@ -201,8 +258,14 @@ class Trainer(Learner):
         # @TODO: At this point we need to generate another some other objects
         #
         _args = {"network_args":self.network_args,"store":self.store,"info":self.info,"candidates":self.candidates,"data":self._df}
+        _args['logs'] = self.network_args['logs']
+        _args['autopilot'] = self.autopilot
         if self.gpu :
             _args['gpu'] = self.gpu
+
+        #
+        # Let us find the smallest, the item is sorted by loss ...
+        _args['epochs'] = gTrain.logs['epochs'][0]['epochs']
         g = Generator(**_args)
         # g.run() 
         
@@ -239,6 +302,7 @@ class Generator (Learner):
             file.close()
         else:
             self._map = {}
+        self.autopilot = False if 'autopilot' not in _args else _args['autopilot']
     def run(self):
         self.initalize()
         if self._encoder is None :
@@ -416,33 +480,32 @@ class Generator (Learner):
             _df = self._df.copy()
             _df[self.columns] = _iodf[self.columns]
             N += _df.shape[0]
-            #
-            #@TODO:
-            # Improve formatting with better post-processing pipeline
-            if 'approximate' in self.info :                
-                _df = self.approximate(_df)
-            if 'make_date' in self.info :
-                for name in self.info['make_date'] :
-                    # iname = self.info['make_date']['init_field']
-                    iname = self.info['make_date'][name]
+            if self._states :
+                _df = State.apply(_df,self._states['post'])
+            # #
+            # #@TODO:
+            # # Improve formatting with better post-processing pipeline
+            # if 'approximate' in self.info :                
+            #     _df = self.approximate(_df)
+            # if 'make_date' in self.info :
+            #     for name in self.info['make_date'] :
+            #         # iname = self.info['make_date']['init_field']
+            #         iname = self.info['make_date'][name]
 
-                    years = _df[iname]
-                    _dates = [self.make_date(year=_year,field=name) for _year in years]
-                    if _dates :                         
-                        _df[name] = _dates
+            #         years = _df[iname]
+            #         _dates = [self.make_date(year=_year,field=name) for _year in years]
+            #         if _dates :                         
+            #             _df[name] = _dates
                         
                         
            
             _schema = self.get_schema()
-            # _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
+            
             _df = self.format(_df,_schema)
             _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ]
             self.log(**{"action":"consolidate","input":_log})
 
-            # w = transport.factory.instance(doc='observation',provider='mongodb',context='write',db='IOV01_LOGS',auth_file='/home/steve/dev/transport/mongo.json')
-            # w.write(_df)
-            # cols = [name for name in _df.columns if name.endswith('datetime')]
-            # print (_df[cols])
+           
             if _store :
                 writer = transport.factory.instance(**_store)
                 if _store['provider'] == 'bigquery':
@@ -507,8 +570,10 @@ class factory :
         :param info             {columns,sql,from}
         :param autopilot        will generate output automatically
         :param batch (default 2k)           size of the batch
+        
         """
         
+        
         if _args['apply'] in [apply.RANDOM] :
             pthread =  Shuffle(**_args)
         elif _args['apply'] == apply.GENERATE :
diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py
index c8331bd..b11be57 100644
--- a/data/maker/prepare/__init__.py
+++ b/data/maker/prepare/__init__.py
@@ -276,7 +276,7 @@ class Input :
         if np.random.choice([0,1],1)[0] :
             novalues = _values[np.random.choice( len(_values),1)[0]].tolist()
         else:
-            novalues = np.repeat(None,len(self._columns))
+            novalues = np.repeat(None,len(self._columns))        
         x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else novalues ,axis=1).tolist()
         return pd.DataFrame(x,columns=columns)
 

From 936bd3ee0be7e01352e364a5dd91337e09cc797c Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Fri, 16 Sep 2022 19:10:49 -0500
Subject: [PATCH 243/250] bug fix with model saving, and pre/post processing

---
 data/gan.py                  |   6 +-
 data/maker/__init__.py       |   2 +-
 data/maker/apply.py          |  76 +++++++++++++++++++++++
 data/maker/state/__init__.py | 105 +++++++++++++++++++++++++++++++
 data/maker/state/default.py  | 116 +++++++++++++++++++++++++++++++++++
 5 files changed, 301 insertions(+), 4 deletions(-)
 create mode 100644 data/maker/apply.py
 create mode 100644 data/maker/state/__init__.py
 create mode 100644 data/maker/state/default.py

diff --git a/data/gan.py b/data/gan.py
index 3727edb..f864dbf 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -469,7 +469,7 @@ class Train (GNet):
                 else :
                         dataset = tf.data.Dataset.from_tensor_slices(features_placeholder)
                 # labels_placeholder = None
-                dataset = dataset.repeat(20000)
+                dataset = dataset.repeat(80000)
                 
                 dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
                 dataset = dataset.prefetch(1)
@@ -564,12 +564,12 @@ class Train (GNet):
 
                                         # if epoch % self.MAX_EPOCHS == 0:
                                         # if epoch in [5,10,20,50,75, self.MAX_EPOCHS] :
-                                        if epoch in self.CHECKPOINTS  or int(epoch) == 1:
+                                        if epoch in self.CHECKPOINTS  :
                                                 # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
                                                 suffix = self.CONTEXT #self.get.suffix()
                                                 _name  = os.sep.join([self.train_dir,str(epoch),suffix])
                                                 # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
-                                                saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
+                                                saver.save(sess, _name, write_meta_graph=False, global_step=np.int64(epoch))
                                                 
                                                 #
                                                 #
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index dea44eb..21b3017 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -265,7 +265,7 @@ class Trainer(Learner):
 
         #
         # Let us find the smallest, the item is sorted by loss ...
-        _args['epochs'] = gTrain.logs['epochs'][0]['epochs']
+        _args['network_args']['max_epochs'] = gTrain.logs['epochs'][0]['epochs']
         g = Generator(**_args)
         # g.run() 
         
diff --git a/data/maker/apply.py b/data/maker/apply.py
new file mode 100644
index 0000000..bb6a085
--- /dev/null
+++ b/data/maker/apply.py
@@ -0,0 +1,76 @@
+"""
+This file is designed to specify the appliction of pre/post-processing code. 
+    The pre-processing code gets applied after the data has been loaded
+    The post-processing code get applied after the data has been generated for instance:
+        -approximation code/logic; date shifting; suppression; adding noise
+        - 
+"""
+import numpy as np
+from datetime import datetime, timedelta
+import time
+
+class Phase:
+    def __init__(self,**_args):
+        self._df = _args['data']
+        self.callback = _args['callback']
+    def apply(self,**_args):
+        """
+            :param  data        data-frame
+            :param  _info       arguments needed to be applied
+            :param  callback    callback function once done
+        """
+        raise Exception ("Function needs to be Implemented")
+class Pre(Phase):
+    pass
+class Post(Phase):
+    def __init__(self,**_args):
+        super().__init__(**_args)
+    pass
+
+class Date(Post):
+    def __init__(self,**_args):
+        super().__init__(**_args)
+    def make(self,**_args):
+        """
+        This function generates a random date given a year and optionally a set of days from the randomly generated date
+        :param year     initial value of a year
+        :param offset   list of days between initial date    
+        """
+        if _args['year'] in ['',None,np.nan] :
+            return None
+        year = int(_args['year'])
+
+        offset = _args['offset'] if 'offset' in _args else 0
+        month   = np.random.randint(1,13)
+        if month == 2:
+            _end = 28 if year % 4 != 0 else 29
+        else:
+            _end = 31 if month in [1,3,5,7,8,10,12] else 30
+        day = np.random.randint(1,_end)
+
+        #-- synthetic date
+        _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0)
+        FORMAT =  '%Y-%m-%d' if 'format' not in _args else _args['format']
+
+
+
+        # print ([_name,FORMAT, _date.strftime(FORMAT)])
+        r = []
+        if offset :
+            r = [_date.strftime(FORMAT)]
+            for _delta in offset :
+                _date = _date + timedelta(_delta)
+                r.append(_date.strptime(FORMAT))
+            return r
+        else:
+            return _date.strftime(FORMAT)
+        
+    def apply(self,**_args):
+        """
+        
+        """
+      pass
+class Approximate(Post):
+    def apply(**_args):
+        pass
+    def applyWithRange(**_args):
diff --git a/data/maker/state/__init__.py b/data/maker/state/__init__.py
new file mode 100644
index 0000000..adf9837
--- /dev/null
+++ b/data/maker/state/__init__.py
@@ -0,0 +1,105 @@
+"""
+This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditiions
+"""
+"""
+This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditions,
+The specifications for this are as follows (within an entry of the configuration)
+    {
+        "state":{
+            "pre":[{"approximate":{"field":"int"}},{"newdate":{"field":"format"}}],"post":[{"limit":10}]
+        }
+    }
+"""
+import importlib
+import importlib.util
+import sys
+from datetime import datetime
+from data.maker.state.default import *
+import os
+
+
+class State :
+    @staticmethod
+    def apply(_data,lpointers):
+        """
+        This function applies a pipeline against a given data-frame, the calling code must decide whether it is a pre/post
+        :_data  data-frame
+        :_lpointers functions modules returned by instance (module,_args)
+        """
+        for _item in lpointers :
+            if _item is None :
+                continue
+            
+            pointer = _item['module']
+            _args = _item['args']
+            
+            _data = pointer(_data,_args)
+        return _data
+    @staticmethod
+    def instance(_args):
+        pre = []
+        post=[]
+       
+        out  = {}
+        for key in _args :
+            #
+            # If the item has a path property is should be ignored
+            path  = _args[key]['path'] if 'path' in _args[key] else ''
+            out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']]
+            
+        return out
+        # if 'pre' in _args:
+        #     path  = _args['pre']['path'] if 'path' in _args['pre'] else ''
+            
+        #     pre = [ State._build(dict(_item,**{'path':path})) for _item in _args['pre']['pipeline']]
+        # else:
+        #     path  = _args['post']['path'] if 'path' in _args['post'] else ''
+
+        #     post = [ State._build(dict(_item,**{'path':path})) for _item in _args['post']['pipeline']]
+        # return {'pre':pre,'post':post}
+   
+    @staticmethod
+    def _extract(_entry):
+        
+        _name = list(set(_entry.keys()) - set(['path']) )
+        _name = _name[0]
+        path = _entry['path'] if 'path' in _entry and os.path.exists(_entry['path']) else ''
+        return {"module": _name,"args": _entry[_name],'name':_name,'path':path}
+        pass
+    @staticmethod
+    def _build(_args):
+        
+        _info = State._extract(_args)
+        # _info = dict(_args,**_info)
+        
+        _info['module'] = State._instance(_info)
+        return _info if _info['module'] is not None else None
+            
+    @staticmethod
+    def _instance(_args):
+        """
+            :path   optional path of the file on disk
+            :module   name of the function
+        """
+        
+        _name = _args['module']
+        
+        if 'path' in _args and os.path.exists(_args['path']):
+            path= _args['path']
+            
+            spec = importlib.util.spec_from_file_location(_name, path)
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+        else:
+            #
+            # Probably calling a built-in module (should be in this file)
+            
+            module = sys.modules['data.maker.state.default'] 
+       
+        return getattr(module,_name) if hasattr(module,_name) else None
+ 
+#
+# Adding a few custom functions that should be able to help ....
+# These functions can be called without specifying a path
+#
+
diff --git a/data/maker/state/default.py b/data/maker/state/default.py
new file mode 100644
index 0000000..75c2c4b
--- /dev/null
+++ b/data/maker/state/default.py
@@ -0,0 +1,116 @@
+"""
+This file contains default functions applied to a data-frame/dataset as pre/post processing jobs. 
+The functions are organized in a pipeline i.e the data will be applied to each function 
+
+Custom functions :
+    functions must tak 2 arguments (_data,_args) : where _data is a data frame and _arg is a object describing the input parameters
+"""
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+
+
+def limit(_data,size):
+    """
+        ...,{limit:size}
+    """
+    
+    # size = int(_args['limit'])
+    return _data.iloc[:size]
+def format(_data,_schema):
+    """
+    This function enforces a schema against a data-frame, this may or may not work depending on the persistence storage
+    :_data  data-frame containing all data
+    :_args  schema to enforce the data, we are expecting the format as a list of {name,type,description}    
+    """   
+    return _data 
+
+def approximate(_data,_args):
+    """
+    :_args  Object of {field:type}
+    This function will approximate n-fields in the data given it's distribution
+    """
+    _m = {'int':int,'float':float,'integer':int,'double':float}
+    columns = list(_args.keys())
+    for _name in columns :
+        if _name not in _data :
+            continue
+        otype = _args[_name]
+        otype = str if otype not in _m else _m[otype]
+        _data.loc[:,_name] = np.random.uniform(_data[_name].values).astype(otype)
+        
+    return _data
+def split_date(_data,_args):
+    """
+    This function takes a field and applies the format from other fields
+    :_data  data-frame
+    :_config    configuration entry {column:{format,column:format,type}}
+    """
+    _columns = list(_args.keys())
+    _m = {'int':int,'float':float,'integer':int,'double':float}
+    for _name in _columns :
+        _iname = _args[_name]['column']
+        _iformat = _args[_name]['format']['in']
+        _oformat = _args[_name]['format']['out']
+        _otype = str if 'type' not in _args[_name]  else _args[_name]['type']
+        _data.loc[:,_name] = _data[_iname].apply(lambda _date: datetime.strftime(datetime.strptime(str(_date),_iformat),_oformat)).astype(_otype)
+    return _data
+def newdate(_data,_args):
+    """
+    This function creates a new data on a given column from another 
+    :_data  data frame
+    :_args  configuration column:{format,column}
+    """
+    _columns = list(_args.keys())
+    for _name in _columns :
+        
+        format = _args[_name]['format']
+        ROW_COUNT = _data[_name].size
+        if 'column' in _args[_name] :
+            srcName = _args[_name]['column']
+            years = _data[srcName].values
+        else:
+            years = np.random.choice(np.arange(datetime.now().year- 90,datetime.now().year),ROW_COUNT)
+        _data.loc[:,_name] = [ _makedate(year = years[_index],format = format) for _index in np.arange(ROW_COUNT)]
+        
+    return _data
+def _makedate(**_args):
+    """
+    This function creates a new date and applies it to a column
+    :_data  data-frame with columns
+    :_args  arguments for col1:format
+    """
+    _columns = list(_args.keys())
+   
+    # if _args['year'] in ['',None,np.nan] :
+    #     year = np.random.choice(np.arange(1920,222),1)
+    # else:
+    #     year = int(_args['year'])
+    year = int(_args['year'])
+    offset = _args['offset'] if 'offset' in _args else 0
+    month   = np.random.randint(1,13)
+    if month == 2:
+        _end = 28 if year % 4 != 0 else 29
+    else:
+        _end = 31 if month in [1,3,5,7,8,10,12] else 30
+    day = np.random.randint(1,_end)
+
+    #-- synthetic date
+    _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0)
+    FORMAT =  '%Y-%m-%d'
+    
+    if 'format' in _args:
+        FORMAT = _args['format']
+    
+
+    # print ([_name,FORMAT, _date.strftime(FORMAT)])
+    r = []
+    if offset :
+        r = [_date.strftime(FORMAT)]
+        for _delta in offset :
+            _date = _date + timedelta(_delta)
+            r.append(_date.strptime(FORMAT))
+        return r
+    else:
+        return _date.strftime(FORMAT)
+

From 4be340ec082509d645ef8e05a8ae18848eafd589 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Fri, 16 Sep 2022 19:13:22 -0500
Subject: [PATCH 244/250] version update

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7b06af8..c28f366 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.5.9",
+args = {"name":"data-maker","version":"1.6.0",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow']

From 209a7b8ee5c04f094efa8ef33841e8464fd3f52c Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Fri, 16 Sep 2022 22:39:25 -0500
Subject: [PATCH 245/250] bug fix: checkpoints

---
 data/gan.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index f864dbf..dae6ea0 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -103,7 +103,7 @@ class GNet :
                 CHECKPOINT_SKIPS = 10
                 if self.MAX_EPOCHS  < 2*CHECKPOINT_SKIPS :
                         CHECKPOINT_SKIPS = 2
-                self.CHECKPOINTS = np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() 
+                self.CHECKPOINTS = [1,self.MAX_EPOCHS] + np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() 
                 
                 
 
@@ -529,7 +529,7 @@ class Train (GNet):
                         train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d)
                         train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g)
                         # saver = tf.train.Saver()
-                        saver   = tf.compat.v1.train.Saver()
+                        saver   = tf.compat.v1.train.Saver(max_to_keep=len(self.CHECKPOINTS))
                         # init    = tf.global_variables_initializer()
                         init    = tf.compat.v1.global_variables_initializer()
                         logs = []
@@ -564,7 +564,7 @@ class Train (GNet):
 
                                         # if epoch % self.MAX_EPOCHS == 0:
                                         # if epoch in [5,10,20,50,75, self.MAX_EPOCHS] :
-                                        if epoch in self.CHECKPOINTS  :
+                                        if epoch in self.CHECKPOINTS :
                                                 # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
                                                 suffix = self.CONTEXT #self.get.suffix()
                                                 _name  = os.sep.join([self.train_dir,str(epoch),suffix])
@@ -587,7 +587,9 @@ class Train (GNet):
                         tf.compat.v1.reset_default_graph()
                 #
                 # let's sort the epochs we've logged thus far (if any)
+                # Take on the last five checkpoints https://stackoverflow.com/questions/41018454/tensorflow-checkpoint-models-getting-deleted
                 #
+                # self.logs['epochs']  = self.logs['epochs'][-5:]
                 self.logs['epochs'].sort(key=lambda _item: _item['loss'])
                 if self.logger :
                         _log = {'module':'gan-train','action':'epochs','input':self.logs['epochs']}

From 3b0903bd4af7073d37094b5db4f63ad6e60a9073 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Mon, 19 Sep 2022 13:10:28 -0500
Subject: [PATCH 246/250] minor bug fix

---
 data/maker/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 21b3017..7f9c0f6 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -480,7 +480,7 @@ class Generator (Learner):
             _df = self._df.copy()
             _df[self.columns] = _iodf[self.columns]
             N += _df.shape[0]
-            if self._states :
+            if self._states and 'post' in self._states:
                 _df = State.apply(_df,self._states['post'])
             # #
             # #@TODO:

From ce594634e848a1956a5ff3dbd2c08a34028592de Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 11 Oct 2022 18:18:59 -0500
Subject: [PATCH 247/250] checkpoint enhancement

---
 data/gan.py            | 49 ++++++++++++++++++++++--------------------
 data/maker/__init__.py | 13 ++++++++---
 setup.py               |  4 ++--
 3 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index dae6ea0..eaf5124 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -100,13 +100,12 @@ class GNet :
                 self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS
                 self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)       
                 self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs'])
-                CHECKPOINT_SKIPS = 10
-                if self.MAX_EPOCHS  < 2*CHECKPOINT_SKIPS :
-                        CHECKPOINT_SKIPS = 2
-                self.CHECKPOINTS = [1,self.MAX_EPOCHS] + np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() 
-                
-                
-
+                CHECKPOINT_SKIPS = int(args['checkpoint_skips']) if 'checkpoint_skips' in args else int(self.MAX_EPOCHS/10)
+                # if self.MAX_EPOCHS  < 2*CHECKPOINT_SKIPS :
+                #         CHECKPOINT_SKIPS = 2
+                # self.CHECKPOINTS = [1,self.MAX_EPOCHS] + np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() 
+                self.CHECKPOINTS = np.repeat(CHECKPOINT_SKIPS, self.MAX_EPOCHS/ CHECKPOINT_SKIPS).cumsum().astype(int).tolist()
+               
                 self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100
                 self.CONTEXT = args['context']
                 self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None}
@@ -469,7 +468,7 @@ class Train (GNet):
                 else :
                         dataset = tf.data.Dataset.from_tensor_slices(features_placeholder)
                 # labels_placeholder = None
-                dataset = dataset.repeat(80000)
+                dataset = dataset.repeat(800000)
                 
                 dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
                 dataset = dataset.prefetch(1)
@@ -560,39 +559,43 @@ class Train (GNet):
                                         print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
                                         # print (dir (w_distance))
 
-                                        logs.append({"epoch": int(epoch),"distance":float(-w_sum/(self.STEPS_PER_EPOCH*2)) })
-
+                                        # logs.append({"epoch": int(epoch),"distance":float(-w_sum/(self.STEPS_PER_EPOCH*2)) })
+                                        
+                                        suffix = str(self.CONTEXT)
+                                        _name  = os.sep.join([self.train_dir,str(epoch),suffix]) if epoch in self.CHECKPOINTS else ''
+                                        _logentry = {"path":_name,"epochs":int(epoch),"loss":float(-w_sum/(self.STEPS_PER_EPOCH*2))}
                                         # if epoch % self.MAX_EPOCHS == 0:
                                         # if epoch in [5,10,20,50,75, self.MAX_EPOCHS] :
                                         if epoch in self.CHECKPOINTS :
                                                 # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
-                                                suffix = self.CONTEXT #self.get.suffix()
-                                                _name  = os.sep.join([self.train_dir,str(epoch),suffix])
+                                                # suffix = self.CONTEXT #self.get.suffix()
+                                                # _name  = os.sep.join([self.train_dir,str(epoch),suffix])
                                                 # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
                                                 saver.save(sess, _name, write_meta_graph=False, global_step=np.int64(epoch))
                                                 
                                                 #
                                                 #
                                                
-                                                logs = [{"path":_name,"epochs":int(epoch),"loss":float(-w_sum/(self.STEPS_PER_EPOCH*2))}]
-                                                if self.logger :
-                                                        # row = {"module":"gan-train","action":"epochs","input":{"logs":logs}} #,"model":pickle.dump(sess)}                                                        
-                                                        # self.logger.write(row)
-                                                        self.logs['epochs'] += logs
-                                                        #
-                                                        # @TODO:
-                                                        # We should upload the files in the checkpoint 
-                                                        # This would allow the learnt model to be portable to another system
+                                                # logs = []
+                                                # if self.logger :
+                                                #         # row = {"module":"gan-train","action":"epochs","input":{"logs":logs}} #,"model":pickle.dump(sess)}                                                        
+                                                #         # self.logger.write(row)
+                                                #         self.logs['epochs'] += logs
+                                                #         #
+                                                #         # @TODO:
+                                                #         # We should upload the files in the checkpoint 
+                                                #         # This would allow the learnt model to be portable to another system
                                                         #
+                                        self.logs['epochs'].append(_logentry)
                         tf.compat.v1.reset_default_graph()
                 #
                 # let's sort the epochs we've logged thus far (if any)
                 # Take on the last five checkpoints https://stackoverflow.com/questions/41018454/tensorflow-checkpoint-models-getting-deleted
                 #
                 # self.logs['epochs']  = self.logs['epochs'][-5:]
-                self.logs['epochs'].sort(key=lambda _item: _item['loss'])
+                
                 if self.logger :
-                        _log = {'module':'gan-train','action':'epochs','input':self.logs['epochs']}
+                        _log = {'module':'gan-train','context':self.CONTEXT,'action':'epochs','input':self.logs['epochs']}
                         self.logger.write(_log)
                 
                 #
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 7f9c0f6..fdf2305 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -226,7 +226,7 @@ class Trainer(Learner):
         self.autopilot = _args['autopilot'] if 'autopilot' in _args else False
         self.generate = None
         self.candidates = int(_args['candidates']) if 'candidates' in _args else 1
-
+        self.checkpoint_skips = _args['checkpoint_skips'] if 'checkpoint_skips' in _args else None
     def run(self):
         self.initalize()
         if self._encoder is None :
@@ -242,6 +242,8 @@ class Trainer(Learner):
         _args['candidates'] = self.candidates
         if 'logger' in self.store :
             _args['logger'] = transport.factory.instance(**self.store['logger'])
+        if self.checkpoint_skips :
+            _args['checkpoint_skips'] = self.checkpoint_skips
         #
         # At this point we have the binary matrix, we can initiate training
         #
@@ -264,8 +266,13 @@ class Trainer(Learner):
             _args['gpu'] = self.gpu
 
         #
-        # Let us find the smallest, the item is sorted by loss ...
-        _args['network_args']['max_epochs'] = gTrain.logs['epochs'][0]['epochs']
+        # Let us find the smallest, the item is sorted by loss on disk
+        # 
+        _epochs = [_e  for _e in gTrain.logs['epochs'] if _e['path'] != '']
+        _epochs.sort(key=lambda _item: _item['loss'],reverse=False)
+       
+        _args['network_args']['max_epochs'] = _epochs[0]['epochs']
+        self.log(action='autopilot',input={'epoch':_epochs[0]})
         g = Generator(**_args)
         # g.run() 
         
diff --git a/setup.py b/setup.py
index c28f366..3a2aaba 100644
--- a/setup.py
+++ b/setup.py
@@ -4,10 +4,10 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.6.0",
+args = {"name":"data-maker","version":"1.6.2",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
-args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow']
+args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git'
 
 if sys.version_info[0] == 2 :

From d469a4904fb5aaa090948ead3172c2d0eeb326f4 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Wed, 9 Nov 2022 14:28:34 -0600
Subject: [PATCH 248/250] fixes with new features

---
 data/gan.py | 2 ++
 setup.py    | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/gan.py b/data/gan.py
index eaf5124..d2cc3ea 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -101,6 +101,8 @@ class GNet :
                 self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)       
                 self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs'])
                 CHECKPOINT_SKIPS = int(args['checkpoint_skips']) if 'checkpoint_skips' in args else int(self.MAX_EPOCHS/10)
+                
+                CHECKPOINT_SKIPS = 1 if CHECKPOINT_SKIPS < 1 else CHECKPOINT_SKIPS
                 # if self.MAX_EPOCHS  < 2*CHECKPOINT_SKIPS :
                 #         CHECKPOINT_SKIPS = 2
                 # self.CHECKPOINTS = [1,self.MAX_EPOCHS] + np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() 
diff --git a/setup.py b/setup.py
index 3a2aaba..6327b10 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.6.2",
+args = {"name":"data-maker","version":"1.6.3",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow']

From e196991c54d4207ab9c30507171748331d96c622 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 15 Nov 2022 11:01:11 -0600
Subject: [PATCH 249/250] plugin handling ...

---
 data/maker/__init__.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index fdf2305..7b3a347 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -589,4 +589,14 @@ class factory :
             pthread= Trainer(**_args)
         if 'start' in _args and _args['start'] == True :
             pthread.start()
-        return pthread
\ No newline at end of file
+        return pthread
+
+class plugins:
+    @staticmethod
+    def load(_config):
+        """
+        This function attempts to load the plugins to insure they are valid
+        _config configuration for plugin specifications {pre:{pipeline,path},post:{pipeline,path}}
+        """
+
+

From b2cf5ead53b51bfbe6dd331cf7f2c271605a0d0c Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Tue, 15 Nov 2022 11:01:33 -0600
Subject: [PATCH 250/250] version #

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6327b10..8ad1b09 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.6.3",
+args = {"name":"data-maker","version":"1.6.4",
         "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow']