From e4b2f943970faa4197427d5d6ef8f6e3a7faaae3 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 12 Dec 2019 11:42:17 -0600
Subject: [PATCH 01/16] making the installation stuff

---
 README.md => data-maker/README.md | 0
 WGAN.py => data-maker/WGAN.py     | 0
 bridge.py => data-maker/bridge.py | 0
 gan.py => data-maker/gan.py       | 0
 params.py => data-maker/params.py | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename README.md => data-maker/README.md (100%)
 rename WGAN.py => data-maker/WGAN.py (100%)
 rename bridge.py => data-maker/bridge.py (100%)
 rename gan.py => data-maker/gan.py (100%)
 rename params.py => data-maker/params.py (100%)

diff --git a/README.md b/data-maker/README.md
similarity index 100%
rename from README.md
rename to data-maker/README.md
diff --git a/WGAN.py b/data-maker/WGAN.py
similarity index 100%
rename from WGAN.py
rename to data-maker/WGAN.py
diff --git a/bridge.py b/data-maker/bridge.py
similarity index 100%
rename from bridge.py
rename to data-maker/bridge.py
diff --git a/gan.py b/data-maker/gan.py
similarity index 100%
rename from gan.py
rename to data-maker/gan.py
diff --git a/params.py b/data-maker/params.py
similarity index 100%
rename from params.py
rename to data-maker/params.py

From bba3b94a308808c9baa8c2bedeac2c40c4faec32 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 12 Dec 2019 11:43:08 -0600
Subject: [PATCH 02/16] structuring the repository

---
 Dockerfile                        |  2 +-
 data-maker/README.md => README.md |  0
 setup.py                          | 13 +++++++++++++
 3 files changed, 14 insertions(+), 1 deletion(-)
 rename data-maker/README.md => README.md (100%)
 create mode 100644 setup.py

diff --git a/Dockerfile b/Dockerfile
index dd02d11..f308336 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,4 +5,4 @@ RUN ["apt-get","install","-y","git", "python3-dev","tmux","locales","python3-pip
 RUN ["pip3","install","pandas-gbq","tensorflow"]
 RUN ["mkdir","-p","/usr/apps"]
 WORKDIR /usr/apps
-RUN ["git","clone","https://hiplab.mc.vanderbilt.edu/git/gan.git","aou-gan"]
+RUN ["git","clone","https://hiplab.mc.vanderbilt.edu/git/aou/gan.git@release","aou-gan"]
diff --git a/data-maker/README.md b/README.md
similarity index 100%
rename from data-maker/README.md
rename to README.md
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..daa8f9e
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,13 @@
+from setuptools import setup, find_packages
+import os
+import sys
+
+def read(fname):
+    return open(os.path.join(os.path.dirname(__file__), fname)).read() 
+args = {"name":"data-maker","version":"1.0.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT","packages":["edi"],"keywords":["healthcare","edi","x12","data","transport","protocol"]}
+args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','numpy','pandas','pandas-gbq','pymongo']
+args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/gan.git'
+if sys.version_info[0] == 2 :
+    args['use_2to3'] = False
+    args['use_2to3_exclude_fixers'] = ['lib2to3.fixes.fix_import']
+setup(**args)

From 99bc98aba59d64c591e9073450ed74c64a7c442f Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 12 Dec 2019 11:49:06 -0600
Subject: [PATCH 03/16] bug fix

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index daa8f9e..afbd3f0 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,8 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.0.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT","packages":["edi"],"keywords":["healthcare","edi","x12","data","transport","protocol"]}
+args = {"name":"data-maker","version":"1.0.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+        "packages":["data-maker"],"keywords":["healthcare","edi","x12","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','numpy','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/gan.git'
 if sys.version_info[0] == 2 :

From 8a5307c242b47cf93eda593e2811af86ff4c32b8 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 12 Dec 2019 12:13:31 -0600
Subject: [PATCH 04/16] bug fix, and documentation

---
 Dockerfile |  4 ++--
 README.md  | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index f308336..d489f50 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@ from ubuntu
 RUN ["apt-get","update"]
 RUN ["apt-get","upgrade","-y"]
 RUN ["apt-get","install","-y","git", "python3-dev","tmux","locales","python3-pip","python3-numpy","python3-pandas","locales"]
-RUN ["pip3","install","pandas-gbq","tensorflow"]
+RUN ["pip3","install","pandas-gbq","tensorflow","git+https://hiplab.mc.vanderbilt.edu/git/aou/"]
 RUN ["mkdir","-p","/usr/apps"]
 WORKDIR /usr/apps
-RUN ["git","clone","https://hiplab.mc.vanderbilt.edu/git/aou/gan.git@release","aou-gan"]
+RUN ["git","clone","https://hiplab.mc.vanderbilt.edu/git/aou/bridge.git@release","aou-gan"]
diff --git a/README.md b/README.md
index 8eb92d1..48b01aa 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,49 @@
-# bridge
+## Introduction
+---
+
+This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques
+
+    - Generative Adversarial Networks
+    - With "Earth mover's distance"
+
+## Installation
+---
+
+    pip install git+https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git@release
+
+## Usage
+---
+
+After installing the easiest way to get started is as follows (using pandas). The process is as follows:
+1. Train the GAN on the original/raw dataset
+
+
+        import pandas as pd
+        import data.maker
+        
+        df  = pd.read_csv('myfile.csv')
+        cols= ['f1','f2','f2']  
+        data.maker.train(data=df,cols=cols,logs='logs')
+
+2. Generate a candidate dataset from the learnt features
+
+
+    import pandas as pd
+    import data.maker
+
+    df = data.maker.generate(logs='logs')
+    df.head()
+    
+
+## Limitations
+---
+
+GANS will generate data assuming the original data has all the value space needed:
+    
+- No new data will be created
+    
+        Assuming we have a dataset with an gender attribute with values [M,F]. The synthetic data will not be able to generate genders outside [M,F]
+- Not advised on continuous values
+
+        GANS work well on discrete values and thus are not advised to be used to synthesize things like measurements (height, blood pressure, ...)
 

From caab8800dd09bfe54314dd60e1c83e0abdeb110a Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 12 Dec 2019 12:15:05 -0600
Subject: [PATCH 05/16] documentation/layout

---
 README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 48b01aa..9589ddd 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ After installing the easiest way to get started is as follows (using pandas). Th
 
     df = data.maker.generate(logs='logs')
     df.head()
-    
+
 
 ## Limitations
 ---
@@ -42,8 +42,10 @@ GANS will generate data assuming the original data has all the value space neede
     
 - No new data will be created
     
-        Assuming we have a dataset with an gender attribute with values [M,F]. The synthetic data will not be able to generate genders outside [M,F]
+        Assuming we have a dataset with an gender attribute with values [M,F]. 
+        The synthetic data will not be able to generate genders outside [M,F]
 - Not advised on continuous values
 
-        GANS work well on discrete values and thus are not advised to be used to synthesize things like measurements (height, blood pressure, ...)
+        GANS work well on discrete values and thus are not advised to be used.
+        e.g:measurements (height, blood pressure, ...)
 

From 363bd376e178e2019f99f17b55a8268be8c77d00 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 12 Dec 2019 12:16:52 -0600
Subject: [PATCH 06/16] credits

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 9589ddd..411d297 100644
--- a/README.md
+++ b/README.md
@@ -49,3 +49,9 @@ GANS will generate data assuming the original data has all the value space neede
         GANS work well on discrete values and thus are not advised to be used.
         e.g:measurements (height, blood pressure, ...)
 
+## Credits :
+---
+
+- [Ziqi Zhang](ziqi.zhang@vanderbilt.edu)
+- [Brad Malin](b.malin@vanderbilt.edu)
+- [Steve L. Nyemba](steve.l.nyemba@vanderbilt.edu)
\ No newline at end of file

From 3750d8b40860585357d323a44357a9be96bb5942 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 12 Dec 2019 12:17:38 -0600
Subject: [PATCH 07/16] s

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 411d297..141502d 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,6 @@ This package is designed to generate synthetic data from a dataset from an origi
 
 ## Usage
 ---
-
 After installing the easiest way to get started is as follows (using pandas). The process is as follows:
 1. Train the GAN on the original/raw dataset
 

From 766ef9694e8d7d85d5a471e74d7bb28349136907 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Thu, 12 Dec 2019 12:19:37 -0600
Subject: [PATCH 08/16] doc

---
 README.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 141502d..8475923 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
 ## Introduction
----
 
 This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques
 
@@ -7,12 +6,11 @@ This package is designed to generate synthetic data from a dataset from an origi
     - With "Earth mover's distance"
 
 ## Installation
----
 
     pip install git+https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git@release
 
 ## Usage
----
+
 After installing the easiest way to get started is as follows (using pandas). The process is as follows:
 1. Train the GAN on the original/raw dataset
 
@@ -35,7 +33,6 @@ After installing the easiest way to get started is as follows (using pandas). Th
 
 
 ## Limitations
----
 
 GANS will generate data assuming the original data has all the value space needed:
     
@@ -49,7 +46,6 @@ GANS will generate data assuming the original data has all the value space neede
         e.g:measurements (height, blood pressure, ...)
 
 ## Credits :
----
 
 - [Ziqi Zhang](ziqi.zhang@vanderbilt.edu)
 - [Brad Malin](b.malin@vanderbilt.edu)

From 7f3748121ca8e4e7584096d51f5569a844d1d751 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 31 Dec 2019 23:26:42 -0600
Subject: [PATCH 09/16] bug fix and refactoring with documentation

---
 {data-maker => data}/WGAN.py   | 0
 {data-maker => data}/bridge.py | 0
 {data-maker => data}/gan.py    | 0
 {data-maker => data}/params.py | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename {data-maker => data}/WGAN.py (100%)
 rename {data-maker => data}/bridge.py (100%)
 rename {data-maker => data}/gan.py (100%)
 rename {data-maker => data}/params.py (100%)

diff --git a/data-maker/WGAN.py b/data/WGAN.py
similarity index 100%
rename from data-maker/WGAN.py
rename to data/WGAN.py
diff --git a/data-maker/bridge.py b/data/bridge.py
similarity index 100%
rename from data-maker/bridge.py
rename to data/bridge.py
diff --git a/data-maker/gan.py b/data/gan.py
similarity index 100%
rename from data-maker/gan.py
rename to data/gan.py
diff --git a/data-maker/params.py b/data/params.py
similarity index 100%
rename from data-maker/params.py
rename to data/params.py

From 685c5676616449e78089c1d9d23099045f1636fa Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 31 Dec 2019 23:27:53 -0600
Subject: [PATCH 10/16] fixes with the framework - only supports single feature

---
 README.md              | 29 +++++++++++-------
 data/__init__.py       |  1 +
 data/gan.py            | 55 +++++++++++++++++++++++-----------
 data/maker/__init__.py | 68 ++++++++++++++++++++++++++++++++++++++++++
 setup.py               |  2 +-
 5 files changed, 125 insertions(+), 30 deletions(-)
 create mode 100644 data/__init__.py
 create mode 100644 data/maker/__init__.py

diff --git a/README.md b/README.md
index 8475923..b42b1f7 100644
--- a/README.md
+++ b/README.md
@@ -15,22 +15,29 @@ After installing the easiest way to get started is as follows (using pandas). Th
 1. Train the GAN on the original/raw dataset
 
 
-        import pandas as pd
-        import data.maker
-        
-        df  = pd.read_csv('myfile.csv')
-        cols= ['f1','f2','f2']  
-        data.maker.train(data=df,cols=cols,logs='logs')
+import pandas as pd
+import data.maker
 
-2. Generate a candidate dataset from the learnt features
+df      = pd.read_csv('sample.csv')
+column  = 'gender'
+id      = 'id' 
+context = 'demo'
+data.maker.train(context=context,data=df,column=column,id=id,logs='logs')
+
+The trainer will store the data on disk (for now) in a structured folder that will hold training models that will be used to generate the synthetic data.
 
 
-    import pandas as pd
-    import data.maker
+2. Generate a candidate dataset from the learnt features
+
 
-    df = data.maker.generate(logs='logs')
-    df.head()
+import pandas as pd
+import data.maker
 
+df  = pd.read_csv('sample.csv')
+id  = 'id'
+column = 'gender'
+context = 'demo'
+data.maker.generate(data=df,id=id,column=column,logs='logs')
 
 ## Limitations
 
diff --git a/data/__init__.py b/data/__init__.py
new file mode 100644
index 0000000..88bbded
--- /dev/null
+++ b/data/__init__.py
@@ -0,0 +1 @@
+import data.params as params
diff --git a/data/gan.py b/data/gan.py
index 0981411..e349018 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -11,8 +11,8 @@ import pandas as pd
 import time
 import os
 import sys
-from params import SYS_ARGS
-from bridge import Binary
+from data.params import SYS_ARGS
+from data.bridge import Binary
 import json
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
@@ -37,8 +37,6 @@ class GNet :
         self.layers = void()
         self.layers.normalize = self.normalize
 
-        self.get = void()
-        self.get.variables = self._variable_on_cpu
 
         self.NUM_GPUS = 1
        
@@ -63,7 +61,11 @@ class GNet :
         self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None}
         self._REAL = args['real'] if 'real' in args else None
         self._LABEL = args['label'] if 'label' in args else None
-            
+
+        self.get = void()
+        self.get.variables = self._variable_on_cpu
+
+        self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
         self.init_logs(**args)
 
     def init_logs(self,**args):
@@ -83,7 +85,9 @@ class GNet :
         This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model.
         Because prediction and training can happen independently
         """
-        _name = os.sep.join([self.out_dir,'meta-'+column+'.json'])
+        # suffix = "-".join(column) if isinstance(column,list)else column
+        suffix = self.get.suffix()
+        _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json'])
         if os.path.exists(_name) :
             attr = json.loads((open(_name)).read())
             for key in attr :
@@ -111,7 +115,10 @@ class GNet :
             key = args['key']
             value= args['value']
             object[key] = value
-        _name = os.sep.join([self.out_dir,'meta-'+SYS_ARGS['column']])
+        # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column
+        suffix = self.get.suffix()
+        _name = os.sep.join([self.out_dir,'meta-'+suffix])
+        
         f = open(_name+'.json','w')
         f.write(json.dumps(object))
     def mkdir (self,path):
@@ -285,7 +292,9 @@ class Train (GNet):
         self.discriminator = Discriminator(**args)
         self._REAL = args['real']
         self._LABEL= args['label']
+        self.column = args['column']
         # print ([" *** ",self.BATCHSIZE_PER_GPU])
+        
         self.log_meta()
     def load_meta(self, column):
         """
@@ -407,8 +416,9 @@ class Train (GNet):
                     format_str = 'epoch: %d, w_distance = %f (%.1f)'
                     print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
                     if epoch % self.MAX_EPOCHS == 0:
-                        
-                        _name  = os.sep.join([self.train_dir,self.ATTRIBUTES['synthetic']])
+                        # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
+                        suffix = self.get.suffix()
+                        _name  = os.sep.join([self.train_dir,suffix])
                         # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
                         saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
                         #
@@ -420,14 +430,16 @@ class Predict(GNet):
     """
     def __init__(self,**args):
         GNet.__init__(self,**args)        
-        self.generator = Generator(**args)
-        self.values  = values
+        self.generator = Generator(**args)        
+        self.values  = args['values']
     def load_meta(self, column):
         super().load_meta(column)
         self.generator.load_meta(column)
     def apply(self,**args):
         # print (self.train_dir)
-        model_dir = os.sep.join([self.train_dir,self.ATTRIBUTES['synthetic']+'-'+str(self.MAX_EPOCHS)])
+        # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
+        suffix = self.get.suffix()
+        model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
         demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
         tf.compat.v1.reset_default_graph()
         z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
@@ -450,19 +462,24 @@ class Predict(GNet):
             # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
             #
 
-            df =  ( pd.DataFrame(np.round(f).astype(np.int32),columns=values))
+            df =  ( pd.DataFrame(np.round(f).astype(np.int32)))
             # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
             # df = (i * df).sum(axis=1)
             #
             # In case we are dealing with actual values like diagnosis codes we can perform 
             #
-            r = np.zeros((self.ROW_COUNT,1))
+            columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
+            
+            r = np.zeros((self.ROW_COUNT,len(columns)))
             for col in df :
                 i = np.where(df[col])[0]
                 r[i] = col
-            df = pd.DataFrame(r,columns=[self.ATTRIBUTES['synthetic']])
             
-            return df.to_dict(orient='list')
+            df = pd.DataFrame(r,columns=columns)
+            
+            df[df.columns] = (df.apply(lambda value: self.values[ int(value)],axis=1))
+            return df.to_dict(orient='lists')
+            # return df.to_dict(orient='list')
             # count = str(len(os.listdir(self.out_dir)))
             # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])
             # df.to_csv(_name,index=False)
@@ -476,7 +493,7 @@ class Predict(GNet):
             #         idx2 = (demo[:, n] == 1)
             #         idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
             #         num = np.sum(idx)
-            #         print ("_____________________")
+            #         print ("___________________list__")
             #         print (idx1)
             #         print (idx2)
             #         print (idx)
@@ -531,7 +548,8 @@ if __name__ == '__main__' :
     elif 'generate' in SYS_ARGS:
         values = df[column].unique().tolist()
         values.sort()
-        p = Predict(context=context,label=LABEL,values=values)
+        
+        p = Predict(context=context,label=LABEL,values=values,column=column)
         p.load_meta(column)
         r = p.apply()
         print (df)
@@ -539,6 +557,7 @@ if __name__ == '__main__' :
         df[column] = r[column]
         print (df)
         
+        
     else:
         print (SYS_ARGS.keys())
         print (__doc__)
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
new file mode 100644
index 0000000..7a441f8
--- /dev/null
+++ b/data/maker/__init__.py
@@ -0,0 +1,68 @@
+"""
+(c) 2019 Data Maker, hiplab.mc.vanderbilt.edu
+version 1.0.0
+
+This package serves as a proxy to the overall usage of the framework.
+This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques
+
+@TODO:
+    - Make configurable GPU, EPOCHS
+"""
+import pandas as pd
+import numpy as np
+from data import gan
+
+def train (**args) :
+    """
+    This function is intended to train the GAN in order to learn about the distribution of the features
+    :column     columns that need to be synthesized (discrete)
+    :logs       where the output of the (location on disk)
+    :id         identifier of the dataset
+    :data       data-frame to be synthesized
+    :context    label of what we are synthesizing
+    """
+    column = args['column']
+    
+    column_id  = args['id']
+    df  = args['data']
+    logs    = args['logs']
+    real    = pd.get_dummies(df[column]).astype(np.float32).values
+    
+    labels  = pd.get_dummies(df[column_id]).astype(np.float32).values
+    max_epochs = 10
+    context = args['context']
+    trainer = gan.Train(context=context,max_epochs=max_epochs,real=real,label=labels,column=column,column_id=column_id)
+    return trainer.apply()
+
+def generate(**args):
+    """
+    This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
+    @return pandas.DataFrame
+
+    :data   data-frame to be synthesized
+    :column   columns that need to be synthesized (discrete)
+    :id     column identifying an entity
+    :logs   location on disk where the learnt knowledge of the dataset is
+    """
+    df      = args['data']
+    
+    column      = args['column'] 
+    column_id   = args['id']
+    logs        = args['logs']
+    context = args['context']
+    #
+    #@TODO:
+    #   If the identifier is not present, we should fine a way to determine or make one
+    #
+    #ocolumns= list(set(df.columns.tolist())- set(columns))
+    
+    values = df[column].unique().tolist()
+    values.sort()
+
+    labels = pd.get_dummies(df[column_id]).astype(np.float32).values
+    handler = gan.Predict (context=context,label=labels,values=values,column=column)
+    handler.load_meta(column)
+    r =  handler.apply()
+    _df = df.copy()
+    _df[column] = r[column]
+    return _df
diff --git a/setup.py b/setup.py
index afbd3f0..c7dc48f 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker","version":"1.0.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
-        "packages":["data-maker"],"keywords":["healthcare","edi","x12","data","transport","protocol"]}
+        "packages":["data-maker"],"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','numpy','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/gan.git'
 if sys.version_info[0] == 2 :

From df47ed4cb2b7e1d05f86b20899244f2350bac05c Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 31 Dec 2019 23:34:04 -0600
Subject: [PATCH 11/16] documentation

---
 README.md | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index b42b1f7..f5c5e5d 100644
--- a/README.md
+++ b/README.md
@@ -12,32 +12,33 @@ This package is designed to generate synthetic data from a dataset from an origi
 ## Usage
 
 After installing the easiest way to get started is as follows (using pandas). The process is as follows:
-1. Train the GAN on the original/raw dataset
 
+**Train the GAN on the original/raw dataset**
 
-import pandas as pd
-import data.maker
 
-df      = pd.read_csv('sample.csv')
-column  = 'gender'
-id      = 'id' 
-context = 'demo'
-data.maker.train(context=context,data=df,column=column,id=id,logs='logs')
+    import pandas as pd
+    import data.maker
+
+    df      = pd.read_csv('sample.csv')
+    column  = 'gender'
+    id      = 'id' 
+    context = 'demo'
+    data.maker.train(context=context,data=df,column=column,id=id,logs='logs')
 
 The trainer will store the data on disk (for now) in a structured folder that will hold training models that will be used to generate the synthetic data.
 
 
-2. Generate a candidate dataset from the learnt features
+**Generate a candidate dataset from the learned features**
 
 
-import pandas as pd
-import data.maker
+        import pandas as pd
+        import data.maker
 
-df  = pd.read_csv('sample.csv')
-id  = 'id'
-column = 'gender'
-context = 'demo'
-data.maker.generate(data=df,id=id,column=column,logs='logs')
+        df  = pd.read_csv('sample.csv')
+        id  = 'id'
+        column = 'gender'
+        context = 'demo'
+        data.maker.generate(data=df,id=id,column=column,logs='logs')
 
 ## Limitations
 
@@ -46,11 +47,14 @@ GANS will generate data assuming the original data has all the value space neede
 - No new data will be created
     
         Assuming we have a dataset with an gender attribute with values [M,F]. 
+            
         The synthetic data will not be able to generate genders outside [M,F]
+        
 - Not advised on continuous values
 
         GANS work well on discrete values and thus are not advised to be used.
         e.g:measurements (height, blood pressure, ...)
+- For now will only perform on a single feature.
 
 ## Credits :
 

From ffc4a8a191cdcf88c0ac25a64dd509fa5797f203 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 31 Dec 2019 23:38:52 -0600
Subject: [PATCH 12/16] layout issue

---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index f5c5e5d..46d2425 100644
--- a/README.md
+++ b/README.md
@@ -31,14 +31,14 @@ The trainer will store the data on disk (for now) in a structured folder that wi
 **Generate a candidate dataset from the learned features**
 
 
-        import pandas as pd
-        import data.maker
+    import pandas as pd
+    import data.maker
 
-        df  = pd.read_csv('sample.csv')
-        id  = 'id'
-        column = 'gender'
-        context = 'demo'
-        data.maker.generate(data=df,id=id,column=column,logs='logs')
+    df  = pd.read_csv('sample.csv')
+    id  = 'id'
+    column = 'gender'
+    context = 'demo'
+    data.maker.generate(data=df,id=id,column=column,logs='logs')
 
 ## Limitations
 
@@ -49,7 +49,7 @@ GANS will generate data assuming the original data has all the value space neede
         Assuming we have a dataset with an gender attribute with values [M,F]. 
             
         The synthetic data will not be able to generate genders outside [M,F]
-        
+
 - Not advised on continuous values
 
         GANS work well on discrete values and thus are not advised to be used.

From 65a3e84c8f584df71738d70820726084e4970e1e Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 31 Dec 2019 23:39:54 -0600
Subject: [PATCH 13/16] documentation usage

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 46d2425..3dfb291 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ The trainer will store the data on disk (for now) in a structured folder that wi
     id  = 'id'
     column = 'gender'
     context = 'demo'
-    data.maker.generate(data=df,id=id,column=column,logs='logs')
+    data.maker.generate(context=context,data=df,id=id,column=column,logs='logs')
 
 ## Limitations
 

From 6de816fc5053386e961af7d92a06367eb0bd57e3 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 3 Jan 2020 21:47:05 -0600
Subject: [PATCH 14/16] bug fixes with operations

---
 data/gan.py            | 22 ++++++++++++++++------
 data/maker/__init__.py | 27 +++++++++++++++++----------
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index e349018..3391b78 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -14,6 +14,7 @@ import sys
 from data.params import SYS_ARGS
 from data.bridge import Binary
 import json
+import pickle
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ['CUDA_VISIBLE_DEVICES'] = "0"
@@ -38,7 +39,7 @@ class GNet :
         self.layers.normalize = self.normalize
 
 
-        self.NUM_GPUS = 1
+        self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
        
 
         self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854
@@ -64,8 +65,8 @@ class GNet :
 
         self.get = void()
         self.get.variables = self._variable_on_cpu
-
         self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
+        self.logger = args['logger'] if 'logger' in args and args['logger'] else None
         self.init_logs(**args)
 
     def init_logs(self,**args):
@@ -98,7 +99,7 @@ class GNet :
                 
             
     def log_meta(self,**args) :
-        object = {
+        _object = {
             'CONTEXT':self.CONTEXT,
             'ATTRIBUTES':self.ATTRIBUTES,
             'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU,
@@ -120,7 +121,8 @@ class GNet :
         _name = os.sep.join([self.out_dir,'meta-'+suffix])
         
         f = open(_name+'.json','w')
-        f.write(json.dumps(object))
+        f.write(json.dumps(_object))
+        return _object
     def mkdir (self,path):
         if not os.path.exists(path) :
             os.mkdir(path)        
@@ -295,7 +297,7 @@ class Train (GNet):
         self.column = args['column']
         # print ([" *** ",self.BATCHSIZE_PER_GPU])
         
-        self.log_meta()
+        self.meta = self.log_meta()
     def load_meta(self, column):
         """
         This function will delegate the calls to load meta data to it's dependents
@@ -393,7 +395,7 @@ class Train (GNet):
             # saver = tf.train.Saver()
             saver   = tf.compat.v1.train.Saver()
             init    = tf.global_variables_initializer()
-
+            logs = []
             with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
                 sess.run(init)
                 sess.run(iterator_d.initializer,
@@ -415,6 +417,10 @@ class Train (GNet):
 
                     format_str = 'epoch: %d, w_distance = %f (%.1f)'
                     print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
+                    # print (dir (w_distance))
+
+                    logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })
+
                     if epoch % self.MAX_EPOCHS == 0:
                         # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
                         suffix = self.get.suffix()
@@ -423,6 +429,10 @@ class Train (GNet):
                         saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
                         #
                         #
+                        if self.logger :
+                            row = {"logs":logs} #,"model":pickle.dump(sess)}
+                            
+                            self.logger.write(row=row)
 
 class Predict(GNet):
     """
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 7a441f8..075bfd3 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -11,7 +11,7 @@ This package is designed to generate synthetic data from a dataset from an origi
 import pandas as pd
 import numpy as np
 from data import gan
-
+from transport import factory
 def train (**args) :
     """
     This function is intended to train the GAN in order to learn about the distribution of the features
@@ -21,17 +21,24 @@ def train (**args) :
     :data       data-frame to be synthesized
     :context    label of what we are synthesizing
     """
-    column = args['column']
+    column      = args['column']
     
-    column_id  = args['id']
-    df  = args['data']
-    logs    = args['logs']
-    real    = pd.get_dummies(df[column]).astype(np.float32).values
+    column_id   = args['id']
+    df          = args['data']
+    logs        = args['logs']
+    real        = pd.get_dummies(df[column]).astype(np.float32).values
+    labels      = pd.get_dummies(df[column_id]).astype(np.float32).values
     
-    labels  = pd.get_dummies(df[column_id]).astype(np.float32).values
-    max_epochs = 10
-    context = args['context']
-    trainer = gan.Train(context=context,max_epochs=max_epochs,real=real,label=labels,column=column,column_id=column_id)
+    max_epochs  = 10 if 'max_epochs' not in args else args['max_epochs']
+    context     = args['context']
+    if 'store' in args :
+        args['store']['args']['doc'] = context
+        logger = factory.instance(**args['store'])
+        
+    else:
+        logger = None
+        
+    trainer     = gan.Train(context=context,max_epochs=max_epochs,real=real,label=labels,column=column,column_id=column_id,logger = logger,logs=logs)
     return trainer.apply()
 
 def generate(**args):

From 4165fabe57c1abdf67cbb27b2ba6b0a03c672e92 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 3 Jan 2020 21:50:32 -0600
Subject: [PATCH 15/16] bug fix with installer (PIP)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c7dc48f..1bfc141 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker","version":"1.0.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
-        "packages":["data-maker"],"keywords":["healthcare","data","transport","protocol"]}
+        "packages":["data"],"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','numpy','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/gan.git'
 if sys.version_info[0] == 2 :

From ef399690822cfbd8ab948aea10bae651173363b6 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Sat, 4 Jan 2020 23:02:15 -0600
Subject: [PATCH 16/16] bug fix with imports

---
 data/__init__.py       |  1 +
 data/maker/__init__.py |  2 +-
 data/maker/__main__.py | 10 ++++++++++
 setup.py               |  3 ++-
 4 files changed, 14 insertions(+), 2 deletions(-)
 create mode 100644 data/maker/__main__.py

diff --git a/data/__init__.py b/data/__init__.py
index 88bbded..98124f1 100644
--- a/data/__init__.py
+++ b/data/__init__.py
@@ -1 +1,2 @@
 import data.params as params
+
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 075bfd3..469a65a 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -10,7 +10,7 @@ This package is designed to generate synthetic data from a dataset from an origi
 """
 import pandas as pd
 import numpy as np
-from data import gan
+import data.gan as gan
 from transport import factory
 def train (**args) :
     """
diff --git a/data/maker/__main__.py b/data/maker/__main__.py
new file mode 100644
index 0000000..e77bf0a
--- /dev/null
+++ b/data/maker/__main__.py
@@ -0,0 +1,10 @@
+import pandas as pd
+import data.maker
+
+df      = pd.read_csv('sample.csv')
+column  = 'gender'
+id      = 'id' 
+context = 'demo'
+store = {"type":"mongo.MongoWriter","args":{"host":"localhost:27017","dbname":"GAN"}}
+max_epochs = 11
+data.maker.train(store=store,max_epochs=max_epochs,context=context,data=df,column=column,id=id,logs='foo')
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 1bfc141..a7befdf 100644
--- a/setup.py
+++ b/setup.py
@@ -5,9 +5,10 @@ import sys
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker","version":"1.0.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
-        "packages":["data"],"keywords":["healthcare","data","transport","protocol"]}
+        "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','numpy','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/gan.git'
+
 if sys.version_info[0] == 2 :
     args['use_2to3'] = False
     args['use_2to3_exclude_fixers'] = ['lib2to3.fixes.fix_import']