From e4b2f943970faa4197427d5d6ef8f6e3a7faaae3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 12 Dec 2019 11:42:17 -0600 Subject: [PATCH 01/16] making the installation stuff --- README.md => data-maker/README.md | 0 WGAN.py => data-maker/WGAN.py | 0 bridge.py => data-maker/bridge.py | 0 gan.py => data-maker/gan.py | 0 params.py => data-maker/params.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename README.md => data-maker/README.md (100%) rename WGAN.py => data-maker/WGAN.py (100%) rename bridge.py => data-maker/bridge.py (100%) rename gan.py => data-maker/gan.py (100%) rename params.py => data-maker/params.py (100%) diff --git a/README.md b/data-maker/README.md similarity index 100% rename from README.md rename to data-maker/README.md diff --git a/WGAN.py b/data-maker/WGAN.py similarity index 100% rename from WGAN.py rename to data-maker/WGAN.py diff --git a/bridge.py b/data-maker/bridge.py similarity index 100% rename from bridge.py rename to data-maker/bridge.py diff --git a/gan.py b/data-maker/gan.py similarity index 100% rename from gan.py rename to data-maker/gan.py diff --git a/params.py b/data-maker/params.py similarity index 100% rename from params.py rename to data-maker/params.py From bba3b94a308808c9baa8c2bedeac2c40c4faec32 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 12 Dec 2019 11:43:08 -0600 Subject: [PATCH 02/16] structuring the repository --- Dockerfile | 2 +- data-maker/README.md => README.md | 0 setup.py | 13 +++++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) rename data-maker/README.md => README.md (100%) create mode 100644 setup.py diff --git a/Dockerfile b/Dockerfile index dd02d11..f308336 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,4 +5,4 @@ RUN ["apt-get","install","-y","git", "python3-dev","tmux","locales","python3-pip RUN ["pip3","install","pandas-gbq","tensorflow"] RUN ["mkdir","-p","/usr/apps"] WORKDIR /usr/apps -RUN ["git","clone","https://hiplab.mc.vanderbilt.edu/git/gan.git","aou-gan"] +RUN ["git","clone","https://hiplab.mc.vanderbilt.edu/git/aou/gan.git@release","aou-gan"] diff --git a/data-maker/README.md b/README.md similarity index 100% rename from data-maker/README.md rename to README.md diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..daa8f9e --- /dev/null +++ b/setup.py @@ -0,0 +1,13 @@ +from setuptools import setup, find_packages +import os +import sys + +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() +args = {"name":"data-maker","version":"1.0.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT","packages":["edi"],"keywords":["healthcare","edi","x12","data","transport","protocol"]} +args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','numpy','pandas','pandas-gbq','pymongo'] +args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/gan.git' +if sys.version_info[0] == 2 : + args['use_2to3'] = False + args['use_2to3_exclude_fixers'] = ['lib2to3.fixes.fix_import'] +setup(**args) From 99bc98aba59d64c591e9073450ed74c64a7c442f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 12 Dec 2019 11:49:06 -0600 Subject: [PATCH 03/16] bug fix --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index daa8f9e..afbd3f0 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,8 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.0.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT","packages":["edi"],"keywords":["healthcare","edi","x12","data","transport","protocol"]} +args = {"name":"data-maker","version":"1.0.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", + "packages":["data-maker"],"keywords":["healthcare","edi","x12","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','numpy','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/gan.git' if sys.version_info[0] == 2 : From 8a5307c242b47cf93eda593e2811af86ff4c32b8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 12 Dec 2019 12:13:31 -0600 Subject: [PATCH 04/16] bug fix, and documentation --- Dockerfile | 4 ++-- README.md | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index f308336..d489f50 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ from ubuntu RUN ["apt-get","update"] RUN ["apt-get","upgrade","-y"] RUN ["apt-get","install","-y","git", "python3-dev","tmux","locales","python3-pip","python3-numpy","python3-pandas","locales"] -RUN ["pip3","install","pandas-gbq","tensorflow"] +RUN ["pip3","install","pandas-gbq","tensorflow","git+https://hiplab.mc.vanderbilt.edu/git/aou/"] RUN ["mkdir","-p","/usr/apps"] WORKDIR /usr/apps -RUN ["git","clone","https://hiplab.mc.vanderbilt.edu/git/aou/gan.git@release","aou-gan"] +RUN ["git","clone","https://hiplab.mc.vanderbilt.edu/git/aou/bridge.git@release","aou-gan"] diff --git a/README.md b/README.md index 8eb92d1..48b01aa 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,49 @@ -# bridge +## Introduction +--- + +This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques + + - Generative Adversarial Networks + - With "Earth mover's distance" + +## Installation +--- + + pip install git+https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git@release + +## Usage +--- + +After installing the easiest way to get started is as follows (using pandas). The process is as follows: +1. Train the GAN on the original/raw dataset + + + import pandas as pd + import data.maker + + df = pd.read_csv('myfile.csv') + cols= ['f1','f2','f2'] + data.maker.train(data=df,cols=cols,logs='logs') + +2. Generate a candidate dataset from the learnt features + + + import pandas as pd + import data.maker + + df = data.maker.generate(logs='logs') + df.head() + + +## Limitations +--- + +GANS will generate data assuming the original data has all the value space needed: + +- No new data will be created + + Assuming we have a dataset with an gender attribute with values [M,F]. The synthetic data will not be able to generate genders outside [M,F] +- Not advised on continuous values + + GANS work well on discrete values and thus are not advised to be used to synthesize things like measurements (height, blood pressure, ...) From caab8800dd09bfe54314dd60e1c83e0abdeb110a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 12 Dec 2019 12:15:05 -0600 Subject: [PATCH 05/16] documentation/layout --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 48b01aa..9589ddd 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ After installing the easiest way to get started is as follows (using pandas). Th df = data.maker.generate(logs='logs') df.head() - + ## Limitations --- @@ -42,8 +42,10 @@ GANS will generate data assuming the original data has all the value space neede - No new data will be created - Assuming we have a dataset with an gender attribute with values [M,F]. The synthetic data will not be able to generate genders outside [M,F] + Assuming we have a dataset with an gender attribute with values [M,F]. + The synthetic data will not be able to generate genders outside [M,F] - Not advised on continuous values - GANS work well on discrete values and thus are not advised to be used to synthesize things like measurements (height, blood pressure, ...) + GANS work well on discrete values and thus are not advised to be used. + e.g:measurements (height, blood pressure, ...) From 363bd376e178e2019f99f17b55a8268be8c77d00 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 12 Dec 2019 12:16:52 -0600 Subject: [PATCH 06/16] credits --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 9589ddd..411d297 100644 --- a/README.md +++ b/README.md @@ -49,3 +49,9 @@ GANS will generate data assuming the original data has all the value space neede GANS work well on discrete values and thus are not advised to be used. e.g:measurements (height, blood pressure, ...) +## Credits : +--- + +- [Ziqi Zhang](ziqi.zhang@vanderbilt.edu) +- [Brad Malin](b.malin@vanderbilt.edu) +- [Steve L. Nyemba](steve.l.nyemba@vanderbilt.edu) \ No newline at end of file From 3750d8b40860585357d323a44357a9be96bb5942 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 12 Dec 2019 12:17:38 -0600 Subject: [PATCH 07/16] s --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 411d297..141502d 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,6 @@ This package is designed to generate synthetic data from a dataset from an origi ## Usage --- - After installing the easiest way to get started is as follows (using pandas). The process is as follows: 1. Train the GAN on the original/raw dataset From 766ef9694e8d7d85d5a471e74d7bb28349136907 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 12 Dec 2019 12:19:37 -0600 Subject: [PATCH 08/16] doc --- README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/README.md b/README.md index 141502d..8475923 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ ## Introduction ---- This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques @@ -7,12 +6,11 @@ This package is designed to generate synthetic data from a dataset from an origi - With "Earth mover's distance" ## Installation ---- pip install git+https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git@release ## Usage ---- + After installing the easiest way to get started is as follows (using pandas). The process is as follows: 1. Train the GAN on the original/raw dataset @@ -35,7 +33,6 @@ After installing the easiest way to get started is as follows (using pandas). Th ## Limitations ---- GANS will generate data assuming the original data has all the value space needed: @@ -49,7 +46,6 @@ GANS will generate data assuming the original data has all the value space neede e.g:measurements (height, blood pressure, ...) ## Credits : ---- - [Ziqi Zhang](ziqi.zhang@vanderbilt.edu) - [Brad Malin](b.malin@vanderbilt.edu) From 7f3748121ca8e4e7584096d51f5569a844d1d751 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 31 Dec 2019 23:26:42 -0600 Subject: [PATCH 09/16] bug fix and refactoring with documentation --- {data-maker => data}/WGAN.py | 0 {data-maker => data}/bridge.py | 0 {data-maker => data}/gan.py | 0 {data-maker => data}/params.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename {data-maker => data}/WGAN.py (100%) rename {data-maker => data}/bridge.py (100%) rename {data-maker => data}/gan.py (100%) rename {data-maker => data}/params.py (100%) diff --git a/data-maker/WGAN.py b/data/WGAN.py similarity index 100% rename from data-maker/WGAN.py rename to data/WGAN.py diff --git a/data-maker/bridge.py b/data/bridge.py similarity index 100% rename from data-maker/bridge.py rename to data/bridge.py diff --git a/data-maker/gan.py b/data/gan.py similarity index 100% rename from data-maker/gan.py rename to data/gan.py diff --git a/data-maker/params.py b/data/params.py similarity index 100% rename from data-maker/params.py rename to data/params.py From 685c5676616449e78089c1d9d23099045f1636fa Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 31 Dec 2019 23:27:53 -0600 Subject: [PATCH 10/16] fixes with the framework - only supports single feature --- README.md | 29 +++++++++++------- data/__init__.py | 1 + data/gan.py | 55 +++++++++++++++++++++++----------- data/maker/__init__.py | 68 ++++++++++++++++++++++++++++++++++++++++++ setup.py | 2 +- 5 files changed, 125 insertions(+), 30 deletions(-) create mode 100644 data/__init__.py create mode 100644 data/maker/__init__.py diff --git a/README.md b/README.md index 8475923..b42b1f7 100644 --- a/README.md +++ b/README.md @@ -15,22 +15,29 @@ After installing the easiest way to get started is as follows (using pandas). Th 1. Train the GAN on the original/raw dataset - import pandas as pd - import data.maker - - df = pd.read_csv('myfile.csv') - cols= ['f1','f2','f2'] - data.maker.train(data=df,cols=cols,logs='logs') +import pandas as pd +import data.maker -2. Generate a candidate dataset from the learnt features +df = pd.read_csv('sample.csv') +column = 'gender' +id = 'id' +context = 'demo' +data.maker.train(context=context,data=df,column=column,id=id,logs='logs') + +The trainer will store the data on disk (for now) in a structured folder that will hold training models that will be used to generate the synthetic data. - import pandas as pd - import data.maker +2. Generate a candidate dataset from the learnt features + - df = data.maker.generate(logs='logs') - df.head() +import pandas as pd +import data.maker +df = pd.read_csv('sample.csv') +id = 'id' +column = 'gender' +context = 'demo' +data.maker.generate(data=df,id=id,column=column,logs='logs') ## Limitations diff --git a/data/__init__.py b/data/__init__.py new file mode 100644 index 0000000..88bbded --- /dev/null +++ b/data/__init__.py @@ -0,0 +1 @@ +import data.params as params diff --git a/data/gan.py b/data/gan.py index 0981411..e349018 100644 --- a/data/gan.py +++ b/data/gan.py @@ -11,8 +11,8 @@ import pandas as pd import time import os import sys -from params import SYS_ARGS -from bridge import Binary +from data.params import SYS_ARGS +from data.bridge import Binary import json os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" @@ -37,8 +37,6 @@ class GNet : self.layers = void() self.layers.normalize = self.normalize - self.get = void() - self.get.variables = self._variable_on_cpu self.NUM_GPUS = 1 @@ -63,7 +61,11 @@ class GNet : self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None} self._REAL = args['real'] if 'real' in args else None self._LABEL = args['label'] if 'label' in args else None - + + self.get = void() + self.get.variables = self._variable_on_cpu + + self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] self.init_logs(**args) def init_logs(self,**args): @@ -83,7 +85,9 @@ class GNet : This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model. Because prediction and training can happen independently """ - _name = os.sep.join([self.out_dir,'meta-'+column+'.json']) + # suffix = "-".join(column) if isinstance(column,list)else column + suffix = self.get.suffix() + _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json']) if os.path.exists(_name) : attr = json.loads((open(_name)).read()) for key in attr : @@ -111,7 +115,10 @@ class GNet : key = args['key'] value= args['value'] object[key] = value - _name = os.sep.join([self.out_dir,'meta-'+SYS_ARGS['column']]) + # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column + suffix = self.get.suffix() + _name = os.sep.join([self.out_dir,'meta-'+suffix]) + f = open(_name+'.json','w') f.write(json.dumps(object)) def mkdir (self,path): @@ -285,7 +292,9 @@ class Train (GNet): self.discriminator = Discriminator(**args) self._REAL = args['real'] self._LABEL= args['label'] + self.column = args['column'] # print ([" *** ",self.BATCHSIZE_PER_GPU]) + self.log_meta() def load_meta(self, column): """ @@ -407,8 +416,9 @@ class Train (GNet): format_str = 'epoch: %d, w_distance = %f (%.1f)' print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) if epoch % self.MAX_EPOCHS == 0: - - _name = os.sep.join([self.train_dir,self.ATTRIBUTES['synthetic']]) + # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + suffix = self.get.suffix() + _name = os.sep.join([self.train_dir,suffix]) # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) saver.save(sess, _name, write_meta_graph=False, global_step=epoch) # @@ -420,14 +430,16 @@ class Predict(GNet): """ def __init__(self,**args): GNet.__init__(self,**args) - self.generator = Generator(**args) - self.values = values + self.generator = Generator(**args) + self.values = args['values'] def load_meta(self, column): super().load_meta(column) self.generator.load_meta(column) def apply(self,**args): # print (self.train_dir) - model_dir = os.sep.join([self.train_dir,self.ATTRIBUTES['synthetic']+'-'+str(self.MAX_EPOCHS)]) + # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + suffix = self.get.suffix() + model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] tf.compat.v1.reset_default_graph() z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) @@ -450,19 +462,24 @@ class Predict(GNet): # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes # - df = ( pd.DataFrame(np.round(f).astype(np.int32),columns=values)) + df = ( pd.DataFrame(np.round(f).astype(np.int32))) # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms # df = (i * df).sum(axis=1) # # In case we are dealing with actual values like diagnosis codes we can perform # - r = np.zeros((self.ROW_COUNT,1)) + columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] + + r = np.zeros((self.ROW_COUNT,len(columns))) for col in df : i = np.where(df[col])[0] r[i] = col - df = pd.DataFrame(r,columns=[self.ATTRIBUTES['synthetic']]) - return df.to_dict(orient='list') + df = pd.DataFrame(r,columns=columns) + + df[df.columns] = (df.apply(lambda value: self.values[ int(value)],axis=1)) + return df.to_dict(orient='lists') + # return df.to_dict(orient='list') # count = str(len(os.listdir(self.out_dir))) # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv']) # df.to_csv(_name,index=False) @@ -476,7 +493,7 @@ class Predict(GNet): # idx2 = (demo[:, n] == 1) # idx = [idx1[j] and idx2[j] for j in range(len(idx1))] # num = np.sum(idx) - # print ("_____________________") + # print ("___________________list__") # print (idx1) # print (idx2) # print (idx) @@ -531,7 +548,8 @@ if __name__ == '__main__' : elif 'generate' in SYS_ARGS: values = df[column].unique().tolist() values.sort() - p = Predict(context=context,label=LABEL,values=values) + + p = Predict(context=context,label=LABEL,values=values,column=column) p.load_meta(column) r = p.apply() print (df) @@ -539,6 +557,7 @@ if __name__ == '__main__' : df[column] = r[column] print (df) + else: print (SYS_ARGS.keys()) print (__doc__) diff --git a/data/maker/__init__.py b/data/maker/__init__.py new file mode 100644 index 0000000..7a441f8 --- /dev/null +++ b/data/maker/__init__.py @@ -0,0 +1,68 @@ +""" +(c) 2019 Data Maker, hiplab.mc.vanderbilt.edu +version 1.0.0 + +This package serves as a proxy to the overall usage of the framework. +This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques + +@TODO: + - Make configurable GPU, EPOCHS +""" +import pandas as pd +import numpy as np +from data import gan + +def train (**args) : + """ + This function is intended to train the GAN in order to learn about the distribution of the features + :column columns that need to be synthesized (discrete) + :logs where the output of the (location on disk) + :id identifier of the dataset + :data data-frame to be synthesized + :context label of what we are synthesizing + """ + column = args['column'] + + column_id = args['id'] + df = args['data'] + logs = args['logs'] + real = pd.get_dummies(df[column]).astype(np.float32).values + + labels = pd.get_dummies(df[column_id]).astype(np.float32).values + max_epochs = 10 + context = args['context'] + trainer = gan.Train(context=context,max_epochs=max_epochs,real=real,label=labels,column=column,column_id=column_id) + return trainer.apply() + +def generate(**args): + """ + This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset + @return pandas.DataFrame + + :data data-frame to be synthesized + :column columns that need to be synthesized (discrete) + :id column identifying an entity + :logs location on disk where the learnt knowledge of the dataset is + """ + df = args['data'] + + column = args['column'] + column_id = args['id'] + logs = args['logs'] + context = args['context'] + # + #@TODO: + # If the identifier is not present, we should fine a way to determine or make one + # + #ocolumns= list(set(df.columns.tolist())- set(columns)) + + values = df[column].unique().tolist() + values.sort() + + labels = pd.get_dummies(df[column_id]).astype(np.float32).values + handler = gan.Predict (context=context,label=labels,values=values,column=column) + handler.load_meta(column) + r = handler.apply() + _df = df.copy() + _df[column] = r[column] + return _df diff --git a/setup.py b/setup.py index afbd3f0..c7dc48f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker","version":"1.0.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", - "packages":["data-maker"],"keywords":["healthcare","edi","x12","data","transport","protocol"]} + "packages":["data-maker"],"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','numpy','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/gan.git' if sys.version_info[0] == 2 : From df47ed4cb2b7e1d05f86b20899244f2350bac05c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 31 Dec 2019 23:34:04 -0600 Subject: [PATCH 11/16] documentation --- README.md | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index b42b1f7..f5c5e5d 100644 --- a/README.md +++ b/README.md @@ -12,32 +12,33 @@ This package is designed to generate synthetic data from a dataset from an origi ## Usage After installing the easiest way to get started is as follows (using pandas). The process is as follows: -1. Train the GAN on the original/raw dataset +**Train the GAN on the original/raw dataset** -import pandas as pd -import data.maker -df = pd.read_csv('sample.csv') -column = 'gender' -id = 'id' -context = 'demo' -data.maker.train(context=context,data=df,column=column,id=id,logs='logs') + import pandas as pd + import data.maker + + df = pd.read_csv('sample.csv') + column = 'gender' + id = 'id' + context = 'demo' + data.maker.train(context=context,data=df,column=column,id=id,logs='logs') The trainer will store the data on disk (for now) in a structured folder that will hold training models that will be used to generate the synthetic data. -2. Generate a candidate dataset from the learnt features +**Generate a candidate dataset from the learned features** -import pandas as pd -import data.maker + import pandas as pd + import data.maker -df = pd.read_csv('sample.csv') -id = 'id' -column = 'gender' -context = 'demo' -data.maker.generate(data=df,id=id,column=column,logs='logs') + df = pd.read_csv('sample.csv') + id = 'id' + column = 'gender' + context = 'demo' + data.maker.generate(data=df,id=id,column=column,logs='logs') ## Limitations @@ -46,11 +47,14 @@ GANS will generate data assuming the original data has all the value space neede - No new data will be created Assuming we have a dataset with an gender attribute with values [M,F]. + The synthetic data will not be able to generate genders outside [M,F] + - Not advised on continuous values GANS work well on discrete values and thus are not advised to be used. e.g:measurements (height, blood pressure, ...) +- For now will only perform on a single feature. ## Credits : From ffc4a8a191cdcf88c0ac25a64dd509fa5797f203 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 31 Dec 2019 23:38:52 -0600 Subject: [PATCH 12/16] layout issue --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index f5c5e5d..46d2425 100644 --- a/README.md +++ b/README.md @@ -31,14 +31,14 @@ The trainer will store the data on disk (for now) in a structured folder that wi **Generate a candidate dataset from the learned features** - import pandas as pd - import data.maker + import pandas as pd + import data.maker - df = pd.read_csv('sample.csv') - id = 'id' - column = 'gender' - context = 'demo' - data.maker.generate(data=df,id=id,column=column,logs='logs') + df = pd.read_csv('sample.csv') + id = 'id' + column = 'gender' + context = 'demo' + data.maker.generate(data=df,id=id,column=column,logs='logs') ## Limitations @@ -49,7 +49,7 @@ GANS will generate data assuming the original data has all the value space neede Assuming we have a dataset with an gender attribute with values [M,F]. The synthetic data will not be able to generate genders outside [M,F] - + - Not advised on continuous values GANS work well on discrete values and thus are not advised to be used. From 65a3e84c8f584df71738d70820726084e4970e1e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 31 Dec 2019 23:39:54 -0600 Subject: [PATCH 13/16] documentation usage --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 46d2425..3dfb291 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ The trainer will store the data on disk (for now) in a structured folder that wi id = 'id' column = 'gender' context = 'demo' - data.maker.generate(data=df,id=id,column=column,logs='logs') + data.maker.generate(context=context,data=df,id=id,column=column,logs='logs') ## Limitations From 6de816fc5053386e961af7d92a06367eb0bd57e3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 3 Jan 2020 21:47:05 -0600 Subject: [PATCH 14/16] bug fixes with operations --- data/gan.py | 22 ++++++++++++++++------ data/maker/__init__.py | 27 +++++++++++++++++---------- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/data/gan.py b/data/gan.py index e349018..3391b78 100644 --- a/data/gan.py +++ b/data/gan.py @@ -14,6 +14,7 @@ import sys from data.params import SYS_ARGS from data.bridge import Binary import json +import pickle os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ['CUDA_VISIBLE_DEVICES'] = "0" @@ -38,7 +39,7 @@ class GNet : self.layers.normalize = self.normalize - self.NUM_GPUS = 1 + self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854 @@ -64,8 +65,8 @@ class GNet : self.get = void() self.get.variables = self._variable_on_cpu - self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + self.logger = args['logger'] if 'logger' in args and args['logger'] else None self.init_logs(**args) def init_logs(self,**args): @@ -98,7 +99,7 @@ class GNet : def log_meta(self,**args) : - object = { + _object = { 'CONTEXT':self.CONTEXT, 'ATTRIBUTES':self.ATTRIBUTES, 'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU, @@ -120,7 +121,8 @@ class GNet : _name = os.sep.join([self.out_dir,'meta-'+suffix]) f = open(_name+'.json','w') - f.write(json.dumps(object)) + f.write(json.dumps(_object)) + return _object def mkdir (self,path): if not os.path.exists(path) : os.mkdir(path) @@ -295,7 +297,7 @@ class Train (GNet): self.column = args['column'] # print ([" *** ",self.BATCHSIZE_PER_GPU]) - self.log_meta() + self.meta = self.log_meta() def load_meta(self, column): """ This function will delegate the calls to load meta data to it's dependents @@ -393,7 +395,7 @@ class Train (GNet): # saver = tf.train.Saver() saver = tf.compat.v1.train.Saver() init = tf.global_variables_initializer() - + logs = [] with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: sess.run(init) sess.run(iterator_d.initializer, @@ -415,6 +417,10 @@ class Train (GNet): format_str = 'epoch: %d, w_distance = %f (%.1f)' print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) + # print (dir (w_distance)) + + logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) }) + if epoch % self.MAX_EPOCHS == 0: # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] suffix = self.get.suffix() @@ -423,6 +429,10 @@ class Train (GNet): saver.save(sess, _name, write_meta_graph=False, global_step=epoch) # # + if self.logger : + row = {"logs":logs} #,"model":pickle.dump(sess)} + + self.logger.write(row=row) class Predict(GNet): """ diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 7a441f8..075bfd3 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -11,7 +11,7 @@ This package is designed to generate synthetic data from a dataset from an origi import pandas as pd import numpy as np from data import gan - +from transport import factory def train (**args) : """ This function is intended to train the GAN in order to learn about the distribution of the features @@ -21,17 +21,24 @@ def train (**args) : :data data-frame to be synthesized :context label of what we are synthesizing """ - column = args['column'] + column = args['column'] - column_id = args['id'] - df = args['data'] - logs = args['logs'] - real = pd.get_dummies(df[column]).astype(np.float32).values + column_id = args['id'] + df = args['data'] + logs = args['logs'] + real = pd.get_dummies(df[column]).astype(np.float32).values + labels = pd.get_dummies(df[column_id]).astype(np.float32).values - labels = pd.get_dummies(df[column_id]).astype(np.float32).values - max_epochs = 10 - context = args['context'] - trainer = gan.Train(context=context,max_epochs=max_epochs,real=real,label=labels,column=column,column_id=column_id) + max_epochs = 10 if 'max_epochs' not in args else args['max_epochs'] + context = args['context'] + if 'store' in args : + args['store']['args']['doc'] = context + logger = factory.instance(**args['store']) + + else: + logger = None + + trainer = gan.Train(context=context,max_epochs=max_epochs,real=real,label=labels,column=column,column_id=column_id,logger = logger,logs=logs) return trainer.apply() def generate(**args): From 4165fabe57c1abdf67cbb27b2ba6b0a03c672e92 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 3 Jan 2020 21:50:32 -0600 Subject: [PATCH 15/16] bug fix with installer (PIP) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c7dc48f..1bfc141 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker","version":"1.0.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", - "packages":["data-maker"],"keywords":["healthcare","data","transport","protocol"]} + "packages":["data"],"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','numpy','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/gan.git' if sys.version_info[0] == 2 : From ef399690822cfbd8ab948aea10bae651173363b6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 4 Jan 2020 23:02:15 -0600 Subject: [PATCH 16/16] bug fix with imports --- data/__init__.py | 1 + data/maker/__init__.py | 2 +- data/maker/__main__.py | 10 ++++++++++ setup.py | 3 ++- 4 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 data/maker/__main__.py diff --git a/data/__init__.py b/data/__init__.py index 88bbded..98124f1 100644 --- a/data/__init__.py +++ b/data/__init__.py @@ -1 +1,2 @@ import data.params as params + diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 075bfd3..469a65a 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -10,7 +10,7 @@ This package is designed to generate synthetic data from a dataset from an origi """ import pandas as pd import numpy as np -from data import gan +import data.gan as gan from transport import factory def train (**args) : """ diff --git a/data/maker/__main__.py b/data/maker/__main__.py new file mode 100644 index 0000000..e77bf0a --- /dev/null +++ b/data/maker/__main__.py @@ -0,0 +1,10 @@ +import pandas as pd +import data.maker + +df = pd.read_csv('sample.csv') +column = 'gender' +id = 'id' +context = 'demo' +store = {"type":"mongo.MongoWriter","args":{"host":"localhost:27017","dbname":"GAN"}} +max_epochs = 11 +data.maker.train(store=store,max_epochs=max_epochs,context=context,data=df,column=column,id=id,logs='foo') \ No newline at end of file diff --git a/setup.py b/setup.py index 1bfc141..a7befdf 100644 --- a/setup.py +++ b/setup.py @@ -5,9 +5,10 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker","version":"1.0.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", - "packages":["data"],"keywords":["healthcare","data","transport","protocol"]} + "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','numpy','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/gan.git' + if sys.version_info[0] == 2 : args['use_2to3'] = False args['use_2to3_exclude_fixers'] = ['lib2to3.fixes.fix_import']