From e07c3553884fc9726cc464e9523f28a1a7f55794 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 8 Mar 2020 19:33:08 -0500 Subject: [PATCH] bug fix, with logs and partitioning --- data/gan.py | 11 +++++++---- data/maker/__init__.py | 4 ++-- pipeline.py | 8 +++++--- setup.py | 2 +- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/data/gan.py b/data/gan.py index a6d35e1..3c41f59 100644 --- a/data/gan.py +++ b/data/gan.py @@ -59,6 +59,7 @@ class GNet : self.logs = {} self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] + self.PARTITION = args['partition'] # if self.NUM_GPUS > 1 : # os.environ['CUDA_VISIBLE_DEVICES'] = "4" @@ -356,7 +357,7 @@ class Train (GNet): self.meta = self.log_meta() if(self.logger): - self.logger.write({"module":"gan-train","action":"start","input":self.meta} ) + self.logger.write({"module":"gan-train","action":"start","input":{"partition":self.PARTITION,"meta":self.meta} } ) # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta) def load_meta(self, column): @@ -408,7 +409,7 @@ class Train (GNet): # losses = tf.compat.v1.get_collection(flag, scope) total_loss = tf.add_n(losses, name='total_loss') - + print (total_loss) return total_loss, w def input_fn(self): """ @@ -514,7 +515,7 @@ class Train (GNet): # # if self.logger : - row = {"module":"gan-train","action":"logs","input":logs} #,"model":pickle.dump(sess)} + row = {"module":"gan-train","action":"logs","input":{"partition":self.PARTITION,"logs":logs}} #,"model":pickle.dump(sess)} self.logger.write(row) # # @TODO: @@ -623,6 +624,7 @@ class Predict(GNet): # r = np.zeros((self.ROW_COUNT,len(columns))) # r = np.zeros(self.ROW_COUNT) + if self.logger : info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)} if INDEX > 0 : @@ -631,6 +633,7 @@ class Predict(GNet): info['selected'] = -1 info['ratio'] = __ratio + info['partition'] = self.PARTITION self.logger.write({"module":"gan-generate","action":"generate","input":info}) df.columns = self.values if len(found) or df.columns.size == len(self.values): @@ -658,7 +661,7 @@ class Predict(GNet): df = df[columns[0]].append(pd.Series(missing)) if self.logger : - info= {"missing": i.size,"rows":df.shape[0],"cols":1} + info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION} self.logger.write({"module":"gan-generate","action":"compile.io","input":info}) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 4be97b8..729654f 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -111,7 +111,7 @@ def train (**args) : BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) args['real'] = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32) else: - df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False) + # df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False) # print (df[col].dtypes) # print (df[col].dropna/(axis=1).unique()) args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values @@ -124,7 +124,7 @@ def train (**args) : args['store']['args']['doc'] = context logger = factory.instance(**args['store']) args['logger'] = logger - info = {"rows":args['real'].shape[0],"cols":args['real'].shape[1],"name":col} + info = {"rows":args['real'].shape[0],"cols":args['real'].shape[1],"name":col,"partition":args['partition']} logger.write({"module":"gan-train","action":"data-prep","input":info}) else: diff --git a/pipeline.py b/pipeline.py index 418ccbf..89ba16f 100644 --- a/pipeline.py +++ b/pipeline.py @@ -89,7 +89,8 @@ class Components : _args['gpu'] = 0 _args['num_gpu'] = 1 os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) - + _args['partition'] = int(partition) + _args['continuous']= args['continuous'] if 'continuous' in args else [] _args['store'] = {'type':'mongo.MongoWriter','args':{'dbname':'aou','doc':args['context']}} _args['data'] = args['data'] @@ -144,7 +145,8 @@ class Components : # df = pd.DataFrame(df[ int (partition) ],columns = columns) info = {"parition":int(partition),"gpu":_args["gpu"],"rows":df.shape[0],"cols":df.shape[1],"part_size":PART_SIZE} logger.write({"module":"generate","action":"partition","input":info}) - + _args['partition'] = int(partition) + _args['continuous']= args['continuous'] if 'continuous' in args else [] _args['data'] = df # _args['data'] = reader() #_args['data'] = _args['data'].astype(object) @@ -194,7 +196,7 @@ class Components : data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=50000) data_comp.to_csv(_pname,index=False) INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' - _args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=complete,credentials=credentials,chunksize=50000) + _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=50000) _id = 'dataset' info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} } if partition : diff --git a/setup.py b/setup.py index bf63cb0..5a8f7b6 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.2.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.2.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'