bug fix: crash with dataset & epochs

dev
Steve Nyemba 2 years ago
parent 4398212caf
commit 0efd4b13bc

@ -13,17 +13,19 @@ This package is designed to generate synthetic data from a dataset from an origi
After installing the easiest way to get started is as follows (using pandas). The process is as follows: After installing the easiest way to get started is as follows (using pandas). The process is as follows:
Read about [data-transport on github](https://github.com/lnyemba/data-transport) or on [healthcareio.the-phi.com/git/code/transport](https://healthcareio.the-phi.com/git/code/transport.git)
**Train the GAN on the original/raw dataset** **Train the GAN on the original/raw dataset**
1. We define the data sources
The sources will consists in source, target and logger20.
import pandas as pd import pandas as pd
import data.maker import data.maker
import transport
from transport import providers
df = pd.read_csv('sample.csv')
column = 'gender'
id = 'id'
context = 'demo'
data.maker.train(context=context,data=df,column=column,id=id,logs='logs')
The trainer will store the data on disk (for now) in a structured folder that will hold training models that will be used to generate the synthetic data. The trainer will store the data on disk (for now) in a structured folder that will hold training models that will be used to generate the synthetic data.

@ -3,3 +3,4 @@ from data.params import SYS_ARGS
import transport import transport
from multiprocessing import Process, Queue from multiprocessing import Process, Queue
from data.maker import prepare from data.maker import prepare
from data.maker import state

@ -100,6 +100,13 @@ class GNet :
self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS
self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)
self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs']) self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs'])
CHECKPOINT_SKIPS = 10
if self.MAX_EPOCHS < 2*CHECKPOINT_SKIPS :
CHECKPOINT_SKIPS = 2
self.CHECKPOINTS = np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist()
self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100 self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100
self.CONTEXT = args['context'] self.CONTEXT = args['context']
self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None} self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None}
@ -120,14 +127,18 @@ class GNet :
for key in ['train','output'] : for key in ['train','output'] :
self.mkdir(os.sep.join([self.log_dir,key])) self.mkdir(os.sep.join([self.log_dir,key]))
self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT])) self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT]))
if 'partition' in args : # if 'partition' in args :
self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT,str(args['partition'])])) # self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT,str(args['partition'])]))
self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT])
self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
if 'partition' in args : if 'partition' in args :
self.train_dir = os.sep.join([self.train_dir,str(args['partition'])]) self.train_dir = os.sep.join([self.train_dir,str(args['partition'])])
self.out_dir = os.sep.join([self.out_dir,str(args['partition'])]) self.out_dir = os.sep.join([self.out_dir,str(args['partition'])])
for checkpoint in self.CHECKPOINTS :
self.mkdir (os.sep.join([self.train_dir,str(checkpoint)]))
self.mkdir (os.sep.join([self.out_dir,str(checkpoint)]))
# if self.logger : # if self.logger :
# We will clear the logs from the data-store # We will clear the logs from the data-store
@ -150,12 +161,13 @@ class GNet :
attr = json.loads((open(_name)).read()) attr = json.loads((open(_name)).read())
for key in attr : for key in attr :
value = attr[key] value = attr[key]
setattr(self,key,value) if not hasattr(self,key):
setattr(self,key,value)
self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT])
self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
if 'partition' in args : # if 'partition' in args :
self.train_dir = os.sep.join([self.train_dir,str(args['partition'])]) # self.train_dir = os.sep.join([self.train_dir,str(args['partition'])])
self.out_dir = os.sep.join([self.out_dir,str(args['partition'])]) # self.out_dir = os.sep.join([self.out_dir,str(args['partition'])])
def log_meta(self,**args) : def log_meta(self,**args) :
@ -183,15 +195,24 @@ class GNet :
suffix = self.CONTEXT #self.get.suffix() suffix = self.CONTEXT #self.get.suffix()
_name = os.sep.join([self.out_dir,'meta-'+suffix]) _name = os.sep.join([self.out_dir,'meta-'+suffix])
f = open(_name+'.json','w') # f = open(_name+'.json','w')
f.write(json.dumps(_object)) # f.write(json.dumps(_object))
# f.close()
for _info in [{"name":os.sep.join([self.out_dir,'meta-'+suffix+'.json']),"data":_object},{"name":os.sep.join([self.out_dir,'epochs.json']),"data":self.logs['epochs'] if 'epochs' in self.logs else []}] :
f = open(_info['name'],'w')
f.write(json.dumps(_info['data']))
f.close()
return _object return _object
def mkdir (self,path): def mkdir (self,path):
if not os.path.exists(path) : if not os.path.exists(path) :
if os.sep in path : if os.sep in path :
pass pass
root = [] root = []
for loc in path.split(os.sep) :
for loc in path.strip().split(os.sep) :
if loc == '' :
root.append(os.sep)
root.append(loc) root.append(loc)
if not os.path.exists(os.sep.join(root)) : if not os.path.exists(os.sep.join(root)) :
os.mkdir(os.sep.join(root)) os.mkdir(os.sep.join(root))
@ -278,8 +299,10 @@ class Generator (GNet):
tf.compat.v1.add_to_collection('glosses', loss) tf.compat.v1.add_to_collection('glosses', loss)
return loss, loss return loss, loss
def load_meta(self, **args): def load_meta(self, **args):
super().load_meta(**args) # super().load_meta(**args)
self.discriminator.load_meta(**args) self.discriminator.load_meta(**args)
def network(self,**args) : def network(self,**args) :
""" """
This function will build the network that will generate the synthetic candidates This function will build the network that will generate the synthetic candidates
@ -381,6 +404,7 @@ class Train (GNet):
self.logger.write({"module":"gan-train","action":"start","input":{"partition":self.PARTITION,"meta":self.meta} } ) self.logger.write({"module":"gan-train","action":"start","input":{"partition":self.PARTITION,"meta":self.meta} } )
# self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta) # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta)
def load_meta(self, column): def load_meta(self, column):
""" """
@ -445,7 +469,7 @@ class Train (GNet):
else : else :
dataset = tf.data.Dataset.from_tensor_slices(features_placeholder) dataset = tf.data.Dataset.from_tensor_slices(features_placeholder)
# labels_placeholder = None # labels_placeholder = None
dataset = dataset.repeat(10000) dataset = dataset.repeat(20000)
dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
dataset = dataset.prefetch(1) dataset = dataset.prefetch(1)
@ -472,9 +496,11 @@ class Train (GNet):
if self._LABEL is not None : if self._LABEL is not None :
(real, label) = iterator.get_next() (real, label) = iterator.get_next()
else: else:
real = iterator.get_next() real = iterator.get_next()
label= None label= None
loss, w = self.loss(scope=scope, stage=stage, real=real, label=label) loss, w = self.loss(scope=scope, stage=stage, real=real, label=label)
#tf.get_variable_scope().reuse_variables() #tf.get_variable_scope().reuse_variables()
tf.compat.v1.get_variable_scope().reuse_variables() tf.compat.v1.get_variable_scope().reuse_variables()
#vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage) #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
@ -507,6 +533,7 @@ class Train (GNet):
# init = tf.global_variables_initializer() # init = tf.global_variables_initializer()
init = tf.compat.v1.global_variables_initializer() init = tf.compat.v1.global_variables_initializer()
logs = [] logs = []
self.logs['epochs'] = []
#with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
@ -536,25 +563,41 @@ class Train (GNet):
logs.append({"epoch": int(epoch),"distance":float(-w_sum/(self.STEPS_PER_EPOCH*2)) }) logs.append({"epoch": int(epoch),"distance":float(-w_sum/(self.STEPS_PER_EPOCH*2)) })
# if epoch % self.MAX_EPOCHS == 0: # if epoch % self.MAX_EPOCHS == 0:
if epoch in [5,10,20,50,75, self.MAX_EPOCHS] : # if epoch in [5,10,20,50,75, self.MAX_EPOCHS] :
if epoch in self.CHECKPOINTS or int(epoch) == 1:
# suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
suffix = self.CONTEXT #self.get.suffix() suffix = self.CONTEXT #self.get.suffix()
_name = os.sep.join([self.train_dir,suffix]) _name = os.sep.join([self.train_dir,str(epoch),suffix])
# saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
saver.save(sess, _name, write_meta_graph=False, global_step=epoch) saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
# #
# #
logs = [{"path":_name,"epochs":int(epoch),"loss":float(-w_sum/(self.STEPS_PER_EPOCH*2))}]
if self.logger : if self.logger :
row = {"module":"gan-train","action":"logs","input":{"partition":self.PARTITION,"logs":logs}} #,"model":pickle.dump(sess)} # row = {"module":"gan-train","action":"epochs","input":{"logs":logs}} #,"model":pickle.dump(sess)}
self.logger.write(row) # self.logger.write(row)
self.logs['epochs'] += logs
# #
# @TODO: # @TODO:
# We should upload the files in the checkpoint # We should upload the files in the checkpoint
# This would allow the learnt model to be portable to another system # This would allow the learnt model to be portable to another system
# #
tf.compat.v1.reset_default_graph() tf.compat.v1.reset_default_graph()
#
# let's sort the epochs we've logged thus far (if any)
#
self.logs['epochs'].sort(key=lambda _item: _item['loss'])
if self.logger :
_log = {'module':'gan-train','action':'epochs','input':self.logs['epochs']}
self.logger.write(_log)
#
# @TODO:
# Make another copy of this on disk to be able to load it should we not have a logger setup
#
self.log_meta()
class Predict(GNet): class Predict(GNet):
""" """
This class uses synthetic data given a learned model This class uses synthetic data given a learned model
@ -565,6 +608,7 @@ class Predict(GNet):
self.values = args['values'] self.values = args['values']
self.ROW_COUNT = args['row_count'] self.ROW_COUNT = args['row_count']
self.oROW_COUNT = self.ROW_COUNT self.oROW_COUNT = self.ROW_COUNT
# self.MISSING_VALUES = np.nan_to_num(np.nan) # self.MISSING_VALUES = np.nan_to_num(np.nan)
# if 'no_value' in args and args['no_value'] not in ['na','','NA'] : # if 'no_value' in args and args['no_value'] not in ['na','','NA'] :
# self.MISSING_VALUES = args['no_value'] # self.MISSING_VALUES = args['no_value']
@ -577,9 +621,20 @@ class Predict(GNet):
super().load_meta(**args) super().load_meta(**args)
self.generator.load_meta(**args) self.generator.load_meta(**args)
self.ROW_COUNT = self.oROW_COUNT self.ROW_COUNT = self.oROW_COUNT
#
# updating the input/output for the generator, so it points properly
#
for object in [self,self.generator] :
_train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT,str(self.MAX_EPOCHS)])
_out_dir= os.sep.join([self.log_dir,'output',self.CONTEXT,str(self.MAX_EPOCHS)])
setattr(object,'train_dir',_train_dir)
setattr(object,'out_dir',_out_dir)
def apply(self,**args): def apply(self,**args):
suffix = self.CONTEXT #self.get.suffix() suffix = self.CONTEXT #self.get.suffix()
model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
# model_dir = os.sep.join([self.train_dir,str(self.MAX_EPOCHS)])
demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
# #
# setup computational graph # setup computational graph

@ -15,6 +15,7 @@ import transport
# from data.bridge import Binary # from data.bridge import Binary
import threading import threading
from data.maker import prepare from data.maker import prepare
from data.maker.state import State
import copy import copy
import os import os
import nujson as json import nujson as json
@ -25,6 +26,7 @@ from multiprocessing import Queue
import time import time
class Learner(Process): class Learner(Process):
def __init__(self,**_args): def __init__(self,**_args):
@ -48,7 +50,7 @@ class Learner(Process):
if 'network_args' not in _args : if 'network_args' not in _args :
self.network_args ={ self.network_args ={
'context':self.info['context'] , 'context':self.info['context'] ,
'logs':_args['logpath'] if 'logpath' in _args else 'logs', 'logs':_args['logs'] if 'logs' in _args else 'logs',
'max_epochs':int(_args['epochs']) if 'epochs' in _args else 2, 'max_epochs':int(_args['epochs']) if 'epochs' in _args else 2,
'batch_size':int (_args['batch']) if 'batch' in _args else 2000 'batch_size':int (_args['batch']) if 'batch' in _args else 2000
} }
@ -72,6 +74,36 @@ class Learner(Process):
self.logger = None self.logger = None
if 'logger' in self.store : if 'logger' in self.store :
self.logger = transport.factory.instance(**self.store['logger']) self.logger = transport.factory.instance(**self.store['logger'])
self.autopilot = False #-- to be set by caller
self._initStateSpace()
def _initStateSpace(self):
"""
Initializing state-space for the data-maker, The state-space functions are used as pre-post processing functions applied to the data accordingly i.e
- Trainer -> pre-processing
- Generation -> post processing
The specifications of a state space in the configuration file is as such
state:{pre:{path,pipeline:[]}, post:{path,pipeline:[]}}
"""
self._states = None
if 'state' in self.info :
try:
_config = self.info ['state']
self._states = State.instance(_config)
except Exception as e:
print (e)
pass
finally:
# __info = (pd.DataFrame(self._states)[['name','path','args']]).to_dict(orient='records')
if self._states :
__info = {}
for key in self._states :
__info[key] = [{"name":_item['name'],"args":_item['args'],"path":_item['path']} for _item in self._states[key]]
self.log(object='state-space',action='load',input=__info)
def log(self,**_args): def log(self,**_args):
try: try:
@ -108,11 +140,36 @@ class Learner(Process):
_read_args= self.info _read_args= self.info
if self._df is None : if self._df is None :
self._df = reader.read(**_read_args) self._df = reader.read(**_read_args)
#
# NOTE : PRE
# At this point we apply pre-processing of the data if there were ever a need for it
#
_log = {}
HAS_STATES = self._states is not None and 'pre' in self._states
NOT_GENERATING = self.name in ['Trainer','Shuffle']
IS_AUTOPILOT = self.autopilot
#
# allow calling pre-conditions if either of the conditions is true
# 1. states and not generating
# 2. IS_GENERATING and states and not autopilot
_ALLOW_PRE_CALL = (HAS_STATES and NOT_GENERATING) or (NOT_GENERATING is False and HAS_STATES and IS_AUTOPILOT is False)
if _ALLOW_PRE_CALL :
# if HAS_STATES and NOT_GENERATING or (HAS_STATES and IS_AUTOPILOT is False and NOT_GENERATING is False):
_logs = {'action':'status','input':{'pre':self._states['pre']}}
_beg = list(self._df.shape)
self._df = State.apply(self._df,self._states['pre'])
_end = list(self._df.shape)
_logs['input']['size'] = _beg,_end
self.log(**_log)
#
#
columns = self.columns if self.columns else self._df.columns columns = self.columns if self.columns else self._df.columns
# #
# Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases
# - The code below tries to address the issue (Perhaps better suited for the reading components) # - The code below tries to address the issue (Perhaps better suited for the reading components)
_log = {}
for name in columns : for name in columns :
# #
# randomly sampling 5 elements to make sense of data-types # randomly sampling 5 elements to make sense of data-types
@ -201,8 +258,14 @@ class Trainer(Learner):
# @TODO: At this point we need to generate another some other objects # @TODO: At this point we need to generate another some other objects
# #
_args = {"network_args":self.network_args,"store":self.store,"info":self.info,"candidates":self.candidates,"data":self._df} _args = {"network_args":self.network_args,"store":self.store,"info":self.info,"candidates":self.candidates,"data":self._df}
_args['logs'] = self.network_args['logs']
_args['autopilot'] = self.autopilot
if self.gpu : if self.gpu :
_args['gpu'] = self.gpu _args['gpu'] = self.gpu
#
# Let us find the smallest, the item is sorted by loss ...
_args['epochs'] = gTrain.logs['epochs'][0]['epochs']
g = Generator(**_args) g = Generator(**_args)
# g.run() # g.run()
@ -239,6 +302,7 @@ class Generator (Learner):
file.close() file.close()
else: else:
self._map = {} self._map = {}
self.autopilot = False if 'autopilot' not in _args else _args['autopilot']
def run(self): def run(self):
self.initalize() self.initalize()
if self._encoder is None : if self._encoder is None :
@ -416,33 +480,32 @@ class Generator (Learner):
_df = self._df.copy() _df = self._df.copy()
_df[self.columns] = _iodf[self.columns] _df[self.columns] = _iodf[self.columns]
N += _df.shape[0] N += _df.shape[0]
# if self._states :
#@TODO: _df = State.apply(_df,self._states['post'])
# Improve formatting with better post-processing pipeline # #
if 'approximate' in self.info : # #@TODO:
_df = self.approximate(_df) # # Improve formatting with better post-processing pipeline
if 'make_date' in self.info : # if 'approximate' in self.info :
for name in self.info['make_date'] : # _df = self.approximate(_df)
# iname = self.info['make_date']['init_field'] # if 'make_date' in self.info :
iname = self.info['make_date'][name] # for name in self.info['make_date'] :
# # iname = self.info['make_date']['init_field']
# iname = self.info['make_date'][name]
years = _df[iname] # years = _df[iname]
_dates = [self.make_date(year=_year,field=name) for _year in years] # _dates = [self.make_date(year=_year,field=name) for _year in years]
if _dates : # if _dates :
_df[name] = _dates # _df[name] = _dates
_schema = self.get_schema() _schema = self.get_schema()
# _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
_df = self.format(_df,_schema) _df = self.format(_df,_schema)
_log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ] _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ]
self.log(**{"action":"consolidate","input":_log}) self.log(**{"action":"consolidate","input":_log})
# w = transport.factory.instance(doc='observation',provider='mongodb',context='write',db='IOV01_LOGS',auth_file='/home/steve/dev/transport/mongo.json')
# w.write(_df)
# cols = [name for name in _df.columns if name.endswith('datetime')]
# print (_df[cols])
if _store : if _store :
writer = transport.factory.instance(**_store) writer = transport.factory.instance(**_store)
if _store['provider'] == 'bigquery': if _store['provider'] == 'bigquery':
@ -507,8 +570,10 @@ class factory :
:param info {columns,sql,from} :param info {columns,sql,from}
:param autopilot will generate output automatically :param autopilot will generate output automatically
:param batch (default 2k) size of the batch :param batch (default 2k) size of the batch
""" """
if _args['apply'] in [apply.RANDOM] : if _args['apply'] in [apply.RANDOM] :
pthread = Shuffle(**_args) pthread = Shuffle(**_args)
elif _args['apply'] == apply.GENERATE : elif _args['apply'] == apply.GENERATE :

@ -276,7 +276,7 @@ class Input :
if np.random.choice([0,1],1)[0] : if np.random.choice([0,1],1)[0] :
novalues = _values[np.random.choice( len(_values),1)[0]].tolist() novalues = _values[np.random.choice( len(_values),1)[0]].tolist()
else: else:
novalues = np.repeat(None,len(self._columns)) novalues = np.repeat(None,len(self._columns))
x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else novalues ,axis=1).tolist() x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else novalues ,axis=1).tolist()
return pd.DataFrame(x,columns=columns) return pd.DataFrame(x,columns=columns)

Loading…
Cancel
Save