From ce55848cc8d8fa06aad95ce8f75274ae968e657d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 13 Feb 2020 17:30:56 -0600 Subject: [PATCH] bug fix with dimensions @TODO: GPU workload --- data/gan.py | 1185 ++++++++++++++++++++++++++------------------------- setup.py | 2 +- 2 files changed, 598 insertions(+), 589 deletions(-) diff --git a/data/gan.py b/data/gan.py index e54daa8..367d63c 100644 --- a/data/gan.py +++ b/data/gan.py @@ -10,11 +10,11 @@ python data/maker --config CONFIGURATION FILE STRUCTURE : - context what it is you are loading (stroke, hypertension, ...) - data path of the file to be loaded - logs folder to store training model and meta data about learning - max_epochs number of iterations in learning - num_gpu number of gpus to be used (will still run if the GPUs are not available) + context what it is you are loading (stroke, hypertension, ...) + data path of the file to be loaded + logs folder to store training model and meta data about learning + max_epochs number of iterations in learning + num_gpu number of gpus to be used (will still run if the GPUs are not available) EMBEDDED IN CODE : @@ -35,619 +35,628 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ['CUDA_VISIBLE_DEVICES'] = "0" os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' -# STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256 -# NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu']) +# STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256 +# NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu']) # BATCHSIZE_PER_GPU = 2000 -# TOTAL_BATCHSIZE = BATCHSIZE_PER_GPU * NUM_GPUS +# TOTAL_BATCHSIZE = BATCHSIZE_PER_GPU * NUM_GPUS class void : - pass + pass class GNet : - def log(self,**args): - self.logs = dict(args,**self.logs) - - - """ - This is the base class of a generative network functions, the details will be implemented in the subclasses. - An instance of this class is accessed as follows - object.layers.normalize applies batch normalization or otherwise - obect.get.variables instanciate variables on cpu and return a reference (tensor) - """ - def __init__(self,**args): - self.layers = void() - self.layers.normalize = self.normalize - self.logs = {} - - self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] - - - self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854 - self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE] - self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis - # self.NUM_LABELS = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1] - if 'label' in args and len(args['label'].shape) == 2 : - self.NUM_LABELS = args['label'].shape[1] - elif 'label' in args and len(args['label']) == 1 : - self.NUM_LABELS = args['label'].shape[0] - else: - self.NUM_LABELS = 8 - self.Z_DIM = 128 #self.X_SPACE_SIZE - self.BATCHSIZE_PER_GPU = args['real'].shape[0] if 'real' in args else 256 - self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS - self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) - self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs']) - self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100 - self.CONTEXT = args['context'] - self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None} - self._REAL = args['real'] if 'real' in args else None - self._LABEL = args['label'] if 'label' in args else None - - self.get = void() - self.get.variables = self._variable_on_cpu - self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - self.logger = args['logger'] if 'logger' in args and args['logger'] else None - self.init_logs(**args) - - def init_logs(self,**args): - self.log_dir = args['logs'] if 'logs' in args else 'logs' - self.mkdir(self.log_dir) - # - # - for key in ['train','output'] : - self.mkdir(os.sep.join([self.log_dir,key])) - self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT])) - - self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) - self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) - if self.logger : - # - # We will clear the logs from the data-store - # - column = self.ATTRIBUTES['synthetic'] - db = self.logger.db - if db[column].count() > 0 : - db.backup.insert({'name':column,'logs':list(db[column].find()) }) - db[column].drop() - - def load_meta(self,column): + def log(self,**args): + self.logs = dict(args,**self.logs) + + """ - This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model. - Because prediction and training can happen independently + This is the base class of a generative network functions, the details will be implemented in the subclasses. + An instance of this class is accessed as follows + object.layers.normalize applies batch normalization or otherwise + obect.get.variables instanciate variables on cpu and return a reference (tensor) """ - # suffix = "-".join(column) if isinstance(column,list)else column - suffix = self.get.suffix() - _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json']) - if os.path.exists(_name) : - attr = json.loads((open(_name)).read()) - for key in attr : - value = attr[key] - setattr(self,key,value) - self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) - self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) + def __init__(self,**args): + self.layers = void() + self.layers.normalize = self.normalize + self.logs = {} + + self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] + if self.NUM_GPUS > 1 : + os.environ['CUDA_VISIBLE_DEVICES'] = "4" + + self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854 + self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE] + self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis + # self.NUM_LABELS = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1] + + if 'label' in args and len(args['label'].shape) == 2 : + self.NUM_LABELS = args['label'].shape[1] + elif 'label' in args and len(args['label']) == 1 : + self.NUM_LABELS = args['label'].shape[0] + else: + self.NUM_LABELS = 8 + # self.Z_DIM = 128 #self.X_SPACE_SIZE + self.Z_DIM = 128 #-- used as rows down stream + self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM] + if 'real' in args : + self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM] + + self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) if 'real' in args else 256 + self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS + self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000) + self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs']) + self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100 + self.CONTEXT = args['context'] + self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None} + self._REAL = args['real'] if 'real' in args else None + self._LABEL = args['label'] if 'label' in args else None + + self.get = void() + self.get.variables = self._variable_on_cpu + self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + self.logger = args['logger'] if 'logger' in args and args['logger'] else None + self.init_logs(**args) + + def init_logs(self,**args): + self.log_dir = args['logs'] if 'logs' in args else 'logs' + self.mkdir(self.log_dir) + # + # + for key in ['train','output'] : + self.mkdir(os.sep.join([self.log_dir,key])) + self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT])) + + self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) + self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) + if self.logger : + # + # We will clear the logs from the data-store + # + column = self.ATTRIBUTES['synthetic'] + db = self.logger.db + if db[column].count() > 0 : + db.backup.insert({'name':column,'logs':list(db[column].find()) }) + db[column].drop() + + def load_meta(self,column): + """ + This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model. + Because prediction and training can happen independently + """ + # suffix = "-".join(column) if isinstance(column,list)else column + suffix = self.get.suffix() + _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json']) + if os.path.exists(_name) : + attr = json.loads((open(_name)).read()) + for key in attr : + value = attr[key] + setattr(self,key,value) + self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) + self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) + + + def log_meta(self,**args) : + + _object = { + # '_id':'meta', + 'CONTEXT':self.CONTEXT, + 'ATTRIBUTES':self.ATTRIBUTES, + 'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU, + 'Z_DIM':self.Z_DIM, + "X_SPACE_SIZE":self.X_SPACE_SIZE, + "D_STRUCTURE":self.D_STRUCTURE, + "G_STRUCTURE":self.G_STRUCTURE, + "NUM_GPUS":self.NUM_GPUS, + "NUM_LABELS":self.NUM_LABELS, + "MAX_EPOCHS":self.MAX_EPOCHS, + "ROW_COUNT":self.ROW_COUNT + } + if args and 'key' in args and 'value' in args : + key = args['key'] + value= args['value'] + object[key] = value + # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column + suffix = self.get.suffix() + _name = os.sep.join([self.out_dir,'meta-'+suffix]) + + f = open(_name+'.json','w') + f.write(json.dumps(_object)) + return _object + def mkdir (self,path): + if not os.path.exists(path) : + os.mkdir(path) - - def log_meta(self,**args) : - - _object = { - # '_id':'meta', - 'CONTEXT':self.CONTEXT, - 'ATTRIBUTES':self.ATTRIBUTES, - 'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU, - 'Z_DIM':self.Z_DIM, - "X_SPACE_SIZE":self.X_SPACE_SIZE, - "D_STRUCTURE":self.D_STRUCTURE, - "G_STRUCTURE":self.G_STRUCTURE, - "NUM_GPUS":self.NUM_GPUS, - "NUM_LABELS":self.NUM_LABELS, - "MAX_EPOCHS":self.MAX_EPOCHS, - "ROW_COUNT":self.ROW_COUNT - } - if args and 'key' in args and 'value' in args : - key = args['key'] - value= args['value'] - object[key] = value - # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column - suffix = self.get.suffix() - _name = os.sep.join([self.out_dir,'meta-'+suffix]) - - f = open(_name+'.json','w') - f.write(json.dumps(_object)) - return _object - def mkdir (self,path): - if not os.path.exists(path) : - os.mkdir(path) - - - def normalize(self,**args): - """ - This function will perform a batch normalization on an network layer - inputs input layer of the neural network - name name of the scope the - labels labels (attributes not synthesized) by default None - n_labels number of labels default None - """ - inputs = args['inputs'] - name = args['name'] - labels = None if 'labels' not in args else args['labels'] - n_labels= None if 'n_labels' not in args else args['n_labels'] - shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing - mean, var = tf.nn.moments(inputs, shift, keep_dims=True) - shape = inputs.shape[1].value - offset_m = self.get.variables(shape=[n_labels,shape], name='offset'+name, - initializer=tf.zeros_initializer) - scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name, - initializer=tf.ones_initializer) - - offset = tf.nn.embedding_lookup(offset_m, labels) - scale = tf.nn.embedding_lookup(scale_m, labels) - result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8) - return result - - def _variable_on_cpu(self,**args): - """ - This function makes sure variables/tensors are not created on the GPU but rather on the CPU - """ - name = args['name'] - shape = args['shape'] - initializer=None if 'initializer' not in args else args['initializer'] - with tf.device('/cpu:0') : - cpu_var = tf.compat.v1.get_variable(name,shape,initializer= initializer) - return cpu_var - def average_gradients(self,tower_grads): - average_grads = [] - for grad_and_vars in zip(*tower_grads): - grads = [] - for g, _ in grad_and_vars: - expanded_g = tf.expand_dims(g, 0) - grads.append(expanded_g) - - grad = tf.concat(axis=0, values=grads) - grad = tf.reduce_mean(grad, 0) - - v = grad_and_vars[0][1] - grad_and_var = (grad, v) - average_grads.append(grad_and_var) - return average_grads + def normalize(self,**args): + """ + This function will perform a batch normalization on an network layer + inputs input layer of the neural network + name name of the scope the + labels labels (attributes not synthesized) by default None + n_labels number of labels default None + """ + inputs = args['inputs'] + name = args['name'] + labels = None if 'labels' not in args else args['labels'] + n_labels= None if 'n_labels' not in args else args['n_labels'] + shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing + mean, var = tf.nn.moments(inputs, shift, keep_dims=True) + shape = inputs.shape[1].value + offset_m = self.get.variables(shape=[n_labels,shape], name='offset'+name, + initializer=tf.zeros_initializer) + scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name, + initializer=tf.ones_initializer) + + offset = tf.nn.embedding_lookup(offset_m, labels) + scale = tf.nn.embedding_lookup(scale_m, labels) + result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8) + return result + + def _variable_on_cpu(self,**args): + """ + This function makes sure variables/tensors are not created on the GPU but rather on the CPU + """ + + name = args['name'] + shape = args['shape'] + initializer=None if 'initializer' not in args else args['initializer'] + with tf.device('/cpu:0') : + cpu_var = tf.compat.v1.get_variable(name,shape,initializer= initializer) + return cpu_var + def average_gradients(self,tower_grads): + average_grads = [] + for grad_and_vars in zip(*tower_grads): + grads = [] + for g, _ in grad_and_vars: + expanded_g = tf.expand_dims(g, 0) + grads.append(expanded_g) + + grad = tf.concat(axis=0, values=grads) + grad = tf.reduce_mean(grad, 0) + + v = grad_and_vars[0][1] + grad_and_var = (grad, v) + average_grads.append(grad_and_var) + return average_grads class Generator (GNet): - """ - This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator, this allows the generator not to be random - - """ - def __init__(self,**args): - GNet.__init__(self,**args) - self.discriminator = Discriminator(**args) - def loss(self,**args): - fake = args['fake'] - label = args['label'] - y_hat_fake = self.discriminator.network(inputs=fake, label=label) - #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) - loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs) - #tf.add_to_collection('glosses', loss) - tf.compat.v1.add_to_collection('glosses', loss) - return loss, loss - def load_meta(self, column): - super().load_meta(column) - self.discriminator.load_meta(column) - def network(self,**args) : - """ - This function will build the network that will generate the synthetic candidates - :inputs matrix of data that we need - :dim dimensions of ... """ - x = args['inputs'] - tmp_dim = self.Z_DIM if 'dim' not in args else args['dim'] - label = args['label'] + This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator, this allows the generator not to be random - with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): - for i, dim in enumerate(self.G_STRUCTURE[:-1]): - kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim]) - h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS) - h2 = tf.nn.relu(h1) - x = x + h2 - tmp_dim = dim - i = len(self.G_STRUCTURE) - 1 - # - # This seems to be an extra hidden layer: - # It's goal is to map continuous values to discrete values (pre-trained to do this) - kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]]) - h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i), - labels=label, n_labels=self.NUM_LABELS) - h2 = tf.nn.tanh(h1) - x = x + h2 - # This seems to be the output layer - # - kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE]) - bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE]) - x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias)) - return x + """ + def __init__(self,**args): + GNet.__init__(self,**args) + self.discriminator = Discriminator(**args) + def loss(self,**args): + fake = args['fake'] + label = args['label'] + y_hat_fake = self.discriminator.network(inputs=fake, label=label) + #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) + all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) + loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs) + #tf.add_to_collection('glosses', loss) + tf.compat.v1.add_to_collection('glosses', loss) + return loss, loss + def load_meta(self, column): + super().load_meta(column) + self.discriminator.load_meta(column) + def network(self,**args) : + """ + This function will build the network that will generate the synthetic candidates + :inputs matrix of data that we need + :dim dimensions of ... + """ + x = args['inputs'] + tmp_dim = self.Z_DIM if 'dim' not in args else args['dim'] + label = args['label'] + + with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): + for i, dim in enumerate(self.G_STRUCTURE[:-1]): + kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim]) + h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS) + h2 = tf.nn.relu(h1) + x = x + h2 + tmp_dim = dim + i = len(self.G_STRUCTURE) - 1 + # + # This seems to be an extra hidden layer: + # It's goal is to map continuous values to discrete values (pre-trained to do this) + kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]]) + h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i), + labels=label, n_labels=self.NUM_LABELS) + h2 = tf.nn.tanh(h1) + x = x + h2 + # This seems to be the output layer + # + kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE]) + bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE]) + x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias)) + return x class Discriminator(GNet): - def __init__(self,**args): - GNet.__init__(self,**args) - def network(self,**args): - """ - This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output (neuron) - :inputs - :label - """ - x = args['inputs'] - label = args['label'] - with tf.compat.v1.variable_scope('D', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): - for i, dim in enumerate(self.D_STRUCTURE[1:]): - kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[i], dim]) - bias = self.get.variables(name='b_' + str(i), shape=[dim]) - # print (["\t",bias,kernel]) - x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias)) - x = self.normalize(inputs=x, name='cln' + str(i), shift=1,labels=label, n_labels=self.NUM_LABELS) - i = len(self.D_STRUCTURE) - kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[-1], 1]) - bias = self.get.variables(name='b_' + str(i), shape=[1]) - y = tf.add(tf.matmul(x, kernel), bias) - return y - - def loss(self,**args) : - """ - This function compute the loss of - :real - :fake - :label - """ - real = args['real'] - fake = args['fake'] - label = args['label'] - epsilon = tf.random.uniform(shape=[self.BATCHSIZE_PER_GPU,1],minval=0,maxval=1) + def __init__(self,**args): + GNet.__init__(self,**args) + def network(self,**args): + """ + This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output (neuron) + :inputs + :label + """ + x = args['inputs'] + label = args['label'] + with tf.compat.v1.variable_scope('D', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): + for i, dim in enumerate(self.D_STRUCTURE[1:]): + kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[i], dim]) + bias = self.get.variables(name='b_' + str(i), shape=[dim]) + # print (["\t",bias,kernel]) + x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias)) + x = self.normalize(inputs=x, name='cln' + str(i), shift=1,labels=label, n_labels=self.NUM_LABELS) + i = len(self.D_STRUCTURE) + kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[-1], 1]) + bias = self.get.variables(name='b_' + str(i), shape=[1]) + y = tf.add(tf.matmul(x, kernel), bias) + return y - x_hat = real + epsilon * (fake - real) - y_hat_fake = self.network(inputs=fake, label=label) - - y_hat_real = self.network(inputs=real, label=label) - y_hat = self.network(inputs=x_hat, label=label) - - grad = tf.gradients(y_hat, [x_hat])[0] - slopes = tf.sqrt(tf.reduce_sum(tf.square(grad), 1)) - gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2) - #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) - w_distance = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake) - loss = w_distance + 10 * gradient_penalty + sum(all_regs) - #tf.add_to_collection('dlosses', loss) - tf.compat.v1.add_to_collection('dlosses', loss) - - return w_distance, loss + def loss(self,**args) : + """ + This function compute the loss of + :real + :fake + :label + """ + real = args['real'] + fake = args['fake'] + label = args['label'] + epsilon = tf.random.uniform(shape=[self.BATCHSIZE_PER_GPU,1],minval=0,maxval=1) + + x_hat = real + epsilon * (fake - real) + y_hat_fake = self.network(inputs=fake, label=label) + + y_hat_real = self.network(inputs=real, label=label) + y_hat = self.network(inputs=x_hat, label=label) + + grad = tf.gradients(y_hat, [x_hat])[0] + slopes = tf.sqrt(tf.reduce_sum(tf.square(grad), 1)) + gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2) + #all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) + all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES) + w_distance = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake) + loss = w_distance + 10 * gradient_penalty + sum(all_regs) + #tf.add_to_collection('dlosses', loss) + tf.compat.v1.add_to_collection('dlosses', loss) + + return w_distance, loss class Train (GNet): - def __init__(self,**args): - GNet.__init__(self,**args) - self.generator = Generator(**args) - self.discriminator = Discriminator(**args) - self._REAL = args['real'] - self._LABEL= args['label'] - self.column = args['column'] - # print ([" *** ",self.BATCHSIZE_PER_GPU]) - - self.meta = self.log_meta() - if(self.logger): - - self.logger.write( self.meta ) - - self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta) - def load_meta(self, column): - """ - This function will delegate the calls to load meta data to it's dependents - column name - """ - super().load_meta(column) - self.generator.load_meta(column) - self.discriminator.load_meta(column) - def loss(self,**args): - """ - This function will compute a "tower" loss of the generated candidate against real data - Training will consist in having both generator and discriminators - :scope - :stage - :real - :label - """ - - scope = args['scope'] - stage = args['stage'] - real = args['real'] - label = args['label'] - label = tf.cast(label, tf.int32) - # - # @TODO: Ziqi needs to explain what's going on here - m = [[i] for i in np.arange(self._LABEL.shape[1]-2)] - label = label[:, 1] * len(m) + tf.squeeze( - tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32)) - ) - # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] ) - z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) + def __init__(self,**args): + GNet.__init__(self,**args) + self.generator = Generator(**args) + self.discriminator = Discriminator(**args) + self._REAL = args['real'] + self._LABEL= args['label'] + self.column = args['column'] + # print ([" *** ",self.BATCHSIZE_PER_GPU]) + + self.meta = self.log_meta() + if(self.logger): + + self.logger.write( self.meta ) + + self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta) + def load_meta(self, column): + """ + This function will delegate the calls to load meta data to it's dependents + column name + """ + super().load_meta(column) + self.generator.load_meta(column) + self.discriminator.load_meta(column) + def loss(self,**args): + """ + This function will compute a "tower" loss of the generated candidate against real data + Training will consist in having both generator and discriminators + :scope + :stage + :real + :label + """ + + scope = args['scope'] + stage = args['stage'] + real = args['real'] + label = args['label'] + label = tf.cast(label, tf.int32) + # + # @TODO: Ziqi needs to explain what's going on here + m = [[i] for i in np.arange(self._LABEL.shape[1]-2)] + label = label[:, 1] * len(m) + tf.squeeze( + tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32)) + ) + # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] ) + z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) + + fake = self.generator.network(inputs=z, label=label) + if stage == 'D': + w, loss = self.discriminator.loss(real=real, fake=fake, label=label) + #losses = tf.get_collection('dlosses', scope) + flag = 'dlosses' + losses = tf.compat.v1.get_collection('dlosses', scope) + else: + w, loss = self.generator.loss(fake=fake, label=label) + #losses = tf.get_collection('glosses', scope) + flag = 'glosses' + losses = tf.compat.v1.get_collection('glosses', scope) + # losses = tf.compat.v1.get_collection(flag, scope) + + total_loss = tf.add_n(losses, name='total_loss') + + return total_loss, w + def input_fn(self): + """ + This function seems to produce + """ + features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32) + labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32) + dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) + dataset = dataset.repeat(10000) + dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) + dataset = dataset.prefetch(1) + # iterator = dataset.make_initializable_iterator() + iterator = tf.compat.v1.data.make_initializable_iterator(dataset) + # next_element = iterator.get_next() + # init_op = iterator.initializer + return iterator, features_placeholder, labels_placeholder - fake = self.generator.network(inputs=z, label=label) - if stage == 'D': - w, loss = self.discriminator.loss(real=real, fake=fake, label=label) - #losses = tf.get_collection('dlosses', scope) - flag = 'dlosses' - losses = tf.compat.v1.get_collection('dlosses', scope) - else: - w, loss = self.generator.loss(fake=fake, label=label) - #losses = tf.get_collection('glosses', scope) - flag = 'glosses' - losses = tf.compat.v1.get_collection('glosses', scope) - # losses = tf.compat.v1.get_collection(flag, scope) - - total_loss = tf.add_n(losses, name='total_loss') + def network(self,**args): + # def graph(stage, opt): + # global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False) + stage = args['stage'] + opt = args['opt'] + tower_grads = [] + per_gpu_w = [] + iterator, features_placeholder, labels_placeholder = self.input_fn() + with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): + for i in range(self.NUM_GPUS): + with tf.device('/gpu:%d' % i): + with tf.name_scope('%s_%d' % ('TOWER', i)) as scope: + (real, label) = iterator.get_next() + loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL) + #tf.get_variable_scope().reuse_variables() + tf.compat.v1.get_variable_scope().reuse_variables() + #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage) + vars_ = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=stage) + grads = opt.compute_gradients(loss, vars_) + tower_grads.append(grads) + per_gpu_w.append(w) + + grads = self.average_gradients(tower_grads) + apply_gradient_op = opt.apply_gradients(grads) + + mean_w = tf.reduce_mean(per_gpu_w) + train_op = apply_gradient_op + return train_op, mean_w, iterator, features_placeholder, labels_placeholder + def apply(self,**args): + # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10 + REAL = self._REAL + LABEL= self._LABEL + if (self.logger): + pass + + with tf.device('/cpu:0'): + opt_d = tf.compat.v1.train.AdamOptimizer(1e-4) + opt_g = tf.compat.v1.train.AdamOptimizer(1e-4) + + train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d) + train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g) + # saver = tf.train.Saver() + saver = tf.compat.v1.train.Saver() + # init = tf.global_variables_initializer() + init = tf.compat.v1.global_variables_initializer() + logs = [] + #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: + with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: + sess.run(init) + sess.run(iterator_d.initializer, + feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL}) + sess.run(iterator_g.initializer, + feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL}) + + for epoch in range(1, self.MAX_EPOCHS + 1): + start_time = time.time() + w_sum = 0 + for i in range(self.STEPS_PER_EPOCH): + for _ in range(2): + _, w = sess.run([train_d, w_distance]) + w_sum += w + sess.run(train_g) + duration = time.time() - start_time + + assert not np.isnan(w_sum), 'Model diverged with loss = NaN' + + format_str = 'epoch: %d, w_distance = %f (%.1f)' + print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) + # print (dir (w_distance)) + + logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) }) + + if epoch % self.MAX_EPOCHS == 0: + # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + suffix = self.get.suffix() + _name = os.sep.join([self.train_dir,suffix]) + # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) + saver.save(sess, _name, write_meta_graph=False, global_step=epoch) + # + # + if self.logger : + row = {"logs":logs} #,"model":pickle.dump(sess)} + self.logger.write(row) + # + # @TODO: + # We should upload the files in the checkpoint + # This would allow the learnt model to be portable to another system + # + tf.compat.v1.reset_default_graph() - return total_loss, w - def input_fn(self): +class Predict(GNet): """ - This function seems to produce + This class uses synthetic data given a learned model """ - features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32) - labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32) - dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) - dataset = dataset.repeat(10000) - dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) - dataset = dataset.prefetch(1) - # iterator = dataset.make_initializable_iterator() - iterator = tf.compat.v1.data.make_initializable_iterator(dataset) - # next_element = iterator.get_next() - # init_op = iterator.initializer - return iterator, features_placeholder, labels_placeholder - - def network(self,**args): - # def graph(stage, opt): - # global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False) - stage = args['stage'] - opt = args['opt'] - tower_grads = [] - per_gpu_w = [] - iterator, features_placeholder, labels_placeholder = self.input_fn() - with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): - for i in range(self.NUM_GPUS): - with tf.device('/gpu:%d' % i): - with tf.name_scope('%s_%d' % ('TOWER', i)) as scope: - (real, label) = iterator.get_next() - loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL) - #tf.get_variable_scope().reuse_variables() - tf.compat.v1.get_variable_scope().reuse_variables() - #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage) - vars_ = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=stage) - grads = opt.compute_gradients(loss, vars_) - tower_grads.append(grads) - per_gpu_w.append(w) - - grads = self.average_gradients(tower_grads) - apply_gradient_op = opt.apply_gradients(grads) - - mean_w = tf.reduce_mean(per_gpu_w) - train_op = apply_gradient_op - return train_op, mean_w, iterator, features_placeholder, labels_placeholder - def apply(self,**args): - # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10 - REAL = self._REAL - LABEL= self._LABEL - if (self.logger): - pass - - with tf.device('/cpu:0'): - opt_d = tf.compat.v1.train.AdamOptimizer(1e-4) - opt_g = tf.compat.v1.train.AdamOptimizer(1e-4) - - train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d) - train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g) - # saver = tf.train.Saver() - saver = tf.compat.v1.train.Saver() - # init = tf.global_variables_initializer() - init = tf.compat.v1.global_variables_initializer() - logs = [] - #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: - with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: - sess.run(init) - sess.run(iterator_d.initializer, - feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL}) - sess.run(iterator_g.initializer, - feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL}) - - for epoch in range(1, self.MAX_EPOCHS + 1): - start_time = time.time() - w_sum = 0 - for i in range(self.STEPS_PER_EPOCH): - for _ in range(2): - _, w = sess.run([train_d, w_distance]) - w_sum += w - sess.run(train_g) - duration = time.time() - start_time - - assert not np.isnan(w_sum), 'Model diverged with loss = NaN' - - format_str = 'epoch: %d, w_distance = %f (%.1f)' - print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) - # print (dir (w_distance)) - - logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) }) - - if epoch % self.MAX_EPOCHS == 0: - # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - suffix = self.get.suffix() - _name = os.sep.join([self.train_dir,suffix]) - # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) - saver.save(sess, _name, write_meta_graph=False, global_step=epoch) + def __init__(self,**args): + GNet.__init__(self,**args) + self.generator = Generator(**args) + self.values = args['values'] + def load_meta(self, column): + super().load_meta(column) + self.generator.load_meta(column) + def apply(self,**args): + # print (self.train_dir) + # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] + suffix = self.get.suffix() + model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) + demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] + tf.compat.v1.reset_default_graph() + z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) + y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32) + ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] + label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) + + fake = self.generator.network(inputs=z, label=label) + init = tf.compat.v1.global_variables_initializer() + saver = tf.compat.v1.train.Saver() + df = pd.DataFrame() + CANDIDATE_COUNT = 1000 + NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0] + with tf.compat.v1.Session() as sess: + + # sess.run(init) + saver.restore(sess, model_dir) + labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) + + found = [] + labels= demo + for i in np.arange(CANDIDATE_COUNT) : + + f = sess.run(fake,feed_dict={y:labels}) + # + # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes + # The code below will insure we have some acceptable cardinal relationships between id and synthetic values + # + df = ( pd.DataFrame(np.round(f).astype(np.int32))) + print (df.head()) + print () + p = 0 not in df.sum(axis=1).values + + if p: + found.append(df) + if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT: + break + else: + continue + + # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms + # df = (i * df).sum(axis=1) # + # In case we are dealing with actual values like diagnosis codes we can perform # - if self.logger : - row = {"logs":logs} #,"model":pickle.dump(sess)} - self.logger.write(row) - # - # @TODO: - # We should upload the files in the checkpoint - # This would allow the learnt model to be portable to another system - # - tf.compat.v1.reset_default_graph() + df = found[np.random.choice(np.arange(len(found)),1)[0]] + columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] + + # r = np.zeros((self.ROW_COUNT,len(columns))) + r = np.zeros(self.ROW_COUNT) + df.columns = self.values + if len(found): + print (len(found),NTH_VALID_CANDIDATE) + # x = df * self.values + + df = pd.DataFrame( df.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) + df.columns = columns + + + + + tf.compat.v1.reset_default_graph() + + return df.to_dict(orient='list') + # return df.to_dict(orient='list') + # count = str(len(os.listdir(self.out_dir))) + # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv']) + # df.to_csv(_name,index=False) + + + # output.extend(np.round(f)) + + # for m in range(2): + # for n in range(2, self.NUM_LABELS): + # idx1 = (demo[:, m] == 1) + # idx2 = (demo[:, n] == 1) + # idx = [idx1[j] and idx2[j] for j in range(len(idx1))] + # num = np.sum(idx) + # print ("___________________list__") + # print (idx1) + # print (idx2) + # print (idx) + # print (num) + # print ("_____________________") + # nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU)) + # label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS)) + # label_input[:, n] = 1 + # label_input[:, m] = 1 + # output = [] + # for i in range(nbatch): + # f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]}) + # output.extend(np.round(f)) + # output = np.array(output)[:num] + # print ([m,n,output]) + + # np.save(self.out_dir + str(m) + str(n), output) + -class Predict(GNet): - """ - This class uses synthetic data given a learned model - """ - def __init__(self,**args): - GNet.__init__(self,**args) - self.generator = Generator(**args) - self.values = args['values'] - def load_meta(self, column): - super().load_meta(column) - self.generator.load_meta(column) - def apply(self,**args): - # print (self.train_dir) - # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] - suffix = self.get.suffix() - model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) - demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] - tf.compat.v1.reset_default_graph() - z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) - y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32) - ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] - label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) +if __name__ == '__main__' : + # + # Now we get things done ... + column = SYS_ARGS['column'] + column_id = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id' + column_id = column_id.split(',') if ',' in column_id else column_id + df = pd.read_csv(SYS_ARGS['raw-data']) + LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values - fake = self.generator.network(inputs=z, label=label) - init = tf.compat.v1.global_variables_initializer() - saver = tf.compat.v1.train.Saver() - df = pd.DataFrame() - CANDIDATE_COUNT = 1000 - NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0] - with tf.compat.v1.Session() as sess: - - # sess.run(init) - saver.restore(sess, model_dir) - labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) - - found = [] - labels= demo - for i in np.arange(CANDIDATE_COUNT) : + context = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4] + if set(['train','learn']) & set(SYS_ARGS.keys()): + + df = pd.read_csv(SYS_ARGS['raw-data']) + + # cols = SYS_ARGS['column'] + # _map,_df = (Binary()).Export(df) + # i = np.arange(_map[column]['start'],_map[column]['end']) + max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10 + # REAL = _df[:,i] + REAL = pd.get_dummies(df[column]).astype(np.float32).values + LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values + trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id) + trainer.apply() - f = sess.run(fake,feed_dict={y:labels}) + + + + # + # We should train upon this data # - # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes - # The code below will insure we have some acceptable cardinal relationships between id and synthetic values + # -- we need to convert the data-frame to binary matrix, given a column # - df = ( pd.DataFrame(np.round(f).astype(np.int32))) - p = 0 not in df.sum(axis=1).values + pass + elif 'generate' in SYS_ARGS: + values = df[column].unique().tolist() + values.sort() - if p: - found.append(df) - if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT: - break - else: - continue - - # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms - # df = (i * df).sum(axis=1) - # - # In case we are dealing with actual values like diagnosis codes we can perform - # - df = found[np.random.choice(np.arange(len(found)),1)[0]] - columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] - - # r = np.zeros((self.ROW_COUNT,len(columns))) - r = np.zeros(self.ROW_COUNT) - df.columns = self.values - if len(found): - print (len(found),NTH_VALID_CANDIDATE) - # x = df * self.values + p = Predict(context=context,label=LABEL,values=values,column=column) + p.load_meta(column) + r = p.apply() + print (df) + print () + df[column] = r[column] + print (df) - df = pd.DataFrame( df.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) - df.columns = columns - - - - tf.compat.v1.reset_default_graph() - - return df.to_dict(orient='list') - # return df.to_dict(orient='list') - # count = str(len(os.listdir(self.out_dir))) - # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv']) - # df.to_csv(_name,index=False) - - - # output.extend(np.round(f)) - - # for m in range(2): - # for n in range(2, self.NUM_LABELS): - # idx1 = (demo[:, m] == 1) - # idx2 = (demo[:, n] == 1) - # idx = [idx1[j] and idx2[j] for j in range(len(idx1))] - # num = np.sum(idx) - # print ("___________________list__") - # print (idx1) - # print (idx2) - # print (idx) - # print (num) - # print ("_____________________") - # nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU)) - # label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS)) - # label_input[:, n] = 1 - # label_input[:, m] = 1 - # output = [] - # for i in range(nbatch): - # f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]}) - # output.extend(np.round(f)) - # output = np.array(output)[:num] - # print ([m,n,output]) - - # np.save(self.out_dir + str(m) + str(n), output) - - -if __name__ == '__main__' : - # - # Now we get things done ... - column = SYS_ARGS['column'] - column_id = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id' - column_id = column_id.split(',') if ',' in column_id else column_id - df = pd.read_csv(SYS_ARGS['raw-data']) - LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values - - context = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4] - if set(['train','learn']) & set(SYS_ARGS.keys()): - - df = pd.read_csv(SYS_ARGS['raw-data']) - - # cols = SYS_ARGS['column'] - # _map,_df = (Binary()).Export(df) - # i = np.arange(_map[column]['start'],_map[column]['end']) - max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10 - # REAL = _df[:,i] - REAL = pd.get_dummies(df[column]).astype(np.float32).values - LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values - trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id) - trainer.apply() - - - - - # - # We should train upon this data - # - # -- we need to convert the data-frame to binary matrix, given a column - # + else: + print (SYS_ARGS.keys()) + print (__doc__) pass - elif 'generate' in SYS_ARGS: - values = df[column].unique().tolist() - values.sort() - - p = Predict(context=context,label=LABEL,values=values,column=column) - p.load_meta(column) - r = p.apply() - print (df) - print () - df[column] = r[column] - print (df) - - - else: - print (SYS_ARGS.keys()) - print (__doc__) - pass diff --git a/setup.py b/setup.py index 8034249..a0b96c7 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.0.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.0.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git'