diff --git a/data/gan.py b/data/gan.py index 3d600a3..77fcf3d 100644 --- a/data/gan.py +++ b/data/gan.py @@ -72,7 +72,7 @@ class GNet : elif 'label' in args and len(args['label']) == 1 : self.NUM_LABELS = args['label'].shape[0] else: - self.NUM_LABELS = 8 + self.NUM_LABELS = None # self.Z_DIM = 128 #self.X_SPACE_SIZE self.Z_DIM = 128 #-- used as rows down stream self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM] @@ -180,14 +180,19 @@ class GNet : shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing mean, var = tf.nn.moments(inputs, shift, keep_dims=True) shape = inputs.shape[1].value - offset_m = self.get.variables(shape=[n_labels,shape], name='offset'+name, - initializer=tf.zeros_initializer) - scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name, - initializer=tf.ones_initializer) - - offset = tf.nn.embedding_lookup(offset_m, labels) - scale = tf.nn.embedding_lookup(scale_m, labels) - result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8) + if labels is not None: + offset_m = self.get.variables(shape=[1,shape], name='offset'+name, + initializer=tf.zeros_initializer) + scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name, + initializer=tf.ones_initializer) + offset = tf.nn.embedding_lookup(offset_m, labels) + scale = tf.nn.embedding_lookup(scale_m, labels) + + else: + offset = None + scale = None + + result = tf.nn.batch_normalization(inputs, mean, var,offset,scale, 1e-8) return result def _variable_on_cpu(self,**args): @@ -248,7 +253,7 @@ class Generator (GNet): x = args['inputs'] tmp_dim = self.Z_DIM if 'dim' not in args else args['dim'] label = args['label'] - + print (self.NUM_LABELS) with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)): for i, dim in enumerate(self.G_STRUCTURE[:-1]): kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim]) @@ -331,7 +336,7 @@ class Train (GNet): self.generator = Generator(**args) self.discriminator = Discriminator(**args) self._REAL = args['real'] - self._LABEL= args['label'] + self._LABEL= args['label'] if 'label' in args else None self.column = args['column'] # print ([" *** ",self.BATCHSIZE_PER_GPU]) @@ -340,7 +345,7 @@ class Train (GNet): self.logger.write( self.meta ) - self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta) + # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta) def load_meta(self, column): """ This function will delegate the calls to load meta data to it's dependents @@ -363,13 +368,16 @@ class Train (GNet): stage = args['stage'] real = args['real'] label = args['label'] - label = tf.cast(label, tf.int32) - # - # @TODO: Ziqi needs to explain what's going on here - m = [[i] for i in np.arange(self._LABEL.shape[1]-2)] - label = label[:, 1] * len(m) + tf.squeeze( - tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32)) - ) + + + if label is not None : + label = tf.cast(label, tf.int32) + # + # @TODO: Ziqi needs to explain what's going on here + m = [[i] for i in np.arange(self._LABEL.shape[1]-2)] + label = label[:, 1] * len(m) + tf.squeeze( + tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32)) + ) # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] ) z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) @@ -394,8 +402,13 @@ class Train (GNet): This function seems to produce """ features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32) - labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32) - dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) + LABEL_SHAPE = [None,None] if self._LABEL is None else self._LABEL.shape + labels_placeholder = tf.compat.v1.placeholder(shape=LABEL_SHAPE, dtype=tf.float32) + if self._LABEL is not None : + dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) + else : + dataset = tf.data.Dataset.from_tensor_slices(features_placeholder) + # labels_placeholder = None dataset = dataset.repeat(10000) dataset = dataset.batch(batch_size=3000) dataset = dataset.prefetch(1) @@ -413,7 +426,10 @@ class Train (GNet): for i in range(self.NUM_GPUS): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ('TOWER', i)) as scope: - (real, label) = iterator.get_next() + if self._LABEL is not None : + (real, label) = iterator.get_next() + else: + real = iterator.get_next() loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL) #tf.get_variable_scope().reuse_variables() tf.compat.v1.get_variable_scope().reuse_variables() @@ -450,11 +466,12 @@ class Train (GNet): #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: sess.run(init) + sess.run(iterator_d.initializer, - feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL}) + feed_dict={features_placeholder_d: REAL}) sess.run(iterator_g.initializer, - feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL}) - + feed_dict={features_placeholder_g: REAL}) + for epoch in range(1, self.MAX_EPOCHS + 1): start_time = time.time() w_sum = 0 @@ -511,9 +528,11 @@ class Predict(GNet): tf.compat.v1.reset_default_graph() z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32) - ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] - label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) - + if self._LABEL is not None : + ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] + label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) + else: + label = None fake = self.generator.network(inputs=z, label=label) init = tf.compat.v1.global_variables_initializer() saver = tf.compat.v1.train.Saver() @@ -524,13 +543,19 @@ class Predict(GNet): # sess.run(init) saver.restore(sess, model_dir) - labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) + if self._LABEL is not None : + labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) + labels= demo + else: + labels = None found = [] - labels= demo + for i in np.arange(CANDIDATE_COUNT) : - - f = sess.run(fake,feed_dict={y:labels}) + if labels : + f = sess.run(fake,feed_dict={y:labels}) + else: + f = sess.run(fake) # # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes # The code below will insure we have some acceptable cardinal relationships between id and synthetic values diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 74ae718..71fdc68 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -25,7 +25,7 @@ def train (**args) : """ column = args['column'] if (isinstance(args['column'],list)) else [args['column']] - column_id = args['id'] + # column_id = args['id'] df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) df.columns = [name.lower() for name in df.columns] @@ -35,7 +35,8 @@ def train (**args) : # handler = Binary() # args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values - args['label'] = handler.Export(df[[column_id]]) + # args['label'] = handler.Export(df[[column_id]]) + # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1) for col in column : # args['real'] = pd.get_dummies(df[col]).astype(np.float32).values args['real'] = handler.Export(df[[col]]) @@ -83,7 +84,7 @@ def generate(**args): # # args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values bwrangler = Binary() - args['label'] = bwrangler.Export(df[[column_id]]) + # args['label'] = bwrangler.Export(df[[column_id]]) _df = df.copy() for col in column : args['context'] = col diff --git a/setup.py b/setup.py index fcc12c1..50155cc 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ def read(fname): args = {"name":"data-maker","version":"1.1.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] -args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git' +args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' if sys.version_info[0] == 2 : args['use_2to3'] = False