data-maker/data/gan.py

"""
import pickle
                else:
                        #
                        column = self.ATTRIBUTES['synthetic']
                        db = self.logger.db
                        if db[column].count() > 0 :
                                db.backup.insert({'name':column,'logs':list(db[column].find()) })
                                db[column].drop()
                
        def load_meta(self,column):
                """
                This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model.
                Because prediction and training can happen independently
                """
                # suffix = "-".join(column) if isinstance(column,list)else column
                suffix = self.get.suffix()
                _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json'])
                if os.path.exists(_name) :
                        attr = json.loads((open(_name)).read())
                        for key in attr :
                                value = attr[key]
                                setattr(self,key,value)
                self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])                
                self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
                                
                        
        def log_meta(self,**args) :
                
                _object = {
                        # '_id':'meta',
                        'CONTEXT':self.CONTEXT,
                        'ATTRIBUTES':self.ATTRIBUTES,
                        'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU,
                        'Z_DIM':self.Z_DIM,
                        "X_SPACE_SIZE":self.X_SPACE_SIZE,
                        "D_STRUCTURE":self.D_STRUCTURE,
                        "G_STRUCTURE":self.G_STRUCTURE,
                        "NUM_GPUS":self.NUM_GPUS,
                        "NUM_LABELS":self.NUM_LABELS,
                        "MAX_EPOCHS":self.MAX_EPOCHS,
                        "ROW_COUNT":self.ROW_COUNT
                }
                if args and 'key' in args and 'value' in args :
                        key = args['key']
                        value= args['value']
                        object[key] = value
                # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column
                suffix = self.get.suffix()
                _name = os.sep.join([self.out_dir,'meta-'+suffix])
                
                f = open(_name+'.json','w')
                f.write(json.dumps(_object))
                return _object
        def mkdir (self,path):
                if not os.path.exists(path) :
                        os.mkdir(path)            
                all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
                loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs)
                #tf.add_to_collection('glosses', loss)
                tf.compat.v1.add_to_collection('glosses', loss)
                return loss, loss                
        def load_meta(self, column):
                super().load_meta(column)
                self.discriminator.load_meta(column)
        def network(self,**args) :
                """
                This function will build the network that will generate the synthetic candidates
                :inputs matrix of data that we need
                :dim    dimensions of ...
                """
                x               = args['inputs']
                tmp_dim = self.Z_DIM if 'dim' not in args else args['dim']
                label   = args['label']
                
                with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
                        for i, dim in enumerate(self.G_STRUCTURE[:-1]):
                                kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim])
                                h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS)
                                h2 = tf.nn.relu(h1)
                                x = x + h2
                                tmp_dim = dim
                        i = len(self.G_STRUCTURE) - 1
                        #
                        # This seems to be an extra hidden layer: 
                        # It's goal is to map continuous values to discrete values (pre-trained to do this)
                        kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]])
                        h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i),
                                                labels=label, n_labels=self.NUM_LABELS)
                        h2 = tf.nn.tanh(h1)
                        x = x + h2
                        # This seems to be the output layer
                        #
                        kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE])
                        bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE])
                        x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias))
                return x           
                self.generator.load_meta(column)
                self.discriminator.load_meta(column)
        def loss(self,**args):
                """
                This function will compute a "tower" loss of the generated candidate against real data
                Training will consist in having both generator and discriminators
                :scope
                :stage
                :real
                :label
                """

                scope   = args['scope']
                stage   = args['stage']
                real    = args['real']
                label   = args['label']
                label   = tf.cast(label, tf.int32)
                #
                # @TODO: Ziqi needs to explain what's going on here
                m = [[i] for i in np.arange(self._LABEL.shape[1]-2)]
                label   = label[:, 1] * len(m) + tf.squeeze(
                        tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32))
                        )
                # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] )
                z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
                
                fake = self.generator.network(inputs=z, label=label)
                if stage == 'D':
                        w, loss = self.discriminator.loss(real=real, fake=fake, label=label)
                        #losses = tf.get_collection('dlosses', scope)
                        flag = 'dlosses'
                        losses = tf.compat.v1.get_collection('dlosses', scope)
                else:
                        w, loss = self.generator.loss(fake=fake, label=label)
                        #losses = tf.get_collection('glosses', scope)
                        flag = 'glosses'
                        losses = tf.compat.v1.get_collection('glosses', scope)
                # losses = tf.compat.v1.get_collection(flag, scope)

                total_loss = tf.add_n(losses, name='total_loss')

                return total_loss, w
        def input_fn(self):
                """
                This function seems to produce 
                """
                features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32)
                labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32)
                dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
                dataset = dataset.repeat(10000)
                                        w_sum = 0
                                        for i in range(self.STEPS_PER_EPOCH):
                                                for _ in range(2):
                                                        _, w = sess.run([train_d, w_distance])
                                                        w_sum += w
                                                sess.run(train_g)
                                        duration = time.time() - start_time

                                        assert not np.isnan(w_sum), 'Model diverged with loss = NaN'

                                        format_str = 'epoch: %d, w_distance = %f (%.1f)'
                                        print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
                                        # print (dir (w_distance))

                                        logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })

                                        if epoch % self.MAX_EPOCHS == 0:
                                                # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
                                                suffix = self.get.suffix()
                                                _name  = os.sep.join([self.train_dir,suffix])
                                                # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
                                                saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
                                                #
                                                #
                                                if self.logger :
                                                        row = {"logs":logs} #,"model":pickle.dump(sess)}                                                        
                                                        self.logger.write(row)
                                                        #
                                                        # @TODO:
                                                        # We should upload the files in the checkpoint 
                                                        # This would allow the learnt model to be portable to another system
                                                        #
                        tf.compat.v1.reset_default_graph()
                        for i in np.arange(CANDIDATE_COUNT) :
                
                tf.compat.v1.reset_default_graph()
                
                return df.to_dict(orient='list')
                        # return df.to_dict(orient='list')
                        # count = str(len(os.listdir(self.out_dir)))
                        # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])
                        # df.to_csv(_name,index=False)

                        
                        # output.extend(np.round(f))

                        # for m in range(2):
                        #         for n in range(2, self.NUM_LABELS):
                        #                 idx1 = (demo[:, m] == 1)
                        #                 idx2 = (demo[:, n] == 1)
                        #                 idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
                        #                 num = np.sum(idx)
                        #                 print ("___________________list__")
                        #                 print (idx1)
                        #                 print (idx2)
                        #                 print (idx)
                        #                 print (num)
                        #                 print ("_____________________")
                        #                 nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU))
                        #                 label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS))
                        #                 label_input[:, n] = 1
                        #                 label_input[:, m] = 1
                        #                 output = []
                        #                 for i in range(nbatch):
                        #                         f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]})
                        #                         output.extend(np.round(f))
                        #                 output = np.array(output)[:num]
                                        # print ([m,n,output])
                                        
                                        # np.save(self.out_dir + str(m) + str(n), output)
        
                p.load_meta(column)
bug fix and enhancement 5 years ago			`"""`
bug fixes 5 years ago			`import pickle`
bug fix and enhancement 5 years ago			`else:`
bug fix with dimensions @TODO: GPU workload 5 years ago			`#`
			`column = self.ATTRIBUTES['synthetic']`
			`db = self.logger.db`
			`if db[column].count() > 0 :`
			`db.backup.insert({'name':column,'logs':list(db[column].find()) })`
			`db[column].drop()`

			`def load_meta(self,column):`
			`"""`
			`This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model.`
			`Because prediction and training can happen independently`
			`"""`
			`# suffix = "-".join(column) if isinstance(column,list)else column`
			`suffix = self.get.suffix()`
			`_name = os.sep.join([self.out_dir,'meta-'+suffix+'.json'])`
			`if os.path.exists(_name) :`
			`attr = json.loads((open(_name)).read())`
			`for key in attr :`
			`value = attr[key]`
			`setattr(self,key,value)`
			`self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT])`
			`self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])`


			`def log_meta(self,**args) :`

			`_object = {`
			`# '_id':'meta',`
			`'CONTEXT':self.CONTEXT,`
			`'ATTRIBUTES':self.ATTRIBUTES,`
			`'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU,`
			`'Z_DIM':self.Z_DIM,`
			`"X_SPACE_SIZE":self.X_SPACE_SIZE,`
			`"D_STRUCTURE":self.D_STRUCTURE,`
			`"G_STRUCTURE":self.G_STRUCTURE,`
			`"NUM_GPUS":self.NUM_GPUS,`
			`"NUM_LABELS":self.NUM_LABELS,`
			`"MAX_EPOCHS":self.MAX_EPOCHS,`
			`"ROW_COUNT":self.ROW_COUNT`
			`}`
			`if args and 'key' in args and 'value' in args :`
			`key = args['key']`
			`value= args['value']`
			`object[key] = value`
			`# suffix = "-".join(self.column) if isinstance(self.column,list) else self.column`
			`suffix = self.get.suffix()`
			`_name = os.sep.join([self.out_dir,'meta-'+suffix])`

			`f = open(_name+'.json','w')`
			`f.write(json.dumps(_object))`
			`return _object`
			`def mkdir (self,path):`
			`if not os.path.exists(path) :`
			`os.mkdir(path)`
bug fix with dimensions @TODO: GPU workload 5 years ago			`all_regs = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)`
			`loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs)`
			`#tf.add_to_collection('glosses', loss)`
			`tf.compat.v1.add_to_collection('glosses', loss)`
			`return loss, loss`
			`def load_meta(self, column):`
			`super().load_meta(column)`
			`self.discriminator.load_meta(column)`
			`def network(self,**args) :`
			`"""`
			`This function will build the network that will generate the synthetic candidates`
			`:inputs matrix of data that we need`
			`:dim dimensions of ...`
			`"""`
			`x = args['inputs']`
			`tmp_dim = self.Z_DIM if 'dim' not in args else args['dim']`
			`label = args['label']`

			`with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):`
			`for i, dim in enumerate(self.G_STRUCTURE[:-1]):`
			`kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim])`
			`h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS)`
			`h2 = tf.nn.relu(h1)`
			`x = x + h2`
			`tmp_dim = dim`
			`i = len(self.G_STRUCTURE) - 1`
			`#`
			`# This seems to be an extra hidden layer:`
			`# It's goal is to map continuous values to discrete values (pre-trained to do this)`
			`kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]])`
			`h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i),`
			`labels=label, n_labels=self.NUM_LABELS)`
			`h2 = tf.nn.tanh(h1)`
			`x = x + h2`
			`# This seems to be the output layer`
			`#`
			`kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE])`
			`bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE])`
			`x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias))`
			`return x`
bug fix with dimensions @TODO: GPU workload 5 years ago			`self.generator.load_meta(column)`
			`self.discriminator.load_meta(column)`
			`def loss(self,**args):`
			`"""`
			`This function will compute a "tower" loss of the generated candidate against real data`
			`Training will consist in having both generator and discriminators`
			`:scope`
			`:stage`
			`:real`
			`:label`
			`"""`

			`scope = args['scope']`
			`stage = args['stage']`
			`real = args['real']`
			`label = args['label']`
			`label = tf.cast(label, tf.int32)`
			`#`
			`# @TODO: Ziqi needs to explain what's going on here`
			`m = [[i] for i in np.arange(self._LABEL.shape[1]-2)]`
			`label = label[:, 1] * len(m) + tf.squeeze(`
			`tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32))`
			`)`
			`# label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] )`
			`z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])`

			`fake = self.generator.network(inputs=z, label=label)`
			`if stage == 'D':`
			`w, loss = self.discriminator.loss(real=real, fake=fake, label=label)`
			`#losses = tf.get_collection('dlosses', scope)`
			`flag = 'dlosses'`
			`losses = tf.compat.v1.get_collection('dlosses', scope)`
			`else:`
			`w, loss = self.generator.loss(fake=fake, label=label)`
			`#losses = tf.get_collection('glosses', scope)`
			`flag = 'glosses'`
			`losses = tf.compat.v1.get_collection('glosses', scope)`
			`# losses = tf.compat.v1.get_collection(flag, scope)`

			`total_loss = tf.add_n(losses, name='total_loss')`

			`return total_loss, w`
			`def input_fn(self):`
			`"""`
			`This function seems to produce`
			`"""`
			`features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32)`
			`labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32)`
			`dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))`
			`dataset = dataset.repeat(10000)`
bug fix with dimensions @TODO: GPU workload 5 years ago			`w_sum = 0`
			`for i in range(self.STEPS_PER_EPOCH):`
			`for _ in range(2):`
			`_, w = sess.run([train_d, w_distance])`
			`w_sum += w`
			`sess.run(train_g)`
			`duration = time.time() - start_time`

			`assert not np.isnan(w_sum), 'Model diverged with loss = NaN'`

			`format_str = 'epoch: %d, w_distance = %f (%.1f)'`
			`print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))`
			`# print (dir (w_distance))`

			`logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })`

			`if epoch % self.MAX_EPOCHS == 0:`
			`# suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']`
			`suffix = self.get.suffix()`
			`_name = os.sep.join([self.train_dir,suffix])`
			`# saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)`
			`saver.save(sess, _name, write_meta_graph=False, global_step=epoch)`
			`#`
			`#`
			`if self.logger :`
			`row = {"logs":logs} #,"model":pickle.dump(sess)}`
			`self.logger.write(row)`
			`#`
			`# @TODO:`
			`# We should upload the files in the checkpoint`
			`# This would allow the learnt model to be portable to another system`
			`#`
			`tf.compat.v1.reset_default_graph()`
bug fix and enhancement 5 years ago			`for i in np.arange(CANDIDATE_COUNT) :`
bug fix with dimensions @TODO: GPU workload 5 years ago
			`tf.compat.v1.reset_default_graph()`

			`return df.to_dict(orient='list')`
			`# return df.to_dict(orient='list')`
			`# count = str(len(os.listdir(self.out_dir)))`
			`# _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])`
			`# df.to_csv(_name,index=False)`


			`# output.extend(np.round(f))`

			`# for m in range(2):`
			`# for n in range(2, self.NUM_LABELS):`
			`# idx1 = (demo[:, m] == 1)`
			`# idx2 = (demo[:, n] == 1)`
			`# idx = [idx1[j] and idx2[j] for j in range(len(idx1))]`
			`# num = np.sum(idx)`
			`# print ("___________________list__")`
			`# print (idx1)`
			`# print (idx2)`
			`# print (idx)`
			`# print (num)`
			`# print ("_____________________")`
			`# nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU))`
			`# label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS))`
			`# label_input[:, n] = 1`
			`# label_input[:, m] = 1`
			`# output = []`
			`# for i in range(nbatch):`
			`# f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]})`
			`# output.extend(np.round(f))`
			`# output = np.array(output)[:num]`
			`# print ([m,n,output])`

			`# np.save(self.out_dir + str(m) + str(n), output)`

bug fix and enhancement 5 years ago			`p.load_meta(column)`