bug fix with binary matrix generation

6 years ago · 0f0c2642c2
parent ce55848cc8
commit 0f0c2642c2
3 changed files with 15 additions and 13 deletions
--- a/data/bridge.py
+++ b/data/bridge.py
@ -191,12 +191,13 @@ class Binary :
        #
        # This will give us a map of how each column was mapped to a bitstream
        
-        _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)
+        # _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)
+        _map = df.fillna('').apply(lambda column: self.__stream(column),axis=0)
        
        #
        # We will merge this to have a healthy matrix
        _matrix =  _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1)
-        _matrix = np.matrix([list(item) for item in _matrix])
+        _matrix = np.matrix([list(item) for item in _matrix]).astype(np.float32)
        #
        # let's format the map so we don't have an unreasonable amount of data
        #
@ -210,7 +211,8 @@ class Binary :
            _m[name] = {"start":beg,"end":end}
            beg = end

-        return _m,_matrix.astype(np.float32)        
+        # return _m,_matrix.astype(np.float32)        
+        return _matrix
        
    def Import(self,df,values,_map):
        """
--- a/data/gan.py
+++ b/data/gan.py
@ -397,17 +397,13 @@ class Train (GNet):
                labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32)
                dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
                dataset = dataset.repeat(10000)
-                dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
+                dataset = dataset.batch(batch_size=3000)
                dataset = dataset.prefetch(1)
                # iterator = dataset.make_initializable_iterator()
                iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
-                # next_element = iterator.get_next()
-                # init_op = iterator.initializer
                return iterator, features_placeholder, labels_placeholder
        
        def network(self,**args):
-        # def graph(stage, opt):
-                # global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False)
                stage   = args['stage']
                opt             = args['opt']
                tower_grads = []
@ -540,8 +536,6 @@ class Predict(GNet):
                                # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
                                #
                                df =  ( pd.DataFrame(np.round(f).astype(np.int32)))
-                                print (df.head())
-                                print ()
                                p = 0 not in df.sum(axis=1).values
                                
                                if      p:
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -12,6 +12,7 @@ import pandas as pd
 import numpy as np
 import data.gan as gan
 from transport import factory
+from data.bridge import Binary
 import threading as thread
 def train (**args) :
    """
@ -32,9 +33,12 @@ def train (**args) :
    # If we have several columns we will proceed one at a time (it could be done in separate threads)
    # @TODO : Consider performing this task on several threads/GPUs simulataneously
    # 
-    args['label']      = pd.get_dummies(df[column_id]).astype(np.float32).values
+    handler = Binary()
+    # args['label']      = pd.get_dummies(df[column_id]).astype(np.float32).values
+    args['label']   = handler.Export(df[[column_id]])
    for col in column :    
-        args['real']        = pd.get_dummies(df[col]).astype(np.float32).values 
+        # args['real']        = pd.get_dummies(df[col]).astype(np.float32).values 
+        args['real']    = handler.Export(df[[col]])
        args['column']  = col
        args['context'] = col
        context     = args['context']
@ -77,7 +81,9 @@ def generate(**args):
    #@TODO:
    #   If the identifier is not present, we should fine a way to determine or make one
    #
-    args['label']  = pd.get_dummies(df[column_id]).astype(np.float32).values
+    # args['label']  = pd.get_dummies(df[column_id]).astype(np.float32).values
+    bwrangler = Binary()
+    args['label']   = bwrangler.Export(df[[column_id]])
    _df     = df.copy()
    for col in column :
        args['context'] = col