Handling of continous values

5 years ago · 3fbd68309f
parent bd6fb03f8d
commit 3fbd68309f
3 changed files with 103 additions and 25 deletions
--- a/data/gan.py
+++ b/data/gan.py
@ -604,7 +604,7 @@ class Predict(GNet):
                        r = np.zeros(self.ROW_COUNT)
                        df.columns = self.values
                        if len(found):
-                                print (len(found),NTH_VALID_CANDIDATE)    
+                                # print (len(found),NTH_VALID_CANDIDATE)    
                                # x = df * self.values 
                                #
                                # let's get the missing rows (if any) ...
@ -704,10 +704,10 @@ if __name__ == '__main__' :
                p = Predict(context=context,label=LABEL,values=values,column=column)
                p.load_meta(column)
                r = p.apply()
-                print (df)
+                # print (df)
-                print ()
+                # print ()
                df[column] = r[column]
-                print (df)
+                # print (df)
        else:
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -14,6 +14,68 @@ import data.gan as gan
 from transport import factory
 from data.bridge import Binary
 import threading as thread
 class ContinuousToDiscrete :
    @staticmethod
    def binary(X,n=4) :
        """
        This function will convert a continous stream of information into a variety a bit stream of bins
        """
        # BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist()
        BOUNDS = ContinuousToDiscrete.bounds(X,n)
        # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS]
        _matrix = []
        m = []
        for value in X :
            x_ = np.zeros(n)
            _matrix.append(x_)
            for row in BOUNDS :
                if value>= row.left and value <= row.right :
                    index = BOUNDS.index(row)
                    x_[index]  = 1
                    break
        return _matrix
    @staticmethod
    def bounds(x,n):
        return list(pd.cut(np.array(x),n).categories)
    @staticmethod
    def continuous(X,BIN_SIZE=4) :
        """
        This function will approximate a binary vector given boundary information
        :X  binary matrix
        :BIN_SIZE
        """
        BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)
        values = []
        _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
        # # print (BOUNDS)
        # values = []
        for row in _BINARY :
            # ubound = BOUNDS[row.index(1)]
            index = np.where(row == 1)[0][0]
            ubound = BOUNDS[ index ].right
            lbound = BOUNDS[ index ].left
            x_ = np.round(np.random.uniform(lbound,ubound),3).astype(float)            
            values.append(x_)
            lbound = ubound
        return values
 def train (**args) :
    """
    This function is intended to train the GAN in order to learn about the distribution of the features
@ -24,22 +86,30 @@ def train (**args) :
    :context    label of what we are synthesizing
    """
    column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
-    
+    CONTINUOUS  = args['continuous'] if 'continuous' in args else []
    # column_id   = args['id']
    df          = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
    df.columns = [name.lower() for name in df.columns]
    #
    # @TODO:
    # Consider sequential training of sub population for extremely large datasets
    #
    #
    # If we have several columns we will proceed one at a time (it could be done in separate threads)
    # @TODO : Consider performing this task on several threads/GPUs simulataneously
    # 
    handler = Binary()
    # args['label']      = pd.get_dummies(df[column_id]).astype(np.float32).values
    # args['label']   = handler.Export(df[[column_id]])
    # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1)
    for col in column : 
        # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
        # if 'float' not in df[col].dtypes.name :
            # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
        if 'float' in df[col].dtypes.name and col in CONTINUOUS:
            BIN_SIZE = 10 if 'bin_size' not in args else int(args['bin_size'])
            args['real']    = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32)
        else:
            args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
-        # args['real']    = handler.Export(df[[col]])
+        
        args['column']  = col
        args['context'] = col
        context     = args['context']
@ -75,7 +145,7 @@ def generate(**args):
    """
    # df      = args['data']
    df      = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
-    
+    CONTINUOUS = args['continous'] if 'continuous' in args else []
    column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
    # column_id   = args['id']
    #
@ -86,18 +156,26 @@ def generate(**args):
    for col in column :
        args['context'] = col
        args['column']  = col
        if 'float' in df[col].dtypes.name or col in CONTINUOUS :
            #
            # We should create the bins for the values we are observing here
            BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
            values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE)
        else:
            values          = df[col].unique().tolist()
        args['values']      = values    
        args['row_count']   = df.shape[0]
        #
        # we can determine the cardinalities here so we know what to allow or disallow
        handler         = gan.Predict (**args)
        handler.load_meta(col)
        # handler.ROW_COUNT = df[col].shape[0]
        r       =  handler.apply()                
        # print (r)      
        # 
        print ([_df.shape,len(r[col])])  
        _df[col] = r[col]
        #
        # @TODO: log basic stats about the synthetic attribute
        #
        # break
    return _df
--- a/data/maker/main.py
+++ b/data/maker/main.py
@ -17,9 +17,9 @@ if 'config' in SYS_ARGS :
        odf = pd.read_csv (ARGS['data'])
        odf.columns = [name.lower() for name in odf.columns]
        column = ARGS['column']  if isinstance(ARGS['column'],list) else [ARGS['column']]
-        print (odf.head())
+        # print (odf.head())
-        print (_df.head())
+        # print (_df.head())
-        # print(pd.merge(odf,_df,rsuffix='_io'))
+        print(odf.join(_df[column],rsuffix='_io'))
        # print (_df[column].risk.evaluate(flag='synth'))
        # print (odf[column].risk.evaluate(flag='original'))
        # _x = pd.get_dummies(_df[column]).values