From 3fbd68309fb57b467063e9ee0b79eb06ff35c7d7 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Fri, 28 Feb 2020 21:37:26 -0600
Subject: [PATCH] Handling of continous values

---
 data/gan.py            |   8 +--
 data/maker/__init__.py | 114 ++++++++++++++++++++++++++++++++++-------
 data/maker/__main__.py |   6 +--
 3 files changed, 103 insertions(+), 25 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index 204f8af..c2aadb5 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -604,7 +604,7 @@ class Predict(GNet):
                         r = np.zeros(self.ROW_COUNT)
                         df.columns = self.values
                         if len(found):
-                                print (len(found),NTH_VALID_CANDIDATE)    
+                                # print (len(found),NTH_VALID_CANDIDATE)    
                                 # x = df * self.values 
                                 #
                                 # let's get the missing rows (if any) ...
@@ -704,10 +704,10 @@ if __name__ == '__main__' :
                 p = Predict(context=context,label=LABEL,values=values,column=column)
                 p.load_meta(column)
                 r = p.apply()
-                print (df)
-                print ()
+                # print (df)
+                # print ()
                 df[column] = r[column]
-                print (df)
+                # print (df)
                 
                 
         else:
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index d5a4308..6114ad2 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -14,6 +14,68 @@ import data.gan as gan
 from transport import factory
 from data.bridge import Binary
 import threading as thread
+class ContinuousToDiscrete :
+    @staticmethod
+    def binary(X,n=4) :
+        """
+        This function will convert a continous stream of information into a variety a bit stream of bins
+        """
+        # BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist()
+        
+        BOUNDS = ContinuousToDiscrete.bounds(X,n)
+        
+        # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS]
+        _matrix = []
+        m = []
+        for value in X :
+            x_ = np.zeros(n)
+            _matrix.append(x_)
+            for row in BOUNDS :
+            
+                if value>= row.left and value <= row.right :
+                    index = BOUNDS.index(row)
+                    x_[index]  = 1
+                    break
+
+        return _matrix
+    
+    @staticmethod
+    def bounds(x,n):
+        return list(pd.cut(np.array(x),n).categories)
+        
+
+        
+    @staticmethod
+    def continuous(X,BIN_SIZE=4) :
+        """
+        This function will approximate a binary vector given boundary information
+        :X  binary matrix
+        :BIN_SIZE
+        """
+        BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)
+        
+        values = []
+        _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
+        # # print (BOUNDS)
+        
+        # values = []
+        for row in _BINARY :
+            # ubound = BOUNDS[row.index(1)]
+            index = np.where(row == 1)[0][0]
+            
+            ubound = BOUNDS[ index ].right
+            lbound = BOUNDS[ index ].left
+            
+            x_ = np.round(np.random.uniform(lbound,ubound),3).astype(float)            
+            values.append(x_)
+            
+            lbound = ubound
+        
+        return values
+            
+
+
+    
 def train (**args) :
     """
     This function is intended to train the GAN in order to learn about the distribution of the features
@@ -24,22 +86,30 @@ def train (**args) :
     :context    label of what we are synthesizing
     """
     column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
-    
+    CONTINUOUS  = args['continuous'] if 'continuous' in args else []
     # column_id   = args['id']
     df          = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
     df.columns = [name.lower() for name in df.columns]
-
+    #
+    # @TODO:
+    # Consider sequential training of sub population for extremely large datasets
+    #
+    
     #
     # If we have several columns we will proceed one at a time (it could be done in separate threads)
     # @TODO : Consider performing this task on several threads/GPUs simulataneously
     # 
-    handler = Binary()
-    # args['label']      = pd.get_dummies(df[column_id]).astype(np.float32).values
-    # args['label']   = handler.Export(df[[column_id]])
-    # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1)
-    for col in column :    
-        args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
-        # args['real']    = handler.Export(df[[col]])
+    for col in column : 
+        # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
+        # if 'float' not in df[col].dtypes.name :
+            # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
+        if 'float' in df[col].dtypes.name and col in CONTINUOUS:
+            BIN_SIZE = 10 if 'bin_size' not in args else int(args['bin_size'])
+            args['real']    = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32)
+        else:
+            args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
+        
+
         args['column']  = col
         args['context'] = col
         context     = args['context']
@@ -75,7 +145,7 @@ def generate(**args):
     """
     # df      = args['data']
     df      = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
-    
+    CONTINUOUS = args['continous'] if 'continuous' in args else []
     column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
     # column_id   = args['id']
     #
@@ -86,18 +156,26 @@ def generate(**args):
     for col in column :
         args['context'] = col
         args['column']  = col
-        values          = df[col].unique().tolist()
-        args['values']  = values
-        args['row_count'] = df.shape[0]
+        
+        if 'float' in df[col].dtypes.name or col in CONTINUOUS :
+            #
+            # We should create the bins for the values we are observing here
+            BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
+            values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE)
+        else:
+            values          = df[col].unique().tolist()
+        
+        args['values']      = values    
+        args['row_count']   = df.shape[0]
         #
         # we can determine the cardinalities here so we know what to allow or disallow
         handler         = gan.Predict (**args)
         handler.load_meta(col)
-        # handler.ROW_COUNT = df[col].shape[0]
-        r       =  handler.apply()        
-        # print (r)      
-        # 
-        print ([_df.shape,len(r[col])])  
+        r       =  handler.apply()                
         _df[col] = r[col]
+        #
+        # @TODO: log basic stats about the synthetic attribute
+        #
+        
         # break
     return _df
\ No newline at end of file
diff --git a/data/maker/__main__.py b/data/maker/__main__.py
index 583be60..d71d400 100644
--- a/data/maker/__main__.py
+++ b/data/maker/__main__.py
@@ -17,9 +17,9 @@ if 'config' in SYS_ARGS :
         odf = pd.read_csv (ARGS['data'])
         odf.columns = [name.lower() for name in odf.columns]
         column = ARGS['column']  if isinstance(ARGS['column'],list) else [ARGS['column']]
-        print (odf.head())
-        print (_df.head())
-        # print(pd.merge(odf,_df,rsuffix='_io'))
+        # print (odf.head())
+        # print (_df.head())
+        print(odf.join(_df[column],rsuffix='_io'))
         # print (_df[column].risk.evaluate(flag='synth'))
         # print (odf[column].risk.evaluate(flag='original'))
         # _x = pd.get_dummies(_df[column]).values