From f1076f441b712e860feb1b7a5ce0e16489c9b02d Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 14 Apr 2020 15:14:38 -0500
Subject: [PATCH] limitations on the matrix shape (feature space limitation)
 per partition

---
 data/bridge.py         | 109 ++++++++++++++++++++++++++---------------
 data/maker/__init__.py |  11 +++--
 2 files changed, 77 insertions(+), 43 deletions(-)

diff --git a/data/bridge.py b/data/bridge.py
index 019f065..41c0429 100644
--- a/data/bridge.py
+++ b/data/bridge.py
@@ -153,7 +153,7 @@ class Binary :
     """
     This is a utility class to import and export a data to/from a binary matrix
     """
-    def __stream(self,column) :
+    def __stream(self,column,size=-1) :
         """
         This function will convert a column into a binary matrix with the value-space representing each column of the resulting matrix        
         :column a column vector i.e every item is a row
@@ -162,12 +162,19 @@ class Binary :
         
         values = column.dropna().unique() 
         values.sort()
+        column = column.values
         #
         # Let's treat the case of missing values i.e nulls 
         #       
         row_count,col_count = column.size,values.size
+        if row_count * col_count > size and row_count < size:
+            N = np.divide(size,row_count).astype(int) 
+            i = np.random.choice(col_count,N)
+            values = values[-i]
+            col_count = N
+
        
-        matrix = [ np.zeros(col_count) for i in np.arange(row_count)]
+        matrix = [ np.zeros(col_count,dtype=np.float32) for i in np.arange(row_count)]
         #
         # let's create a binary matrix of the feature that was passed in
         # The indices of the matrix are inspired by classical x,y axis 
@@ -176,14 +183,31 @@ class Binary :
             
             for yi in np.arange(row_count) :
                 value   = column[yi]
-                if value not in values :
-                    continue
-                xi = np.where(values == value)                
-                xi      = xi[0][0] #-- column index            
-                matrix[yi][xi] = 1
+                # if value not in values :
+                #     continue
+                xi = np.where(values == value)    
+                if xi and xi[0].size > 0:         
+                    xi      = xi[0][0] #-- column index            
+                    matrix[yi][xi] = 1
+        
+        return pd.DataFrame(matrix,columns=values)
+    def apply(self,column,size):
+        return self.__stream(column,size)
+    def get_column_values(self,column,size=-1):
+        values = column.dropna().unique() 
+        values.sort()
         
-        return matrix
-    def Export(self,df) :
+        #
+        # Let's treat the case of missing values i.e nulls 
+        #       
+        row_count,col_count = column.size,values.size
+        if row_count * col_count > size and row_count < size:
+            N = np.divide(size,row_count).astype(int) 
+            i = np.random.choice(col_count,N)
+            values = values[-i]
+        return values
+ 
+    def _Export(self,df) :
         """
         This function will convert a data-frame to a binary matrix
         :return _map,matrix
@@ -192,8 +216,9 @@ class Binary :
         # This will give us a map of how each column was mapped to a bitstream
         
         # _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)
-        _map = df.fillna('').apply(lambda column: self.__stream(column),axis=0)
+        # _map = df.fillna(np.nan).apply(lambda column: column,axis=0)
         
+        print (df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0))
         #
         # We will merge this to have a healthy matrix
         _matrix =  _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1)
@@ -239,37 +264,41 @@ if __name__ == '__main__' :
         --pseudo    will create pseudonyms for a given
         --export    will export data to a specified location
     """
-    has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
-    has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
-    if has_basic and has_action :
-        builder = Builder()
-        if 'export' in SYS_ARGS :
-            print ()
-            print ("exporting ....")
-            if not os.path.exists(SYS_ARGS['export']) :
-                os.mkdir(SYS_ARGS['export'])
-            SQL = builder.encode(**SYS_ARGS)
-            #
-            # Assuming the user wants to filter the records returned :
-            #
+    df = pd.read_csv('sample.csv')
+    print ( pd.get_dummies(df.race))
+    print ( (Binary()).apply(df.race, 30))
+
+    # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
+    # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
+    # if has_basic and has_action :
+    #     builder = Builder()
+    #     if 'export' in SYS_ARGS :
+    #         print ()
+    #         print ("exporting ....")
+    #         if not os.path.exists(SYS_ARGS['export']) :
+    #             os.mkdir(SYS_ARGS['export'])
+    #         SQL = builder.encode(**SYS_ARGS)
+    #         #
+    #         # Assuming the user wants to filter the records returned :
+    #         #
             
-            credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key'])
-            df  = pd.read_gbq(SQL,credentials =credentials,dialect='standard')
-            FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv'])
-            #
-            # This would allow us to export it to wherever we see fit
-            print (FILENAME)
-            df.to_csv(FILENAME,index=False)
-            f = open(FILENAME.replace('.csv','.sql'),'w+')
-            f.write(SQL)
-            f.close()
-        elif 'pseudo' in SYS_ARGS :
-            builder.process(**SYS_ARGS)
-    else:
-        print ("")
-        print (SYS_ARGS.keys())
-        print ("has basic ",has_basic)
-        print ("has action ",has_action)
+    #         credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key'])
+    #         df  = pd.read_gbq(SQL,credentials =credentials,dialect='standard')
+    #         FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv'])
+    #         #
+    #         # This would allow us to export it to wherever we see fit
+    #         print (FILENAME)
+    #         df.to_csv(FILENAME,index=False)
+    #         f = open(FILENAME.replace('.csv','.sql'),'w+')
+    #         f.write(SQL)
+    #         f.close()
+    #     elif 'pseudo' in SYS_ARGS :
+    #         builder.process(**SYS_ARGS)
+    # else:
+    #     print ("")
+    #     print (SYS_ARGS.keys())
+    #     print ("has basic ",has_basic)
+    #     print ("has action ",has_action)
 # pseudonym.apply(table='person',dataset='wgan_original',key='./curation-test-2.json')        
 # args = {"dataset":"wgan_original","table":"observation","key":"./curation-test-2.json"}
 # builder = Builder()
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 25392f9..072b2f2 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -135,7 +135,9 @@ def train (**args) :
             # df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False)
             # print (df[col].dtypes)
             # print (df[col].dropna/(axis=1).unique())
-        args['real']  = pd.get_dummies(df[col].dropna()).astype(np.float32).values
+        # args['real']  = pd.get_dummies(df[col].dropna()).astype(np.float32).values
+        msize = args['matrix_size'] if 'matrix_size' in args else -1
+        args['real'] = (Binary()).apply(df[col],msize)
 
             
         
@@ -190,7 +192,7 @@ def generate(**args):
     #
     BIN_SIZE    = 4 if 'bin_size' not in args else int(args['bin_size'])
     NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
-    
+    bhandler = Binary()    
     _df     = df.copy()
     for col in column :
         args['context'] = col
@@ -207,7 +209,10 @@ def generate(**args):
         #     values = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32).T
             
         # else:
-        values          = df[col].dropna().unique().tolist()
+        # values          = df[col].dropna().unique().tolist()
+        msize = args['matrix_size'] if 'matrix_size' in args else -1
+        values = bhandler.get_column_values(df[col])
+