From 8f390931f33bc462f6b57603de65c9d604b6ed54 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Tue, 14 Apr 2020 16:24:02 -0500
Subject: [PATCH] bug fix: matrix space restriction

---
 data/bridge.py         |  6 +++---
 data/maker/__init__.py |  4 ++--
 pipeline.py            | 24 +++++++-----------------
 3 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/data/bridge.py b/data/bridge.py
index a86deef..2e38431 100644
--- a/data/bridge.py
+++ b/data/bridge.py
@@ -173,7 +173,7 @@ class Binary :
             # N = 
             i = np.random.choice(col_count,size)
             values = values[-i]
-            col_count = N
+            col_count = size
             
 
        
@@ -209,7 +209,7 @@ class Binary :
             # N = 
             i = np.random.choice(col_count,size)
             values = values[-i]
-            col_count = N
+            col_count = size
         return values
  
     def _Export(self,df) :
@@ -271,7 +271,7 @@ if __name__ == '__main__' :
     """
     df = pd.read_csv('sample.csv')
     print ( pd.get_dummies(df.race))
-    print ( (Binary()).apply(df.race, 30))
+    print ( (Binary()).apply(df.race, 2))
 
     # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
     # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 072b2f2..78bc08d 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -136,7 +136,7 @@ def train (**args) :
             # print (df[col].dtypes)
             # print (df[col].dropna/(axis=1).unique())
         # args['real']  = pd.get_dummies(df[col].dropna()).astype(np.float32).values
-        msize = args['matrix_size'] if 'matrix_size' in args else -1
+        msize = args['matrix_size'] if 'matrix_size' in args else 128
         args['real'] = (Binary()).apply(df[col],msize)
 
             
@@ -210,7 +210,7 @@ def generate(**args):
             
         # else:
         # values          = df[col].dropna().unique().tolist()
-        msize = args['matrix_size'] if 'matrix_size' in args else -1
+        msize = args['matrix_size'] if 'matrix_size' in args else 128
         values = bhandler.get_column_values(df[col])
 
         
diff --git a/pipeline.py b/pipeline.py
index 80fed9e..54e12c4 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -73,21 +73,7 @@ class Components :
 		# @TODO: we need to log something here about the parameters being passed
 		# pointer  = args['reader'] if 'reader' in args else lambda: Components.get(**args)
 		df = args['data']
-		
-		if 'slice' in args and 'max_rows' in args['slice']:
-			max_rows = args['slice']['max_rows']
-			if df.shape[0] > max_rows :
-				print (".. slicing ")
-				i = np.random.choice(df.shape[0],max_rows,replace=False)
-				df = df.iloc[i]
-		
-		
-			#
-			# Certain columns need to be removed too large of a matrix
-			#
-		# if df.shape[0] == 0 :
-		# 	print ("CAN NOT TRAIN EMPTY DATASET ")
-		# 	return 
+
 		#
 		# Now we can parse the arguments and submit the entire thing to training
 		#
@@ -102,8 +88,8 @@ class Components :
 		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 		if 'batch_size' in args :
 			_args['batch_size'] = int(args['batch_size'])
-			
-		#
+		
+		_args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128		#
 		# We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel
 		#
 		if int(args['num_gpu']) > 1 :
@@ -157,6 +143,8 @@ class Components :
 		_args['num_gpu'] 	= 1
 		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
 		_args['no_value']= args['no_value']
+		_args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128
+			
 		
 		# MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
 		PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
@@ -298,6 +286,8 @@ if __name__ == '__main__' :
 		args[key] = _config[key]
 	
 	args = dict(args,**SYS_ARGS)
+	if 'matrix_size' in args :
+		args['matrix_size'] = int(args['matrix_size'])
 	if 'batch_size' not in args :
 		args['batch_size']	= 2000 #if 'batch_size' not in args else int(args['batch_size'])
 	if 'dataset' not in args :