From 8f390931f33bc462f6b57603de65c9d604b6ed54 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 14 Apr 2020 16:24:02 -0500 Subject: [PATCH] bug fix: matrix space restriction --- data/bridge.py | 6 +++--- data/maker/__init__.py | 4 ++-- pipeline.py | 24 +++++++----------------- 3 files changed, 12 insertions(+), 22 deletions(-) diff --git a/data/bridge.py b/data/bridge.py index a86deef..2e38431 100644 --- a/data/bridge.py +++ b/data/bridge.py @@ -173,7 +173,7 @@ class Binary : # N = i = np.random.choice(col_count,size) values = values[-i] - col_count = N + col_count = size @@ -209,7 +209,7 @@ class Binary : # N = i = np.random.choice(col_count,size) values = values[-i] - col_count = N + col_count = size return values def _Export(self,df) : @@ -271,7 +271,7 @@ if __name__ == '__main__' : """ df = pd.read_csv('sample.csv') print ( pd.get_dummies(df.race)) - print ( (Binary()).apply(df.race, 30)) + print ( (Binary()).apply(df.race, 2)) # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys() # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys() diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 072b2f2..78bc08d 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -136,7 +136,7 @@ def train (**args) : # print (df[col].dtypes) # print (df[col].dropna/(axis=1).unique()) # args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values - msize = args['matrix_size'] if 'matrix_size' in args else -1 + msize = args['matrix_size'] if 'matrix_size' in args else 128 args['real'] = (Binary()).apply(df[col],msize) @@ -210,7 +210,7 @@ def generate(**args): # else: # values = df[col].dropna().unique().tolist() - msize = args['matrix_size'] if 'matrix_size' in args else -1 + msize = args['matrix_size'] if 'matrix_size' in args else 128 values = bhandler.get_column_values(df[col]) diff --git a/pipeline.py b/pipeline.py index 80fed9e..54e12c4 100644 --- a/pipeline.py +++ b/pipeline.py @@ -73,21 +73,7 @@ class Components : # @TODO: we need to log something here about the parameters being passed # pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args) df = args['data'] - - if 'slice' in args and 'max_rows' in args['slice']: - max_rows = args['slice']['max_rows'] - if df.shape[0] > max_rows : - print (".. slicing ") - i = np.random.choice(df.shape[0],max_rows,replace=False) - df = df.iloc[i] - - - # - # Certain columns need to be removed too large of a matrix - # - # if df.shape[0] == 0 : - # print ("CAN NOT TRAIN EMPTY DATASET ") - # return + # # Now we can parse the arguments and submit the entire thing to training # @@ -102,8 +88,8 @@ class Components : _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) if 'batch_size' in args : _args['batch_size'] = int(args['batch_size']) - - # + + _args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128 # # We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel # if int(args['num_gpu']) > 1 : @@ -157,6 +143,8 @@ class Components : _args['num_gpu'] = 1 os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) _args['no_value']= args['no_value'] + _args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128 + # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 @@ -298,6 +286,8 @@ if __name__ == '__main__' : args[key] = _config[key] args = dict(args,**SYS_ARGS) + if 'matrix_size' in args : + args['matrix_size'] = int(args['matrix_size']) if 'batch_size' not in args : args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size']) if 'dataset' not in args :