From f1076f441b712e860feb1b7a5ce0e16489c9b02d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 14 Apr 2020 15:14:38 -0500 Subject: [PATCH] limitations on the matrix shape (feature space limitation) per partition --- data/bridge.py | 109 ++++++++++++++++++++++++++--------------- data/maker/__init__.py | 11 +++-- 2 files changed, 77 insertions(+), 43 deletions(-) diff --git a/data/bridge.py b/data/bridge.py index 019f065..41c0429 100644 --- a/data/bridge.py +++ b/data/bridge.py @@ -153,7 +153,7 @@ class Binary : """ This is a utility class to import and export a data to/from a binary matrix """ - def __stream(self,column) : + def __stream(self,column,size=-1) : """ This function will convert a column into a binary matrix with the value-space representing each column of the resulting matrix :column a column vector i.e every item is a row @@ -162,12 +162,19 @@ class Binary : values = column.dropna().unique() values.sort() + column = column.values # # Let's treat the case of missing values i.e nulls # row_count,col_count = column.size,values.size + if row_count * col_count > size and row_count < size: + N = np.divide(size,row_count).astype(int) + i = np.random.choice(col_count,N) + values = values[-i] + col_count = N + - matrix = [ np.zeros(col_count) for i in np.arange(row_count)] + matrix = [ np.zeros(col_count,dtype=np.float32) for i in np.arange(row_count)] # # let's create a binary matrix of the feature that was passed in # The indices of the matrix are inspired by classical x,y axis @@ -176,14 +183,31 @@ class Binary : for yi in np.arange(row_count) : value = column[yi] - if value not in values : - continue - xi = np.where(values == value) - xi = xi[0][0] #-- column index - matrix[yi][xi] = 1 + # if value not in values : + # continue + xi = np.where(values == value) + if xi and xi[0].size > 0: + xi = xi[0][0] #-- column index + matrix[yi][xi] = 1 + + return pd.DataFrame(matrix,columns=values) + def apply(self,column,size): + return self.__stream(column,size) + def get_column_values(self,column,size=-1): + values = column.dropna().unique() + values.sort() - return matrix - def Export(self,df) : + # + # Let's treat the case of missing values i.e nulls + # + row_count,col_count = column.size,values.size + if row_count * col_count > size and row_count < size: + N = np.divide(size,row_count).astype(int) + i = np.random.choice(col_count,N) + values = values[-i] + return values + + def _Export(self,df) : """ This function will convert a data-frame to a binary matrix :return _map,matrix @@ -192,8 +216,9 @@ class Binary : # This will give us a map of how each column was mapped to a bitstream # _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0) - _map = df.fillna('').apply(lambda column: self.__stream(column),axis=0) + # _map = df.fillna(np.nan).apply(lambda column: column,axis=0) + print (df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)) # # We will merge this to have a healthy matrix _matrix = _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1) @@ -239,37 +264,41 @@ if __name__ == '__main__' : --pseudo will create pseudonyms for a given --export will export data to a specified location """ - has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys() - has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys() - if has_basic and has_action : - builder = Builder() - if 'export' in SYS_ARGS : - print () - print ("exporting ....") - if not os.path.exists(SYS_ARGS['export']) : - os.mkdir(SYS_ARGS['export']) - SQL = builder.encode(**SYS_ARGS) - # - # Assuming the user wants to filter the records returned : - # + df = pd.read_csv('sample.csv') + print ( pd.get_dummies(df.race)) + print ( (Binary()).apply(df.race, 30)) + + # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys() + # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys() + # if has_basic and has_action : + # builder = Builder() + # if 'export' in SYS_ARGS : + # print () + # print ("exporting ....") + # if not os.path.exists(SYS_ARGS['export']) : + # os.mkdir(SYS_ARGS['export']) + # SQL = builder.encode(**SYS_ARGS) + # # + # # Assuming the user wants to filter the records returned : + # # - credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key']) - df = pd.read_gbq(SQL,credentials =credentials,dialect='standard') - FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv']) - # - # This would allow us to export it to wherever we see fit - print (FILENAME) - df.to_csv(FILENAME,index=False) - f = open(FILENAME.replace('.csv','.sql'),'w+') - f.write(SQL) - f.close() - elif 'pseudo' in SYS_ARGS : - builder.process(**SYS_ARGS) - else: - print ("") - print (SYS_ARGS.keys()) - print ("has basic ",has_basic) - print ("has action ",has_action) + # credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key']) + # df = pd.read_gbq(SQL,credentials =credentials,dialect='standard') + # FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv']) + # # + # # This would allow us to export it to wherever we see fit + # print (FILENAME) + # df.to_csv(FILENAME,index=False) + # f = open(FILENAME.replace('.csv','.sql'),'w+') + # f.write(SQL) + # f.close() + # elif 'pseudo' in SYS_ARGS : + # builder.process(**SYS_ARGS) + # else: + # print ("") + # print (SYS_ARGS.keys()) + # print ("has basic ",has_basic) + # print ("has action ",has_action) # pseudonym.apply(table='person',dataset='wgan_original',key='./curation-test-2.json') # args = {"dataset":"wgan_original","table":"observation","key":"./curation-test-2.json"} # builder = Builder() diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 25392f9..072b2f2 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -135,7 +135,9 @@ def train (**args) : # df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False) # print (df[col].dtypes) # print (df[col].dropna/(axis=1).unique()) - args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values + # args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values + msize = args['matrix_size'] if 'matrix_size' in args else -1 + args['real'] = (Binary()).apply(df[col],msize) @@ -190,7 +192,7 @@ def generate(**args): # BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value'] - + bhandler = Binary() _df = df.copy() for col in column : args['context'] = col @@ -207,7 +209,10 @@ def generate(**args): # values = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32).T # else: - values = df[col].dropna().unique().tolist() + # values = df[col].dropna().unique().tolist() + msize = args['matrix_size'] if 'matrix_size' in args else -1 + values = bhandler.get_column_values(df[col]) +