limitations on the matrix shape (feature space limitation) per partition

dev
Steve L. Nyemba 5 years ago
parent 3dde3bf4ef
commit f1076f441b

@ -153,7 +153,7 @@ class Binary :
""" """
This is a utility class to import and export a data to/from a binary matrix This is a utility class to import and export a data to/from a binary matrix
""" """
def __stream(self,column) : def __stream(self,column,size=-1) :
""" """
This function will convert a column into a binary matrix with the value-space representing each column of the resulting matrix This function will convert a column into a binary matrix with the value-space representing each column of the resulting matrix
:column a column vector i.e every item is a row :column a column vector i.e every item is a row
@ -162,12 +162,19 @@ class Binary :
values = column.dropna().unique() values = column.dropna().unique()
values.sort() values.sort()
column = column.values
# #
# Let's treat the case of missing values i.e nulls # Let's treat the case of missing values i.e nulls
# #
row_count,col_count = column.size,values.size row_count,col_count = column.size,values.size
if row_count * col_count > size and row_count < size:
N = np.divide(size,row_count).astype(int)
i = np.random.choice(col_count,N)
values = values[-i]
col_count = N
matrix = [ np.zeros(col_count) for i in np.arange(row_count)] matrix = [ np.zeros(col_count,dtype=np.float32) for i in np.arange(row_count)]
# #
# let's create a binary matrix of the feature that was passed in # let's create a binary matrix of the feature that was passed in
# The indices of the matrix are inspired by classical x,y axis # The indices of the matrix are inspired by classical x,y axis
@ -176,14 +183,31 @@ class Binary :
for yi in np.arange(row_count) : for yi in np.arange(row_count) :
value = column[yi] value = column[yi]
if value not in values : # if value not in values :
continue # continue
xi = np.where(values == value) xi = np.where(values == value)
xi = xi[0][0] #-- column index if xi and xi[0].size > 0:
matrix[yi][xi] = 1 xi = xi[0][0] #-- column index
matrix[yi][xi] = 1
return pd.DataFrame(matrix,columns=values)
def apply(self,column,size):
return self.__stream(column,size)
def get_column_values(self,column,size=-1):
values = column.dropna().unique()
values.sort()
return matrix #
def Export(self,df) : # Let's treat the case of missing values i.e nulls
#
row_count,col_count = column.size,values.size
if row_count * col_count > size and row_count < size:
N = np.divide(size,row_count).astype(int)
i = np.random.choice(col_count,N)
values = values[-i]
return values
def _Export(self,df) :
""" """
This function will convert a data-frame to a binary matrix This function will convert a data-frame to a binary matrix
:return _map,matrix :return _map,matrix
@ -192,8 +216,9 @@ class Binary :
# This will give us a map of how each column was mapped to a bitstream # This will give us a map of how each column was mapped to a bitstream
# _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0) # _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)
_map = df.fillna('').apply(lambda column: self.__stream(column),axis=0) # _map = df.fillna(np.nan).apply(lambda column: column,axis=0)
print (df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0))
# #
# We will merge this to have a healthy matrix # We will merge this to have a healthy matrix
_matrix = _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1) _matrix = _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1)
@ -239,37 +264,41 @@ if __name__ == '__main__' :
--pseudo will create pseudonyms for a given --pseudo will create pseudonyms for a given
--export will export data to a specified location --export will export data to a specified location
""" """
has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys() df = pd.read_csv('sample.csv')
has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys() print ( pd.get_dummies(df.race))
if has_basic and has_action : print ( (Binary()).apply(df.race, 30))
builder = Builder()
if 'export' in SYS_ARGS : # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
print () # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
print ("exporting ....") # if has_basic and has_action :
if not os.path.exists(SYS_ARGS['export']) : # builder = Builder()
os.mkdir(SYS_ARGS['export']) # if 'export' in SYS_ARGS :
SQL = builder.encode(**SYS_ARGS) # print ()
# # print ("exporting ....")
# Assuming the user wants to filter the records returned : # if not os.path.exists(SYS_ARGS['export']) :
# # os.mkdir(SYS_ARGS['export'])
# SQL = builder.encode(**SYS_ARGS)
# #
# # Assuming the user wants to filter the records returned :
# #
credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key']) # credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key'])
df = pd.read_gbq(SQL,credentials =credentials,dialect='standard') # df = pd.read_gbq(SQL,credentials =credentials,dialect='standard')
FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv']) # FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv'])
# # #
# This would allow us to export it to wherever we see fit # # This would allow us to export it to wherever we see fit
print (FILENAME) # print (FILENAME)
df.to_csv(FILENAME,index=False) # df.to_csv(FILENAME,index=False)
f = open(FILENAME.replace('.csv','.sql'),'w+') # f = open(FILENAME.replace('.csv','.sql'),'w+')
f.write(SQL) # f.write(SQL)
f.close() # f.close()
elif 'pseudo' in SYS_ARGS : # elif 'pseudo' in SYS_ARGS :
builder.process(**SYS_ARGS) # builder.process(**SYS_ARGS)
else: # else:
print ("") # print ("")
print (SYS_ARGS.keys()) # print (SYS_ARGS.keys())
print ("has basic ",has_basic) # print ("has basic ",has_basic)
print ("has action ",has_action) # print ("has action ",has_action)
# pseudonym.apply(table='person',dataset='wgan_original',key='./curation-test-2.json') # pseudonym.apply(table='person',dataset='wgan_original',key='./curation-test-2.json')
# args = {"dataset":"wgan_original","table":"observation","key":"./curation-test-2.json"} # args = {"dataset":"wgan_original","table":"observation","key":"./curation-test-2.json"}
# builder = Builder() # builder = Builder()

@ -135,7 +135,9 @@ def train (**args) :
# df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False) # df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False)
# print (df[col].dtypes) # print (df[col].dtypes)
# print (df[col].dropna/(axis=1).unique()) # print (df[col].dropna/(axis=1).unique())
args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values # args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values
msize = args['matrix_size'] if 'matrix_size' in args else -1
args['real'] = (Binary()).apply(df[col],msize)
@ -190,7 +192,7 @@ def generate(**args):
# #
BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value'] NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
bhandler = Binary()
_df = df.copy() _df = df.copy()
for col in column : for col in column :
args['context'] = col args['context'] = col
@ -207,7 +209,10 @@ def generate(**args):
# values = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32).T # values = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32).T
# else: # else:
values = df[col].dropna().unique().tolist() # values = df[col].dropna().unique().tolist()
msize = args['matrix_size'] if 'matrix_size' in args else -1
values = bhandler.get_column_values(df[col])

Loading…
Cancel
Save