|
|
@ -153,7 +153,7 @@ class Binary :
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
This is a utility class to import and export a data to/from a binary matrix
|
|
|
|
This is a utility class to import and export a data to/from a binary matrix
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
def __stream(self,column) :
|
|
|
|
def __stream(self,column,size=-1) :
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
This function will convert a column into a binary matrix with the value-space representing each column of the resulting matrix
|
|
|
|
This function will convert a column into a binary matrix with the value-space representing each column of the resulting matrix
|
|
|
|
:column a column vector i.e every item is a row
|
|
|
|
:column a column vector i.e every item is a row
|
|
|
@ -162,12 +162,19 @@ class Binary :
|
|
|
|
|
|
|
|
|
|
|
|
values = column.dropna().unique()
|
|
|
|
values = column.dropna().unique()
|
|
|
|
values.sort()
|
|
|
|
values.sort()
|
|
|
|
|
|
|
|
column = column.values
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# Let's treat the case of missing values i.e nulls
|
|
|
|
# Let's treat the case of missing values i.e nulls
|
|
|
|
#
|
|
|
|
#
|
|
|
|
row_count,col_count = column.size,values.size
|
|
|
|
row_count,col_count = column.size,values.size
|
|
|
|
|
|
|
|
if row_count * col_count > size and row_count < size:
|
|
|
|
|
|
|
|
N = np.divide(size,row_count).astype(int)
|
|
|
|
|
|
|
|
i = np.random.choice(col_count,N)
|
|
|
|
|
|
|
|
values = values[-i]
|
|
|
|
|
|
|
|
col_count = N
|
|
|
|
|
|
|
|
|
|
|
|
matrix = [ np.zeros(col_count) for i in np.arange(row_count)]
|
|
|
|
|
|
|
|
|
|
|
|
matrix = [ np.zeros(col_count,dtype=np.float32) for i in np.arange(row_count)]
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# let's create a binary matrix of the feature that was passed in
|
|
|
|
# let's create a binary matrix of the feature that was passed in
|
|
|
|
# The indices of the matrix are inspired by classical x,y axis
|
|
|
|
# The indices of the matrix are inspired by classical x,y axis
|
|
|
@ -176,14 +183,31 @@ class Binary :
|
|
|
|
|
|
|
|
|
|
|
|
for yi in np.arange(row_count) :
|
|
|
|
for yi in np.arange(row_count) :
|
|
|
|
value = column[yi]
|
|
|
|
value = column[yi]
|
|
|
|
if value not in values :
|
|
|
|
# if value not in values :
|
|
|
|
continue
|
|
|
|
# continue
|
|
|
|
xi = np.where(values == value)
|
|
|
|
xi = np.where(values == value)
|
|
|
|
|
|
|
|
if xi and xi[0].size > 0:
|
|
|
|
xi = xi[0][0] #-- column index
|
|
|
|
xi = xi[0][0] #-- column index
|
|
|
|
matrix[yi][xi] = 1
|
|
|
|
matrix[yi][xi] = 1
|
|
|
|
|
|
|
|
|
|
|
|
return matrix
|
|
|
|
return pd.DataFrame(matrix,columns=values)
|
|
|
|
def Export(self,df) :
|
|
|
|
def apply(self,column,size):
|
|
|
|
|
|
|
|
return self.__stream(column,size)
|
|
|
|
|
|
|
|
def get_column_values(self,column,size=-1):
|
|
|
|
|
|
|
|
values = column.dropna().unique()
|
|
|
|
|
|
|
|
values.sort()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# Let's treat the case of missing values i.e nulls
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
row_count,col_count = column.size,values.size
|
|
|
|
|
|
|
|
if row_count * col_count > size and row_count < size:
|
|
|
|
|
|
|
|
N = np.divide(size,row_count).astype(int)
|
|
|
|
|
|
|
|
i = np.random.choice(col_count,N)
|
|
|
|
|
|
|
|
values = values[-i]
|
|
|
|
|
|
|
|
return values
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _Export(self,df) :
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
This function will convert a data-frame to a binary matrix
|
|
|
|
This function will convert a data-frame to a binary matrix
|
|
|
|
:return _map,matrix
|
|
|
|
:return _map,matrix
|
|
|
@ -192,8 +216,9 @@ class Binary :
|
|
|
|
# This will give us a map of how each column was mapped to a bitstream
|
|
|
|
# This will give us a map of how each column was mapped to a bitstream
|
|
|
|
|
|
|
|
|
|
|
|
# _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)
|
|
|
|
# _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)
|
|
|
|
_map = df.fillna('').apply(lambda column: self.__stream(column),axis=0)
|
|
|
|
# _map = df.fillna(np.nan).apply(lambda column: column,axis=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print (df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0))
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# We will merge this to have a healthy matrix
|
|
|
|
# We will merge this to have a healthy matrix
|
|
|
|
_matrix = _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1)
|
|
|
|
_matrix = _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1)
|
|
|
@ -239,37 +264,41 @@ if __name__ == '__main__' :
|
|
|
|
--pseudo will create pseudonyms for a given
|
|
|
|
--pseudo will create pseudonyms for a given
|
|
|
|
--export will export data to a specified location
|
|
|
|
--export will export data to a specified location
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
|
|
|
|
df = pd.read_csv('sample.csv')
|
|
|
|
has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
|
|
|
|
print ( pd.get_dummies(df.race))
|
|
|
|
if has_basic and has_action :
|
|
|
|
print ( (Binary()).apply(df.race, 30))
|
|
|
|
builder = Builder()
|
|
|
|
|
|
|
|
if 'export' in SYS_ARGS :
|
|
|
|
|
|
|
|
print ()
|
|
|
|
|
|
|
|
print ("exporting ....")
|
|
|
|
|
|
|
|
if not os.path.exists(SYS_ARGS['export']) :
|
|
|
|
|
|
|
|
os.mkdir(SYS_ARGS['export'])
|
|
|
|
|
|
|
|
SQL = builder.encode(**SYS_ARGS)
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# Assuming the user wants to filter the records returned :
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key'])
|
|
|
|
# has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
|
|
|
|
df = pd.read_gbq(SQL,credentials =credentials,dialect='standard')
|
|
|
|
# has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
|
|
|
|
FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv'])
|
|
|
|
# if has_basic and has_action :
|
|
|
|
#
|
|
|
|
# builder = Builder()
|
|
|
|
# This would allow us to export it to wherever we see fit
|
|
|
|
# if 'export' in SYS_ARGS :
|
|
|
|
print (FILENAME)
|
|
|
|
# print ()
|
|
|
|
df.to_csv(FILENAME,index=False)
|
|
|
|
# print ("exporting ....")
|
|
|
|
f = open(FILENAME.replace('.csv','.sql'),'w+')
|
|
|
|
# if not os.path.exists(SYS_ARGS['export']) :
|
|
|
|
f.write(SQL)
|
|
|
|
# os.mkdir(SYS_ARGS['export'])
|
|
|
|
f.close()
|
|
|
|
# SQL = builder.encode(**SYS_ARGS)
|
|
|
|
elif 'pseudo' in SYS_ARGS :
|
|
|
|
# #
|
|
|
|
builder.process(**SYS_ARGS)
|
|
|
|
# # Assuming the user wants to filter the records returned :
|
|
|
|
else:
|
|
|
|
# #
|
|
|
|
print ("")
|
|
|
|
|
|
|
|
print (SYS_ARGS.keys())
|
|
|
|
# credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key'])
|
|
|
|
print ("has basic ",has_basic)
|
|
|
|
# df = pd.read_gbq(SQL,credentials =credentials,dialect='standard')
|
|
|
|
print ("has action ",has_action)
|
|
|
|
# FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv'])
|
|
|
|
|
|
|
|
# #
|
|
|
|
|
|
|
|
# # This would allow us to export it to wherever we see fit
|
|
|
|
|
|
|
|
# print (FILENAME)
|
|
|
|
|
|
|
|
# df.to_csv(FILENAME,index=False)
|
|
|
|
|
|
|
|
# f = open(FILENAME.replace('.csv','.sql'),'w+')
|
|
|
|
|
|
|
|
# f.write(SQL)
|
|
|
|
|
|
|
|
# f.close()
|
|
|
|
|
|
|
|
# elif 'pseudo' in SYS_ARGS :
|
|
|
|
|
|
|
|
# builder.process(**SYS_ARGS)
|
|
|
|
|
|
|
|
# else:
|
|
|
|
|
|
|
|
# print ("")
|
|
|
|
|
|
|
|
# print (SYS_ARGS.keys())
|
|
|
|
|
|
|
|
# print ("has basic ",has_basic)
|
|
|
|
|
|
|
|
# print ("has action ",has_action)
|
|
|
|
# pseudonym.apply(table='person',dataset='wgan_original',key='./curation-test-2.json')
|
|
|
|
# pseudonym.apply(table='person',dataset='wgan_original',key='./curation-test-2.json')
|
|
|
|
# args = {"dataset":"wgan_original","table":"observation","key":"./curation-test-2.json"}
|
|
|
|
# args = {"dataset":"wgan_original","table":"observation","key":"./curation-test-2.json"}
|
|
|
|
# builder = Builder()
|
|
|
|
# builder = Builder()
|
|
|
|