|
|
|
"""
|
|
|
|
(c) 2018 - 2021, Vanderbilt University Medical Center
|
|
|
|
Steve L. Nyemba, steve.l.nyemba@vumc.org
|
|
|
|
|
|
|
|
This file is designed to handle preconditions for a generative adversarial network:
|
|
|
|
- The file will read/get data from a source specified by transport (or data-frame)
|
|
|
|
- The class will convert the data to a binary vector
|
|
|
|
- The class will also help rebuild the data from a binary matrix.
|
|
|
|
Usage :
|
|
|
|
|
|
|
|
"""
|
|
|
|
import transport
|
|
|
|
import json
|
|
|
|
import pandas as pd
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
if 'columns' not in _args :
|
|
|
|
self._initcols(data=self.df)
|
|
|
|
else:
|
|
|
|
self._initcols(data=self.df,columns=_args['columns'])
|
|
|
|
|
|
|
|
pass
|
|
|
|
def _initcols (self,**_args) :
|
|
|
|
"""
|
|
|
|
This function will initialize the columns to be synthesized and/or determine which ones can be synthesized
|
|
|
|
:param data data-frame that holds the data (matrix)
|
|
|
|
:param columns optional columns to be synthesized
|
|
|
|
"""
|
|
|
|
# df = _args['data'].copy()
|
|
|
|
row_count = self.df.shape[0]
|
|
|
|
cols = None if 'columns' not in _args else _args['columns']
|
|
|
|
self.columns = self.df.columns.tolist()
|
|
|
|
# _df = pd.DataFrame(self.df.apply(lambda col: col.unique().size )).T
|
|
|
|
|
|
|
|
def revert(self,**_args) :
|
|
|
|
"""
|
|
|
|
This function will take in a binary matrix and based on the map of values it will repopulate it with values
|
|
|
|
:param _matrix binary matrix
|
|
|
|
:param column|columns column name or columns if the column is specified
|
|
|
|
"""
|
|
|
|
_column = _args['column'] if 'column' in _args else None
|
|
|
|
|
|
|
|
|
|
|
|
matrix = _args['matrix']
|
|
|
|
row_count = matrix.shape[0]
|
|
|
|
r = {}
|
|
|
|
for key in self._map :
|
|
|
|
if _column and key != _column :
|
|
|
|
continue
|
|
|
|
_item = self._map[key]
|
|
|
|
_beg = _item['beg']
|
|
|
|
_end = _item['end']
|
|
|
|
columns = np.array(_item['values'])
|
|
|
|
#
|
|
|
|
# @NOTE: We are accessing matrices in terms of [row,col],
|
|
|
|
# The beg,end variables are for the columns in the matrix (mini matrix)
|
|
|
|
#
|
|
|
|
# if not _column :
|
|
|
|
# _matrix = matrix[:,_beg:_end] #-- The understanding is that _end is not included
|
|
|
|
# else:
|
|
|
|
# _matrix = matrix
|
|
|
|
_matrix = matrix[:,_beg:_end]
|
|
|
|
#
|
|
|
|
# vectorize the matrix to replace the bits by their actual values (accounting for the data-types)
|
|
|
|
# @TODO: Find ways to do this on a GPU (for big data) or across threads
|
|
|
|
#
|
|
|
|
row_count = _matrix.shape[0]
|
|
|
|
# r[key] = [columns[np.where(row == 1) [0][0] ] for row in _matrix[:,_beg:_end]]
|
|
|
|
|
|
|
|
r[key] = [columns[np.where(row==1)[0][0]] if np.where(row==1)[0].size > 0 else '' for row in _matrix]
|
|
|
|
|
|
|
|
|
|
|
|
return pd.DataFrame(r)
|
|
|
|
|
|
|
|
def tobinary(self,rows,cols=None) :
|
|
|
|
"""
|
|
|
|
This function will compile a binary matrix from a row of values this allows hopefully this can be done in parallel, this function can be vectorized and processed
|
|
|
|
:param rows np.array or list of vector of values
|
|
|
|
:param cols a space of values if it were to be different fromt he current sample.
|
|
|
|
"""
|
|
|
|
|
|
|
|
if not cols:
|
|
|
|
#
|
|
|
|
# In the advent the sample rows do NOT have the values of the
|
|
|
|
cols = rows.unique()
|
|
|
|
cols = np.array(cols)
|
|
|
|
row_count = len(rows)
|
|
|
|
# if 'GPU' not in os.environ :
|
|
|
|
_matrix = np.zeros([row_count,cols.size])
|
|
|
|
|
|
|
|
[np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
|
|
|
|
# else:
|
|
|
|
# _matrix = cp.zeros([row_count,cols.size])
|
|
|
|
# [cp.put(_matrix[i], cp.where(cols == rows[i]),1)for i in cp.arange(row_count) ]
|
|
|
|
# _matrix = _matrix.asnumpy()
|
|
|
|
|
|
|
|
|
|
|
|
return cols,_matrix
|
|
|
|
|
|
|
|
if __name__ == '__main__' :
|
|
|
|
df = pd.read_csv('../../sample.csv')
|
|
|
|
_input = Input(data=df,columns=['age','race'])
|
|
|
|
_m = _input.convert(column='age')
|
|
|
|
print (_m.shape)
|
|
|
|
print (_input.revert(matrix=_m,column='age'))
|
|
|
|
print (_input._metadf)
|
|
|
|
|
|
|
|
# _args = {"store":{"type":"sql.BQReader","args":{"service_key":"/home/steve/dev/aou/accounts/curation-prod.json"}}}
|
|
|
|
# _args['table'] = 'io.observation'
|
|
|
|
# _i = Input(**_args)
|
|
|
|
# df = pd.read_csv('../../sample.csv')
|
|
|
|
# print (Input.ToBinary(df.age))
|