You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

262 lines
10 KiB
Python

"""
(c) 2018 - 2021, Vanderbilt University Medical Center
Steve L. Nyemba, steve.l.nyemba@vumc.org
This file is designed to handle preconditions for a generative adversarial network:
- The file will read/get data from a source specified by transport (or data-frame)
- The class will convert the data to a binary vector
- The class will also help rebuild the data from a binary matrix.
Usage :
"""
import transport
import json
import pandas as pd
import numpy as np
"""
# _store_args = _args['store']
# reader = transport.factory.instance(**_store_args)
# sql = _args['sql']
# self.df = reader.read(sql=_args['sql'])
if 'columns' not in _args :
self._initcols(data=self.df)
else:
self._initcols(data=self.df,columns=_args['columns'])
pass
def _initcols (self,**_args) :
"""
This function will initialize the columns to be synthesized and/or determine which ones can be synthesized
:param data data-frame that holds the data (matrix)
:param columns optional columns to be synthesized
"""
# df = _args['data'].copy()
row_count = self.df.shape[0]
cols = None if 'columns' not in _args else _args['columns']
self.columns = self.df.columns.tolist()
4 years ago
"""
#
# @NOTE:
# The map should allow us to be able to convert or reconvert the binary matrix to whatever we want ...
#
# self._matrix = _m
return _values,_m
def revert(self,**_args) :
"""
This function will take in a binary matrix and based on the map of values it will repopulate it with values
:param _matrix binary matrix
:param column|columns column name or columns if the column is specified
"""
_column = _args['column'] if 'column' in _args else None
matrix = _args['matrix']
row_count = matrix.shape[0]
r = {}
for key in self._map :
if _column and key != _column :
continue
_item = self._map[key]
_beg = _item['beg']
_end = _item['end']
columns = np.array(_item['values'])
#
# @NOTE: We are accessing matrices in terms of [row,col],
# The beg,end variables are for the columns in the matrix (mini matrix)
#
# if not _column :
# _matrix = matrix[:,_beg:_end] #-- The understanding is that _end is not included
# else:
# _matrix = matrix
_matrix = matrix[:,_beg:_end]
#
# vectorize the matrix to replace the bits by their actual values (accounting for the data-types)
# @TODO: Find ways to do this on a GPU (for big data) or across threads
#
row_count = _matrix.shape[0]
# r[key] = [columns[np.where(row == 1) [0][0] ] for row in _matrix[:,_beg:_end]]
r[key] = [columns[np.where(row==1)[0][0]] if np.where(row==1)[0].size > 0 else '' for row in _matrix]
return pd.DataFrame(r)
def tobinary(self,rows,cols=None) :
"""
This function will compile a binary matrix from a row of values this allows hopefully this can be done in parallel, this function can be vectorized and processed
:param rows np.array or list of vector of values
:param cols a space of values if it were to be different fromt he current sample.
"""
if not cols:
#
# In the advent the sample rows do NOT have the values of the
cols = rows.unique()
cols = np.array(cols)
row_count = len(rows)
# if 'GPU' not in os.environ :