Handling of continous values

dev
Steve L. Nyemba 5 years ago
parent bd6fb03f8d
commit 3fbd68309f

@ -604,7 +604,7 @@ class Predict(GNet):
r = np.zeros(self.ROW_COUNT) r = np.zeros(self.ROW_COUNT)
df.columns = self.values df.columns = self.values
if len(found): if len(found):
print (len(found),NTH_VALID_CANDIDATE) # print (len(found),NTH_VALID_CANDIDATE)
# x = df * self.values # x = df * self.values
# #
# let's get the missing rows (if any) ... # let's get the missing rows (if any) ...
@ -704,10 +704,10 @@ if __name__ == '__main__' :
p = Predict(context=context,label=LABEL,values=values,column=column) p = Predict(context=context,label=LABEL,values=values,column=column)
p.load_meta(column) p.load_meta(column)
r = p.apply() r = p.apply()
print (df) # print (df)
print () # print ()
df[column] = r[column] df[column] = r[column]
print (df) # print (df)
else: else:

@ -14,6 +14,68 @@ import data.gan as gan
from transport import factory from transport import factory
from data.bridge import Binary from data.bridge import Binary
import threading as thread import threading as thread
class ContinuousToDiscrete :
@staticmethod
def binary(X,n=4) :
"""
This function will convert a continous stream of information into a variety a bit stream of bins
"""
# BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist()
BOUNDS = ContinuousToDiscrete.bounds(X,n)
# _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS]
_matrix = []
m = []
for value in X :
x_ = np.zeros(n)
_matrix.append(x_)
for row in BOUNDS :
if value>= row.left and value <= row.right :
index = BOUNDS.index(row)
x_[index] = 1
break
return _matrix
@staticmethod
def bounds(x,n):
return list(pd.cut(np.array(x),n).categories)
@staticmethod
def continuous(X,BIN_SIZE=4) :
"""
This function will approximate a binary vector given boundary information
:X binary matrix
:BIN_SIZE
"""
BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)
values = []
_BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
# # print (BOUNDS)
# values = []
for row in _BINARY :
# ubound = BOUNDS[row.index(1)]
index = np.where(row == 1)[0][0]
ubound = BOUNDS[ index ].right
lbound = BOUNDS[ index ].left
x_ = np.round(np.random.uniform(lbound,ubound),3).astype(float)
values.append(x_)
lbound = ubound
return values
def train (**args) : def train (**args) :
""" """
This function is intended to train the GAN in order to learn about the distribution of the features This function is intended to train the GAN in order to learn about the distribution of the features
@ -24,22 +86,30 @@ def train (**args) :
:context label of what we are synthesizing :context label of what we are synthesizing
""" """
column = args['column'] if (isinstance(args['column'],list)) else [args['column']] column = args['column'] if (isinstance(args['column'],list)) else [args['column']]
CONTINUOUS = args['continuous'] if 'continuous' in args else []
# column_id = args['id'] # column_id = args['id']
df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
df.columns = [name.lower() for name in df.columns] df.columns = [name.lower() for name in df.columns]
#
# @TODO:
# Consider sequential training of sub population for extremely large datasets
#
# #
# If we have several columns we will proceed one at a time (it could be done in separate threads) # If we have several columns we will proceed one at a time (it could be done in separate threads)
# @TODO : Consider performing this task on several threads/GPUs simulataneously # @TODO : Consider performing this task on several threads/GPUs simulataneously
# #
handler = Binary()
# args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values
# args['label'] = handler.Export(df[[column_id]])
# args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1)
for col in column : for col in column :
# args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values
# if 'float' not in df[col].dtypes.name :
# args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values
if 'float' in df[col].dtypes.name and col in CONTINUOUS:
BIN_SIZE = 10 if 'bin_size' not in args else int(args['bin_size'])
args['real'] = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32)
else:
args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values
# args['real'] = handler.Export(df[[col]])
args['column'] = col args['column'] = col
args['context'] = col args['context'] = col
context = args['context'] context = args['context']
@ -75,7 +145,7 @@ def generate(**args):
""" """
# df = args['data'] # df = args['data']
df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
CONTINUOUS = args['continous'] if 'continuous' in args else []
column = args['column'] if (isinstance(args['column'],list)) else [args['column']] column = args['column'] if (isinstance(args['column'],list)) else [args['column']]
# column_id = args['id'] # column_id = args['id']
# #
@ -86,18 +156,26 @@ def generate(**args):
for col in column : for col in column :
args['context'] = col args['context'] = col
args['column'] = col args['column'] = col
if 'float' in df[col].dtypes.name or col in CONTINUOUS :
#
# We should create the bins for the values we are observing here
BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE)
else:
values = df[col].unique().tolist() values = df[col].unique().tolist()
args['values'] = values args['values'] = values
args['row_count'] = df.shape[0] args['row_count'] = df.shape[0]
# #
# we can determine the cardinalities here so we know what to allow or disallow # we can determine the cardinalities here so we know what to allow or disallow
handler = gan.Predict (**args) handler = gan.Predict (**args)
handler.load_meta(col) handler.load_meta(col)
# handler.ROW_COUNT = df[col].shape[0]
r = handler.apply() r = handler.apply()
# print (r)
#
print ([_df.shape,len(r[col])])
_df[col] = r[col] _df[col] = r[col]
#
# @TODO: log basic stats about the synthetic attribute
#
# break # break
return _df return _df

@ -17,9 +17,9 @@ if 'config' in SYS_ARGS :
odf = pd.read_csv (ARGS['data']) odf = pd.read_csv (ARGS['data'])
odf.columns = [name.lower() for name in odf.columns] odf.columns = [name.lower() for name in odf.columns]
column = ARGS['column'] if isinstance(ARGS['column'],list) else [ARGS['column']] column = ARGS['column'] if isinstance(ARGS['column'],list) else [ARGS['column']]
print (odf.head()) # print (odf.head())
print (_df.head()) # print (_df.head())
# print(pd.merge(odf,_df,rsuffix='_io')) print(odf.join(_df[column],rsuffix='_io'))
# print (_df[column].risk.evaluate(flag='synth')) # print (_df[column].risk.evaluate(flag='synth'))
# print (odf[column].risk.evaluate(flag='original')) # print (odf[column].risk.evaluate(flag='original'))
# _x = pd.get_dummies(_df[column]).values # _x = pd.get_dummies(_df[column]).values

Loading…
Cancel
Save