From 3fbd68309fb57b467063e9ee0b79eb06ff35c7d7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 28 Feb 2020 21:37:26 -0600 Subject: [PATCH] Handling of continous values --- data/gan.py | 8 +-- data/maker/__init__.py | 114 ++++++++++++++++++++++++++++++++++------- data/maker/__main__.py | 6 +-- 3 files changed, 103 insertions(+), 25 deletions(-) diff --git a/data/gan.py b/data/gan.py index 204f8af..c2aadb5 100644 --- a/data/gan.py +++ b/data/gan.py @@ -604,7 +604,7 @@ class Predict(GNet): r = np.zeros(self.ROW_COUNT) df.columns = self.values if len(found): - print (len(found),NTH_VALID_CANDIDATE) + # print (len(found),NTH_VALID_CANDIDATE) # x = df * self.values # # let's get the missing rows (if any) ... @@ -704,10 +704,10 @@ if __name__ == '__main__' : p = Predict(context=context,label=LABEL,values=values,column=column) p.load_meta(column) r = p.apply() - print (df) - print () + # print (df) + # print () df[column] = r[column] - print (df) + # print (df) else: diff --git a/data/maker/__init__.py b/data/maker/__init__.py index d5a4308..6114ad2 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -14,6 +14,68 @@ import data.gan as gan from transport import factory from data.bridge import Binary import threading as thread +class ContinuousToDiscrete : + @staticmethod + def binary(X,n=4) : + """ + This function will convert a continous stream of information into a variety a bit stream of bins + """ + # BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist() + + BOUNDS = ContinuousToDiscrete.bounds(X,n) + + # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS] + _matrix = [] + m = [] + for value in X : + x_ = np.zeros(n) + _matrix.append(x_) + for row in BOUNDS : + + if value>= row.left and value <= row.right : + index = BOUNDS.index(row) + x_[index] = 1 + break + + return _matrix + + @staticmethod + def bounds(x,n): + return list(pd.cut(np.array(x),n).categories) + + + + @staticmethod + def continuous(X,BIN_SIZE=4) : + """ + This function will approximate a binary vector given boundary information + :X binary matrix + :BIN_SIZE + """ + BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE) + + values = [] + _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE) + # # print (BOUNDS) + + # values = [] + for row in _BINARY : + # ubound = BOUNDS[row.index(1)] + index = np.where(row == 1)[0][0] + + ubound = BOUNDS[ index ].right + lbound = BOUNDS[ index ].left + + x_ = np.round(np.random.uniform(lbound,ubound),3).astype(float) + values.append(x_) + + lbound = ubound + + return values + + + + def train (**args) : """ This function is intended to train the GAN in order to learn about the distribution of the features @@ -24,22 +86,30 @@ def train (**args) : :context label of what we are synthesizing """ column = args['column'] if (isinstance(args['column'],list)) else [args['column']] - + CONTINUOUS = args['continuous'] if 'continuous' in args else [] # column_id = args['id'] df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) df.columns = [name.lower() for name in df.columns] - + # + # @TODO: + # Consider sequential training of sub population for extremely large datasets + # + # # If we have several columns we will proceed one at a time (it could be done in separate threads) # @TODO : Consider performing this task on several threads/GPUs simulataneously # - handler = Binary() - # args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values - # args['label'] = handler.Export(df[[column_id]]) - # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1) - for col in column : - args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values - # args['real'] = handler.Export(df[[col]]) + for col in column : + # args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values + # if 'float' not in df[col].dtypes.name : + # args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values + if 'float' in df[col].dtypes.name and col in CONTINUOUS: + BIN_SIZE = 10 if 'bin_size' not in args else int(args['bin_size']) + args['real'] = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32) + else: + args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values + + args['column'] = col args['context'] = col context = args['context'] @@ -75,7 +145,7 @@ def generate(**args): """ # df = args['data'] df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) - + CONTINUOUS = args['continous'] if 'continuous' in args else [] column = args['column'] if (isinstance(args['column'],list)) else [args['column']] # column_id = args['id'] # @@ -86,18 +156,26 @@ def generate(**args): for col in column : args['context'] = col args['column'] = col - values = df[col].unique().tolist() - args['values'] = values - args['row_count'] = df.shape[0] + + if 'float' in df[col].dtypes.name or col in CONTINUOUS : + # + # We should create the bins for the values we are observing here + BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) + values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE) + else: + values = df[col].unique().tolist() + + args['values'] = values + args['row_count'] = df.shape[0] # # we can determine the cardinalities here so we know what to allow or disallow handler = gan.Predict (**args) handler.load_meta(col) - # handler.ROW_COUNT = df[col].shape[0] - r = handler.apply() - # print (r) - # - print ([_df.shape,len(r[col])]) + r = handler.apply() _df[col] = r[col] + # + # @TODO: log basic stats about the synthetic attribute + # + # break return _df \ No newline at end of file diff --git a/data/maker/__main__.py b/data/maker/__main__.py index 583be60..d71d400 100644 --- a/data/maker/__main__.py +++ b/data/maker/__main__.py @@ -17,9 +17,9 @@ if 'config' in SYS_ARGS : odf = pd.read_csv (ARGS['data']) odf.columns = [name.lower() for name in odf.columns] column = ARGS['column'] if isinstance(ARGS['column'],list) else [ARGS['column']] - print (odf.head()) - print (_df.head()) - # print(pd.merge(odf,_df,rsuffix='_io')) + # print (odf.head()) + # print (_df.head()) + print(odf.join(_df[column],rsuffix='_io')) # print (_df[column].risk.evaluate(flag='synth')) # print (odf[column].risk.evaluate(flag='original')) # _x = pd.get_dummies(_df[column]).values