From 12d7573ba8915f5ee43e81f46d5a556db899fbcb Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 4 Apr 2021 13:52:15 -0500 Subject: [PATCH] bg fix : approximation --- pipeline.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 2ed5cdc..d73f1fc 100644 --- a/pipeline.py +++ b/pipeline.py @@ -169,14 +169,24 @@ class Components : # # @TODO: create bins? r = np.random.dirichlet(values+.001) #-- dirichlet doesn't work on values with zeros + _sd = values[values > 0].std() + _me = values[values > 0].mean() x = [] _type = values.dtype for index in np.arange(values.size) : if np.random.choice([0,1],1)[0] : value = values[index] + (values[index] * r[index]) + else : value = values[index] - (values[index] * r[index]) + # + # randomly shifting the measurements + if np.random.choice([0,1],1)[0] and _me > _sd: + if np.random.choice([0,1],1)[0] : + value = value * np.divide(_me,_sd) + else: + value = value + (np.divide(_me,_sd)) value = int(value) if _type == int else np.round(value,2) x.append( value) np.random.shuffle(x) @@ -305,7 +315,7 @@ class Components : if real_df[_col].unique().size > 0 : - _df[_col] = self.approximate(real_df[_col]) + _df[_col] = self.approximate(real_df[_col].values) _approx[_col] = { "io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}, "real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}