|
|
@ -122,10 +122,20 @@ class Components :
|
|
|
|
_args = copy.deepcopy(args)
|
|
|
|
_args = copy.deepcopy(args)
|
|
|
|
# _args['store'] = args['store']['source']
|
|
|
|
# _args['store'] = args['store']['source']
|
|
|
|
_args['data'] = df
|
|
|
|
_args['data'] = df
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
|
|
|
|
|
|
|
|
if 'continuous' in args :
|
|
|
|
|
|
|
|
x_cols = args['continuous']
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
x_cols = []
|
|
|
|
|
|
|
|
|
|
|
|
if 'ignore' in args and 'columns' in args['ignore'] :
|
|
|
|
if 'ignore' in args and 'columns' in args['ignore'] :
|
|
|
|
_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
|
|
|
|
_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
|
|
|
|
_args['data'] = df[ list(set(df.columns)- set(_cols))]
|
|
|
|
_args['data'] = df[ list(set(df.columns)- set(_cols))]
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# We need to make sure that continuous columns are removed
|
|
|
|
|
|
|
|
if x_cols :
|
|
|
|
|
|
|
|
_args['data'] = df[list(set(df.columns) - set(x_cols))]
|
|
|
|
data.maker.train(**_args)
|
|
|
|
data.maker.train(**_args)
|
|
|
|
|
|
|
|
|
|
|
|
if 'autopilot' in ( list(args.keys())) :
|
|
|
|
if 'autopilot' in ( list(args.keys())) :
|
|
|
@ -136,7 +146,26 @@ class Components :
|
|
|
|
|
|
|
|
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
def post(self,args):
|
|
|
|
def approximate(self,values):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
:param values array of values to be approximated
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
if values.dtype in [int,float] :
|
|
|
|
|
|
|
|
r = np.random.dirichlet(values)
|
|
|
|
|
|
|
|
x = []
|
|
|
|
|
|
|
|
_type = values.dtype
|
|
|
|
|
|
|
|
for index in np.arange(values.size) :
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if np.random.choice([0,1],1)[0] :
|
|
|
|
|
|
|
|
value = values[index] + (values[index] * r[index])
|
|
|
|
|
|
|
|
else :
|
|
|
|
|
|
|
|
value = values[index] - (values[index] * r[index])
|
|
|
|
|
|
|
|
value = int(value) if _type == int else np.round(value,2)
|
|
|
|
|
|
|
|
x.append( value)
|
|
|
|
|
|
|
|
np.random.shuffle(x)
|
|
|
|
|
|
|
|
return np.array(x)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
return values
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -179,9 +208,22 @@ class Components :
|
|
|
|
_dc = pd.DataFrame()
|
|
|
|
_dc = pd.DataFrame()
|
|
|
|
# for mdf in df :
|
|
|
|
# for mdf in df :
|
|
|
|
args['data'] = df
|
|
|
|
args['data'] = df
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
|
|
|
|
|
|
|
|
if 'continuous' in args :
|
|
|
|
|
|
|
|
x_cols = args['continuous']
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
x_cols = []
|
|
|
|
|
|
|
|
|
|
|
|
if 'ignore' in args and 'columns' in args['ignore'] :
|
|
|
|
if 'ignore' in args and 'columns' in args['ignore'] :
|
|
|
|
_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
|
|
|
|
_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
|
|
|
|
args['data'] = df[ list(set(df.columns)- set(_cols))]
|
|
|
|
args['data'] = df[ list(set(df.columns)- set(_cols))]
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# We need to remove the continuous columns from the data-frame
|
|
|
|
|
|
|
|
# @TODO: Abstract this !!
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
if x_cols :
|
|
|
|
|
|
|
|
args['data'] = df[list(set(df.columns) - set(x_cols))]
|
|
|
|
|
|
|
|
|
|
|
|
args['candidates'] = 1 if 'candidates' not in args else int(args['candidates'])
|
|
|
|
args['candidates'] = 1 if 'candidates' not in args else int(args['candidates'])
|
|
|
|
|
|
|
|
|
|
|
@ -192,7 +234,10 @@ class Components :
|
|
|
|
_columns = None
|
|
|
|
_columns = None
|
|
|
|
skip_columns = []
|
|
|
|
skip_columns = []
|
|
|
|
_schema = schema
|
|
|
|
_schema = schema
|
|
|
|
cols = [_item['name'] for _item in _schema]
|
|
|
|
if schema :
|
|
|
|
|
|
|
|
cols = [_item['name'] for _item in _schema]
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
cols = df.columns
|
|
|
|
for _df in candidates :
|
|
|
|
for _df in candidates :
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# we need to format the fields here to make sure we have something cohesive
|
|
|
|
# we need to format the fields here to make sure we have something cohesive
|
|
|
@ -206,6 +251,9 @@ class Components :
|
|
|
|
# for _name in _df.columns:
|
|
|
|
# for _name in _df.columns:
|
|
|
|
# if _name in name:
|
|
|
|
# if _name in name:
|
|
|
|
# skip_columns.append(_name)
|
|
|
|
# skip_columns.append(_name)
|
|
|
|
|
|
|
|
if x_cols :
|
|
|
|
|
|
|
|
for _col in x_cols :
|
|
|
|
|
|
|
|
_df[_col] = self.approximate(df[_col])
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# We perform a series of set operations to insure that the following conditions are met:
|
|
|
|
# We perform a series of set operations to insure that the following conditions are met:
|
|
|
|
# - the synthetic dataset only has fields that need to be synthesized
|
|
|
|
# - the synthetic dataset only has fields that need to be synthesized
|
|
|
@ -222,10 +270,16 @@ class Components :
|
|
|
|
# Let us merge the dataset here and and have a comprehensive dataset
|
|
|
|
# Let us merge the dataset here and and have a comprehensive dataset
|
|
|
|
|
|
|
|
|
|
|
|
_df = pd.DataFrame.join(df,_df)
|
|
|
|
_df = pd.DataFrame.join(df,_df)
|
|
|
|
for _item in _schema :
|
|
|
|
if _schema :
|
|
|
|
if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
|
|
|
|
for _item in _schema :
|
|
|
|
_df[_item['name']] = _df[_item['name']].astype(str)
|
|
|
|
if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
|
|
|
|
writer.write(_df[cols],schema=_schema,table=args['from'])
|
|
|
|
_df[_item['name']] = _df[_item['name']].astype(str)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
if _schema :
|
|
|
|
|
|
|
|
writer.write(_df[cols],schema=_schema,table=args['from'])
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
writer.write(_df[cols],table=args['from'])
|
|
|
|
# writer.write(df,table=table)
|
|
|
|
# writer.write(df,table=table)
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|