|
|
|
@ -178,13 +178,14 @@ class Components :
|
|
|
|
|
#
|
|
|
|
|
info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
|
|
|
|
|
x = {}
|
|
|
|
|
for name in args['columns'] :
|
|
|
|
|
ident = data_comp.apply(lambda row: 1*(row[name]==row[name+'_io']),axis=1).sum()
|
|
|
|
|
count = data_comp[name].unique().size
|
|
|
|
|
_ident= data_comp.shape[1] - ident
|
|
|
|
|
_count= data_comp[name+'_io'].unique().size
|
|
|
|
|
|
|
|
|
|
info['input']['logs'] += [{"name":name,"identical":int(ident),"no_identical":int(_ident),"original_count":count,"synthetic_count":_count}]
|
|
|
|
|
# for name in args['columns'] :
|
|
|
|
|
# ident = data_comp.apply(lambda row: 1*(row[name]==row[name+'_io']),axis=1).sum()
|
|
|
|
|
# count = data_comp[name].unique().size
|
|
|
|
|
# _ident= data_comp.shape[1] - ident
|
|
|
|
|
# _count= data_comp[name+'_io'].unique().size
|
|
|
|
|
# _count= len(set(data_comp[name+'_io'].values.tolist()))
|
|
|
|
|
|
|
|
|
|
# info['input']['logs'] += [{"name":name,"identical":int(ident),"no_identical":int(_ident),"original_count":count,"synthetic_count":_count}]
|
|
|
|
|
# for name in data_comp.columns.tolist() :
|
|
|
|
|
# g = pd.DataFrame(data_comp.groupby([name]).size())
|
|
|
|
|
# g.columns = ['counts']
|
|
|
|
@ -192,17 +193,17 @@ class Components :
|
|
|
|
|
# g.index = np.arange(g.shape[0])
|
|
|
|
|
# logs.append({"name":name,"counts": g.to_dict(orient='records')})
|
|
|
|
|
# info['input']['logs'] = logs
|
|
|
|
|
logger.write(info)
|
|
|
|
|
# logger.write(info)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it)
|
|
|
|
|
cols = _dc.columns.tolist()
|
|
|
|
|
for name in cols :
|
|
|
|
|
_args['data'][name] = _dc[name]
|
|
|
|
|
info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
|
|
|
|
|
if partition != '' :
|
|
|
|
|
info['partition'] = int(partition)
|
|
|
|
|
logger.write(info)
|
|
|
|
|
# for name in cols :
|
|
|
|
|
# _args['data'][name] = _dc[name]
|
|
|
|
|
# info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
|
|
|
|
|
# if partition != '' :
|
|
|
|
|
# info['partition'] = int(partition)
|
|
|
|
|
# logger.write(info)
|
|
|
|
|
|
|
|
|
|
# filename = os.sep.join([log_folder,'output',name+'.csv'])
|
|
|
|
|
# data_comp[[name]].to_csv(filename,index=False)
|
|
|
|
|