@ -163,6 +163,21 @@ class Components :
cols = _dc . columns . tolist ( )
cols = _dc . columns . tolist ( )
data_comp = _args [ ' data ' ] [ args [ ' columns ' ] ] . join ( _dc [ args [ ' columns ' ] ] , rsuffix = ' _io ' ) #-- will be used for comparison (store this in big query)
data_comp = _args [ ' data ' ] [ args [ ' columns ' ] ] . join ( _dc [ args [ ' columns ' ] ] , rsuffix = ' _io ' ) #-- will be used for comparison (store this in big query)
#
# performing basic analytics on the synthetic data generated (easy to quickly asses)
#
info = { " module " : " generate " , " action " : " io-stats " , " input " : { " rows " : data_comp . shape [ 0 ] , " partition " : partition , " logs " : [ ] } }
logs = [ ]
for name in data_comp . columns . tolist ( ) :
g = pd . DataFrame ( data_comp . groupby ( [ name ] ) . size ( ) )
g . columns = [ ' counts ' ]
g [ name ] = g . index . tolist ( )
g . index = np . arange ( g . shape [ 0 ] )
logs . append ( { " name " : name , " counts " : g . to_dict ( orient = ' records ' ) } )
info [ ' input ' ] [ ' logs ' ] = logs
logger . write ( info )
base_cols = list ( set ( _args [ ' data ' ] . columns ) - set ( args [ ' columns ' ] ) ) #-- rebuilt the dataset (and store it)
base_cols = list ( set ( _args [ ' data ' ] . columns ) - set ( args [ ' columns ' ] ) ) #-- rebuilt the dataset (and store it)
for name in cols :
for name in cols :
_args [ ' data ' ] [ name ] = _dc [ name ]
_args [ ' data ' ] [ name ] = _dc [ name ]
@ -170,6 +185,7 @@ class Components :
if partition != ' ' :
if partition != ' ' :
info [ ' partition ' ] = int ( partition )
info [ ' partition ' ] = int ( partition )
logger . write ( info )
logger . write ( info )
# filename = os.sep.join([log_folder,'output',name+'.csv'])
# filename = os.sep.join([log_folder,'output',name+'.csv'])
# data_comp[[name]].to_csv(filename,index=False)
# data_comp[[name]].to_csv(filename,index=False)
@ -197,10 +213,10 @@ class Components :
if ' dump ' in args :
if ' dump ' in args :
print ( _args [ ' data ' ] . head ( ) )
print ( _args [ ' data ' ] . head ( ) )
else :
else :
data_comp . to_gbq ( if_exists = ' append ' , destination_table = partial , credentials = credentials , chunksize = 5 0000)
data_comp . to_gbq ( if_exists = ' append ' , destination_table = partial , credentials = credentials , chunksize = 9 0000)
INSERT_FLAG = ' replace ' if ' partition ' not in args or ' segment ' not in args else ' append '
INSERT_FLAG = ' replace ' if ' partition ' not in args or ' segment ' not in args else ' append '
_args [ ' data ' ] . to_gbq ( if_exists = ' append ' , destination_table = complete , credentials = credentials , chunksize = 5 0000)
_args [ ' data ' ] . to_gbq ( if_exists = ' append ' , destination_table = complete , credentials = credentials , chunksize = 9 0000)
_id = ' dataset '
_id = ' dataset '
info = { " full " : { _id : _fname , " rows " : _args [ ' data ' ] . shape [ 0 ] } , " partial " : { " path " : _pname , " rows " : data_comp . shape [ 0 ] } }
info = { " full " : { _id : _fname , " rows " : _args [ ' data ' ] . shape [ 0 ] } , " partial " : { " path " : _pname , " rows " : data_comp . shape [ 0 ] } }
if partition :
if partition :