From 2f6f43c9c694383d02563b6e3fa4abe9471c4f95 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 18 Mar 2020 23:16:36 -0500 Subject: [PATCH] bug fix: statistics for quick assessment --- pipeline.py | 20 ++++++++++++++++++-- setup.py | 2 +- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/pipeline.py b/pipeline.py index b838043..76496bd 100644 --- a/pipeline.py +++ b/pipeline.py @@ -163,6 +163,21 @@ class Components : cols = _dc.columns.tolist() data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query) + # + # performing basic analytics on the synthetic data generated (easy to quickly asses) + # + info = {"module":"generate","action":"io-stats","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}} + logs = [] + for name in data_comp.columns.tolist() : + g = pd.DataFrame(data_comp.groupby([name]).size()) + g.columns = ['counts'] + g[name] = g.index.tolist() + g.index = np.arange(g.shape[0]) + logs.append({"name":name,"counts": g.to_dict(orient='records')}) + info['input']['logs'] = logs + logger.write(info) + + base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) for name in cols : _args['data'][name] = _dc[name] @@ -170,6 +185,7 @@ class Components : if partition != '' : info['partition'] = int(partition) logger.write(info) + # filename = os.sep.join([log_folder,'output',name+'.csv']) # data_comp[[name]].to_csv(filename,index=False) @@ -197,10 +213,10 @@ class Components : if 'dump' in args : print (_args['data'].head()) else: - data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=50000) + data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' - _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=50000) + _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000) _id = 'dataset' info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} } if partition : diff --git a/setup.py b/setup.py index 4a4e87b..0f38464 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.2.3","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.2.4","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'