From 915601236cd0f06a99f2e7fbdbaa5153da7f25f6 Mon Sep 17 00:00:00 2001
From: Steve Nyemba <steve@the-phi.com>
Date: Wed, 25 Mar 2020 17:43:23 -0500
Subject: [PATCH] bug fix with ICD and some minor improvements

---
 data/gan.py            |  13 +++--
 data/maker/__init__.py |  53 +++++++++++++-----
 pipeline.py            | 124 ++++++++++++++++-------------------------
 setup.py               |   2 +-
 4 files changed, 97 insertions(+), 95 deletions(-)

diff --git a/data/gan.py b/data/gan.py
index c85776a..a6dece6 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -172,7 +172,7 @@ class GNet :
                                 root = []
                                 for loc in path.split(os.sep) :
                                         root.append(loc)
-                                        if not os.path.exists(os.sep.join(root)) :
+                                        if not os.path.exists(os.sep.join(root)) :                                                
                                                 os.mkdir(os.sep.join(root))
 
                         elif not os.path.exists(path):
@@ -535,8 +535,12 @@ class Predict(GNet):
                 self.values     = args['values']
                 self.ROW_COUNT  = args['row_count']
                 self.oROW_COUNT = self.ROW_COUNT
-                
-                self.MISSING_VALUES = args['no_value'] 
+                if args['no_value'] in ['na','','NA'] :
+                        self.MISSING_VALUES = np.nan
+                else :
+                        self.MISSING_VALUES = args['no_value']
+                # self.MISSING_VALUES = args['no_value']
+                # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else  np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value']
         def load_meta(self, column):
                 super().load_meta(column)
                 self.generator.load_meta(column)
@@ -652,7 +656,8 @@ class Predict(GNet):
                                 if ii.shape[0] > 0 :
                                         #
                                         #@TODO Have this be a configurable variable
-                                        missing = np.repeat(0, np.where(ii==1)[0].size)
+                                        
+                                        missing = np.repeat(self.MISSING_VALUES, np.where(ii==1)[0].size)
                                 else:
                                         missing = []
                                 #
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index 3a016cf..e252de5 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -62,21 +62,28 @@ class ContinuousToDiscrete :
         BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)
         
         values = []
-        _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
-        # # print (BOUNDS)
-        
-        # values = []
-        for row in _BINARY :
-            # ubound = BOUNDS[row.index(1)]
-            index = np.where(row == 1)[0][0]
+        # _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
+        # # # print (BOUNDS)
+        l = {}
+        for value in X :
+            values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if  value >= item.left and value <= item.right ]
+            
             
-            ubound = BOUNDS[ index ].right
-            lbound = BOUNDS[ index ].left
+    
+        # # values = []
+        # for row in _BINARY :
+        #     # ubound = BOUNDS[row.index(1)]
+        #     index = np.where(row == 1)[0][0]
+            
+        #     ubound = BOUNDS[ index ].right
+        #     lbound = BOUNDS[ index ].left
             
-            x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)            
-            values.append(x_)
+        #     x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)            
+        #     values.append(x_)
             
-            lbound = ubound
+        #     lbound = ubound
+
+        # values = [np.random.uniform() for item in BOUNDS]
         
         return values
             
@@ -173,6 +180,8 @@ def generate(**args):
     #   If the identifier is not present, we should fine a way to determine or make one
     #
     BIN_SIZE    = 4 if 'bin_size' not in args else int(args['bin_size'])
+    NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
+
     _df     = df.copy()
     for col in column :
         args['context'] = col
@@ -195,13 +204,29 @@ def generate(**args):
         
         args['values']      = values    
         args['row_count']   = df.shape[0]
+        if col in NO_VALUE :
+            args['no_value'] = NO_VALUE[col]
+        else:
+            args['no_value'] = NO_VALUE
+        
         #
         # we can determine the cardinalities here so we know what to allow or disallow
         handler     = gan.Predict (**args)
         handler.load_meta(col)
         r           =  handler.apply()                
-        
-        _df[col]    = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col]
+        if col in CONTINUOUS :
+            r[col] = np.array(r[col])
+            MISSING= np.nan if args['no_value'] in ['na','','NA'] else args['no_value']
+            
+            if np.isnan(MISSING):
+                i = np.isnan(r[col])                
+                i = np.where (i == False)[0]                
+            else:
+                i = np.where( r[col] != None)[0]            
+            _approx = ContinuousToDiscrete.continuous(r[col][i],BIN_SIZE)                        
+            r[col][i] = _approx
+            
+        _df[col]    = r[col] #ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col]
         # _df[col]    = r[col]
         #
         # @TODO: log basic stats about the synthetic attribute
diff --git a/pipeline.py b/pipeline.py
index 76496bd..0d19e60 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -16,7 +16,12 @@ from data.params import SYS_ARGS
 DATASET='combined20191004v2_deid'
 
 class Components :
-
+	class KEYS :
+		PIPELINE_KEY = 'pipeline'
+		SQL_FILTER = 'filter'
+	@staticmethod
+	def get_logger(**args) :
+		return factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 	@staticmethod
 	def get(args):
 		"""
@@ -26,15 +31,19 @@ class Components :
 		:condition	optional condition and filters
 		"""
 		SQL = args['sql']
-		if 'condition' in args :
-			condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')'])
+		if Components.KEYS.SQL_FILTER in args :
+			SQL_FILTER = Components.KEYS.SQL_FILTER
+			condition = ' '.join([args[SQL_FILTER]['field'],args[SQL_FILTER]['qualifier'],'(',args[SQL_FILTER]['value'],')'])
 			SQL = " ".join([SQL,'WHERE',condition])
 
 		SQL = SQL.replace(':dataset',args['dataset']) #+ " LI "
 		
 		if 'limit' in args :
 			SQL = SQL + ' LIMIT ' + args['limit']
-		
+		#
+		# let's log the sql query that has been performed here
+		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
+		logger.write({"module":"bigquery","action":"read","input":{"sql":SQL}})
 		credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').astype(object)
 		return df
@@ -131,6 +140,7 @@ class Components :
 		_args['num_gpu'] 	= 1
 		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
 		_args['no_value']= args['no_value']
+		
 		# MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
 		PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
 		
@@ -166,19 +176,27 @@ class Components :
 		#
 		# performing basic analytics on the synthetic data generated (easy to quickly asses)
 		#
-		info = {"module":"generate","action":"io-stats","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
-		logs = []
-		for name in data_comp.columns.tolist() :
-			g = pd.DataFrame(data_comp.groupby([name]).size())						
-			g.columns = ['counts']
-			g[name] = g.index.tolist()
-			g.index = np.arange(g.shape[0])
-			logs.append({"name":name,"counts": g.to_dict(orient='records')})
-		info['input']['logs'] = logs
+		info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
+		x = {}
+		for name in args['columns'] :
+			ident = data_comp.apply(lambda row: 1*(row[name]==row[name+'_io']),axis=1).sum()
+			count = data_comp[name].unique().size
+			_ident= data_comp.shape[1] - ident
+			_count= data_comp[name+'_io'].unique().size
+			
+			info['input']['logs'] += [{"name":name,"identical":int(ident),"no_identical":int(_ident),"original_count":count,"synthetic_count":_count}]
+		# for name in data_comp.columns.tolist() :
+			# g = pd.DataFrame(data_comp.groupby([name]).size())						
+			# g.columns = ['counts']
+			# g[name] = g.index.tolist()
+			# g.index = np.arange(g.shape[0])
+			# logs.append({"name":name,"counts": g.to_dict(orient='records')})
+		# info['input']['logs'] = logs
 		logger.write(info)
 
 		
 		base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
+		cols = _dc.columns.tolist()
 		for name in cols :
 			_args['data'][name] = _dc[name]
 			info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
@@ -223,43 +241,14 @@ class Components :
 			info ['partition'] = int(partition)
 		logger.write({"module":"generate","action":"write","input":info} )
 		
-	@staticmethod
-	def callback(channel,method,header,stream):
-		if stream.decode('utf8') in ['QUIT','EXIT','END'] :
-			channel.close()
-			channel.connection.close()
-		info = json.loads(stream)
-		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']})
-		
-		logger.write({'module':'process','action':'read-partition','input':info['input']})
-		df = pd.DataFrame(info['data'])
-		args = info['args']
-		if args['num_gpu'] > 1 :
-			args['gpu'] = int(info['input']['partition']) if info['input']['partition'] < 8  else np.random.choice(np.arange(8)).astype(int)
-		
-		else:
-			args['gpu'] 	= 0
-		args['num_gpu'] = 1
-		# if int(args['num_gpu']) > 1 and args['gpu'] > 0:
-		# 	args['gpu'] = args['gpu'] + args['num_gpu'] if args['gpu'] + args['num_gpu'] < 8 else args['gpu'] #-- 8 max gpus
-		args['reader'] = lambda: df
-		#
-		# @TODO: Fix
-		# 	There is an inconsistency in column/columns ... fix this shit!
-		#
-		channel.close()
-		channel.connection.close()
-		args['columns'] = args['column']		
-		(Components()).train(**args)		
-		logger.write({"module":"process","action":"exit","input":info["input"]})
-
-		pass
+	
 	
 if __name__ == '__main__' :
 	filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json'
 	f = open (filename)
-	PIPELINE = json.loads(f.read())
+	_config = json.loads(f.read())
 	f.close()	
+	PIPELINE = _config['pipeline']
 	index = SYS_ARGS['index']
 	if index.isnumeric() :
 		index = int(SYS_ARGS['index']) 
@@ -274,10 +263,17 @@ if __name__ == '__main__' :
 	# print 	
 	print ("..::: ",PIPELINE[index]['context'])
 	args =  (PIPELINE[index])
-	
+	for key in _config :
+		if key == 'pipeline' or key in args:
+			#
+			# skip in case of pipeline or if key exists in the selected pipeline (provided by index)
+			# 
+			continue
+		
+		args[key] = _config[key]
 	args = dict(args,**SYS_ARGS)
 	
-	args['logs'] 		= args['logs'] if 'logs' in args else 'logs'
+	
 	args['batch_size']	= 2000 if 'batch_size' not in args else int(args['batch_size'])
 	if 'dataset' not in args :
 		args['dataset'] = 'combined20191004v2_deid'
@@ -340,38 +336,14 @@ if __name__ == '__main__' :
 		else:
 			generator.generate(args)
 		# Components.generate(args)
-	elif 'listen' in args :
+	elif 'finalize' in args :
 		#
-		# This will start a worker just in case to listen to a queue
-		SYS_ARGS = dict(args)	#-- things get lost in context
-		if 'read' in SYS_ARGS :
-			QUEUE_TYPE = 'queue.QueueReader'
-			pointer = lambda qreader: qreader.read()
-		else:
-			QUEUE_TYPE = 'queue.QueueListener'
-			pointer = lambda qlistener: qlistener.listen()
-		N = int(SYS_ARGS['jobs']) if 'jobs' in SYS_ARGS else 1 
-		
-		qhandlers = [factory.instance(type=QUEUE_TYPE,args={'queue':'aou.io'}) for i in np.arange(N)]
-		jobs = []
-		for qhandler in qhandlers :
-			qhandler.callback = Components.callback
-			job = Process(target=pointer,args=(qhandler,))
-			job.start()
-			jobs.append(job)
+		# This will finalize a given set of synthetic operations into a table
 		#
-		# let us wait for the jobs
-		print (["Started ",len(jobs)," trainers"])
-		while len(jobs) > 0 :
-			
-			jobs = [job for job in jobs if job.is_alive()]
-			time.sleep(2)
+		idataset = args['input'] if 'input' in args else 'io'	#-- input dataset
+		odataset = args['output']	#-- output dataset
+		labels = [name.strip() for name in args['labels'].split(',') ]
 		
-		# pointer(qhandler)
-
-
-		# qreader.read(1)
-		pass
 	else:
 		
 		# DATA  = np.array_split(DATA,PART_SIZE)
diff --git a/setup.py b/setup.py
index 0f38464..c441e36 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ import sys
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.2.4","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.2.5","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
         "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'