bug fix: multiple conditions on statement

5 years ago · 6d84b25d95
parent 87d54c508d
commit 6d84b25d95
2 changed files with 38 additions and 52 deletions
--- a/data/gan.py
+++ b/data/gan.py
@ -536,10 +536,10 @@ class Predict(GNet):
                self.values     = args['values']
                self.ROW_COUNT  = args['row_count']
                self.oROW_COUNT = self.ROW_COUNT
-                if args['no_value'] in ['na','','NA'] :
                self.MISSING_VALUES = np.nan
-                else :
+                if 'no_value' in args and args['no_value'] not in ['na','','NA'] :
                        self.MISSING_VALUES = args['no_value']
+                        
                # self.MISSING_VALUES = args['no_value']
                # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else  np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value']
        def load_meta(self, column):
--- a/pipeline.py
+++ b/pipeline.py
@ -20,7 +20,12 @@ class Components :
 	class KEYS :
 		PIPELINE_KEY = 'pipeline'
 		SQL_FILTER = 'filter'
-		
+	@staticmethod
+	def get_filter (**args):
+		if args['qualifier'] == 'IN' :
+			return ' '.join([args['field'],args['qualifier'],'(',args['value'],')'])
+		else:
+			return ' '.join([args['field'],args['qualifier'],args['value']])
 	@staticmethod
 	def get_logger(**args) :
 		return factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
@ -34,8 +39,11 @@ class Components :
 		"""
 		SQL = args['sql']
 		if Components.KEYS.SQL_FILTER in args :
-			SQL_FILTER = Components.KEYS.SQL_FILTER
-			condition = ' '.join([args[SQL_FILTER]['field'],args[SQL_FILTER]['qualifier'],'(',args[SQL_FILTER]['value'],')'])
+			FILTER_KEY = Components.KEYS.SQL_FILTER
+			SQL_FILTER = args[FILTER_KEY] if type(args[FILTER_KEY]) == list else [args[FILTER_KEY]]
+			# condition = ' '.join([args[FILTER_KEY]['field'],args[FILTER_KEY]['qualifier'],'(',args[FILTER_KEY]['value'],')'])
+			
+			condition = ' AND '.join([Components.get_filter(**item) for item in SQL_FILTER])
 			SQL = " ".join([SQL,'WHERE',condition])

 		SQL = SQL.replace(':dataset',args['dataset']) #+ " LI "
@ -76,13 +84,6 @@ class Components :
 		
 		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 		log_folder = args['logs'] if 'logs' in args else 'logs'
-		# _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
-		
-		# _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
-		# _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
-		# _args['gpu'] = args['gpu'] if 'gpu' in args else 0
-
-		# # MAX_ROWS = args['max_rows'] 	if 'max_rows' in args else 0
 		PART_SIZE = int(args['part_size']) 

 		partition = args['partition'] 
@ -156,16 +157,22 @@ class Components :
 		# 	columns = args['columns']
 		# 	df = np.array_split(df[columns].values,PART_SIZE)
 		# 	df = pd.DataFrame(df[ int (partition) ],columns = columns)
-		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE)}
+		# max_rows = int(args['partition_max_rows']) if 'partition_max_rows' in args else 1000000
+		# N = np.divide(df.shape[0],max_rows).astype(int) + 1
+		info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE),"partition-info":{"count":int(N),"max_rows":max_rows}}
 		logger.write({"module":"generate","action":"partition","input":info})
 		_args['partition'] = int(partition)
 		_args['continuous']= args['continuous'] if 'continuous' in args else []
-		_args['data'] = df
-			# _args['data'] = reader()
-		#_args['data'] = _args['data'].astype(object)
-		# _args['num_gpu'] = 1
+		#
+		# How many rows sub-partition must we divide this into ?
+		# -- Let us tray assessing
+		
 		
-		_dc = data.maker.generate(**_args) 
+		df = np.array_split(df,N)
+		_dc = pd.DataFrame()
+		# for mdf in df :
+		_args['data'] = df		
+		_dc = _dc.append(data.maker.generate(**_args))
 		#
 		# We need to post the generate the data in order to :
 		#	1. compare immediately
@ -180,35 +187,13 @@ class Components :
 		#
 		info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
 		x = {}
-		# for name in args['columns'] :
-		# 	ident = data_comp.apply(lambda row: 1*(row[name]==row[name+'_io']),axis=1).sum()
-		# 	count = data_comp[name].unique().size
-		# 	_ident= data_comp.shape[1] - ident
-		# 	_count= data_comp[name+'_io'].unique().size
-		# 	_count= len(set(data_comp[name+'_io'].values.tolist()))
-			
-		# 	info['input']['logs'] += [{"name":name,"identical":int(ident),"no_identical":int(_ident),"original_count":count,"synthetic_count":_count}]
-		# for name in data_comp.columns.tolist() :
-			# g = pd.DataFrame(data_comp.groupby([name]).size())						
-			# g.columns = ['counts']
-			# g[name] = g.index.tolist()
-			# g.index = np.arange(g.shape[0])
-			# logs.append({"name":name,"counts": g.to_dict(orient='records')})
-		# info['input']['logs'] = logs
-		# logger.write(info)
-
+		#
+		# @TODO: Send data over to a process for analytics
 		
 		base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
 		cols = _dc.columns.tolist()
 		for name in cols :
 			_args['data'][name] = _dc[name]
-		# 	info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
-		# 	if partition != '' :
-		# 		info['partition'] = int(partition)
-		# 	logger.write(info)
-			
-			# filename = os.sep.join([log_folder,'output',name+'.csv'])
-			# data_comp[[name]].to_csv(filename,index=False)

 		#
 		#-- Let us store all of this into bigquery
@ -265,7 +250,7 @@ if __name__ == '__main__' :
 		f = [i for i in range(0,N) if PIPELINE[i]['context'] == index]
 		index = f[0] if f else 0
 	#
-	# print 	
+	
 	print ("..::: ",PIPELINE[index]['context'])
 	args =  (PIPELINE[index])
 	for key in _config :
@ -274,8 +259,8 @@ if __name__ == '__main__' :
 			# skip in case of pipeline or if key exists in the selected pipeline (provided by index)
 			# 
 			continue
-		
 		args[key] = _config[key]
+	
 	args = dict(args,**SYS_ARGS)
 	if 'batch_size' not in args :
 		args['batch_size']	= 2000 #if 'batch_size' not in args else int(args['batch_size'])
@ -286,7 +271,7 @@ if __name__ == '__main__' :
 	# @TODO:
 	#	Log what was initiated so we have context of this processing ...
 	#
-	if 'listen' not in SYS_ARGS :
+	# if 'listen' not in SYS_ARGS :
 	if 'file' in args :
 		DATA = pd.read_csv(args['file']) ;
 	else:
@ -325,6 +310,7 @@ if __name__ == '__main__' :
 					args['gpu'] = index
 				else:
 					args['gpu']=0
+
 				make = lambda _args: (Components()).generate(_args)
 				job = Process(target=make,args=(args,))
 				job.name = 'generator # '+str(index)