| 
						
						
							
								
							
						
						
					 | 
					 | 
					@ -81,7 +81,6 @@ class ContinuousToDiscrete :
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        return values
 | 
					 | 
					 | 
					 | 
					        return values
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            
 | 
					 | 
					 | 
					 | 
					            
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					def train (**_args):
 | 
					 | 
					 | 
					 | 
					def train (**_args):
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    """
 | 
					 | 
					 | 
					 | 
					    """
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    :params sql
 | 
					 | 
					 | 
					 | 
					    :params sql
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
							
								
							
						
						
							
								
							
						
						
					 | 
					 | 
					@ -126,7 +125,7 @@ def train (**_args):
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    args['matrix_size'] = _matrix.shape[0]
 | 
					 | 
					 | 
					 | 
					    args['matrix_size'] = _matrix.shape[0]
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    args['batch_size'] = 2000
 | 
					 | 
					 | 
					 | 
					    args['batch_size'] = 2000
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    args['partition'] = 0 if 'partition' not in _args else _args['partition']
 | 
					 | 
					 | 
					 | 
					    args['partition'] = 0 if 'partition' not in _args else _args['partition']
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
 | 
					 | 
					 | 
					 | 
					    # os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
 | 
				
			
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    
 | 
					 | 
					 | 
					 | 
					    
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    trainer = gan.Train(**args)   
 | 
					 | 
					 | 
					 | 
					    trainer = gan.Train(**args)   
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    #
 | 
					 | 
					 | 
					 | 
					    #
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
							
								
							
						
						
							
								
							
						
						
					 | 
					 | 
					@ -215,8 +214,7 @@ def generate(**_args):
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    _inputhandler = prepare.Input(**_args)
 | 
					 | 
					 | 
					 | 
					    _inputhandler = prepare.Input(**_args)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    values,_matrix = _inputhandler.convert()    
 | 
					 | 
					 | 
					 | 
					    values,_matrix = _inputhandler.convert()    
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    args['values'] = np.array(values)
 | 
					 | 
					 | 
					 | 
					    args['values'] = np.array(values)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    if 'gpu' in _args :
 | 
					 | 
					 | 
					 | 
					       
 | 
				
			
			
				
				
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    handler     = gan.Predict (**args)
 | 
					 | 
					 | 
					 | 
					    handler     = gan.Predict (**args)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    handler.load_meta(None)
 | 
					 | 
					 | 
					 | 
					    handler.load_meta(None)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    #
 | 
					 | 
					 | 
					 | 
					    #
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
						
						
							
								
							
						
					 | 
					 | 
					@ -226,76 +224,3 @@ def generate(**_args):
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    candidates = handler.apply(candidates=args['candidates'])       
 | 
					 | 
					 | 
					 | 
					    candidates = handler.apply(candidates=args['candidates'])       
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]
 | 
					 | 
					 | 
					 | 
					    return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    
 | 
					 | 
					 | 
					 | 
					    
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					def _generate(**args):
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    """
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    @return pandas.DataFrame
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    :data   data-frame to be synthesized
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    :column   columns that need to be synthesized (discrete)
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    :id     column identifying an entity
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    :logs   location on disk where the learnt knowledge of the dataset is
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    """
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    # df      = args['data']
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    df      = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    CONTINUOUS = args['continuous'] if 'continuous' in args else []
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    # column_id   = args['id']
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    #
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    #@TODO:
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    #   If the identifier is not present, we should fine a way to determine or make one
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    #
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    BIN_SIZE    = 4 if 'bin_size' not in args else int(args['bin_size'])
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    # NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    bhandler = Binary()    
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    _df     = df.copy()
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    for col in column :
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        args['context'] = col
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        args['column']  = col
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        msize = args['matrix_size'] if 'matrix_size' in args else -1        
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        values = bhandler.get_column(df[col],msize)
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        MISSING= bhandler.get_missing(df[col],msize)
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        args['values']      = values    
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        args['row_count']   = df.shape[0]
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        # if col in NO_VALUE :
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        #     args['no_value'] = NO_VALUE[col] 
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        # else:
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        #     args['no_value'] = NO_VALUE
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        # novalue = NO_VALUE[col] if NO_VALUE[col] in ['na',''] else NO_VALUE[col]
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        # MISSING += [NO_VALUE[col]]
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        args['missing'] = MISSING
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        #
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        # we can determine the cardinalities here so we know what to allow or disallow
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        handler     = gan.Predict (**args)
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        handler.load_meta(col)
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        r           =  handler.apply()                
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        if col in CONTINUOUS :
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            r[col] = np.array(r[col])            
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            _approx = ContinuousToDiscrete.continuous(r[col],BIN_SIZE)  #-- approximating based on arbitrary bins                                
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            r[col] = _approx
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        _df[col]    = r[col]
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        #
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        # Let's cast the type to the original type (it makes the data more usable)
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        #
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        # print (values)
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        # print ([col,df[col].dtype,_df[col].tolist()])
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        otype       = df[col].dtype
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        _df[col]    = _df[col].astype(otype)
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        #
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        # @TODO: log basic stats about the synthetic attribute
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        #
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        # print (r)s
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        # break
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    return _df
 | 
					 | 
					 | 
					 | 
					 |