parent
							
								
									7f3748121c
								
							
						
					
					
						commit
						685c567661
					
				@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					import data.params as params
 | 
				
			||||||
@ -0,0 +1,68 @@
 | 
				
			|||||||
 | 
					"""
 | 
				
			||||||
 | 
					(c) 2019 Data Maker, hiplab.mc.vanderbilt.edu
 | 
				
			||||||
 | 
					version 1.0.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This package serves as a proxy to the overall usage of the framework.
 | 
				
			||||||
 | 
					This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@TODO:
 | 
				
			||||||
 | 
					    - Make configurable GPU, EPOCHS
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					import pandas as pd
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					from data import gan
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def train (**args) :
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    This function is intended to train the GAN in order to learn about the distribution of the features
 | 
				
			||||||
 | 
					    :column     columns that need to be synthesized (discrete)
 | 
				
			||||||
 | 
					    :logs       where the output of the (location on disk)
 | 
				
			||||||
 | 
					    :id         identifier of the dataset
 | 
				
			||||||
 | 
					    :data       data-frame to be synthesized
 | 
				
			||||||
 | 
					    :context    label of what we are synthesizing
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    column = args['column']
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    column_id  = args['id']
 | 
				
			||||||
 | 
					    df  = args['data']
 | 
				
			||||||
 | 
					    logs    = args['logs']
 | 
				
			||||||
 | 
					    real    = pd.get_dummies(df[column]).astype(np.float32).values
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    labels  = pd.get_dummies(df[column_id]).astype(np.float32).values
 | 
				
			||||||
 | 
					    max_epochs = 10
 | 
				
			||||||
 | 
					    context = args['context']
 | 
				
			||||||
 | 
					    trainer = gan.Train(context=context,max_epochs=max_epochs,real=real,label=labels,column=column,column_id=column_id)
 | 
				
			||||||
 | 
					    return trainer.apply()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def generate(**args):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
 | 
				
			||||||
 | 
					    @return pandas.DataFrame
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    :data   data-frame to be synthesized
 | 
				
			||||||
 | 
					    :column   columns that need to be synthesized (discrete)
 | 
				
			||||||
 | 
					    :id     column identifying an entity
 | 
				
			||||||
 | 
					    :logs   location on disk where the learnt knowledge of the dataset is
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    df      = args['data']
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    column      = args['column'] 
 | 
				
			||||||
 | 
					    column_id   = args['id']
 | 
				
			||||||
 | 
					    logs        = args['logs']
 | 
				
			||||||
 | 
					    context = args['context']
 | 
				
			||||||
 | 
					    #
 | 
				
			||||||
 | 
					    #@TODO:
 | 
				
			||||||
 | 
					    #   If the identifier is not present, we should fine a way to determine or make one
 | 
				
			||||||
 | 
					    #
 | 
				
			||||||
 | 
					    #ocolumns= list(set(df.columns.tolist())- set(columns))
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    values = df[column].unique().tolist()
 | 
				
			||||||
 | 
					    values.sort()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    labels = pd.get_dummies(df[column_id]).astype(np.float32).values
 | 
				
			||||||
 | 
					    handler = gan.Predict (context=context,label=labels,values=values,column=column)
 | 
				
			||||||
 | 
					    handler.load_meta(column)
 | 
				
			||||||
 | 
					    r =  handler.apply()
 | 
				
			||||||
 | 
					    _df = df.copy()
 | 
				
			||||||
 | 
					    _df[column] = r[column]
 | 
				
			||||||
 | 
					    return _df
 | 
				
			||||||
					Loading…
					
					
				
		Reference in new issue