parent
							
								
									99bc98aba5
								
							
						
					
					
						commit
						8a5307c242
					
				@ -1,2 +1,49 @@
 | 
				
			|||||||
# bridge
 | 
					## Introduction
 | 
				
			||||||
 | 
					---
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    - Generative Adversarial Networks
 | 
				
			||||||
 | 
					    - With "Earth mover's distance"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Installation
 | 
				
			||||||
 | 
					---
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pip install git+https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git@release
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Usage
 | 
				
			||||||
 | 
					---
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					After installing the easiest way to get started is as follows (using pandas). The process is as follows:
 | 
				
			||||||
 | 
					1. Train the GAN on the original/raw dataset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        import pandas as pd
 | 
				
			||||||
 | 
					        import data.maker
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        df  = pd.read_csv('myfile.csv')
 | 
				
			||||||
 | 
					        cols= ['f1','f2','f2']  
 | 
				
			||||||
 | 
					        data.maker.train(data=df,cols=cols,logs='logs')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. Generate a candidate dataset from the learnt features
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    import pandas as pd
 | 
				
			||||||
 | 
					    import data.maker
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    df = data.maker.generate(logs='logs')
 | 
				
			||||||
 | 
					    df.head()
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Limitations
 | 
				
			||||||
 | 
					---
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					GANS will generate data assuming the original data has all the value space needed:
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					- No new data will be created
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					        Assuming we have a dataset with an gender attribute with values [M,F]. The synthetic data will not be able to generate genders outside [M,F]
 | 
				
			||||||
 | 
					- Not advised on continuous values
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        GANS work well on discrete values and thus are not advised to be used to synthesize things like measurements (height, blood pressure, ...)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
					Loading…
					
					
				
		Reference in new issue