parent
							
								
									4df27a251c
								
							
						
					
					
						commit
						886230e647
					
				@ -0,0 +1,91 @@
 | 
				
			||||
"""
 | 
				
			||||
    Health Information Privacy Lab
 | 
				
			||||
    Steve L. Nyemba & Brad. Malin
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
    This is an extension to the pandas data-frame that will perform a risk assessment on a variety of attributes
 | 
				
			||||
    This implementation puts the responsibility on the user of the framework to join datasets and load the final results into a pandas data-frame.
 | 
				
			||||
 | 
				
			||||
    The code will randomly select fields and compute the risk (marketer and prosecutor) and perform a given number of runs.
 | 
				
			||||
 | 
				
			||||
    Usage:
 | 
				
			||||
    from pandas_risk import *
 | 
				
			||||
 | 
				
			||||
    mydataframe = pd.DataFrame('/myfile.csv')
 | 
				
			||||
    risk = mydataframe.deid.risk(id=<name of patient field>,num_runs=<number of runs>)
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
    @TODO:
 | 
				
			||||
        - Provide a selected number of fields and risk will be computed for those fields.
 | 
				
			||||
        - include journalist risk
 | 
				
			||||
 | 
				
			||||
"""
 | 
				
			||||
import pandas as pd
 | 
				
			||||
import numpy as np
 | 
				
			||||
 | 
				
			||||
@pd.api.extensions.register_dataframe_accessor("deid")
 | 
				
			||||
class deid :
 | 
				
			||||
    """
 | 
				
			||||
        This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
 | 
				
			||||
    """
 | 
				
			||||
    def __init__(self,df):
 | 
				
			||||
        self._df = df
 | 
				
			||||
    
 | 
				
			||||
    def risk(self,**args):
 | 
				
			||||
        """
 | 
				
			||||
            @param  id          name of patient field            
 | 
				
			||||
            @params num_runs    number of runs (default will be 100)
 | 
				
			||||
        """
 | 
				
			||||
        
 | 
				
			||||
        id  = args['id']
 | 
				
			||||
        
 | 
				
			||||
        num_runs  = args['num_runs'] if 'num_runs' in args else 100
 | 
				
			||||
        r   = pd.DataFrame()
 | 
				
			||||
        
 | 
				
			||||
        columns = list(set(self._df.columns) - set([id]))
 | 
				
			||||
        k = len(columns)
 | 
				
			||||
        for i in range(0,num_runs) :
 | 
				
			||||
            #
 | 
				
			||||
            # let's chose a random number of columns and compute marketer and prosecutor risk
 | 
				
			||||
            # Once the fields are selected we run a groupby clause
 | 
				
			||||
            #
 | 
				
			||||
 | 
				
			||||
            n   = np.random.randint(2,k) #-- number of random fields we are picking
 | 
				
			||||
            ii = np.random.choice(k,n,replace=False)
 | 
				
			||||
            cols = np.array(columns)[ii].tolist()
 | 
				
			||||
            x_ = self._df.groupby(cols).count()[id].values
 | 
				
			||||
            r = r.append(
 | 
				
			||||
                pd.DataFrame(
 | 
				
			||||
                    [
 | 
				
			||||
                        {
 | 
				
			||||
                            "selected":n,
 | 
				
			||||
                            "marketer": x_.size / np.float64(np.sum(x_)),
 | 
				
			||||
                            "prosecutor":1 / np.float64(np.min(x_))
 | 
				
			||||
 | 
				
			||||
                        }
 | 
				
			||||
                    ]
 | 
				
			||||
                )
 | 
				
			||||
            )
 | 
				
			||||
            g_size = x_.size
 | 
				
			||||
            n_ids = np.float64(np.sum(x_))
 | 
				
			||||
 | 
				
			||||
        return r
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
import pandas as pd
 | 
				
			||||
import numpy as np
 | 
				
			||||
from io import StringIO
 | 
				
			||||
csv = """
 | 
				
			||||
id,sex,age,profession,drug_test
 | 
				
			||||
1,M,37,doctor,-
 | 
				
			||||
2,F,28,doctor,+
 | 
				
			||||
3,M,37,doctor,-
 | 
				
			||||
4,M,28,doctor,+
 | 
				
			||||
5,M,28,doctor,-
 | 
				
			||||
6,M,37,doctor,-
 | 
				
			||||
"""
 | 
				
			||||
f = StringIO()
 | 
				
			||||
f.write(unicode(csv))
 | 
				
			||||
f.seek(0)
 | 
				
			||||
df = pd.read_csv(f)     
 | 
				
			||||
print df.deid.risk(id='id',num_runs=1)   
 | 
				
			||||
					Loading…
					
					
				
		Reference in new issue