pandas extension for risk

7 years ago · 886230e647
parent 4df27a251c
commit 886230e647
1 changed files with 91 additions and 0 deletions
--- a/src/pandas_risk.py
+++ b/src/pandas_risk.py
@ -0,0 +1,91 @@
+"""
+    Health Information Privacy Lab
+    Steve L. Nyemba & Brad. Malin
+
+
+    This is an extension to the pandas data-frame that will perform a risk assessment on a variety of attributes
+    This implementation puts the responsibility on the user of the framework to join datasets and load the final results into a pandas data-frame.
+
+    The code will randomly select fields and compute the risk (marketer and prosecutor) and perform a given number of runs.
+
+    Usage:
+    from pandas_risk import *
+
+    mydataframe = pd.DataFrame('/myfile.csv')
+    risk = mydataframe.deid.risk(id=<name of patient field>,num_runs=<number of runs>)
+
+
+    @TODO:
+        - Provide a selected number of fields and risk will be computed for those fields.
+        - include journalist risk
+
+"""
+import pandas as pd
+import numpy as np
+
+@pd.api.extensions.register_dataframe_accessor("deid")
+class deid :
+    """
+        This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
+    """
+    def __init__(self,df):
+        self._df = df
+    
+    def risk(self,**args):
+        """
+            @param  id          name of patient field            
+            @params num_runs    number of runs (default will be 100)
+        """
+        
+        id  = args['id']
+        
+        num_runs  = args['num_runs'] if 'num_runs' in args else 100
+        r   = pd.DataFrame()
+        
+        columns = list(set(self._df.columns) - set([id]))
+        k = len(columns)
+        for i in range(0,num_runs) :
+            #
+            # let's chose a random number of columns and compute marketer and prosecutor risk
+            # Once the fields are selected we run a groupby clause
+            #
+
+            n   = np.random.randint(2,k) #-- number of random fields we are picking
+            ii = np.random.choice(k,n,replace=False)
+            cols = np.array(columns)[ii].tolist()
+            x_ = self._df.groupby(cols).count()[id].values
+            r = r.append(
+                pd.DataFrame(
+                    [
+                        {
+                            "selected":n,
+                            "marketer": x_.size / np.float64(np.sum(x_)),
+                            "prosecutor":1 / np.float64(np.min(x_))
+
+                        }
+                    ]
+                )
+            )
+            g_size = x_.size
+            n_ids = np.float64(np.sum(x_))
+
+        return r
+
+
+import pandas as pd
+import numpy as np
+from io import StringIO
+csv = """
+id,sex,age,profession,drug_test
+1,M,37,doctor,-
+2,F,28,doctor,+
+3,M,37,doctor,-
+4,M,28,doctor,+
+5,M,28,doctor,-
+6,M,37,doctor,-
+"""
+f = StringIO()
+f.write(unicode(csv))
+f.seek(0)
+df = pd.read_csv(f)     
+print df.deid.risk(id='id',num_runs=1)