You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
privacykit/notebooks/Untitled.ipynb

7.7 KiB

None <html lang="en"> <head> </head>
In [1]:
import itertools 
import pandas as pd
import numpy as np
# from pandas_risk import *
from time import time
import os

attr = ['gender','race','zip','year_of_birth']
comb_attr = [
    ['zip' ,'gender', 'birth_datetime', 'race'], 
    ['zip', 'gender', 'year_of_birth', 'race'], 
    ['gender','race','zip'],
    ['race','year_of_birth','zip']
]
In [2]:
SQL_CONTROLLED="SELECT * FROM deid_risk.basic_risk60k"
dfc = pd.read_gbq(SQL_CONTROLLED,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
In [3]:
def risk(**args):
    Yi    = args['data']
    Yi    = Yi.fillna(' ')
    sizes = args['prop'] if 'prop' in args else np.arange(5,100,5)
    FLAG  = args['flag'] if 'flag' in args else 'UNFLAGGED'
    N     = args['num_runs']
    if 'cols' in args :
        columns = args['cols']
    else:
        columns = list(set(Yi.columns.tolist()) - set(['person_id']))
    p     = pd.DataFrame()
    y_i= pd.DataFrame({"group_size":Yi.groupby(columns,as_index=False).size()}).reset_index()
    for index in sizes :
        for n in np.repeat(index,N):
        
             # we will randomly sample n% rows from the dataset
            i = np.random.choice(Yi.shape[0],((Yi.shape[0] * n)/100),replace=False)
            x_i= pd.DataFrame(Yi).loc[i] 
            risk = x_i.deid.risk(id='person_id',quasi_id = columns)
            x_i = pd.DataFrame({"group_size":x_i.groupby(columns,as_index=False).size()}).reset_index()


            r = pd.merge(x_i,y_i,on=columns,how='inner')
            if r.shape[0] == 0 :
                continue
            r['marketer'] = r.apply(lambda row: (row.group_size_x / np.float64(row.group_size_y)) /np.sum(x_i.group_size) ,axis=1)
            r['sample %'] = np.repeat(n,r.shape[0])
            r['tier'] = np.repeat(FLAG,r.shape[0])
            r['sample marketer'] =  np.repeat(risk['marketer'].values[0],r.shape[0])
    #         r['patient_count'] = np.repeat(r.shape[0],r.shape[0])
            r = r.groupby(['sample %','tier','sample marketer'],as_index=False).sum()[['sample %','marketer','sample marketer','tier']]
            p = p.append(r)
            p.index = np.arange(p.shape[0]).astype(np.int64)
    return p
In [4]:
from pandas_risk import *
o = pd.DataFrame()
PATH="out/experiment-phase-2.xlsx"
writer = pd.ExcelWriter(PATH,engine='xlsxwriter')
comb_attr = [
    ['zip' ,'gender', 'birth_datetime', 'race'], 
    ['zip', 'gender', 'year_of_birth', 'race'], 
    ['gender','race','zip'],
    ['race','year_of_birth','zip']
]

for cols in comb_attr :
    o = risk(data=dfc,cols=cols,flag='CONTROLLED',num_runs=5)
    #
    # adding the policy
    x = [1* dfc.columns.isin(cols) for i in range(o.shape[0])]
    o = o.join(pd.DataFrame(x,columns = dfc.columns))
    #
    # Write this to excel notebook
    o.to_excel(writer,"-".join(cols))
#     break
    

# p = p.rename(columns={'marketer_x':'sample marketer'})
# p.index = np.arange(p.shape[0]).astype(np.int64)

writer.save()
In [20]:
x = [1* dfc.columns.isin(cols) for i in range(o.shape[0])]
o.join(pd.DataFrame(x,columns = dfc.columns))
Out[20]:
person_id year_of_birth month_of_birth day_of_birth birth_datetime race_concept_id ethnicity_concept_id location_id care_site_id person_source_value ... gender_source_concept_id race_source_value ethnicity_source_value sex_at_birth birth_date race zip city state gender

0 rows × 21 columns

In [6]:
columns
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-8e7b9895361f> in <module>()
----> 1 columns

NameError: name 'columns' is not defined
In [ ]:

</html>