You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
7.7 KiB
7.7 KiB
None
<html lang="en">
<head>
</head>
</html>
In [1]:
import itertools
import pandas as pd
import numpy as np
# from pandas_risk import *
from time import time
import os
attr = ['gender','race','zip','year_of_birth']
comb_attr = [
['zip' ,'gender', 'birth_datetime', 'race'],
['zip', 'gender', 'year_of_birth', 'race'],
['gender','race','zip'],
['race','year_of_birth','zip']
]
In [2]:
SQL_CONTROLLED="SELECT * FROM deid_risk.basic_risk60k"
dfc = pd.read_gbq(SQL_CONTROLLED,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
In [3]:
def risk(**args):
Yi = args['data']
Yi = Yi.fillna(' ')
sizes = args['prop'] if 'prop' in args else np.arange(5,100,5)
FLAG = args['flag'] if 'flag' in args else 'UNFLAGGED'
N = args['num_runs']
if 'cols' in args :
columns = args['cols']
else:
columns = list(set(Yi.columns.tolist()) - set(['person_id']))
p = pd.DataFrame()
y_i= pd.DataFrame({"group_size":Yi.groupby(columns,as_index=False).size()}).reset_index()
for index in sizes :
for n in np.repeat(index,N):
# we will randomly sample n% rows from the dataset
i = np.random.choice(Yi.shape[0],((Yi.shape[0] * n)/100),replace=False)
x_i= pd.DataFrame(Yi).loc[i]
risk = x_i.deid.risk(id='person_id',quasi_id = columns)
x_i = pd.DataFrame({"group_size":x_i.groupby(columns,as_index=False).size()}).reset_index()
r = pd.merge(x_i,y_i,on=columns,how='inner')
if r.shape[0] == 0 :
continue
r['marketer'] = r.apply(lambda row: (row.group_size_x / np.float64(row.group_size_y)) /np.sum(x_i.group_size) ,axis=1)
r['sample %'] = np.repeat(n,r.shape[0])
r['tier'] = np.repeat(FLAG,r.shape[0])
r['sample marketer'] = np.repeat(risk['marketer'].values[0],r.shape[0])
# r['patient_count'] = np.repeat(r.shape[0],r.shape[0])
r = r.groupby(['sample %','tier','sample marketer'],as_index=False).sum()[['sample %','marketer','sample marketer','tier']]
p = p.append(r)
p.index = np.arange(p.shape[0]).astype(np.int64)
return p
In [4]:
from pandas_risk import *
o = pd.DataFrame()
PATH="out/experiment-phase-2.xlsx"
writer = pd.ExcelWriter(PATH,engine='xlsxwriter')
comb_attr = [
['zip' ,'gender', 'birth_datetime', 'race'],
['zip', 'gender', 'year_of_birth', 'race'],
['gender','race','zip'],
['race','year_of_birth','zip']
]
for cols in comb_attr :
o = risk(data=dfc,cols=cols,flag='CONTROLLED',num_runs=5)
#
# adding the policy
x = [1* dfc.columns.isin(cols) for i in range(o.shape[0])]
o = o.join(pd.DataFrame(x,columns = dfc.columns))
#
# Write this to excel notebook
o.to_excel(writer,"-".join(cols))
# break
# p = p.rename(columns={'marketer_x':'sample marketer'})
# p.index = np.arange(p.shape[0]).astype(np.int64)
writer.save()
In [20]:
x = [1* dfc.columns.isin(cols) for i in range(o.shape[0])]
o.join(pd.DataFrame(x,columns = dfc.columns))
Out[20]:
In [6]:
columns
In [ ]: