You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
privacykit/notebooks/registered-tier-history.ipynb

12 KiB

None <html lang="en"> <head> </head>
In [8]:
"""
The experiments here describe medical/family history as they associate with risk measures
Additionally we will have fractional risk assessments
"""
import pandas as pd
import numpy as np
from pandas_risk import *
dfm = pd.read_gbq("SELECT * FROM deid_risk.registered_medical_history_dec_001",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
dff = pd.read_gbq("SELECT * FROM deid_risk.registered_family_history_dec_001",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
df = pd.read_gbq("SELECT person_id, birth_date,city,state,home_owner,race,ethnicity,gender,birth_place,marital_status,orientation,education,employment_status,income,travel_abroad_6_months,active_duty_status FROM deid_risk.registered_dec_01",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
In [32]:
med_cols = np.random.choice(list(set(dfm.columns.tolist())  - set(['person_id'])),3).tolist()
fam_cols = np.random.choice(list(set(dff.columns.tolist())  - set(['person_id'])),3).tolist()
medical = pd.merge(df,dfm[med_cols+['person_id']],on='person_id')
family = pd.merge(df,dff[fam_cols + ['person_id']],on='person_id')
_tmp = pd.merge(dfm[med_cols +['person_id']],dff[fam_cols+['person_id']])
data = pd.merge(df,_tmp,on='person_id')
In [33]:
pd.concat([data.deid.evaluate(flag='full history',cols= list(set(data.columns.tolist()) - set(['person_id']))  )
        ,medical.deid.evaluate(flag='medical',cols=list( set(medical.columns.tolist() ) - set(['person_id']) ) )
        ,family.deid.evaluate(flag='family',cols=list( set(family.columns.tolist() ) - set(['person_id']) ) )
        ,df.deid.evaluate(flag='no-history',cols=list( set(df.columns.tolist() ) - set(['person_id']) ) )
        , dfm.deid.evaluate(flag='medical-only',cols=med_cols )
        , dff.deid.evaluate(flag='family-only',cols=fam_cols )
          ],ignore_index=True)
Out[33]:
field_count flag group_count marketer prosecutor unique_row_ratio
0 21 full history 115308 0.992691 1.0 0.987663
1 18 medical 115306 0.992674 1.0 0.987629
2 18 family 115304 0.992656 1.0 0.987594
3 15 no-history 115300 0.992622 1.0 0.987526
4 3 medical-only 27 0.000232 0.5 0.000000
5 3 family-only 146 0.001257 1.0 0.000551
In [2]:
from __future__ import division
def evaluate(df) :
    cols = list(set(df.columns.tolist()) - set(['person_id']))
    
    portions = np.round(np.random.random_sample(4),3).tolist() + np.arange(5,105,5).tolist()
    
    N = df.shape[0] - 1
    portions = np.divide(np.multiply(portions,N),100).astype(np.int64)
    portions = np.unique([n for n in portions if n > 1])
   
    r = pd.DataFrame()
    for num_rows in portions :
        
        indices = np.random.choice(N,num_rows,replace=False)
#         print (indices.size / N)
        flag = " ".join([str( np.round(100*indices.size/ N,2)),'%'])
        r = r.append(df.loc[indices].deid.evaluate(cols=cols,flag=flag,min_group_size=2))
    return r
In [3]:
cols = list(set (df.columns.tolist()) - set(['person_id']))
df[['race','state','gender_identity','ethnicity','marital_status','education','orientation','sex_at_birth','birth_date','travel_abroad_6_months','active_duty_status']].deid.evaluate()
Out[3]:
field_count flag group_count marketer prosecutor unique_row_ratio
0 11 UNFLAGGED 114886 0.989058 1.0 0.980535
In [68]:
#
# This is the merge with medical history

cols = ['person_id'] + np.random.choice(dfm.columns[1:],3,replace=False).tolist()
p = pd.merge(df,dfm[cols],on='person_id')
cols
# # cols = list(set(p.columns.tolist()) - set(['person_id']))
# evaluate(p) #p.deid.explore(cols=cols,num_runs=100)
Out[68]:
['person_id',
 'HearingVision_FarSightedness',
 'HearingVision_Glaucoma',
 'Digestive_Pancreatitis']
In [7]:
cols = list( set(dfm.columns.tolist()) - set(['person_id']))
cols = np.random.choice(cols,3,replace=False).tolist()
p = pd.merge(dfm[['person_id']+cols],df)
fcols = list(set(p.columns.tolist()) - set(['person_id']))
# dfm[cols].deid.evaluate(cols=list( set(cols) - set(['person_id'])))

Medical History

We randomly select three a tributes {{ " ; ".join(cols)}} . 
The dataset associated risk evaluation contains {{ p.shape[0] }} records

{{ p[fcols].deid.evaluate() }}

In [52]:
cols
Out[52]:
['person_id',
 'InfectiousDiseases_Tuberculosis',
 'SkeletalMuscular_Fibromyalgia',
 'Cancer_ProstateCancer']
In [67]:
# dfm[cols[1:]].head()
np.sum(dfm.fillna(' ').groupby(cols[1:],as_index=False).size().values <= 1)
Out[67]:
3
In [ ]:

</html>