You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
12 KiB
12 KiB
None
<html lang="en">
<head>
</head>
</html>
In [8]:
"""
The experiments here describe medical/family history as they associate with risk measures
Additionally we will have fractional risk assessments
"""
import pandas as pd
import numpy as np
from pandas_risk import *
dfm = pd.read_gbq("SELECT * FROM deid_risk.registered_medical_history_dec_001",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
dff = pd.read_gbq("SELECT * FROM deid_risk.registered_family_history_dec_001",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
df = pd.read_gbq("SELECT person_id, birth_date,city,state,home_owner,race,ethnicity,gender,birth_place,marital_status,orientation,education,employment_status,income,travel_abroad_6_months,active_duty_status FROM deid_risk.registered_dec_01",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
In [32]:
med_cols = np.random.choice(list(set(dfm.columns.tolist()) - set(['person_id'])),3).tolist()
fam_cols = np.random.choice(list(set(dff.columns.tolist()) - set(['person_id'])),3).tolist()
medical = pd.merge(df,dfm[med_cols+['person_id']],on='person_id')
family = pd.merge(df,dff[fam_cols + ['person_id']],on='person_id')
_tmp = pd.merge(dfm[med_cols +['person_id']],dff[fam_cols+['person_id']])
data = pd.merge(df,_tmp,on='person_id')
In [33]:
pd.concat([data.deid.evaluate(flag='full history',cols= list(set(data.columns.tolist()) - set(['person_id'])) )
,medical.deid.evaluate(flag='medical',cols=list( set(medical.columns.tolist() ) - set(['person_id']) ) )
,family.deid.evaluate(flag='family',cols=list( set(family.columns.tolist() ) - set(['person_id']) ) )
,df.deid.evaluate(flag='no-history',cols=list( set(df.columns.tolist() ) - set(['person_id']) ) )
, dfm.deid.evaluate(flag='medical-only',cols=med_cols )
, dff.deid.evaluate(flag='family-only',cols=fam_cols )
],ignore_index=True)
Out[33]:
In [2]:
from __future__ import division
def evaluate(df) :
cols = list(set(df.columns.tolist()) - set(['person_id']))
portions = np.round(np.random.random_sample(4),3).tolist() + np.arange(5,105,5).tolist()
N = df.shape[0] - 1
portions = np.divide(np.multiply(portions,N),100).astype(np.int64)
portions = np.unique([n for n in portions if n > 1])
r = pd.DataFrame()
for num_rows in portions :
indices = np.random.choice(N,num_rows,replace=False)
# print (indices.size / N)
flag = " ".join([str( np.round(100*indices.size/ N,2)),'%'])
r = r.append(df.loc[indices].deid.evaluate(cols=cols,flag=flag,min_group_size=2))
return r
In [3]:
cols = list(set (df.columns.tolist()) - set(['person_id']))
df[['race','state','gender_identity','ethnicity','marital_status','education','orientation','sex_at_birth','birth_date','travel_abroad_6_months','active_duty_status']].deid.evaluate()
Out[3]:
In [68]:
#
# This is the merge with medical history
cols = ['person_id'] + np.random.choice(dfm.columns[1:],3,replace=False).tolist()
p = pd.merge(df,dfm[cols],on='person_id')
cols
# # cols = list(set(p.columns.tolist()) - set(['person_id']))
# evaluate(p) #p.deid.explore(cols=cols,num_runs=100)
Out[68]:
In [7]:
cols = list( set(dfm.columns.tolist()) - set(['person_id']))
cols = np.random.choice(cols,3,replace=False).tolist()
p = pd.merge(dfm[['person_id']+cols],df)
fcols = list(set(p.columns.tolist()) - set(['person_id']))
# dfm[cols].deid.evaluate(cols=list( set(cols) - set(['person_id'])))
Medical History¶
We randomly select three a tributes {{ " ; ".join(cols)}} .
The dataset associated risk evaluation contains {{ p.shape[0] }} records
{{ p[fcols].deid.evaluate() }}
In [52]:
cols
Out[52]:
In [67]:
# dfm[cols[1:]].head()
np.sum(dfm.fillna(' ').groupby(cols[1:],as_index=False).size().values <= 1)
Out[67]:
In [ ]: