You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
privacykit/notebooks/experiments.ipynb

611 lines
60 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
" Health Information Privacy Lab\n",
" This notebook is intended to run experiments and generate the data to be used by another notebook\n",
"\n",
" pre-requisites:\n",
" - pandas_risk This is a custom framework that will compute risk for a given dataset\n",
" - google-cloud-bigquery\n",
" - numpy\n",
"\"\"\"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from pandas_risk import *\n",
"from time import time\n",
"import os\n",
"#\n",
"#-- Loading the dataset\n",
"class Logger :\n",
" cache = []\n",
" @staticmethod\n",
" def clear():\n",
" Logger.cache = []\n",
" @staticmethod\n",
" def log(**args) :\n",
" Logger.cache.append(args)\n",
" \n",
"SQL_CONTROLLED=\"SELECT person_id,birth_datetime,city,zip,state,race,gender FROM deid_risk.basic_risk60k\"\n",
"SQL_REGISTERED = \"SELECT person_id,birth_datetime,city,zip,state,race,gender FROM deid_risk.basic_deid_risk60k\"\n",
"dfr = pd.read_gbq(SQL_REGISTERED,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')\n",
"dfc = pd.read_gbq(SQL_CONTROLLED,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sample %</th>\n",
" <th>marketer</th>\n",
" <th>sample marketer</th>\n",
" <th>tier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>0.974945</td>\n",
" <td>0.981364</td>\n",
" <td>controlled</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>0.975513</td>\n",
" <td>0.981996</td>\n",
" <td>controlled</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>0.975798</td>\n",
" <td>0.980733</td>\n",
" <td>controlled</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5</td>\n",
" <td>0.976364</td>\n",
" <td>0.981996</td>\n",
" <td>controlled</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>0.976364</td>\n",
" <td>0.981996</td>\n",
" <td>controlled</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sample % marketer sample marketer tier\n",
"0 5 0.974945 0.981364 controlled\n",
"1 5 0.975513 0.981996 controlled\n",
"2 5 0.975798 0.980733 controlled\n",
"3 5 0.976364 0.981996 controlled\n",
"4 5 0.976364 0.981996 controlled"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"#\n",
"FLAG='REGISTERED-TIER-1'\n",
"if FLAG == 'REGISTERED-TIER' :\n",
" Yi = pd.DataFrame(dfr)\n",
" FOLDER='registered'\n",
"else:\n",
" Yi = pd.DataFrame(dfc)\n",
" FOLDER='controlled'\n",
"Yi = Yi.fillna(' ')\n",
"N = 5\n",
"N_ = str(N)\n",
"SUFFIX = FOLDER+'-tier-'+str(N)+'-experiment.xlsx'\n",
"PATH = os.sep.join(['out',SUFFIX])\n",
"\n",
"\n",
"columns = list(set(Yi.columns.tolist()) - set(['person_id']))\n",
"merged_columns = list(columns)+['field_count']\n",
"m = {}\n",
"p = pd.DataFrame()\n",
"n = 0\n",
"y_i= pd.DataFrame({\"group_size\":Yi.groupby(columns,as_index=False).size()}).reset_index()\n",
"#.deid.risk(id='person_id',quasi_id=columns)\n",
"for index in np.arange(5,105,5):\n",
" for n in np.repeat(index,N) :\n",
"# np.random.seed( np.random.randint(0,int(time())+np.random.randint(0,1000)+index+n ) \n",
" #\n",
" # we will randomly sample n% rows from the dataset\n",
" i = np.random.choice(Yi.shape[0],((Yi.shape[0] * n)/100),replace=False)\n",
" x_i= pd.DataFrame(Yi).loc[i] \n",
" risk = x_i.deid.risk(id='person_id',quasi_id = columns)\n",
" x_i = pd.DataFrame({\"group_size\":x_i.groupby(columns,as_index=False).size()}).reset_index()\n",
" \n",
"# y_i= pd.DataFrame(Yi).deid.risk(id='person_id',quasi_id=columns)\n",
"\n",
"\n",
" r = pd.merge(x_i,y_i,on=columns,how='inner')\n",
" if r.shape[0] == 0 :\n",
" print 'skipping ',n\n",
" continue\n",
" r['marketer'] = r.apply(lambda row: (row.group_size_x / np.float64(row.group_size_y)) /np.sum(x_i.group_size) ,axis=1)\n",
" r['sample %'] = np.repeat(n,r.shape[0])\n",
" r['tier'] = np.repeat(FOLDER,r.shape[0])\n",
" r['sample marketer'] = np.repeat(risk['marketer'].values[0],r.shape[0])\n",
"# r['patient_count'] = np.repeat(r.shape[0],r.shape[0])\n",
" r = r.groupby(['sample %','tier','sample marketer'],as_index=False).sum()[['sample %','marketer','sample marketer','tier']]\n",
"# r['marketer'] = r.apply(lambda row: (row.group_size_x / row.group_size_y) / row.patient_count_x,axis=1 )\n",
"# r = r.groupby(columns+['marketer_x'],as_index=False).sum()[columns+['marketer','marketer_x']]\n",
"# r['sample %'] = np.repeat(n,r.shape[0])\n",
"# r['tier'] = np.repeat(FOLDER,r.shape[0])\n",
" p = p.append(r)\n",
"\n",
"writer = pd.ExcelWriter(PATH,engine='xlsxwriter')\n",
"p = p.rename(columns={'marketer_x':'sample marketer'})\n",
"p.index = np.arange(p.shape[0]).astype(np.int64)\n",
"p.to_excel(writer,FOLDER)\n",
"writer.save()\n",
"p.head() "
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7fe67aa7a9d0>"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"p.plot(kind='scatter',x='sample %',y='marketer', c = '#4682B4')\n",
"p.plot(kind='scatter',x='sample %',y = 'sample marketer', c='#CC0000')\n",
"ax = p.plot(kind='scatter',x='sample %',y='marketer', c = '#4682B4')\n",
"p.plot(kind='scatter',x='sample %',y = 'sample marketer', c='#CC0000',ax=ax)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
" This experiment consists in :\n",
" 1: randomly selecting x % of the records to be sampled\n",
" 2: running a group by on the sample\n",
" 3: calling groupby on the population which th\n",
"\"\"\"\n",
"SQL_ORIGINAL=\"SELECT * FROM deid_risk.risk_60k2\"\n",
"SQL_DEID = \"SELECT * FROM deid_risk.deid_risk_60k limit 20000\"\n",
"# df = pd.read_gbq(SQL_DEID,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')\n",
"\n",
"#\n",
"FLAG='REGISTERED-TIER-9'\n",
"if FLAG == 'REGISTERED-TIER' :\n",
" Yi = pd.DataFrame(dfr)\n",
" FOLDER='registered'\n",
"else:\n",
" Yi = pd.DataFrame(dfc)\n",
" FOLDER='controlled'\n",
"N = 20\n",
"N_ = str(N)\n",
"SUFFIX = FOLDER+'-tier-'+str(N)+'-experiment.xlsx'\n",
"PATH = os.sep.join(['out',SUFFIX])\n",
"\n",
"\n",
"columns = list(set(Yi.columns.tolist()) - set(['person_id']))\n",
"merged_columns = list(columns)+['field_count']\n",
"m = {}\n",
"p = pd.DataFrame()\n",
"n = 0\n",
"y_i= pd.DataFrame(Yi).deid.risk(id='person_id',quasi_id=columns)\n",
"for index in np.arange(5,105,5):\n",
"# np.random.seed( int(time())+np.random.randint(0,100)+index ) \n",
"# n = np.random.randint(10,35) #-- randomly pick a number within an interval\n",
" \n",
" for n in np.repeat(index,20) :\n",
"# np.random.seed( np.random.randint(0,int(time())+np.random.randint(0,1000)+index+n ) \n",
" #\n",
" # we will randomly sample n% rows from the dataset\n",
" i = np.random.choice(Yi.shape[0],((Yi.shape[0] * n)/100),replace=False)\n",
" x_i= pd.DataFrame(Yi).loc[i].deid.risk(id='person_id',quasi_id = columns)\n",
" \n",
"# y_i= pd.DataFrame(Yi).deid.risk(id='person_id',quasi_id=columns)\n",
"\n",
"\n",
" r = pd.merge(x_i,y_i,on=merged_columns,how='inner')\n",
" if r.shape[0] == 0 :\n",
" print 'skipping ',n\n",
" continue\n",
"\n",
" r['marketer'] = r.apply(lambda row: (row.group_size_x / row.group_size_y) / row.patient_count_x,axis=1 )\n",
" r = r.groupby(columns+['marketer_x'],as_index=False).sum()[columns+['marketer','marketer_x']]\n",
" r['sample %'] = np.repeat(n,r.shape[0])\n",
" r['tier'] = np.repeat(FOLDER,r.shape[0])\n",
" p = p.append(r)\n",
"\n",
"writer = pd.ExcelWriter(PATH,engine='xlsxwriter')\n",
"p = p.rename(columns={'marketer_x':'sample marketer'})\n",
"p.index = np.arange(p.shape[0]).astype(np.int64)\n",
"p.to_excel(writer,FOLDER)\n",
"writer.save()\n",
"p.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"ax = p.plot(kind='scatter',x='sample %',y='marketer',c='r',ylim=[p.marketer.min(),p.marketer.max()])\n",
"p.plot(kind='scatter',x='sample %',y='sample marketer',c='#4682B4')\n",
"ax = p.plot(kind='scatter',x='sample %',y='marketer',c='r')\n",
"p.plot(kind='scatter',x='sample %',y='sample marketer',c='#4682B4',ax=ax)\n",
"\n",
"_p = pd.DataFrame(p)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"p.head()\n",
"\n",
"# writer = pd.ExcelWriter('out/foo.xlsx',engine='xlsxwriter')\n",
"# workbook = writer.book\n",
"# r.groupby('field_count',as_index=False).sum()[['field_count','marketer_x']].to_excel(writer,'page-0')\n",
"# chart = workbook.add_chart({'type':'line'})\n",
"# o = r.groupby('field_count',as_index=False).sum()[['field_count','marketer_x']]\n",
"# # values = o.marketer_x.tolist()\n",
"# # values = [['page-0',item] for item in values]\n",
"# # chart.add_series({\"values\":values})\n",
"# # chart.add_series({'values':'=page-0!$B$2:$B$5'})\n",
"\n",
"# worksheet = writer.sheets['page-0']\n",
"# worksheet.insert_chart('G2',chart)\n",
"# writer.save()\n",
"\n",
"str(10)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"help(chart.add_series)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cols = list(set(dfr.columns.tolist()) - set(['person_id'])) + ['field_count']\n",
"r = pd.merge(x_i,y_i,on=cols,how='inner')\n",
"r['marketer'] = r.apply(lambda row: (row.group_count_x/row.group_count_y)/row.patient_count_y ,axis=1)\n",
"# r['field_count'] = r['field_count_x']\n",
"o = r.groupby(cols,as_index=False).sum()[cols+['marketer']]\n",
"o.groupby(['field_count'],as_index=False).mean()\n",
"# o.groupby('field_count',as_index=False).mean().plot.line(x='field_count',y='marketer')\n",
"# r.head()\n",
"# N = r.patient_count_y.mean()\n",
"# r['marketer'] = r.apply(lambda row: row.group_count_x / row.group_count_y,axis=1)\n",
"# m = r.groupby(['field_count'],as_index=False).mean()[['field_count','marketer']]\n",
"# m.marketer = m.marketer / N\n",
"# m.groupby(['field_count']).mean().plot.line()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"p.to_csv('out/x-2/single-runs-deid.csv',index=False)\n",
"p.groupby(['sample %']).mean()['marketer'].plot.line()\n",
"p.groupby(['sample %'],as_index=False).mean().plot.scatter(x='sample %',y='marketer')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y = pd.DataFrame({\"name\":['d','e','f','g'],\"age\":[12,40,20,30],\"income\":[100,200,300,400]})\n",
"x = pd.DataFrame({\"name\":['a','b','c'],\"age\":[10,20,40],\"income\":[120,100,200]})\n",
"\n",
"# x.join(y,how='outer',on='age')\n",
"x_ = pd.merge(x,y,on=['age','income'],how='outer')\n",
"Logger.log(action='merge',value=x_.shape)\n",
"Logger.cache"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# EXP_0\n",
"# Running the experiment on the Original dataset, with all the attributes\n",
"SCHEMA = \"deid_risk\"\n",
"df = pd.read_gbq(\"select person_id,birth_datetime,race,gender,sex_at_birth, city,state,zip from deid_risk.basic_risk60k \",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n",
" dialect='standard')\n",
"\n",
"RUNS = 500\n",
"FLAG = 'basic-features'\n",
"r = df.deid.risk(id='person_id',num_runs=RUNS) #,field_count=11)\n",
"# r.to_csv('out/pandas-60k-'+FLAG+'-patients-'+str(RUNS)+'-x-runs.csv')\n",
"compiled = r.groupby('field_count',as_index=False)['marketer','prosecutor'].mean()\n",
"fi = compiled[['marketer','prosecutor']].plot.line().get_figure()\n",
"# fo\n",
"# r.plot.line(x='field_count',y='marketer')\n",
"compiled = r.groupby('field_count',as_index=False)['field_count','marketer','prosecutor'].mean()\n",
"fig_i = r.plot.scatter(x='field_count',y='marketer').get_figure()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# EXP_2 :\n",
"# This experiment will run the marketer risk against individual attributes\n",
"deid_df = pd.read_gbq(\"select person_id,birth_datetime,race,gender,sex_at_birth, city,state,zip from deid_risk.basic_deid_risk60k\",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n",
" dialect='standard')\n",
"RUNS = 500\n",
"FLAG = 'basic-deid-features'\n",
"deid_r = deid_df.deid.risk(id='person_id',num_runs=RUNS) #,field_count=11)\n",
"# r.to_csv('out/pandas-60k-'+FLAG+'-patients-'+str(RUNS)+'-x-runs.csv')\n",
"deid_compiled = deid_r.groupby('field_count',as_index=False)['marketer','prosecutor'].mean()\n",
"fo = deid_compiled[['marketer','prosecutor']].plot.line().get_figure()\n",
"# fo\n",
"# r.plot.line(x='field_count',y='marketer')\n",
"# deid_compiled = deid_r.groupby('field_count',as_index=False)['field_count','marketer','prosecutor'].mean()\n",
"fig_o = deid_r.plot.scatter(x='field_count',y='marketer').get_figure()\n",
"\n",
"# orig_df = pd.read_gbq(\"select * from deid_risk.risk_60k2\",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n",
"# dialect='standard')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# deid_r.to_csv('out/basic-attributes-deid-data-60k-patients.csv')\n",
"# r.to_csv('out/basic-attributes-raw-data-60k-patients.csv')\n",
"# deid_r.head()\n",
"p = pd.DataFrame()\n",
"p = deid_df.deid.risk(id='person_id',quasi_id=['birth_datetime','race','gender','sex_at_birth', 'city','state','zip'])\n",
"p = p.append(df.deid.risk(id='person_id',quasi_id=['birth_datetime','race','gender','sex_at_birth', 'city','state','zip']))\n",
"p.index = ['deid data','raw data']\n",
"p.to_csv('out/basic_run-7-fields.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cols = deid_r.columns[5:]\n",
"deid_r.index = np.arange(deid_r.shape[0]).astype(np.int64)\n",
"xdeid_ = deid_r[cols].sum().tolist()\n",
"xraw_ = r[cols].sum().tolist()\n",
"o = pd.DataFrame()\n",
"o['name'] = cols\n",
"o['raw'] = xraw_\n",
"o['deid']= xdeid_\n",
"\n",
"\n",
"o\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"columns = list( set(orig_df.columns) - set(['person_id']))\n",
"xo = pd.DataFrame()\n",
"xi = pd.DataFrame()\n",
"#\n",
"# Let's compute the risk for every attribute given the list of attributes we've gathered\n",
"#\n",
"for name in columns :\n",
" xo = xo.append(deid_df.deid.risk(id='person_id',quasi_id=[name])[['marketer','prosecutor']],sort=False)\n",
" xi = xi.append(orig_df.deid.risk(id='person_id',quasi_id=[name])[['marketer','prosecutor']],sort=False)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# The following shows how much the deid process has affected each attributes\n",
"#\n",
"\n",
"RISK_THRESHOLD = 0.5\n",
"xo.index = columns\n",
"xi.index = columns\n",
"\n",
"ii = xi[xi.marketer > RISK_THRESHOLD].index\n",
"# zo = pd.concat([xi.loc[ii],xo.loc[ii]])\n",
"\n",
"zo = xi.loc[ii].join(xo.loc[ii],rsuffix='_deid')\n",
"#\n",
"# heatmap for original data\n",
"# fig_o = sns.heatmap(xi.loc[ii], cmap='RdYlGn_r', linewidths=0.5, annot=True).get_figure()\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# Running the experiment on the DEID dataset, with all the attributes\n",
"#\n",
"df = pd.read_gbq(\"select * from deid_risk.deid_risk_60k\",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n",
" dialect='standard')\n",
"\n",
"RUNS = 1500\n",
"FLAG = 'deid-full-attr-dataset'\n",
"r = df.deid.risk(id='person_id',num_runs=RUNS) #,field_count=11)\n",
"# r.to_csv('out/pandas-60k-'+FLAG+'-patients-'+str(RUNS)+'-x-runs.csv')\n",
"compiled = r.groupby('field_count',as_index=False)['marketer','prosecutor'].mean()\n",
"fo = compiled[['marketer','prosecutor']].plot.line().get_figure()\n",
"# fo\n",
"# r.plot.line(x='field_count',y='marketer')\n",
"compiled = r.groupby('field_count',as_index=False)['field_count','marketer','prosecutor'].mean()\n",
"fig_o = r.plot.scatter(x='field_count',y='marketer').get_figure()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"r.groupby('field_count',as_index=False)['marketer','prosecutor'].var()[['marketer','prosecutor']].plot.line()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# We are going to look into the attributes with a risk of a given threshold\n",
"# We will run the experiment (varied combinations of the list of attributes)\n",
"# The experiment is intended to capture the attributes responsible for increasing the marketer risk\n",
"#\n",
"DEID_DATASET = 'deid_risk.deid_risk_60k2'\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.15rc1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}