You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
611 lines
60 KiB
Plaintext
611 lines
60 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\"\"\"\n",
|
|
" Health Information Privacy Lab\n",
|
|
" This notebook is intended to run experiments and generate the data to be used by another notebook\n",
|
|
"\n",
|
|
" pre-requisites:\n",
|
|
" - pandas_risk This is a custom framework that will compute risk for a given dataset\n",
|
|
" - google-cloud-bigquery\n",
|
|
" - numpy\n",
|
|
"\"\"\"\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"from pandas_risk import *\n",
|
|
"from time import time\n",
|
|
"import os\n",
|
|
"#\n",
|
|
"#-- Loading the dataset\n",
|
|
"class Logger :\n",
|
|
" cache = []\n",
|
|
" @staticmethod\n",
|
|
" def clear():\n",
|
|
" Logger.cache = []\n",
|
|
" @staticmethod\n",
|
|
" def log(**args) :\n",
|
|
" Logger.cache.append(args)\n",
|
|
" \n",
|
|
"SQL_CONTROLLED=\"SELECT person_id,birth_datetime,city,zip,state,race,gender FROM deid_risk.basic_risk60k\"\n",
|
|
"SQL_REGISTERED = \"SELECT person_id,birth_datetime,city,zip,state,race,gender FROM deid_risk.basic_deid_risk60k\"\n",
|
|
"dfr = pd.read_gbq(SQL_REGISTERED,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')\n",
|
|
"dfc = pd.read_gbq(SQL_CONTROLLED,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 99,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>sample %</th>\n",
|
|
" <th>marketer</th>\n",
|
|
" <th>sample marketer</th>\n",
|
|
" <th>tier</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0.974945</td>\n",
|
|
" <td>0.981364</td>\n",
|
|
" <td>controlled</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0.975513</td>\n",
|
|
" <td>0.981996</td>\n",
|
|
" <td>controlled</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0.975798</td>\n",
|
|
" <td>0.980733</td>\n",
|
|
" <td>controlled</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0.976364</td>\n",
|
|
" <td>0.981996</td>\n",
|
|
" <td>controlled</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0.976364</td>\n",
|
|
" <td>0.981996</td>\n",
|
|
" <td>controlled</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" sample % marketer sample marketer tier\n",
|
|
"0 5 0.974945 0.981364 controlled\n",
|
|
"1 5 0.975513 0.981996 controlled\n",
|
|
"2 5 0.975798 0.980733 controlled\n",
|
|
"3 5 0.976364 0.981996 controlled\n",
|
|
"4 5 0.976364 0.981996 controlled"
|
|
]
|
|
},
|
|
"execution_count": 99,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"\n",
|
|
"#\n",
|
|
"FLAG='REGISTERED-TIER-1'\n",
|
|
"if FLAG == 'REGISTERED-TIER' :\n",
|
|
" Yi = pd.DataFrame(dfr)\n",
|
|
" FOLDER='registered'\n",
|
|
"else:\n",
|
|
" Yi = pd.DataFrame(dfc)\n",
|
|
" FOLDER='controlled'\n",
|
|
"Yi = Yi.fillna(' ')\n",
|
|
"N = 5\n",
|
|
"N_ = str(N)\n",
|
|
"SUFFIX = FOLDER+'-tier-'+str(N)+'-experiment.xlsx'\n",
|
|
"PATH = os.sep.join(['out',SUFFIX])\n",
|
|
"\n",
|
|
"\n",
|
|
"columns = list(set(Yi.columns.tolist()) - set(['person_id']))\n",
|
|
"merged_columns = list(columns)+['field_count']\n",
|
|
"m = {}\n",
|
|
"p = pd.DataFrame()\n",
|
|
"n = 0\n",
|
|
"y_i= pd.DataFrame({\"group_size\":Yi.groupby(columns,as_index=False).size()}).reset_index()\n",
|
|
"#.deid.risk(id='person_id',quasi_id=columns)\n",
|
|
"for index in np.arange(5,105,5):\n",
|
|
" for n in np.repeat(index,N) :\n",
|
|
"# np.random.seed( np.random.randint(0,int(time())+np.random.randint(0,1000)+index+n ) \n",
|
|
" #\n",
|
|
" # we will randomly sample n% rows from the dataset\n",
|
|
" i = np.random.choice(Yi.shape[0],((Yi.shape[0] * n)/100),replace=False)\n",
|
|
" x_i= pd.DataFrame(Yi).loc[i] \n",
|
|
" risk = x_i.deid.risk(id='person_id',quasi_id = columns)\n",
|
|
" x_i = pd.DataFrame({\"group_size\":x_i.groupby(columns,as_index=False).size()}).reset_index()\n",
|
|
" \n",
|
|
"# y_i= pd.DataFrame(Yi).deid.risk(id='person_id',quasi_id=columns)\n",
|
|
"\n",
|
|
"\n",
|
|
" r = pd.merge(x_i,y_i,on=columns,how='inner')\n",
|
|
" if r.shape[0] == 0 :\n",
|
|
" print 'skipping ',n\n",
|
|
" continue\n",
|
|
" r['marketer'] = r.apply(lambda row: (row.group_size_x / np.float64(row.group_size_y)) /np.sum(x_i.group_size) ,axis=1)\n",
|
|
" r['sample %'] = np.repeat(n,r.shape[0])\n",
|
|
" r['tier'] = np.repeat(FOLDER,r.shape[0])\n",
|
|
" r['sample marketer'] = np.repeat(risk['marketer'].values[0],r.shape[0])\n",
|
|
"# r['patient_count'] = np.repeat(r.shape[0],r.shape[0])\n",
|
|
" r = r.groupby(['sample %','tier','sample marketer'],as_index=False).sum()[['sample %','marketer','sample marketer','tier']]\n",
|
|
"# r['marketer'] = r.apply(lambda row: (row.group_size_x / row.group_size_y) / row.patient_count_x,axis=1 )\n",
|
|
"# r = r.groupby(columns+['marketer_x'],as_index=False).sum()[columns+['marketer','marketer_x']]\n",
|
|
"# r['sample %'] = np.repeat(n,r.shape[0])\n",
|
|
"# r['tier'] = np.repeat(FOLDER,r.shape[0])\n",
|
|
" p = p.append(r)\n",
|
|
"\n",
|
|
"writer = pd.ExcelWriter(PATH,engine='xlsxwriter')\n",
|
|
"p = p.rename(columns={'marketer_x':'sample marketer'})\n",
|
|
"p.index = np.arange(p.shape[0]).astype(np.int64)\n",
|
|
"p.to_excel(writer,FOLDER)\n",
|
|
"writer.save()\n",
|
|
"p.head() "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 100,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<matplotlib.axes._subplots.AxesSubplot at 0x7fe67aa7a9d0>"
|
|
]
|
|
},
|
|
"execution_count": 100,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 432x288 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"needs_background": "light"
|
|
},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 432x288 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"needs_background": "light"
|
|
},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 432x288 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"needs_background": "light"
|
|
},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"p.plot(kind='scatter',x='sample %',y='marketer', c = '#4682B4')\n",
|
|
"p.plot(kind='scatter',x='sample %',y = 'sample marketer', c='#CC0000')\n",
|
|
"ax = p.plot(kind='scatter',x='sample %',y='marketer', c = '#4682B4')\n",
|
|
"p.plot(kind='scatter',x='sample %',y = 'sample marketer', c='#CC0000',ax=ax)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\"\"\"\n",
|
|
" This experiment consists in :\n",
|
|
" 1: randomly selecting x % of the records to be sampled\n",
|
|
" 2: running a group by on the sample\n",
|
|
" 3: calling groupby on the population which th\n",
|
|
"\"\"\"\n",
|
|
"SQL_ORIGINAL=\"SELECT * FROM deid_risk.risk_60k2\"\n",
|
|
"SQL_DEID = \"SELECT * FROM deid_risk.deid_risk_60k limit 20000\"\n",
|
|
"# df = pd.read_gbq(SQL_DEID,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')\n",
|
|
"\n",
|
|
"#\n",
|
|
"FLAG='REGISTERED-TIER-9'\n",
|
|
"if FLAG == 'REGISTERED-TIER' :\n",
|
|
" Yi = pd.DataFrame(dfr)\n",
|
|
" FOLDER='registered'\n",
|
|
"else:\n",
|
|
" Yi = pd.DataFrame(dfc)\n",
|
|
" FOLDER='controlled'\n",
|
|
"N = 20\n",
|
|
"N_ = str(N)\n",
|
|
"SUFFIX = FOLDER+'-tier-'+str(N)+'-experiment.xlsx'\n",
|
|
"PATH = os.sep.join(['out',SUFFIX])\n",
|
|
"\n",
|
|
"\n",
|
|
"columns = list(set(Yi.columns.tolist()) - set(['person_id']))\n",
|
|
"merged_columns = list(columns)+['field_count']\n",
|
|
"m = {}\n",
|
|
"p = pd.DataFrame()\n",
|
|
"n = 0\n",
|
|
"y_i= pd.DataFrame(Yi).deid.risk(id='person_id',quasi_id=columns)\n",
|
|
"for index in np.arange(5,105,5):\n",
|
|
"# np.random.seed( int(time())+np.random.randint(0,100)+index ) \n",
|
|
"# n = np.random.randint(10,35) #-- randomly pick a number within an interval\n",
|
|
" \n",
|
|
" for n in np.repeat(index,20) :\n",
|
|
"# np.random.seed( np.random.randint(0,int(time())+np.random.randint(0,1000)+index+n ) \n",
|
|
" #\n",
|
|
" # we will randomly sample n% rows from the dataset\n",
|
|
" i = np.random.choice(Yi.shape[0],((Yi.shape[0] * n)/100),replace=False)\n",
|
|
" x_i= pd.DataFrame(Yi).loc[i].deid.risk(id='person_id',quasi_id = columns)\n",
|
|
" \n",
|
|
"# y_i= pd.DataFrame(Yi).deid.risk(id='person_id',quasi_id=columns)\n",
|
|
"\n",
|
|
"\n",
|
|
" r = pd.merge(x_i,y_i,on=merged_columns,how='inner')\n",
|
|
" if r.shape[0] == 0 :\n",
|
|
" print 'skipping ',n\n",
|
|
" continue\n",
|
|
"\n",
|
|
" r['marketer'] = r.apply(lambda row: (row.group_size_x / row.group_size_y) / row.patient_count_x,axis=1 )\n",
|
|
" r = r.groupby(columns+['marketer_x'],as_index=False).sum()[columns+['marketer','marketer_x']]\n",
|
|
" r['sample %'] = np.repeat(n,r.shape[0])\n",
|
|
" r['tier'] = np.repeat(FOLDER,r.shape[0])\n",
|
|
" p = p.append(r)\n",
|
|
"\n",
|
|
"writer = pd.ExcelWriter(PATH,engine='xlsxwriter')\n",
|
|
"p = p.rename(columns={'marketer_x':'sample marketer'})\n",
|
|
"p.index = np.arange(p.shape[0]).astype(np.int64)\n",
|
|
"p.to_excel(writer,FOLDER)\n",
|
|
"writer.save()\n",
|
|
"p.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"ax = p.plot(kind='scatter',x='sample %',y='marketer',c='r',ylim=[p.marketer.min(),p.marketer.max()])\n",
|
|
"p.plot(kind='scatter',x='sample %',y='sample marketer',c='#4682B4')\n",
|
|
"ax = p.plot(kind='scatter',x='sample %',y='marketer',c='r')\n",
|
|
"p.plot(kind='scatter',x='sample %',y='sample marketer',c='#4682B4',ax=ax)\n",
|
|
"\n",
|
|
"_p = pd.DataFrame(p)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"p.head()\n",
|
|
"\n",
|
|
"# writer = pd.ExcelWriter('out/foo.xlsx',engine='xlsxwriter')\n",
|
|
"# workbook = writer.book\n",
|
|
"# r.groupby('field_count',as_index=False).sum()[['field_count','marketer_x']].to_excel(writer,'page-0')\n",
|
|
"# chart = workbook.add_chart({'type':'line'})\n",
|
|
"# o = r.groupby('field_count',as_index=False).sum()[['field_count','marketer_x']]\n",
|
|
"# # values = o.marketer_x.tolist()\n",
|
|
"# # values = [['page-0',item] for item in values]\n",
|
|
"# # chart.add_series({\"values\":values})\n",
|
|
"# # chart.add_series({'values':'=page-0!$B$2:$B$5'})\n",
|
|
"\n",
|
|
"# worksheet = writer.sheets['page-0']\n",
|
|
"# worksheet.insert_chart('G2',chart)\n",
|
|
"# writer.save()\n",
|
|
"\n",
|
|
"str(10)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"help(chart.add_series)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"cols = list(set(dfr.columns.tolist()) - set(['person_id'])) + ['field_count']\n",
|
|
"r = pd.merge(x_i,y_i,on=cols,how='inner')\n",
|
|
"r['marketer'] = r.apply(lambda row: (row.group_count_x/row.group_count_y)/row.patient_count_y ,axis=1)\n",
|
|
"# r['field_count'] = r['field_count_x']\n",
|
|
"o = r.groupby(cols,as_index=False).sum()[cols+['marketer']]\n",
|
|
"o.groupby(['field_count'],as_index=False).mean()\n",
|
|
"# o.groupby('field_count',as_index=False).mean().plot.line(x='field_count',y='marketer')\n",
|
|
"# r.head()\n",
|
|
"# N = r.patient_count_y.mean()\n",
|
|
"# r['marketer'] = r.apply(lambda row: row.group_count_x / row.group_count_y,axis=1)\n",
|
|
"# m = r.groupby(['field_count'],as_index=False).mean()[['field_count','marketer']]\n",
|
|
"# m.marketer = m.marketer / N\n",
|
|
"# m.groupby(['field_count']).mean().plot.line()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"p.to_csv('out/x-2/single-runs-deid.csv',index=False)\n",
|
|
"p.groupby(['sample %']).mean()['marketer'].plot.line()\n",
|
|
"p.groupby(['sample %'],as_index=False).mean().plot.scatter(x='sample %',y='marketer')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"y = pd.DataFrame({\"name\":['d','e','f','g'],\"age\":[12,40,20,30],\"income\":[100,200,300,400]})\n",
|
|
"x = pd.DataFrame({\"name\":['a','b','c'],\"age\":[10,20,40],\"income\":[120,100,200]})\n",
|
|
"\n",
|
|
"# x.join(y,how='outer',on='age')\n",
|
|
"x_ = pd.merge(x,y,on=['age','income'],how='outer')\n",
|
|
"Logger.log(action='merge',value=x_.shape)\n",
|
|
"Logger.cache"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#\n",
|
|
"# EXP_0\n",
|
|
"# Running the experiment on the Original dataset, with all the attributes\n",
|
|
"SCHEMA = \"deid_risk\"\n",
|
|
"df = pd.read_gbq(\"select person_id,birth_datetime,race,gender,sex_at_birth, city,state,zip from deid_risk.basic_risk60k \",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n",
|
|
" dialect='standard')\n",
|
|
"\n",
|
|
"RUNS = 500\n",
|
|
"FLAG = 'basic-features'\n",
|
|
"r = df.deid.risk(id='person_id',num_runs=RUNS) #,field_count=11)\n",
|
|
"# r.to_csv('out/pandas-60k-'+FLAG+'-patients-'+str(RUNS)+'-x-runs.csv')\n",
|
|
"compiled = r.groupby('field_count',as_index=False)['marketer','prosecutor'].mean()\n",
|
|
"fi = compiled[['marketer','prosecutor']].plot.line().get_figure()\n",
|
|
"# fo\n",
|
|
"# r.plot.line(x='field_count',y='marketer')\n",
|
|
"compiled = r.groupby('field_count',as_index=False)['field_count','marketer','prosecutor'].mean()\n",
|
|
"fig_i = r.plot.scatter(x='field_count',y='marketer').get_figure()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#\n",
|
|
"# EXP_2 :\n",
|
|
"# This experiment will run the marketer risk against individual attributes\n",
|
|
"deid_df = pd.read_gbq(\"select person_id,birth_datetime,race,gender,sex_at_birth, city,state,zip from deid_risk.basic_deid_risk60k\",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n",
|
|
" dialect='standard')\n",
|
|
"RUNS = 500\n",
|
|
"FLAG = 'basic-deid-features'\n",
|
|
"deid_r = deid_df.deid.risk(id='person_id',num_runs=RUNS) #,field_count=11)\n",
|
|
"# r.to_csv('out/pandas-60k-'+FLAG+'-patients-'+str(RUNS)+'-x-runs.csv')\n",
|
|
"deid_compiled = deid_r.groupby('field_count',as_index=False)['marketer','prosecutor'].mean()\n",
|
|
"fo = deid_compiled[['marketer','prosecutor']].plot.line().get_figure()\n",
|
|
"# fo\n",
|
|
"# r.plot.line(x='field_count',y='marketer')\n",
|
|
"# deid_compiled = deid_r.groupby('field_count',as_index=False)['field_count','marketer','prosecutor'].mean()\n",
|
|
"fig_o = deid_r.plot.scatter(x='field_count',y='marketer').get_figure()\n",
|
|
"\n",
|
|
"# orig_df = pd.read_gbq(\"select * from deid_risk.risk_60k2\",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n",
|
|
"# dialect='standard')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# deid_r.to_csv('out/basic-attributes-deid-data-60k-patients.csv')\n",
|
|
"# r.to_csv('out/basic-attributes-raw-data-60k-patients.csv')\n",
|
|
"# deid_r.head()\n",
|
|
"p = pd.DataFrame()\n",
|
|
"p = deid_df.deid.risk(id='person_id',quasi_id=['birth_datetime','race','gender','sex_at_birth', 'city','state','zip'])\n",
|
|
"p = p.append(df.deid.risk(id='person_id',quasi_id=['birth_datetime','race','gender','sex_at_birth', 'city','state','zip']))\n",
|
|
"p.index = ['deid data','raw data']\n",
|
|
"p.to_csv('out/basic_run-7-fields.csv')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"cols = deid_r.columns[5:]\n",
|
|
"deid_r.index = np.arange(deid_r.shape[0]).astype(np.int64)\n",
|
|
"xdeid_ = deid_r[cols].sum().tolist()\n",
|
|
"xraw_ = r[cols].sum().tolist()\n",
|
|
"o = pd.DataFrame()\n",
|
|
"o['name'] = cols\n",
|
|
"o['raw'] = xraw_\n",
|
|
"o['deid']= xdeid_\n",
|
|
"\n",
|
|
"\n",
|
|
"o\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"columns = list( set(orig_df.columns) - set(['person_id']))\n",
|
|
"xo = pd.DataFrame()\n",
|
|
"xi = pd.DataFrame()\n",
|
|
"#\n",
|
|
"# Let's compute the risk for every attribute given the list of attributes we've gathered\n",
|
|
"#\n",
|
|
"for name in columns :\n",
|
|
" xo = xo.append(deid_df.deid.risk(id='person_id',quasi_id=[name])[['marketer','prosecutor']],sort=False)\n",
|
|
" xi = xi.append(orig_df.deid.risk(id='person_id',quasi_id=[name])[['marketer','prosecutor']],sort=False)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#\n",
|
|
"# The following shows how much the deid process has affected each attributes\n",
|
|
"#\n",
|
|
"\n",
|
|
"RISK_THRESHOLD = 0.5\n",
|
|
"xo.index = columns\n",
|
|
"xi.index = columns\n",
|
|
"\n",
|
|
"ii = xi[xi.marketer > RISK_THRESHOLD].index\n",
|
|
"# zo = pd.concat([xi.loc[ii],xo.loc[ii]])\n",
|
|
"\n",
|
|
"zo = xi.loc[ii].join(xo.loc[ii],rsuffix='_deid')\n",
|
|
"#\n",
|
|
"# heatmap for original data\n",
|
|
"# fig_o = sns.heatmap(xi.loc[ii], cmap='RdYlGn_r', linewidths=0.5, annot=True).get_figure()\n",
|
|
"\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#\n",
|
|
"# Running the experiment on the DEID dataset, with all the attributes\n",
|
|
"#\n",
|
|
"df = pd.read_gbq(\"select * from deid_risk.deid_risk_60k\",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n",
|
|
" dialect='standard')\n",
|
|
"\n",
|
|
"RUNS = 1500\n",
|
|
"FLAG = 'deid-full-attr-dataset'\n",
|
|
"r = df.deid.risk(id='person_id',num_runs=RUNS) #,field_count=11)\n",
|
|
"# r.to_csv('out/pandas-60k-'+FLAG+'-patients-'+str(RUNS)+'-x-runs.csv')\n",
|
|
"compiled = r.groupby('field_count',as_index=False)['marketer','prosecutor'].mean()\n",
|
|
"fo = compiled[['marketer','prosecutor']].plot.line().get_figure()\n",
|
|
"# fo\n",
|
|
"# r.plot.line(x='field_count',y='marketer')\n",
|
|
"compiled = r.groupby('field_count',as_index=False)['field_count','marketer','prosecutor'].mean()\n",
|
|
"fig_o = r.plot.scatter(x='field_count',y='marketer').get_figure()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"r.groupby('field_count',as_index=False)['marketer','prosecutor'].var()[['marketer','prosecutor']].plot.line()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#\n",
|
|
"# We are going to look into the attributes with a risk of a given threshold\n",
|
|
"# We will run the experiment (varied combinations of the list of attributes)\n",
|
|
"# The experiment is intended to capture the attributes responsible for increasing the marketer risk\n",
|
|
"#\n",
|
|
"DEID_DATASET = 'deid_risk.deid_risk_60k2'\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 2",
|
|
"language": "python",
|
|
"name": "python2"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 2
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython2",
|
|
"version": "2.7.15rc1"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|