You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
611 lines
60 KiB
Plaintext
611 lines
60 KiB
Plaintext
6 years ago
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"\"\"\"\n",
|
||
|
" Health Information Privacy Lab\n",
|
||
|
" This notebook is intended to run experiments and generate the data to be used by another notebook\n",
|
||
|
"\n",
|
||
|
" pre-requisites:\n",
|
||
|
" - pandas_risk This is a custom framework that will compute risk for a given dataset\n",
|
||
|
" - google-cloud-bigquery\n",
|
||
|
" - numpy\n",
|
||
|
"\"\"\"\n",
|
||
|
"import pandas as pd\n",
|
||
|
"import numpy as np\n",
|
||
|
"from pandas_risk import *\n",
|
||
|
"from time import time\n",
|
||
|
"import os\n",
|
||
|
"#\n",
|
||
|
"#-- Loading the dataset\n",
|
||
|
"class Logger :\n",
|
||
|
" cache = []\n",
|
||
|
" @staticmethod\n",
|
||
|
" def clear():\n",
|
||
|
" Logger.cache = []\n",
|
||
|
" @staticmethod\n",
|
||
|
" def log(**args) :\n",
|
||
|
" Logger.cache.append(args)\n",
|
||
|
" \n",
|
||
|
"SQL_CONTROLLED=\"SELECT person_id,birth_datetime,city,zip,state,race,gender FROM deid_risk.basic_risk60k\"\n",
|
||
|
"SQL_REGISTERED = \"SELECT person_id,birth_datetime,city,zip,state,race,gender FROM deid_risk.basic_deid_risk60k\"\n",
|
||
|
"dfr = pd.read_gbq(SQL_REGISTERED,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')\n",
|
||
|
"dfc = pd.read_gbq(SQL_CONTROLLED,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')\n",
|
||
|
"\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 99,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>sample %</th>\n",
|
||
|
" <th>marketer</th>\n",
|
||
|
" <th>sample marketer</th>\n",
|
||
|
" <th>tier</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>5</td>\n",
|
||
|
" <td>0.974945</td>\n",
|
||
|
" <td>0.981364</td>\n",
|
||
|
" <td>controlled</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>5</td>\n",
|
||
|
" <td>0.975513</td>\n",
|
||
|
" <td>0.981996</td>\n",
|
||
|
" <td>controlled</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>5</td>\n",
|
||
|
" <td>0.975798</td>\n",
|
||
|
" <td>0.980733</td>\n",
|
||
|
" <td>controlled</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>5</td>\n",
|
||
|
" <td>0.976364</td>\n",
|
||
|
" <td>0.981996</td>\n",
|
||
|
" <td>controlled</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>5</td>\n",
|
||
|
" <td>0.976364</td>\n",
|
||
|
" <td>0.981996</td>\n",
|
||
|
" <td>controlled</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" sample % marketer sample marketer tier\n",
|
||
|
"0 5 0.974945 0.981364 controlled\n",
|
||
|
"1 5 0.975513 0.981996 controlled\n",
|
||
|
"2 5 0.975798 0.980733 controlled\n",
|
||
|
"3 5 0.976364 0.981996 controlled\n",
|
||
|
"4 5 0.976364 0.981996 controlled"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 99,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"\n",
|
||
|
"#\n",
|
||
|
"FLAG='REGISTERED-TIER-1'\n",
|
||
|
"if FLAG == 'REGISTERED-TIER' :\n",
|
||
|
" Yi = pd.DataFrame(dfr)\n",
|
||
|
" FOLDER='registered'\n",
|
||
|
"else:\n",
|
||
|
" Yi = pd.DataFrame(dfc)\n",
|
||
|
" FOLDER='controlled'\n",
|
||
|
"Yi = Yi.fillna(' ')\n",
|
||
|
"N = 5\n",
|
||
|
"N_ = str(N)\n",
|
||
|
"SUFFIX = FOLDER+'-tier-'+str(N)+'-experiment.xlsx'\n",
|
||
|
"PATH = os.sep.join(['out',SUFFIX])\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"columns = list(set(Yi.columns.tolist()) - set(['person_id']))\n",
|
||
|
"merged_columns = list(columns)+['field_count']\n",
|
||
|
"m = {}\n",
|
||
|
"p = pd.DataFrame()\n",
|
||
|
"n = 0\n",
|
||
|
"y_i= pd.DataFrame({\"group_size\":Yi.groupby(columns,as_index=False).size()}).reset_index()\n",
|
||
|
"#.deid.risk(id='person_id',quasi_id=columns)\n",
|
||
|
"for index in np.arange(5,105,5):\n",
|
||
|
" for n in np.repeat(index,N) :\n",
|
||
|
"# np.random.seed( np.random.randint(0,int(time())+np.random.randint(0,1000)+index+n ) \n",
|
||
|
" #\n",
|
||
|
" # we will randomly sample n% rows from the dataset\n",
|
||
|
" i = np.random.choice(Yi.shape[0],((Yi.shape[0] * n)/100),replace=False)\n",
|
||
|
" x_i= pd.DataFrame(Yi).loc[i] \n",
|
||
|
" risk = x_i.deid.risk(id='person_id',quasi_id = columns)\n",
|
||
|
" x_i = pd.DataFrame({\"group_size\":x_i.groupby(columns,as_index=False).size()}).reset_index()\n",
|
||
|
" \n",
|
||
|
"# y_i= pd.DataFrame(Yi).deid.risk(id='person_id',quasi_id=columns)\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
" r = pd.merge(x_i,y_i,on=columns,how='inner')\n",
|
||
|
" if r.shape[0] == 0 :\n",
|
||
|
" print 'skipping ',n\n",
|
||
|
" continue\n",
|
||
|
" r['marketer'] = r.apply(lambda row: (row.group_size_x / np.float64(row.group_size_y)) /np.sum(x_i.group_size) ,axis=1)\n",
|
||
|
" r['sample %'] = np.repeat(n,r.shape[0])\n",
|
||
|
" r['tier'] = np.repeat(FOLDER,r.shape[0])\n",
|
||
|
" r['sample marketer'] = np.repeat(risk['marketer'].values[0],r.shape[0])\n",
|
||
|
"# r['patient_count'] = np.repeat(r.shape[0],r.shape[0])\n",
|
||
|
" r = r.groupby(['sample %','tier','sample marketer'],as_index=False).sum()[['sample %','marketer','sample marketer','tier']]\n",
|
||
|
"# r['marketer'] = r.apply(lambda row: (row.group_size_x / row.group_size_y) / row.patient_count_x,axis=1 )\n",
|
||
|
"# r = r.groupby(columns+['marketer_x'],as_index=False).sum()[columns+['marketer','marketer_x']]\n",
|
||
|
"# r['sample %'] = np.repeat(n,r.shape[0])\n",
|
||
|
"# r['tier'] = np.repeat(FOLDER,r.shape[0])\n",
|
||
|
" p = p.append(r)\n",
|
||
|
"\n",
|
||
|
"writer = pd.ExcelWriter(PATH,engine='xlsxwriter')\n",
|
||
|
"p = p.rename(columns={'marketer_x':'sample marketer'})\n",
|
||
|
"p.index = np.arange(p.shape[0]).astype(np.int64)\n",
|
||
|
"p.to_excel(writer,FOLDER)\n",
|
||
|
"writer.save()\n",
|
||
|
"p.head() "
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 100,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"<matplotlib.axes._subplots.AxesSubplot at 0x7fe67aa7a9d0>"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 100,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEKCAYAAAA4t9PUAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzt3XuUXFWd9vHvk+4khBiIJJkIuZA4oBIgBKZBHC+BvDITRgkXGQXvLpVZM/B6QRzhxVEHZTFcFHVgdCGgMKLIZDCGeUHgJUFmRmSlA7kAMUwEJRcIzSUBQkzS3b/3j3M6VBd9OdW7qivd9XzW6pWqXeec2qerU8/Ze5+zjyICMzOzgRpR7wqYmdnQ5iAxM7MkDhIzM0viIDEzsyQOEjMzS+IgMTOzJA4SMzNL4iAxM7MkDhIzM0vSXO8KDIaJEyfGjBkz6l0NM7MhZfny5c9GxKT+lmuIIJkxYwatra31roaZ2ZAi6Q9FlnPXlpmZJXGQmJlZEgeJmZklcZCYmVkSB4mZmSVxkJiZWRIHiZmZJXGQmJlZEgeJmZklcZCYmVkSB4mZmSVxkJiZWRIHiZmZJXGQmJlZEgeJmZklqWmQSJovaa2kdZLO7+H1AyXdI2mVpHslTS157VJJD+c/HygpnynpgXybP5M0qpb7YGZmfatZkEhqAq4GTgRmAWdKmlW22BXAjRExG7gIuCRf9z3AUcAc4K3AeZL2yde5FLgyIg4CXgA+Wat9MDOz/tWyRXIMsC4iHo+IncDNwMlly8wCluSPl5a8Pgu4LyLaI2IbsAqYL0nAPGBhvtwNwCk13AczM+tHLYNkCrC+5PmGvKzUSuC0/PGpwDhJE/Ly+ZL2ljQROB6YBkwAtkREex/bNDOzQVTvwfbzgLmSHgLmAhuBjoi4C7gd+DXwU+B+oKOSDUs6S1KrpNa2trYqV9vMzLrUMkg2krUiukzNy3aLiE0RcVpEHAlcmJdtyf+9OCLmRMQJgIDHgOeA8ZKae9tmybaviYiWiGiZNGlSNffLzMxK1DJIlgEH52dZjQLOABaXLiBpoqSuOlwAXJ+XN+VdXEiaDcwG7oqIIBtLOT1f52PAL2q4D2Zm1o+aBUk+jnEOcCewBrglIh6RdJGkBflixwFrJT0GTAYuzstHAv8p6VHgGuDDJeMiXwLOlbSObMzkulrtg5mZ9U/ZQf7w1tLSEq2trfWuhpnZkCJpeUS09LdcvQfbzcxsiHOQmJlZEgeJmZklcZCYmVkSB4mZmSVxkJiZWRIHiZmZJXGQmJlZEgeJmZklcZCYmVkSB4mZmSVxkJiZWRIHiZmZJXGQmJlZEgeJmZklcZCYmVkSB4mZmSVxkJiZWRIHiZmZJXGQmJlZEgeJmZklcZCYmVkSB4mZmSVxkJiZWRIHiZmZJXGQmJlZEgeJmZklcZCYmVmSmgaJpPmS1kpaJ+n8Hl4/UNI9klZJulfS1JLXLpP0iKQ1kr4rSXn5vfk2V+Q/f1LLfTAzs77VLEgkNQFXAycCs4AzJc0qW+wK4MaImA1cBFySr/vnwNuB2cBhwNHA3JL1PhQRc/KfZ2q1D2Zm1r9atkiOAdZFxOMRsRO4GTi5bJlZwJL88dKS1wPYCxgFjAZGAptrWFczMxugWgbJFGB9yfMNeVmplcBp+eNTgXGSJkTE/WTB8lT+c2dErClZ74d5t9Y/dHV5lZN0lqRWSa1tbW3V2B8zM+tBvQfbzwPmSnqIrOtqI9Ah6SDgEGAqWfjMk/TOfJ0PRcThwDvzn4/0tOGIuCYiWiKiZdKkSbXeDzOzhlXLINkITCt5PjUv2y0iNkXEaRFxJHBhXraFrHXym4h4OSJeBu4A3pa/vjH/9yXgJ2RdaGZmVie1DJJlwMGSZkoaBZwBLC5dQNJESV11uAC4Pn/8JFlLpVnSSLLWypr8+cR83ZHAe4GHa7gPZmbWj5oFSUS0A+cAdwJrgFsi4hFJF0lakC92HLBW0mPAZODivHwh8DtgNdk4ysqIuI1s4P1OSauAFWQtnB/Uah/MzKx/ioh616HmWlpaorW1td7VMDMbUiQtj4iW/par92C7mZkNcQ4SMzNL4iAxM7MkDhIzM0viIDEzsyQOEjMzS+IgMTOzJA4SMzNL4iAxM7MkDhIzM0viIDEzsyQOEjMzS+IgMTOzJA4SMzNL4iAxM7MkDhIzM0viIDEzsyQOEjMzS+IgMTOzJA6SGtqybQdrN21hy7Yd9a6KmVnNNNe7AsPVkoc38u3bVtHUNIKOjk4+f9Jsjj9sSr2rZWZWdW6R1MCWbTv49m2r2NHeySs72tnR3smVt61yy8TMhiUHSQ1s3rqdpqbuv9qmphFs3rq9TjUyM6sdB0kNTN53DDt2tncr27Gzncn7jqlTjczMasdBUiMaoT6fm5kNF/0GiaQmSUsHozLDxeat2xnV3NStbFRzk7u2zGxY6jdIIqID6JS07yDUZ1iYvO8YOjo6u5V1dHQOetdW6unHPn3ZzIooevrvy8BqSXcD27oKI+Izfa0kaT7wHaAJuDYi/qns9QOB64FJwPPAhyNiQ/7aZcB7yMLubuCzERGS/gz4ETAGuL2rvOB+DIrxY0fz+ZNmc2XZ6b/jx44etDqknn7s05fNrKiiQXJr/lOYpCbgauAEYAOwTNLiiHi0ZLErgBsj4gZJ84BLgI9I+nPg7cDsfLn/AuYC9wLfAz4NPEAWJPOBOyqp22A4/rApHDlzIpu3bmfyvmMGNURKTz+mPWsZXXnbKo6cObFQPVLXN7PGUihI8i/6McD0iFhbcNvHAOsi4nEASTcDJwOlQTILODd/vBRY1PWWwF7AKEDASGCzpP2BfSLiN/k2bwROYQ8MEshaJvX44t19+nH7q91rXacfF6lP6vpm1lgKnbUl6SRgBfDL/PkcSYv7WW0KsL7k+Ya8rNRK4LT88anAOEkTIuJ+smB5Kv+5MyLW5Otv6GebDS91jGZPGeMxs6Gh6Om/XyNrYWwBiIgVwBur8P7nAXMlPUTWdbUR6JB0EHAIMJUsKOZJemclG5Z0lqRWSa1tbW1VqOrgG+hgd9cYzejmEew9upnRzSMqGqNJXd/MGkvRMZJdEbFV6nYtRGdvC+c2AtNKnk/Ny3aLiE3kLRJJrwPeFxFbJH0a+E1EvJy/dgfwNuBf8+30us2SbV8DXAPQ0tKyRw3GF5E62J06RlONMZ4t23bUZYzIzAZX0SB5RNIHgSZJBwOfAX7dzzrLgIMlzST7sj8D+GDpApImAs9HRCdwAdkZXABPAp+WdAnZGMlc4NsR8ZSkFyUdSzbY/lHgnwvuw5BRrcHu1DGalPWrcdaXg8hsaCjatfW/gUOBHcBPgK3AZ/taISLagXOAO4E1wC0R8YikiyQtyBc7Dlgr6TFgMnBxXr4Q+B2wmmwcZWVE3Ja/9nfAtcC6fJk9cqA9xVCfq6sak1YueXgjH/3uEs7/8QN89LtLWPpwjw1PM9sDFG2RvCciLgQu7CqQ9NfAv/W1UkTcTnaKbmnZV0oeLyQLjfL1OoC/6WWbrcBhBes9JO0pg90DbRGknvXl04/NhpaiLZILCpZZFewJg90pLYLUIBzqLTKzRtNni0TSicBfAVMkfbfkpX2A9p7Xsmqo52B3aosg9cr+arXIPMZiNjj669raBLQCC4DlJeUvAZ+vVaUsU6/B7mpckJgShNWYYmY4TPHS6EHY6Ps/lPQZJBGxElgp6Sf5spVc2W51ktqi2BPGaI4/bAp/OnkffrtpC285YDzTJ40rvO5wGGMZDme9pbz/cDgQaCRFB9vnk82LNQqYKWkOcFFELOh7NauH1BbFntAiqHeLqhrq1bUI9Q+ilPcfDgcCjaZokHyN7Mr2eyG7sj2/PsT2QJP3HcPO9o5uZTvbOypqUaR0TdV70sg9YYxlycMb+dbilQgRBF9YcMSgBWG9gyj1/at1IDCUW2RDTcqV7UPuavGhJuUPMTqjz+e1tHnrdjrLZvbvjKho0siU9avVorpy8Uo0YgTR2cm5FQTBlm07uGzRCrJdyPbj0kUrKgrClAOB1N9fNYIg5f2rcSA
|
||
|
"text/plain": [
|
||
|
"<Figure size 432x288 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {
|
||
|
"needs_background": "light"
|
||
|
},
|
||
|
"output_type": "display_data"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEKCAYAAAA4t9PUAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzt3X2cXVV97/HPN8kMeSKmJCOXkjCDikoqaQIDQsEmsUDDS18GKEVSsWBj8V6l115LK1yq1ijlorRWHlpFw1PNDaXUh9SiiWDwgfqQiYEEnCbNxYQkoA7UQAIJmSS/+8feE84cJjP7zDpnTs7M9/16ndecs87e56ydmZzv2WutvZYiAjMzs8EaVe8KmJlZY3OQmJlZEgeJmZklcZCYmVkSB4mZmSVxkJiZWRIHiZmZJXGQmJlZEgeJmZklGVPvCgyFqVOnRltbW72rYWbWUNasWfNMRLQMtN2ICJK2tjY6OjrqXQ0zs4YiaUuR7dy0ZWZmSRwkZmaWxEFiZmZJHCRmZpbEQWJmZkkcJGZmlsRBYmZmSRwkZmaWxEFiZmZJahokkuZL2iBpk6Sr+3i+VdKDktZJekjStJLnbpD0WH57Z0n5nZJ+JumR/DarlsdgZmb9q1mQSBoN3AqcB8wAFkqaUbbZjcDdETETWAxcn+/7NuBkYBbwZuAqSZNK9vvziJiV3x6p1TGYmdnAanlGchqwKSKeiIi9wD3AgrJtZgDfzu+vKnl+BvDdiNgXES8A64D5NayrmZkNUi2D5Fhga8njbXlZqUeBC/P7FwBHSpqSl8+XNF7SVGAeML1kv+vy5rDPSDqirzeXdIWkDkkdXV1d1TgeMzPrQ707268C5khaC8wBtgP7I2IlcD/w78Ay4AfA/nyfa4A3AqcCRwEf7uuFI+K2iGiPiPaWlgFnQTYzs0GqZZBsp/dZxLS87KCIeCoiLoyI2cC1edmO/Od1eR/IOYCAjXn505F5CbiDrAnNzMzqpJZBsho4QdLxkpqBS4DlpRtImiqppw7XALfn5aPzJi4kzQRmAivzx8fkPwWcDzxWw2MwM7MB1Gxhq4jYJ+lKYAUwGrg9Ih6XtBjoiIjlwFzgekkBfBf4QL57E/C9LCt4Hrg0Ivblzy2V1EJ2lvII8N9rdQxmZjYwRUS961Bz7e3t4RUSzcwqI2lNRLQPtF29O9vNzKzBOUjMzCyJg8TMzJI4SMzMLImDxMzMkjhIzMwsiYPEzMySOEjMzCyJg8TMzJI4SMzMLImDxMzMkjhIzMwsiYPEzMySOEjMzCyJg8TMzJI4SMzMLImDxMzMkjhIzMwsiYPEzMySOEjMzCyJg8TMzJI4SMzMLImDxMzMkjhIzMwsiYPEzMySOEjMzCxJTYNE0nxJGyRtknR1H8+3SnpQ0jpJD0maVvLcDZIey2/vLCk/XtKP8tf8J0nNtTwGMzPrX82CRNJo4FbgPGAGsFDSjLLNbgTujoiZwGLg+nzftwEnA7OANwNXSZqU73MD8JmIeB3wK2BRrY7BzMwGVsszktOATRHxRETsBe4BFpRtMwP4dn5/VcnzM4DvRsS+iHgBWAfMlyTgrcB9+XZ3AefX8BjMzGwAtQySY4GtJY+35WWlHgUuzO9fABwpaUpePl/SeElTgXnAdGAKsCMi9vXzmmZmNoTq3dl+FTBH0lpgDrAd2B8RK4H7gX8HlgE/APZX8sKSrpDUIamjq6urytU2M7MetQyS7WRnET2m5WUHRcRTEXFhRMwGrs3LduQ/r4uIWRFxDiBgI/AsMFnSmEO9Zslr3xYR7RHR3tLSUs3jMjOzErUMktXACfkoq2bgEmB56QaSpkrqqcM1wO15+ei8iQtJM4GZwMqICLK+lIvyfS4DvlbDYzAzswHULEjyfowrgRVAJ3BvRDwuabGkd+SbzQU2SNoIHA1cl5c3Ad+T9FPgNuDSkn6RDwMfkrSJrM9kSa2OwczMBqbsS/7w1t7eHh0dHfWuhplZQ5G0JiLaB9qu3p3tZmbW4BwkZmaWxEFiZmZJHCRmZpbEQWJmZkkcJGZmlsRBYmZmSRwkZmaWxEFiZmZJHCRmZpbEQWJmZkkcJGZmlqTfIJE0StJvDVVlzMys8fQbJBFxALh1iOpiZmYNqEjT1oOSfk+Sal4bMzNrOEWC5H3APwN7JT0vaaek52tcLzMzaxBjBtogIo4cioqYmVljGvCMRJlLJX0kfzxd0mm1r5qZmTWCIk1bfw+cAfxB/ngX7oA3M7PcgE1bwJsj4mRJawEi4leSmmtcLzMzaxBFzki6JY0GAkBSC3CgprUyM7OGUSRIbgK+Arxa0nXA94Hra1orMzNrGEVGbS2VtAb4HUDA+RHRWfOamZlZQxgwSCT9Y0S8G/iPPsrMzGyEK9K09RulD/L+klNqU53hpburixdWr6a7q6veVTEzq5lDBomkayTtBGaWXNG+E/gl8LUhq2GDenbZMta3trLxnHNY39rKs8uW1btKZmY1ccggiYjr86vaPx0RkyLiyPw2JSKuKfLikuZL2iBpk6Sr+3i+VdKDktZJekjStJLnPiXpcUmdkm7qmesr326DpEfy26sHcdw11d3VxZZFi4jduznw3HPE7t1sWbTIZyZmNiwVadq6djBXtudNYLcC5wEzgIWSZpRtdiNwd0TMBBaTjwbLp64/E5gJvAk4FZhTst+7ImJWfvtlgWMYUns3b66o3MyskRUJklsZ3JXtpwGbIuKJiNgL3AMsKNtmBvDt/P6qkucDGAs0A0cATcAvCrznYWHUxInE7t29ymL3bkZNnFinGpmZ1U6RIHlzRHwA2APZle1kH/ADORbYWvJ4W15W6lHgwvz+BcCRkqZExA/IguXp/LaibMjxHXmz1kcOx+ntD+zaBePG9S4cOzYrNzMbZup9ZftVwJx8+pU5wHZgv6TXAScC08jC562S3pLv866IOAl4S37rcxiypCskdUjq6BrivonmtjbK000SzW1tQ1oPM7OhUMsr27cD00seT8vLDoqIpyLiwoiYDVybl+0gOzv5YUTsiohdwDfImteIiO35z53A/yVrQnuFiLgtItojor2lpaVAdaunqaWF1iVL0LhxjJo0CY0bR+uSJTQNcT3MzIZCLa9sXw2cIOl4sgC5hJf7WQCQNBX4r3xJ32uA2/OnngT+WNL1+XvOAf5O0hhgckQ8I6kJeDvwQIG6DLkpCxcy6eyz2bt5M81tbQ4RMxu2ilzZvigiltD7yvb/ExGvGM5bKiL2SboSWAGMBm6PiMclLQY6ImI5MBe4XlIA3wU+kO9+H/BWYD1Zk9o3I+JfJU0AVuQhMposRL5Q0REPoaaWFgeImQ17ioj+N5DuB5ZGxNL88a3A2IhYNAT1q4r29vbo6OiodzXMzBqKpDUR0T7QdkXWI/k9YLmkA8B8YEcjhchI1t3V5aY1M6u5/qZIOUrSUcA44L3AXwA7gY/n5XYY8xQtZjZUDtm0Jeln5EN+e4pK7kdEvKaWFaumRm3aGuwZRXdXF+tbW3tdFKlx4zhpyxafmZhZYclNWxFxvKRRwBkR8XBVa2cDenbZMrYsWoSam4m9e2ldsoQpCxcW2nfv5s3ZfqVB0tTE3s2bHSRmVnX9XkeSD8u9ZYjqYrnUSR+b29qIvXt7lUV3d8UXRHoafDMrosgFiQ9K+r3DcSqS4arnjKJUzxlFEdW4INJ9LGZWVJHhvzuBCcA+svm2RNZHMqn21auORusjqVYfh/tYzCxF0T6SAc9I8jVIRkVEc8m6JA0TIo2oWlOsNLW0MOHUUyveL/WMyMxGliLXkSDp14ATyKZ2ByAivlurSll9p1ipVh+LmY0MA56RSHov2fQlK4CP5z//qrbVMhj8GUU13nfKot7XnE5ZtMjNWmbWpyKd7R8kW6FwS0TMA2YDO2paK6uKwY666u7q4tklS3qVPbtkiUdvmVmfigTJnojYAyDpiIj4D+ANta2WQdrw25RRV+4jMbNKFAmSbZImA18FviXpa8CW2lbLUoLA16GY2VAqMmrrgojYERF/BXwEWAKcX+uKHQ52Pvww2z/2MXY+PLQ
|
||
|
"text/plain": [
|
||
|
"<Figure size 432x288 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {
|
||
|
"needs_background": "light"
|
||
|
},
|
||
|
"output_type": "display_data"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEKCAYAAAA4t9PUAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzt3XmcXGWd7/HPr9cknQ2SNmC2DrJGEhJstlFJgopwhRACIrmKROIyMzqCgAtuM4MiwzKDMDIqY4IweqMIiMGrBkRQZIRLh4QECGEiSSABQxPTWTpJr7/7xznVqa70cqqeqq509/f9etUrVU+d59Rzqjrnd571mLsjIiKSq5JiF0BERPo3BRIREQmiQCIiIkEUSEREJIgCiYiIBFEgERGRIAokIiISRIFERESCKJCIiEiQsmIXoC+MHTvWa2pqil0MEZF+ZcWKFW+6e3Vv2w2KQFJTU0NdXV2xiyEi0q+Y2aYk26lpS0REgiiQiIhIEAUSEREJokAiIiJBFEhERCSIAomIiARRIBERkSAKJCIiEkSBREREgiiQiIhIEAUSEREJokAiIiJBFEhERCSIAomIiARRIBERkSAFDSRmdpaZrTOz9Wb2pS7en2xmj5jZajN7zMwmpL13g5k9Fz8+lJY+xcyeivf5UzOrKOQxiIhIzwoWSMysFLgdOBuYCiwws6kZm90M3O3u04FrgevjvB8ATgRmAKcAV5vZyDjPDcAt7n4ksB1YVKhjEBGR3hWyRnIysN7dX3b3ZuAnwHkZ20wFfhc/fzTt/anAH9y91d0bgdXAWWZmwBnAvfF2dwHzCngMIiLSi0IGkvHAq2mvN8dp6Z4F5sfPzwdGmNmYOP0sMxtmZmOBOcBEYAzQ4O6tPexTRET6ULE7268GZpnZSmAWsAVoc/eHgF8B/w0sBf4EtGWzYzP7pJnVmVldfX19nostIiIphQwkW4hqESkT4rQO7v6au89395nAV+K0hvjf69x9hru/DzDgJWAbMNrMyrrbZ9q+73D3Wnevra6uzudxiYhImkIGkqeBo+JRVhXAxcCy9A3MbKyZpcpwDbAkTi+Nm7gws+nAdOAhd3eivpQL4zyXAr8o4DGIiEgvChZI4n6MzwDLgbXAPe7+vJlda2Zz481mA+vM7CVgHHBdnF4OPG5mLwB3AB9J6xf5InClma0n6jNZXKhjEBGR3ll0kT+w1dbWel1dXbGLISLSr5jZCnev7W27Yne2i4hIP6dAIiIiQRRIREQkiAKJiIgEUSAREZEgCiQiIhJEgURERIIokIiISBAFEhERCaJAIiIiQRRIREQkiAKJiIgEUSAREZEgCiQiIhJEgURERIIokIiISBAFEhERCaJAIiIiQRRIREQkiAKJiIgEUSAREZEgCiQiIhJEgURERIIokIiISBAFEhERCaJAIiIiQRRI+kBLfT2NTz9NS319sYsiIpJ3CiQFtm3pUtZMnsxLc+awZvJkti1dWuwiiYjkVUEDiZmdZWbrzGy9mX2pi/cnm9kjZrbazB4zswlp791oZs+b2Vozu83MLE5/LN7nqvjxlkIeQ4iW+no2LVyI791Le2MjvncvmxYuVM1ERAaUggUSMysFbgfOBqYCC8xsasZmNwN3u/t04Frg+jjv3wDvBKYDxwMnAbPS8n3Y3WfEjzcKdQyh9qxciTc3d0rz5mb2rFxZpBKJiORfIWskJwPr3f1ld28GfgKcl7HNVOB38fNH0953YAhQAVQC5cDWApZVRERyVMhAMh54Ne315jgt3bPA/Pj5+cAIMxvj7n8iCiyvx4/l7r42Ld+dcbPW11JNXpnM7JNmVmdmdfVFakoaNnMmlJd3Tiwvj9JFRAaIYne2Xw3MMrOVRE1XW4A2MzsSOA6YQBR8zjCzd8d5Puzu04B3x49Lutqxu9/h7rXuXltdXV3o4+hSeXU1NXfdBUOGYFVVMGQINXfdRXmRyiMiUghlBdz3FmBi2usJcVoHd3+NuEZiZsOBC9y9wcw+ATzp7rvj934NnAY87u5b4ry7zOz/EDWh3V3A4wgyZsECRr73vTRv3EhFTY2CiIgMOIWskTwNHGVmU8ysArgYWJa+gZmNNbNUGa4BlsTPXyGqqZSZWTlRbWVt/HpsnLccOAd4roDHkBfl1dVUnXSSgoiIDEgFCyTu3gp8BlgOrAXucffnzexaM5sbbzYbWGdmLwHjgOvi9HuBPwNriPpRnnX3B4k63peb2WpgFVEN5z8LdQwiItI7c/dil6Hgamtrva6urtjFEBHpV8xshbvX9rZdsTvbRUSkn1MgGQS01peIFJICST8QEgg61vp63/u01peIFIQCyUEuJBC01NezadGiaK2vHTuitb4WLVLNRETyqsdAYmYl8bpXUgShgaB540asoqJTmpWX07xxY1ZlULOYiPSkx0Di7u1ECy9KEYQGgoqamgMXjWxpoaKmJlF+NYuJSBJJmrYeMbMLulvTSgonNBCUV1czefFibOhQSkaOxIYOZfLixYkmRqpZTESSShJIPgX8DGg2s51mtsvMdha4XEJYIEgZs2AB0zZt4ujf/pZpmzYxZsGCRPny0SwmIoNDr2ttufuIviiIdC0fa3WVV1dnnS+0NiQig0evNRKLfMTMvha/nmhmJxe+aJJSjLW6OtWGqqpyqg2JyOCQpGnrP4hW3v3f8evdqAO+XwkZeeXuePyviEhXkgSSU9z908A+AHffTnTnQukHch15lepsZ98+vLER9u1TZ7uIdClJIGmJ77/uAGZWDbQXtFTSSa41ipCRV/nqbNc8FJGBL0kguQ34OfAWM7sO+CNwfUFLdZDZ9cQTbPnHf2TXE0/0+WeHzOVo3riRzAYpd08UDPLR2a55KCKDQ6Jl5M3sWOA9gAGPZNw//aAXsoz8ujPPZPfDD3e8Hn7mmRyzfHm+itajlvp61kyejO/d25FmQ4cybdOmRJ3ee9eu5YWpUw9In/rCCww97rhe829bupRNixZh5eV4SwuTFy9OPHw4tOwiUnxJl5Hvdfivmf2Xu18CvNhF2oC264knOgURgN0PPcSuJ55gxDvfWfDPTzUvdToZx81LSU7G7bt3Y0OHHnAyb9+9O9Hnhww9Di27iPQfSZq23p7+Iu4veUdhinNw2fnQQ1ml51to81J322XTPJXr0ON8zUNRH4vIwa/bQGJm15jZLmB62oz2XcAbwC/6rIRFNPLMM7NKz7fy6mrGLFrUKW3MokWJT+r5mBmfq3x8tvpYRPqHXvtIzOx6d7+mj8pTEEF9JO9/P7vTaiC59JG01Nfn1DyUr36GXD8/NG9IfvWxiBRfPm+1+5XBPLN97MKFUFHR8Ri7cGFW+bctXcrqSZN4cfYcVk+alPWoq3wMwc21eaqYNQINPxbpP5IEktsZpDPbOyblNTd3PLKZlNdSX8+Gj34U9u2DPdGkvg0f/Wji/BU1NZ2uyAF8374+We8qH6v/poLoujnZB1ENPxbpPzSzvQehV8Vv/OkpaG3tnNjaGqUnlNn02FdLlYQee0t9PRsvvbTTzPiNl16aOBCF9g9pGXyRvqOZ7T0IvSpuaGzOKj1T88aNWFnnEdpWVtYnS7mHHvuelSuhpaVzYktLlJ5AS309b37/+53S3vz+9/v07pAikoxmtvcg9Kr4sNNOpq20cyBoKy3jsNOSdTGVDB9+YNPW3r2UDB+eKH+IYq/+GxqItAy+SN/pNZC4+4+BLxAFj9eBee5+T6ELdjBoqa9n2+LFndK2LV6c+Kq4tLqa+y+4kpayCprKh9BSVsH9F1xJacKTcfvu3TB0aOfEIUMSTyhMKcbqv8NmzjywRlBRwbCZM7MuQy6KOfRZZLBJMrN9kbsvpvPM9n9x9y8VtGQHgdDZ2Vt37OWld5zBjVNOYHTDVhpGj8MPHcPWHXsZXVXZa/6KmhoMOq2XZWZZdzhvWrQIKynB29sTL3PSafXfOG3TokWMfO97Ex17eXU1k3/4wwM+O+mJfNjMmVBaCm1t+xNLS7MKRPm4KVjo8GeRwSBJ09YFZvbh1Aszux0YFP+jQptHxo0aSltbO41Vo9g
|
||
|
"text/plain": [
|
||
|
"<Figure size 432x288 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {
|
||
|
"needs_background": "light"
|
||
|
},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"p.plot(kind='scatter',x='sample %',y='marketer', c = '#4682B4')\n",
|
||
|
"p.plot(kind='scatter',x='sample %',y = 'sample marketer', c='#CC0000')\n",
|
||
|
"ax = p.plot(kind='scatter',x='sample %',y='marketer', c = '#4682B4')\n",
|
||
|
"p.plot(kind='scatter',x='sample %',y = 'sample marketer', c='#CC0000',ax=ax)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"\"\"\"\n",
|
||
|
" This experiment consists in :\n",
|
||
|
" 1: randomly selecting x % of the records to be sampled\n",
|
||
|
" 2: running a group by on the sample\n",
|
||
|
" 3: calling groupby on the population which th\n",
|
||
|
"\"\"\"\n",
|
||
|
"SQL_ORIGINAL=\"SELECT * FROM deid_risk.risk_60k2\"\n",
|
||
|
"SQL_DEID = \"SELECT * FROM deid_risk.deid_risk_60k limit 20000\"\n",
|
||
|
"# df = pd.read_gbq(SQL_DEID,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')\n",
|
||
|
"\n",
|
||
|
"#\n",
|
||
|
"FLAG='REGISTERED-TIER-9'\n",
|
||
|
"if FLAG == 'REGISTERED-TIER' :\n",
|
||
|
" Yi = pd.DataFrame(dfr)\n",
|
||
|
" FOLDER='registered'\n",
|
||
|
"else:\n",
|
||
|
" Yi = pd.DataFrame(dfc)\n",
|
||
|
" FOLDER='controlled'\n",
|
||
|
"N = 20\n",
|
||
|
"N_ = str(N)\n",
|
||
|
"SUFFIX = FOLDER+'-tier-'+str(N)+'-experiment.xlsx'\n",
|
||
|
"PATH = os.sep.join(['out',SUFFIX])\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"columns = list(set(Yi.columns.tolist()) - set(['person_id']))\n",
|
||
|
"merged_columns = list(columns)+['field_count']\n",
|
||
|
"m = {}\n",
|
||
|
"p = pd.DataFrame()\n",
|
||
|
"n = 0\n",
|
||
|
"y_i= pd.DataFrame(Yi).deid.risk(id='person_id',quasi_id=columns)\n",
|
||
|
"for index in np.arange(5,105,5):\n",
|
||
|
"# np.random.seed( int(time())+np.random.randint(0,100)+index ) \n",
|
||
|
"# n = np.random.randint(10,35) #-- randomly pick a number within an interval\n",
|
||
|
" \n",
|
||
|
" for n in np.repeat(index,20) :\n",
|
||
|
"# np.random.seed( np.random.randint(0,int(time())+np.random.randint(0,1000)+index+n ) \n",
|
||
|
" #\n",
|
||
|
" # we will randomly sample n% rows from the dataset\n",
|
||
|
" i = np.random.choice(Yi.shape[0],((Yi.shape[0] * n)/100),replace=False)\n",
|
||
|
" x_i= pd.DataFrame(Yi).loc[i].deid.risk(id='person_id',quasi_id = columns)\n",
|
||
|
" \n",
|
||
|
"# y_i= pd.DataFrame(Yi).deid.risk(id='person_id',quasi_id=columns)\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
" r = pd.merge(x_i,y_i,on=merged_columns,how='inner')\n",
|
||
|
" if r.shape[0] == 0 :\n",
|
||
|
" print 'skipping ',n\n",
|
||
|
" continue\n",
|
||
|
"\n",
|
||
|
" r['marketer'] = r.apply(lambda row: (row.group_size_x / row.group_size_y) / row.patient_count_x,axis=1 )\n",
|
||
|
" r = r.groupby(columns+['marketer_x'],as_index=False).sum()[columns+['marketer','marketer_x']]\n",
|
||
|
" r['sample %'] = np.repeat(n,r.shape[0])\n",
|
||
|
" r['tier'] = np.repeat(FOLDER,r.shape[0])\n",
|
||
|
" p = p.append(r)\n",
|
||
|
"\n",
|
||
|
"writer = pd.ExcelWriter(PATH,engine='xlsxwriter')\n",
|
||
|
"p = p.rename(columns={'marketer_x':'sample marketer'})\n",
|
||
|
"p.index = np.arange(p.shape[0]).astype(np.int64)\n",
|
||
|
"p.to_excel(writer,FOLDER)\n",
|
||
|
"writer.save()\n",
|
||
|
"p.head()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"scrolled": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"ax = p.plot(kind='scatter',x='sample %',y='marketer',c='r',ylim=[p.marketer.min(),p.marketer.max()])\n",
|
||
|
"p.plot(kind='scatter',x='sample %',y='sample marketer',c='#4682B4')\n",
|
||
|
"ax = p.plot(kind='scatter',x='sample %',y='marketer',c='r')\n",
|
||
|
"p.plot(kind='scatter',x='sample %',y='sample marketer',c='#4682B4',ax=ax)\n",
|
||
|
"\n",
|
||
|
"_p = pd.DataFrame(p)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"p.head()\n",
|
||
|
"\n",
|
||
|
"# writer = pd.ExcelWriter('out/foo.xlsx',engine='xlsxwriter')\n",
|
||
|
"# workbook = writer.book\n",
|
||
|
"# r.groupby('field_count',as_index=False).sum()[['field_count','marketer_x']].to_excel(writer,'page-0')\n",
|
||
|
"# chart = workbook.add_chart({'type':'line'})\n",
|
||
|
"# o = r.groupby('field_count',as_index=False).sum()[['field_count','marketer_x']]\n",
|
||
|
"# # values = o.marketer_x.tolist()\n",
|
||
|
"# # values = [['page-0',item] for item in values]\n",
|
||
|
"# # chart.add_series({\"values\":values})\n",
|
||
|
"# # chart.add_series({'values':'=page-0!$B$2:$B$5'})\n",
|
||
|
"\n",
|
||
|
"# worksheet = writer.sheets['page-0']\n",
|
||
|
"# worksheet.insert_chart('G2',chart)\n",
|
||
|
"# writer.save()\n",
|
||
|
"\n",
|
||
|
"str(10)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"help(chart.add_series)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"cols = list(set(dfr.columns.tolist()) - set(['person_id'])) + ['field_count']\n",
|
||
|
"r = pd.merge(x_i,y_i,on=cols,how='inner')\n",
|
||
|
"r['marketer'] = r.apply(lambda row: (row.group_count_x/row.group_count_y)/row.patient_count_y ,axis=1)\n",
|
||
|
"# r['field_count'] = r['field_count_x']\n",
|
||
|
"o = r.groupby(cols,as_index=False).sum()[cols+['marketer']]\n",
|
||
|
"o.groupby(['field_count'],as_index=False).mean()\n",
|
||
|
"# o.groupby('field_count',as_index=False).mean().plot.line(x='field_count',y='marketer')\n",
|
||
|
"# r.head()\n",
|
||
|
"# N = r.patient_count_y.mean()\n",
|
||
|
"# r['marketer'] = r.apply(lambda row: row.group_count_x / row.group_count_y,axis=1)\n",
|
||
|
"# m = r.groupby(['field_count'],as_index=False).mean()[['field_count','marketer']]\n",
|
||
|
"# m.marketer = m.marketer / N\n",
|
||
|
"# m.groupby(['field_count']).mean().plot.line()\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"p.to_csv('out/x-2/single-runs-deid.csv',index=False)\n",
|
||
|
"p.groupby(['sample %']).mean()['marketer'].plot.line()\n",
|
||
|
"p.groupby(['sample %'],as_index=False).mean().plot.scatter(x='sample %',y='marketer')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"y = pd.DataFrame({\"name\":['d','e','f','g'],\"age\":[12,40,20,30],\"income\":[100,200,300,400]})\n",
|
||
|
"x = pd.DataFrame({\"name\":['a','b','c'],\"age\":[10,20,40],\"income\":[120,100,200]})\n",
|
||
|
"\n",
|
||
|
"# x.join(y,how='outer',on='age')\n",
|
||
|
"x_ = pd.merge(x,y,on=['age','income'],how='outer')\n",
|
||
|
"Logger.log(action='merge',value=x_.shape)\n",
|
||
|
"Logger.cache"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#\n",
|
||
|
"# EXP_0\n",
|
||
|
"# Running the experiment on the Original dataset, with all the attributes\n",
|
||
|
"SCHEMA = \"deid_risk\"\n",
|
||
|
"df = pd.read_gbq(\"select person_id,birth_datetime,race,gender,sex_at_birth, city,state,zip from deid_risk.basic_risk60k \",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n",
|
||
|
" dialect='standard')\n",
|
||
|
"\n",
|
||
|
"RUNS = 500\n",
|
||
|
"FLAG = 'basic-features'\n",
|
||
|
"r = df.deid.risk(id='person_id',num_runs=RUNS) #,field_count=11)\n",
|
||
|
"# r.to_csv('out/pandas-60k-'+FLAG+'-patients-'+str(RUNS)+'-x-runs.csv')\n",
|
||
|
"compiled = r.groupby('field_count',as_index=False)['marketer','prosecutor'].mean()\n",
|
||
|
"fi = compiled[['marketer','prosecutor']].plot.line().get_figure()\n",
|
||
|
"# fo\n",
|
||
|
"# r.plot.line(x='field_count',y='marketer')\n",
|
||
|
"compiled = r.groupby('field_count',as_index=False)['field_count','marketer','prosecutor'].mean()\n",
|
||
|
"fig_i = r.plot.scatter(x='field_count',y='marketer').get_figure()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#\n",
|
||
|
"# EXP_2 :\n",
|
||
|
"# This experiment will run the marketer risk against individual attributes\n",
|
||
|
"deid_df = pd.read_gbq(\"select person_id,birth_datetime,race,gender,sex_at_birth, city,state,zip from deid_risk.basic_deid_risk60k\",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n",
|
||
|
" dialect='standard')\n",
|
||
|
"RUNS = 500\n",
|
||
|
"FLAG = 'basic-deid-features'\n",
|
||
|
"deid_r = deid_df.deid.risk(id='person_id',num_runs=RUNS) #,field_count=11)\n",
|
||
|
"# r.to_csv('out/pandas-60k-'+FLAG+'-patients-'+str(RUNS)+'-x-runs.csv')\n",
|
||
|
"deid_compiled = deid_r.groupby('field_count',as_index=False)['marketer','prosecutor'].mean()\n",
|
||
|
"fo = deid_compiled[['marketer','prosecutor']].plot.line().get_figure()\n",
|
||
|
"# fo\n",
|
||
|
"# r.plot.line(x='field_count',y='marketer')\n",
|
||
|
"# deid_compiled = deid_r.groupby('field_count',as_index=False)['field_count','marketer','prosecutor'].mean()\n",
|
||
|
"fig_o = deid_r.plot.scatter(x='field_count',y='marketer').get_figure()\n",
|
||
|
"\n",
|
||
|
"# orig_df = pd.read_gbq(\"select * from deid_risk.risk_60k2\",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n",
|
||
|
"# dialect='standard')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# deid_r.to_csv('out/basic-attributes-deid-data-60k-patients.csv')\n",
|
||
|
"# r.to_csv('out/basic-attributes-raw-data-60k-patients.csv')\n",
|
||
|
"# deid_r.head()\n",
|
||
|
"p = pd.DataFrame()\n",
|
||
|
"p = deid_df.deid.risk(id='person_id',quasi_id=['birth_datetime','race','gender','sex_at_birth', 'city','state','zip'])\n",
|
||
|
"p = p.append(df.deid.risk(id='person_id',quasi_id=['birth_datetime','race','gender','sex_at_birth', 'city','state','zip']))\n",
|
||
|
"p.index = ['deid data','raw data']\n",
|
||
|
"p.to_csv('out/basic_run-7-fields.csv')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"cols = deid_r.columns[5:]\n",
|
||
|
"deid_r.index = np.arange(deid_r.shape[0]).astype(np.int64)\n",
|
||
|
"xdeid_ = deid_r[cols].sum().tolist()\n",
|
||
|
"xraw_ = r[cols].sum().tolist()\n",
|
||
|
"o = pd.DataFrame()\n",
|
||
|
"o['name'] = cols\n",
|
||
|
"o['raw'] = xraw_\n",
|
||
|
"o['deid']= xdeid_\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"o\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"columns = list( set(orig_df.columns) - set(['person_id']))\n",
|
||
|
"xo = pd.DataFrame()\n",
|
||
|
"xi = pd.DataFrame()\n",
|
||
|
"#\n",
|
||
|
"# Let's compute the risk for every attribute given the list of attributes we've gathered\n",
|
||
|
"#\n",
|
||
|
"for name in columns :\n",
|
||
|
" xo = xo.append(deid_df.deid.risk(id='person_id',quasi_id=[name])[['marketer','prosecutor']],sort=False)\n",
|
||
|
" xi = xi.append(orig_df.deid.risk(id='person_id',quasi_id=[name])[['marketer','prosecutor']],sort=False)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#\n",
|
||
|
"# The following shows how much the deid process has affected each attributes\n",
|
||
|
"#\n",
|
||
|
"\n",
|
||
|
"RISK_THRESHOLD = 0.5\n",
|
||
|
"xo.index = columns\n",
|
||
|
"xi.index = columns\n",
|
||
|
"\n",
|
||
|
"ii = xi[xi.marketer > RISK_THRESHOLD].index\n",
|
||
|
"# zo = pd.concat([xi.loc[ii],xo.loc[ii]])\n",
|
||
|
"\n",
|
||
|
"zo = xi.loc[ii].join(xo.loc[ii],rsuffix='_deid')\n",
|
||
|
"#\n",
|
||
|
"# heatmap for original data\n",
|
||
|
"# fig_o = sns.heatmap(xi.loc[ii], cmap='RdYlGn_r', linewidths=0.5, annot=True).get_figure()\n",
|
||
|
"\n",
|
||
|
"\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#\n",
|
||
|
"# Running the experiment on the DEID dataset, with all the attributes\n",
|
||
|
"#\n",
|
||
|
"df = pd.read_gbq(\"select * from deid_risk.deid_risk_60k\",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n",
|
||
|
" dialect='standard')\n",
|
||
|
"\n",
|
||
|
"RUNS = 1500\n",
|
||
|
"FLAG = 'deid-full-attr-dataset'\n",
|
||
|
"r = df.deid.risk(id='person_id',num_runs=RUNS) #,field_count=11)\n",
|
||
|
"# r.to_csv('out/pandas-60k-'+FLAG+'-patients-'+str(RUNS)+'-x-runs.csv')\n",
|
||
|
"compiled = r.groupby('field_count',as_index=False)['marketer','prosecutor'].mean()\n",
|
||
|
"fo = compiled[['marketer','prosecutor']].plot.line().get_figure()\n",
|
||
|
"# fo\n",
|
||
|
"# r.plot.line(x='field_count',y='marketer')\n",
|
||
|
"compiled = r.groupby('field_count',as_index=False)['field_count','marketer','prosecutor'].mean()\n",
|
||
|
"fig_o = r.plot.scatter(x='field_count',y='marketer').get_figure()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"r.groupby('field_count',as_index=False)['marketer','prosecutor'].var()[['marketer','prosecutor']].plot.line()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"#\n",
|
||
|
"# We are going to look into the attributes with a risk of a given threshold\n",
|
||
|
"# We will run the experiment (varied combinations of the list of attributes)\n",
|
||
|
"# The experiment is intended to capture the attributes responsible for increasing the marketer risk\n",
|
||
|
"#\n",
|
||
|
"DEID_DATASET = 'deid_risk.deid_risk_60k2'\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 2",
|
||
|
"language": "python",
|
||
|
"name": "python2"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 2
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython2",
|
||
|
"version": "2.7.15rc1"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|