privacykit/notebooks/risk.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "    This notebook is intended to show how to use the risk framework:\n",
    "    There are two basic usages:\n",
    "        1. Experiment\n",
    "            \n",
    "            Here the framework will select a number of random fields other than the patient id and compute risk for the selection.\n",
    "            This will repeat over a designated number of runs.\n",
    "            \n",
    "            The parameters to pass to enable this mode are id=<patient id>,nun_runs=<number of runs>\n",
    "        2. Assessment\n",
    "        \n",
    "            Here the framework assumes you are only interested in a list of quasi identifiers and will run the evaluation once for a given list of quasi identifiers.\n",
    "            The parameters to enable this mode are id=<patient id>,quasi_id=<list of quasi ids>\n",
    "\"\"\"\n",
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "#\n",
    "#-- Loading a template file\n",
    "# The example taken a de-identification white-paper\n",
    "# http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf\n",
    "#\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from io import StringIO\n",
    "csv = \"\"\"\n",
    "id,sex,age,profession,drug_test\n",
    "1,M,37,doctor,-\n",
    "2,F,28,doctor,+\n",
    "3,M,37,doctor,-\n",
    "4,M,28,doctor,+\n",
    "5,M,28,doctor,-\n",
    "6,M,37,doctor,-\n",
    "\"\"\"\n",
    "f = StringIO()\n",
    "f.write(unicode(csv))\n",
    "f.seek(0)\n",
    "MY_DATAFRAME = pd.read_csv(f)   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "    Here's the pandas_risk code verbatim. \n",
    "    NOTE: \n",
    "\"\"\"\n",
    "@pd.api.extensions.register_dataframe_accessor(\"deid\")\n",
    "class deid :\n",
    "    \"\"\"\n",
    "        This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe\n",
    "    \"\"\"\n",
    "    def __init__(self,df):\n",
    "        self._df = df\n",
    "    \n",
    "    def risk(self,**args):\n",
    "        \"\"\"\n",
    "            @param  id          name of patient field            \n",
    "            @params num_runs    number of runs (default will be 100)\n",
    "            @params quasi_id \tlist of quasi identifiers to be used (this will only perform a single run)\n",
    "        \"\"\"\n",
    "        \n",
    "        id  = args['id']\n",
    "        if 'quasi_id' in args :\n",
    "            num_runs = 1\n",
    "            columns = list(set(args['quasi_id'])- set(id) )\n",
    "        else :\n",
    "            num_runs  = args['num_runs'] if 'num_runs' in args else 100\n",
    "            columns = list(set(self._df.columns) - set([id]))\n",
    "        r   = pd.DataFrame()        \n",
    "        k = len(columns)\n",
    "        for i in range(0,num_runs) :\n",
    "            #\n",
    "            # let's chose a random number of columns and compute marketer and prosecutor risk\n",
    "            # Once the fields are selected we run a groupby clause\n",
    "            #\n",
    "            if 'quasi_id' not in args :\n",
    "                n   = np.random.randint(2,k) #-- number of random fields we are picking\n",
    "                ii = np.random.choice(k,n,replace=False)\n",
    "                cols = np.array(columns)[ii].tolist()\n",
    "            else:\n",
    "                cols \t= columns\n",
    "                n \t= len(cols)\n",
    "            x_ = self._df.groupby(cols).count()[id].values\n",
    "            r = r.append(\n",
    "                pd.DataFrame(\n",
    "                    [\n",
    "                        {\n",
    "                            \"selected\":n,\n",
    "                            \"marketer\": x_.size / np.float64(np.sum(x_)),\n",
    "                            \"prosecutor\":1 / np.float64(np.min(x_))\n",
    "\n",
    "                        }\n",
    "                    ]\n",
    "                )\n",
    "            )\n",
    "            g_size = x_.size\n",
    "            n_ids = np.float64(np.sum(x_))\n",
    "\n",
    "        return r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>marketer</th>\n",
       "      <th>prosecutor</th>\n",
       "      <th>selected</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.500000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.500000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.500000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.333333</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.5</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   marketer  prosecutor  selected\n",
       "0  0.500000         1.0         2\n",
       "0  0.500000         1.0         3\n",
       "0  0.500000         1.0         3\n",
       "0  0.333333         1.0         2\n",
       "0  0.333333         0.5         2"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#\n",
    "# Lets us compute risk here for a random any random selection of quasi identifiers\n",
    "# We will run this experiment 5 times\n",
    "#\n",
    "MY_DATAFRAME.deid.risk(id='id',num_runs=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>marketer</th>\n",
       "      <th>prosecutor</th>\n",
       "      <th>selected</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.5</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   marketer  prosecutor  selected\n",
       "0       0.5         1.0         3"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#\n",
    "# In this scenario we are just interested in sex,profession,age\n",
    "#\n",
    "MY_DATAFRAME.deid.risk(id='id',quasi_id=['age','sex','profession'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15rc1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}