{ "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", " This notebook is intended to show how to use the risk framework:\n", " There are two basic usages:\n", " 1. Experiment\n", " \n", " Here the framework will select a number of random fields other than the patient id and compute risk for the selection.\n", " This will repeat over a designated number of runs.\n", " \n", " The parameters to pass to enable this mode are id=,nun_runs=\n", " 2. Assessment\n", " \n", " Here the framework assumes you are only interested in a list of quasi identifiers and will run the evaluation once for a given list of quasi identifiers.\n", " The parameters to enable this mode are id=,quasi_id=\n", "\"\"\"\n", "import os\n", "import pandas as pd\n", "import numpy as np\n", "\n", "\n", "#\n", "#-- Loading a template file\n", "# The example taken a de-identification white-paper\n", "# http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf\n", "#\n", "\n", "import pandas as pd\n", "import numpy as np\n", "from io import StringIO\n", "csv = \"\"\"\n", "id,sex,age,profession,drug_test\n", "1,M,37,doctor,-\n", "2,F,28,doctor,+\n", "3,M,37,doctor,-\n", "4,M,28,doctor,+\n", "5,M,28,doctor,-\n", "6,M,37,doctor,-\n", "\"\"\"\n", "f = StringIO()\n", "f.write(unicode(csv))\n", "f.seek(0)\n", "MY_DATAFRAME = pd.read_csv(f) " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", " Here's the pandas_risk code verbatim. \n", " NOTE: \n", "\"\"\"\n", "@pd.api.extensions.register_dataframe_accessor(\"deid\")\n", "class deid :\n", " \"\"\"\n", " This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe\n", " \"\"\"\n", " def __init__(self,df):\n", " self._df = df\n", " \n", " def risk(self,**args):\n", " \"\"\"\n", " @param id name of patient field \n", " @params num_runs number of runs (default will be 100)\n", " @params quasi_id \tlist of quasi identifiers to be used (this will only perform a single run)\n", " \"\"\"\n", " \n", " id = args['id']\n", " if 'quasi_id' in args :\n", " num_runs = 1\n", " columns = list(set(args['quasi_id'])- set(id) )\n", " else :\n", " num_runs = args['num_runs'] if 'num_runs' in args else 100\n", " columns = list(set(self._df.columns) - set([id]))\n", " r = pd.DataFrame() \n", " k = len(columns)\n", " for i in range(0,num_runs) :\n", " #\n", " # let's chose a random number of columns and compute marketer and prosecutor risk\n", " # Once the fields are selected we run a groupby clause\n", " #\n", " if 'quasi_id' not in args :\n", " n = np.random.randint(2,k) #-- number of random fields we are picking\n", " ii = np.random.choice(k,n,replace=False)\n", " cols = np.array(columns)[ii].tolist()\n", " else:\n", " cols \t= columns\n", " n \t= len(cols)\n", " x_ = self._df.groupby(cols).count()[id].values\n", " r = r.append(\n", " pd.DataFrame(\n", " [\n", " {\n", " \"selected\":n,\n", " \"marketer\": x_.size / np.float64(np.sum(x_)),\n", " \"prosecutor\":1 / np.float64(np.min(x_))\n", "\n", " }\n", " ]\n", " )\n", " )\n", " g_size = x_.size\n", " n_ids = np.float64(np.sum(x_))\n", "\n", " return r" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
marketerprosecutorselected
00.5000001.02
00.5000001.03
00.5000001.03
00.3333331.02
00.3333330.52
\n", "
" ], "text/plain": [ " marketer prosecutor selected\n", "0 0.500000 1.0 2\n", "0 0.500000 1.0 3\n", "0 0.500000 1.0 3\n", "0 0.333333 1.0 2\n", "0 0.333333 0.5 2" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Lets us compute risk here for a random any random selection of quasi identifiers\n", "# We will run this experiment 5 times\n", "#\n", "MY_DATAFRAME.deid.risk(id='id',num_runs=5)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
marketerprosecutorselected
00.51.03
\n", "
" ], "text/plain": [ " marketer prosecutor selected\n", "0 0.5 1.0 3" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# In this scenario we are just interested in sex,profession,age\n", "#\n", "MY_DATAFRAME.deid.risk(id='id',quasi_id=['age','sex','profession'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.15rc1" } }, "nbformat": 4, "nbformat_minor": 2 }