bug fix and adding usage

pull/2/head
Steve L. Nyemba 6 years ago
parent cb58675cd3
commit 47f94974c9

@ -1,34 +1,16 @@
# deid-risk # deid-risk
This project is intended to compute an estimated value of risk for a given database. The code below extends a data-frame by adding it the ability to compute de-identification risk (marketer, prosecutor).
Because data-frames can connect to any database/file it will be the responsibility of the user to load the dataset into a data-frame.
1. Pull meta data of the database and create a dataset via joins Basic examples that illustrate usage of the the framework are in the notebook folder. The example is derived from
2. Generate the dataset with random selection of features [http://ehelthinformation.ca](http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf)
3. Compute risk via SQL using group by
## Python environment
The following are the dependencies needed to run the code: Dependencies:
pandas
numpy numpy
pandas-gbq pandas
google-cloud-bigquery
## Usage
**Generate The merged dataset**
python risk.py create --i_dataset <in dataset|schema> --o_dataset <out dataset|schema> --table <name> --path <bigquery-key-file> --key <patient-id-field-name> [--file ]
**Compute risk (marketer, prosecutor)**
python risk.py compute --i_dataset <dataset> --table <name> --path <bigquery-key-file> --key <patient-id-field-name> Limitations:
## Limitations
- It works against bigquery for now
@TODO: @TODO:
- Need to write a transport layer (database interface)
- Support for referential integrity, so one table can be selected and a dataset derived given referential integrity
- Add support for journalist risk - Add support for journalist risk

@ -2,294 +2,209 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"dev-deid-600@aou-res-deid-vumc-test.iam.gserviceaccount.com df0ac049-d5b6-416f-ab3c-6321eda919d6 2018-09-25 08:18:34.829000+00:00 DONE\n"
]
}
],
"source": [ "source": [
"\"\"\"\n",
" This notebook is intended to show how to use the risk framework:\n",
" There are two basic usages:\n",
" 1. Experiment\n",
" \n",
" Here the framework will select a number of random fields other than the patient id and compute risk for the selection.\n",
" This will repeat over a designated number of runs.\n",
" \n",
" The parameters to pass to enable this mode are id=<patient id>,nun_runs=<number of runs>\n",
" 2. Assessment\n",
" \n",
" Here the framework assumes you are only interested in a list of quasi identifiers and will run the evaluation once for a given list of quasi identifiers.\n",
" The parameters to enable this mode are id=<patient id>,quasi_id=<list of quasi ids>\n",
"\"\"\"\n",
"import os\n",
"import pandas as pd\n", "import pandas as pd\n",
"import numpy as np\n", "import numpy as np\n",
"from google.cloud import bigquery as bq\n",
"\n", "\n",
"client = bq.Client.from_service_account_json('/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')\n", "\n",
"# pd.read_gbq(query=\"select * from raw.observation limit 10\",private_key='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')\n", "#\n",
"jobs = client.list_jobs()\n", "#-- Loading a template file\n",
"for job in jobs :\n", "# The example taken a de-identification white-paper\n",
"# print dir(job)\n", "# http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf\n",
" print job.user_email,job.job_id,job.started, job.state\n", "#\n",
" break" "\n",
] "import pandas as pd\n",
}, "import numpy as np\n",
{ "from io import StringIO\n",
"cell_type": "code", "csv = \"\"\"\n",
"execution_count": 33, "id,sex,age,profession,drug_test\n",
"metadata": {}, "1,M,37,doctor,-\n",
"outputs": [], "2,F,28,doctor,+\n",
"source": [ "3,M,37,doctor,-\n",
"xo = ['person_id','date_of_birth','race']\n", "4,M,28,doctor,+\n",
"xi = ['person_id','value_as_number','value_source_value']" "5,M,28,doctor,-\n",
"6,M,37,doctor,-\n",
"\"\"\"\n",
"f = StringIO()\n",
"f.write(unicode(csv))\n",
"f.seek(0)\n",
"MY_DATAFRAME = pd.read_csv(f) "
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def get_tables(client,id,fields=[]):\n",
"\"\"\"\n", "\"\"\"\n",
" getting table lists from google\n", " Here's the pandas_risk code verbatim. \n",
" NOTE: \n",
"\"\"\"\n", "\"\"\"\n",
" r = []\n", "@pd.api.extensions.register_dataframe_accessor(\"deid\")\n",
" ref = client.dataset(id)\n", "class deid :\n",
" tables = list(client.list_tables(ref))\n",
" for table in tables :\n",
" ref = table.reference\n",
" schema = client.get_table(ref).schema\n",
" names = [f.name for f in schema]\n",
" x = list(set(names) & set(fields))\n",
" if x :\n",
" r.append({\"name\":table.table_id,\"fields\":names})\n",
" return r\n",
" \n",
"def get_fields(**args):\n",
" \"\"\"\n", " \"\"\"\n",
" This function will generate a random set of fields from two tables. Tables are structured as follows \n", " This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe\n",
" {name,fields:[],\"y\":}, with \n",
" name table name (needed to generate sql query)\n",
" fields list of field names, used in the projection\n",
" y name of the field to be joined.\n",
" @param xo candidate table in the join\n",
" @param xi candidate table in the join\n",
" @param join field by which the tables can be joined.\n",
" \"\"\"\n", " \"\"\"\n",
" # The set operation will remove redundancies in the field names (not sure it's a good idea)\n", " def __init__(self,df):\n",
"# xo = args['xo']['fields']\n", " self._df = df\n",
"# xi = args['xi']['fields']\n", " \n",
"# zi = args['xi']['name']\n", " def risk(self,**args):\n",
"# return list(set([ \".\".join([args['xo']['name'],name]) for name in xo]) | set(['.'.join([args['xi']['name'],name]) for name in xi if name != args['join']]) )\n",
" xo = args['xo']\n",
" fields = [\".\".join([args['xo']['name'],name]) for name in args['xo']['fields']]\n",
" if not isinstance(args['xi'],list) :\n",
" x_ = [args['xi']]\n",
" else:\n",
" x_ = args['xi']\n",
" for xi in x_ :\n",
" fields += (['.'.join([xi['name'], name]) for name in xi['fields'] if name != args['join']])\n",
" return fields\n",
"def generate_sql(**args):\n",
" \"\"\"\n", " \"\"\"\n",
" This function will generate the SQL query for the resulting join\n", " @param id name of patient field \n",
" @params num_runs number of runs (default will be 100)\n",
" @params quasi_id \tlist of quasi identifiers to be used (this will only perform a single run)\n",
" \"\"\"\n", " \"\"\"\n",
" \n", " \n",
" xo = args['xo']\n", " id = args['id']\n",
" x_ = args['xi']\n", " if 'quasi_id' in args :\n",
" xo_name = \".\".join([args['prefix'],xo['name'] ]) if 'prefix' in args else xo['name']\n", " num_runs = 1\n",
" SQL = \"SELECT :fields FROM :xo.name \".replace(\":xo.name\",xo_name)\n", " columns = list(set(args['quasi_id'])- set(id) )\n",
" if not isinstance(x_,list):\n", " else :\n",
" x_ = [x_]\n", " num_runs = args['num_runs'] if 'num_runs' in args else 100\n",
" f = []#[\".\".join([args['xo']['name'],args['join']] )] \n", " columns = list(set(self._df.columns) - set([id]))\n",
" INNER_JOINS = []\n", " r = pd.DataFrame() \n",
" for xi in x_ :\n", " k = len(columns)\n",
" xi_name = \".\".join([args['prefix'],xi['name'] ]) if 'prefix' in args else xi['name']\n", " for i in range(0,num_runs) :\n",
" JOIN_SQL = \"INNER JOIN :xi.name ON \".replace(':xi.name',xi_name)\n", " #\n",
" value = \".\".join([xi['name'],args['join']])\n", " # let's chose a random number of columns and compute marketer and prosecutor risk\n",
" f.append(value) \n", " # Once the fields are selected we run a groupby clause\n",
" #\n",
" if 'quasi_id' not in args :\n",
" n = np.random.randint(2,k) #-- number of random fields we are picking\n",
" ii = np.random.choice(k,n,replace=False)\n",
" cols = np.array(columns)[ii].tolist()\n",
" else:\n",
" cols \t= columns\n",
" n \t= len(cols)\n",
" x_ = self._df.groupby(cols).count()[id].values\n",
" r = r.append(\n",
" pd.DataFrame(\n",
" [\n",
" {\n",
" \"selected\":n,\n",
" \"marketer\": x_.size / np.float64(np.sum(x_)),\n",
" \"prosecutor\":1 / np.float64(np.min(x_))\n",
"\n", "\n",
" ON_SQL = \"\"\n", " }\n",
" tmp = []\n", " ]\n",
" for term in f :\n",
" ON_SQL = \":xi.name.:ofield = :xo.name.:ofield\".replace(\":xo.name\",xo['name'])\n",
" ON_SQL = ON_SQL.replace(\":xi.name.:ofield\",term).replace(\":ofield\",args['join'])\n",
" tmp.append(ON_SQL)\n",
" INNER_JOINS += [JOIN_SQL + \" AND \".join(tmp)]\n",
" return SQL + \" \".join(INNER_JOINS)\n",
"def get_final_sql(**args):\n",
" xo = args['xo']\n",
" xi = args['xi']\n",
" join=args['join']\n",
" prefix = args['prefix'] if 'prefix' in args else ''\n",
" fields = get_fields (xo=xo,xi=xi,join=join)\n",
" k = len(fields)\n",
" n = np.random.randint(2,k) #-- number of fields to select\n",
" i = np.random.randint(0,k,size=n)\n",
" fields = [name for name in fields if fields.index(name) in i]\n",
" base_sql = generate_sql(xo=xo,xi=xi,prefix)\n",
" SQL = \"\"\"\n",
" SELECT AVERAGE(count),size,n as selected_features,k as total_features\n",
" FROM(\n",
" SELECT COUNT(*) as count,count(:join) as pop,sum(:n) as N,sum(:k) as k,:fields\n",
" FROM (:sql)\n",
" GROUP BY :fields\n",
" )\n", " )\n",
" order by 1\n", " )\n",
" \n", " g_size = x_.size\n",
" \"\"\".replace(\":sql\",base_sql)\n", " n_ids = np.float64(np.sum(x_))\n",
"# sql = \"SELECT :fields FROM :xo.name INNER JOIN :xi.name ON :xi.name.:xi.y = :xo.y \"\n",
"# fields = \",\".join(get_fields(xo=xi,xi=xi,join=xi['y']))\n",
" \n",
" \n",
"# sql = sql.replace(\":fields\",fields).replace(\":xo.name\",xo['name']).replace(\":xi.name\",xi['name'])\n",
"# sql = sql.replace(\":xi.y\",xi['y']).replace(\":xo.y\",xo['y'])\n",
"# return sql\n",
"\n", "\n",
" " " return r"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"xo = {\"name\":\"person\",\"fields\":['person_id','date_of_birth','race','value_as_number']}\n",
"xi = [{\"name\":\"measurement\",\"fields\":['person_id','value_as_number','value_source_value']}] #,{\"name\":\"observation\",\"fields\":[\"person_id\",\"value_as_string\",\"observation_source_value\"]}]\n",
"# generate_sql(xo=xo,xi=xi,join=\"person_id\",prefix='raw')\n",
"fields = get_fields(xo=xo,xi=xi,join='person_id')\n",
"ofields = list(fields)\n",
"k = len(fields)\n",
"n = np.random.randint(2,k) #-- number of fields to select\n",
"i = np.random.randint(0,k,size=n)\n",
"fields = [name for name in fields if fields.index(name) in i]"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 34, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/html": [
"['person.race', 'person.value_as_number', 'measurement.value_source_value']" "<div>\n",
] "<style scoped>\n",
}, " .dataframe tbody tr th:only-of-type {\n",
"execution_count": 34, " vertical-align: middle;\n",
"metadata": {}, " }\n",
"output_type": "execute_result" "\n",
} " .dataframe tbody tr th {\n",
], " vertical-align: top;\n",
"source": [ " }\n",
"fields\n" "\n",
] " .dataframe thead th {\n",
}, " text-align: right;\n",
{ " }\n",
"cell_type": "code", "</style>\n",
"execution_count": 55, "<table border=\"1\" class=\"dataframe\">\n",
"metadata": {}, " <thead>\n",
"outputs": [ " <tr style=\"text-align: right;\">\n",
{ " <th></th>\n",
"data": { " <th>marketer</th>\n",
"text/plain": [ " <th>prosecutor</th>\n",
"'SELECT person_id,value_as_number,measurements.value_source_value,measurements.value_as_number,value_source_value FROM person INNER JOIN measurements ON measurements.person_id = person_id '" " <th>selected</th>\n",
] " </tr>\n",
}, " </thead>\n",
"execution_count": 55, " <tbody>\n",
"metadata": {}, " <tr>\n",
"output_type": "execute_result" " <th>0</th>\n",
} " <td>0.500000</td>\n",
], " <td>1.0</td>\n",
"source": [ " <td>2</td>\n",
"xo = {\"name\":\"person\",\"fields\":['person_id','date_of_birth','race'],\"y\":\"person_id\"}\n", " </tr>\n",
"xi = {\"name\":\"measurements\",\"fields\":['person_id','value_as_number','value_source_value'],\"y\":\"person_id\"}\n", " <tr>\n",
"generate_sql(xo=xo,xi=xi)" " <th>0</th>\n",
] " <td>0.500000</td>\n",
}, " <td>1.0</td>\n",
{ " <td>3</td>\n",
"cell_type": "code", " </tr>\n",
"execution_count": 59, " <tr>\n",
"metadata": {}, " <th>0</th>\n",
"outputs": [ " <td>0.500000</td>\n",
{ " <td>1.0</td>\n",
"data": { " <td>3</td>\n",
"text/plain": [ " </tr>\n",
"[('a', 'b'), ('a', 'c'), ('b', 'c')]" " <tr>\n",
] " <th>0</th>\n",
}, " <td>0.333333</td>\n",
"execution_count": 59, " <td>1.0</td>\n",
"metadata": {}, " <td>2</td>\n",
"output_type": "execute_result" " </tr>\n",
} " <tr>\n",
" <th>0</th>\n",
" <td>0.333333</td>\n",
" <td>0.5</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
], ],
"source": [
"\"\"\"\n",
" We are designing a process that will take two tables that will generate \n",
"\"\"\"\n",
"import itertools\n",
"list(itertools.combinations(['a','b','c'],2))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [ "text/plain": [
"array([1, 3, 0, 0])" " marketer prosecutor selected\n",
"0 0.500000 1.0 2\n",
"0 0.500000 1.0 3\n",
"0 0.500000 1.0 3\n",
"0 0.333333 1.0 2\n",
"0 0.333333 0.5 2"
] ]
}, },
"execution_count": 6, "execution_count": 7,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"#\n", "#\n",
"# find every table with person id at the very least or a subset of fields\n", "# Lets us compute risk here for a random any random selection of quasi identifiers\n",
"# We will run this experiment 5 times\n",
"#\n", "#\n",
"np.random.randint(0,4,size=4)" "MY_DATAFRAME.deid.risk(id='id',num_runs=5)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['a']"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(set(['a','b']) & set(['a']))"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"x_ = 1"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"x_ = pd.DataFrame({\"group\":[1,1,1,1,1], \"size\":[2,1,1,1,1]})"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -313,35 +228,37 @@
" <thead>\n", " <thead>\n",
" <tr style=\"text-align: right;\">\n", " <tr style=\"text-align: right;\">\n",
" <th></th>\n", " <th></th>\n",
" <th>size</th>\n", " <th>marketer</th>\n",
" </tr>\n", " <th>prosecutor</th>\n",
" <tr>\n", " <th>selected</th>\n",
" <th>group</th>\n",
" <th></th>\n",
" </tr>\n", " </tr>\n",
" </thead>\n", " </thead>\n",
" <tbody>\n", " <tbody>\n",
" <tr>\n", " <tr>\n",
" <th>1</th>\n", " <th>0</th>\n",
" <td>1.2</td>\n", " <td>0.5</td>\n",
" <td>1.0</td>\n",
" <td>3</td>\n",
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
"</table>\n", "</table>\n",
"</div>" "</div>"
], ],
"text/plain": [ "text/plain": [
" size\n", " marketer prosecutor selected\n",
"group \n", "0 0.5 1.0 3"
"1 1.2"
] ]
}, },
"execution_count": 12, "execution_count": 8,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"x_.groupby(['group']).mean()\n" "#\n",
"# In this scenario we are just interested in sex,profession,age\n",
"#\n",
"MY_DATAFRAME.deid.risk(id='id',quasi_id=['age','sex','profession'])"
] ]
}, },
{ {

Loading…
Cancel
Save