Merge branch 'master' into dev

pull/2/head
Steve L. Nyemba 6 years ago
commit 5e453f8371

@ -1,16 +1,63 @@
# deid-risk
# Re-Identification Risk
The code below extends a data-frame by adding it the ability to compute de-identification risk (marketer, prosecutor).
Because data-frames can connect to any database/file it will be the responsibility of the user to load the dataset into a data-frame.
This framework computes re-identification risk of a dataset by extending pandas. It works like a pandas **add-on**
The framework will compute the following risk measures: marketer, prosecutor, journalist and pitman risk. References for the risk measures can be found on [http://ehelthinformation.ca] (http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf) and [https://www.scb.se/contentassets](https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf)
Basic examples that illustrate usage of the the framework are in the notebook folder. The example is derived from
[http://ehelthinformation.ca](http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf)
There are two modes available :
**explore:**
Here the assumption is that we are not sure of the attributes to be disclosed, the framework will randomly generate random combinations of attributes and evaluate them accordingly as it provides all the measures of risk.
**evaluation**
Here the assumption is that we are clear on the sets of attributes to be used and we are interested in computing the associated risk.
### Four risk measures are computed :
- Marketer risk
- Prosecutor risk
- Journalist risk
- Pitman Risk
### Usage:
Install this package using pip as follows :
Stable :
pip install git+https://hiplab.mc.vanderbilt.edu/git/steve/deid-risk.git
Dependencies:
numpy
pandas
Limitations:
Latest Development (not fully tested):
pip install git+https://hiplab.mc.vanderbilt.edu/git/steve/deid-risk.git@risk
The framework will depend on pandas and numpy (for now). Below is a basic sample to get started quickly.
import numpy as np
import pandas as pd
import risk
mydf = pd.DataFrame({"x":np.random.choice( np.random.randint(1,10),50),"y":np.random.choice( np.random.randint(1,10),50),"z":np.random.choice( np.random.randint(1,10),50),"r":np.random.choice( np.random.randint(1,10),50) })
print (mydf.risk.evaluate())
#
# computing journalist and pitman
# - Insure the population size is much greater than the sample size
# - Insure the fields are identical in both sample and population
#
pop = pd.DataFrame({"x":np.random.choice( np.random.randint(1,10),150),"y":np.random.choice( np.random.randint(1,10),150) ,"z":np.random.choice( np.random.randint(1,10),150),"r":np.random.choice( np.random.randint(1,10),150)})
print (mydf.risk.evaluate(pop=pop))
@TODO:
- Add support for journalist risk
- Evaluation of how sparse attributes are (the ratio of non-null over rows)
- Have a smart way to drop attributes (based on the above in random policy search)
Basic examples that illustrate usage of the the framework are in the notebook folder. The example is derived from

@ -1,293 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
" This notebook is intended to show how to use the risk framework:\n",
" There are two basic usages:\n",
" 1. Experiment\n",
" \n",
" Here the framework will select a number of random fields other than the patient id and compute risk for the selection.\n",
" This will repeat over a designated number of runs.\n",
" \n",
" The parameters to pass to enable this mode are id=<patient id>,nun_runs=<number of runs>\n",
" 2. Assessment\n",
" \n",
" Here the framework assumes you are only interested in a list of quasi identifiers and will run the evaluation once for a given list of quasi identifiers.\n",
" The parameters to enable this mode are id=<patient id>,quasi_id=<list of quasi ids>\n",
"\"\"\"\n",
"import os\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"\n",
"#\n",
"#-- Loading a template file\n",
"# The example taken a de-identification white-paper\n",
"# http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf\n",
"#\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from io import StringIO\n",
"csv = \"\"\"\n",
"id,sex,age,profession,drug_test\n",
"1,M,37,doctor,-\n",
"2,F,28,doctor,+\n",
"3,M,37,doctor,-\n",
"4,M,28,doctor,+\n",
"5,M,28,doctor,-\n",
"6,M,37,doctor,-\n",
"\"\"\"\n",
"f = StringIO()\n",
"f.write(unicode(csv))\n",
"f.seek(0)\n",
"MY_DATAFRAME = pd.read_csv(f) "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
" Here's the pandas_risk code verbatim. \n",
" NOTE: \n",
"\"\"\"\n",
"@pd.api.extensions.register_dataframe_accessor(\"deid\")\n",
"class deid :\n",
" \"\"\"\n",
" This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe\n",
" \"\"\"\n",
" def __init__(self,df):\n",
" self._df = df\n",
" \n",
" def risk(self,**args):\n",
" \"\"\"\n",
" @param id name of patient field \n",
" @params num_runs number of runs (default will be 100)\n",
" @params quasi_id \tlist of quasi identifiers to be used (this will only perform a single run)\n",
" \"\"\"\n",
" \n",
" id = args['id']\n",
" if 'quasi_id' in args :\n",
" num_runs = 1\n",
" columns = list(set(args['quasi_id'])- set(id) )\n",
" else :\n",
" num_runs = args['num_runs'] if 'num_runs' in args else 100\n",
" columns = list(set(self._df.columns) - set([id]))\n",
" r = pd.DataFrame() \n",
" k = len(columns)\n",
" for i in range(0,num_runs) :\n",
" #\n",
" # let's chose a random number of columns and compute marketer and prosecutor risk\n",
" # Once the fields are selected we run a groupby clause\n",
" #\n",
" if 'quasi_id' not in args :\n",
" n = np.random.randint(2,k) #-- number of random fields we are picking\n",
" ii = np.random.choice(k,n,replace=False)\n",
" cols = np.array(columns)[ii].tolist()\n",
" else:\n",
" cols \t= columns\n",
" n \t= len(cols)\n",
" x_ = self._df.groupby(cols).count()[id].values\n",
" r = r.append(\n",
" pd.DataFrame(\n",
" [\n",
" {\n",
" \"selected\":n,\n",
" \"marketer\": x_.size / np.float64(np.sum(x_)),\n",
" \"prosecutor\":1 / np.float64(np.min(x_))\n",
"\n",
" }\n",
" ]\n",
" )\n",
" )\n",
" g_size = x_.size\n",
" n_ids = np.float64(np.sum(x_))\n",
"\n",
" return r"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>marketer</th>\n",
" <th>prosecutor</th>\n",
" <th>selected</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.500000</td>\n",
" <td>1.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.500000</td>\n",
" <td>1.0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.500000</td>\n",
" <td>1.0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.333333</td>\n",
" <td>1.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.333333</td>\n",
" <td>0.5</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" marketer prosecutor selected\n",
"0 0.500000 1.0 2\n",
"0 0.500000 1.0 3\n",
"0 0.500000 1.0 3\n",
"0 0.333333 1.0 2\n",
"0 0.333333 0.5 2"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#\n",
"# Lets us compute risk here for a random any random selection of quasi identifiers\n",
"# We will run this experiment 5 times\n",
"#\n",
"MY_DATAFRAME.deid.risk(id='id',num_runs=5)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>marketer</th>\n",
" <th>prosecutor</th>\n",
" <th>selected</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.5</td>\n",
" <td>1.0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" marketer prosecutor selected\n",
"0 0.5 1.0 3"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#\n",
"# In this scenario we are just interested in sex,profession,age\n",
"#\n",
"MY_DATAFRAME.deid.risk(id='id',quasi_id=['age','sex','profession'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.15rc1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -0,0 +1,69 @@
"""
# Re-Identification Risk
This framework computes re-identification risk of a dataset by extending pandas. It works like a pandas **add-on**
The framework will compute the following risk measures: marketer, prosecutor, journalist and pitman risk.
References for the risk measures can be found on
- http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf
- https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf
There are two modes available :
**explore:**
Here the assumption is that we are not sure of the attributes to be disclosed, the framework will randomly generate random combinations of attributes and evaluate them accordingly as it provides all the measures of risk.
**evaluation**
Here the assumption is that we are clear on the sets of attributes to be used and we are interested in computing the associated risk.
### Four risk measures are computed :
- Marketer risk
- Prosecutor risk
- Journalist risk
- Pitman Risk
### Usage:
Install this package using pip as follows :
Stable :
pip install git+https://hiplab.mc.vanderbilt.edu/git/steve/deid-risk.git
Latest Development (not fully tested):
pip install git+https://hiplab.mc.vanderbilt.edu/git/steve/deid-risk.git@risk
The framework will depend on pandas and numpy (for now). Below is a basic sample to get started quickly.
import numpy as np
import pandas as pd
import risk
mydf = pd.DataFrame({"x":np.random.choice( np.random.randint(1,10),50),"y":np.random.choice( np.random.randint(1,10),50),"z":np.random.choice( np.random.randint(1,10),50),"r":np.random.choice( np.random.randint(1,10),50) })
print (mydf.risk.evaluate())
#
# computing journalist and pitman
# - Insure the population size is much greater than the sample size
# - Insure the fields are identical in both sample and population
#
pop = pd.DataFrame({"x":np.random.choice( np.random.randint(1,10),150),"y":np.random.choice( np.random.randint(1,10),150) ,"z":np.random.choice( np.random.randint(1,10),150),"r":np.random.choice( np.random.randint(1,10),150)})
print (mydf.risk.evaluate(pop=pop))
@TODO:
- Evaluation of how sparse attributes are (the ratio of non-null over rows)
- Have a smart way to drop attributes (based on the above in random policy search)
Basic examples that illustrate usage of the the framework are in the notebook folder. The example is derived from
"""
from risk import deid

@ -0,0 +1,255 @@
"""
Health Information Privacy Lab
Brad. Malin, Weiyi Xia, Steve L. Nyemba
This framework computes re-identification risk of a dataset assuming the data being shared can be loaded into a dataframe (pandas)
The framework will compute the following risk measures:
- marketer
- prosecutor
- pitman
References :
https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf
This framework integrates pandas (for now) as an extension and can be used in two modes :
Experimental mode
Here the assumption is that we are not sure of the attributes to be disclosed, the framework will explore a variety of combinations and associate risk measures every random combinations
Evaluation mode
The evaluation mode assumes the set of attributes given are known and thus will evaluate risk for a subset of attributes.
features :
- determine viable fields (quantifiable in terms of uniqueness). This is a way to identify fields that can act as identifiers.
- explore and evaluate risk of a sample dataset against a known population dataset
- explore and evaluate risk on a sample dataset
Usage:
from pandas_risk import *
mydataframe = pd.DataFrame('/myfile.csv')
resp = mydataframe.risk.evaluate(id=<name of patient field>,num_runs=<number of runs>,cols=[])
resp = mydataframe.risk.explore(id=<name of patient field>,num_runs=<number of runs>,cols=[])
@TODO:
- Provide a selected number of fields and risk will be computed for those fields.
- include journalist risk
"""
import pandas as pd
import numpy as np
import logging
import json
from datetime import datetime
import sys
@pd.api.extensions.register_dataframe_accessor("risk")
class deid :
"""
This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
"""
def __init__(self,df):
self._df = df.fillna(' ')
def explore(self,**args):
"""
This function will perform experimentation by performing a random policies (combinations of attributes)
This function is intended to explore a variety of policies and evaluate their associated risk.
@param pop|sample data-frame with popublation reference
@param id key field that uniquely identifies patient/customer ...
"""
pop= args['pop'] if 'pop' in args else None
if 'pop_size' in args :
pop_size = np.float64(args['pop_size'])
else:
pop_size = -1
#
# Policies will be generated with a number of runs
#
RUNS = args['num_runs'] if 'num_runs' in args else 5
sample = args['sample'] if 'sample' in args else pd.DataFrame(self._df)
k = sample.columns.size if 'field_count' not in args else int(args['field_count']) + 1
if 'id' in args :
id = args['id']
columns = list(set(sample.columns.tolist()) - set([id]))
else:
columns = sample.columns.tolist()
o = pd.DataFrame()
for i in np.arange(RUNS):
n = np.random.randint(2,k)
cols = np.random.choice(columns,n,replace=False).tolist()
params = {'sample':sample,'cols':cols}
if pop is not None :
params['pop'] = pop
if pop_size > 0 :
params['pop_size'] = pop_size
r = self.evaluate(**params)
#
# let's put the policy in place
p = pd.DataFrame(1*sample.columns.isin(cols)).T
p.columns = sample.columns
o = o.append(r.join(p))
o.index = np.arange(o.shape[0]).astype(np.int64)
return o
def evaluate(self, **args):
"""
This function has the ability to evaluate risk associated with either a population or a sample dataset
:sample sample dataset
:pop population dataset
:cols list of columns of interest or policies
:flag user provided flag for the context of the evaluation
"""
if 'sample' in args :
sample = pd.DataFrame(args['sample'])
else:
sample = pd.DataFrame(self._df)
if not args or 'cols' not in args:
cols = sample.columns.tolist()
elif args and 'cols' in args:
cols = args['cols']
flag = 'UNFLAGGED' if 'flag' not in args else args['flag']
#
# @TODO: auto select the columns i.e removing the columns that will have the effect of an identifier
#
# if 'population' in args :
# pop = pd.DataFrame(args['population'])
r = {"flag":flag}
# if sample :
handle_sample = Sample()
xi = sample.groupby(cols,as_index=False).size().values
handle_sample.set('groups',xi)
if 'pop_size' in args :
pop_size = np.float64(args['pop_size'])
else:
pop_size = -1
#
#-- The following conditional line is to address the labels that will be returned
# @TODO: Find a more elegant way of doing this.
#
if 'pop' in args :
r['sample marketer'] = handle_sample.marketer()
r['sample prosecutor'] = handle_sample.prosecutor()
r['sample unique ratio'] = handle_sample.unique_ratio()
r['sample group count'] = xi.size
else:
r['marketer'] = handle_sample.marketer()
r['prosecutor'] = handle_sample.prosecutor()
r['unique ratio'] = handle_sample.unique_ratio()
r['group count'] = xi.size
if pop_size > 0 :
handle_sample.set('pop_size',pop_size)
r['pitman risk'] = handle_sample.pitman()
if 'pop' in args :
xi = pd.DataFrame({"sample_group_size":sample.groupby(cols,as_index=False).size()}).reset_index()
yi = pd.DataFrame({"population_group_size":args['pop'].groupby(cols,as_index=False).size()}).reset_index()
merged_groups = pd.merge(xi,yi,on=cols,how='inner')
handle_population= Population()
handle_population.set('merged_groups',merged_groups)
r['pop. marketer'] = handle_population.marketer()
r['pitman risk'] = handle_population.pitman()
r['pop. group size'] = np.unique(yi.population_group_size).size
#
# At this point we have both columns for either sample,population or both
#
r['field count'] = len(cols)
return pd.DataFrame([r])
class Risk :
"""
This class is an abstraction of how we chose to structure risk computation i.e in 2 sub classes:
- Sample computes risk associated with a sample dataset only
- Population computes risk associated with a population
"""
def __init__(self):
self.cache = {}
def set(self,key,value):
if id not in self.cache :
self.cache[id] = {}
self.cache[key] = value
class Sample(Risk):
"""
This class will compute risk for the sample dataset: the marketer and prosecutor risk are computed by default.
This class can optionally add pitman risk if the population size is known.
"""
def __init__(self):
Risk.__init__(self)
def marketer(self):
"""
computing marketer risk for sample dataset
"""
groups = self.cache['groups']
group_count = groups.size
row_count = groups.sum()
return group_count / np.float64(row_count)
def prosecutor(self):
"""
The prosecutor risk consists in determining 1 over the smallest group size
It identifies if there is at least one record that is unique
"""
groups = self.cache['groups']
return 1 / np.float64(groups.min())
def unique_ratio(self):
groups = self.cache['groups']
row_count = groups.sum()
return groups[groups == 1].sum() / np.float64(row_count)
def pitman(self):
"""
This function will approximate pitman de-identification risk based on pitman sampling
"""
groups = self.cache['groups']
si = groups[groups == 1].size
u = groups.size
alpha = np.divide(si , np.float64(u) )
f = np.divide(groups.sum(), np.float64(self.cache['pop_size']))
return np.power(f,1-alpha)
class Population(Sample):
"""
This class will compute risk for datasets that have population information or datasets associated with them.
This computation includes pitman risk (it requires minimal information about population)
"""
def __init__(self,**args):
Sample.__init__(self)
def set(self,key,value):
Sample.set(self,key,value)
if key == 'merged_groups' :
Sample.set(self,'pop_size',np.float64(value.population_group_size.sum()) )
Sample.set(self,'groups',value.sample_group_size)
"""
This class will measure risk and account for the existance of a population
:merged_groups {sample_group_size, population_group_size} is a merged dataset with group sizes of both population and sample
"""
def marketer(self):
"""
This function requires
"""
r = self.cache['merged_groups']
sample_row_count = r.sample_group_size.sum()
#
# @TODO : make sure the above line is size (not sum)
# sample_row_count = r.sample_group_size.size
return r.apply(lambda row: (row.sample_group_size / np.float64(row.population_group_size)) /np.float64(sample_row_count) ,axis=1).sum()

@ -0,0 +1,14 @@
"""
This is a build file for the
"""
from setuptools import setup, find_packages
setup(
name = "risk",
version = "0.1",
author = "Health Information Privacy Lab",
author_email = "steve.l.nyemba@vanderbilt.edu",
license = "MIT",
packages=['risk'],
install_requires = ['numpy','pandas']
)

@ -1,115 +0,0 @@
"""
Health Information Privacy Lab
Steve L. Nyemba & Brad. Malin
This is an extension to the pandas data-frame that will perform a risk assessment on a variety of attributes
This implementation puts the responsibility on the user of the framework to join datasets and load the final results into a pandas data-frame.
The code will randomly select fields and compute the risk (marketer and prosecutor) and perform a given number of runs.
Usage:
from pandas_risk import *
mydataframe = pd.DataFrame('/myfile.csv')
risk = mydataframe.deid.risk(id=<name of patient field>,num_runs=<number of runs>)
@TODO:
- Provide a selected number of fields and risk will be computed for those fields.
- include journalist risk
"""
import pandas as pd
import numpy as np
@pd.api.extensions.register_dataframe_accessor("deid")
class deid :
"""
This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
"""
def __init__(self,df):
self._df = df
def risk(self,**args):
"""
@param id name of patient field
@params num_runs number of runs (default will be 100)
@params quasi_id list of quasi identifiers to be used (this will only perform a single run)
"""
id = args['id']
if 'quasi_id' in args :
num_runs = 1
columns = list(set(args['quasi_id'])- set(id) )
else :
num_runs = args['num_runs'] if 'num_runs' in args else 100
columns = list(set(self._df.columns) - set([id]))
r = pd.DataFrame()
k = len(columns)
N = self._df.shape[0]
tmp = self._df.fillna(' ')
np.random.seed(1)
for i in range(0,num_runs) :
#
# let's chose a random number of columns and compute marketer and prosecutor risk
# Once the fields are selected we run a groupby clause
#
if 'quasi_id' not in args :
if 'field_count' in args :
#
# We chose to limit how many fields we passin
n = np.random.randint(2,int(args['field_count'])) #-- number of random fields we are picking
else :
n = np.random.randint(2,k) #-- number of random fields we are picking
ii = np.random.choice(k,n,replace=False)
cols = np.array(columns)[ii].tolist()
policy = np.zeros(k)
policy [ii] = 1
policy = pd.DataFrame(policy).T
else:
cols = columns
policy = np.ones(k)
policy = pd.DataFrame(policy).T
n = len(cols)
policy.columns = columns
N = tmp.shape[0]
x_ = tmp.groupby(cols).size().values
# print [id,i,n,k,self._df.groupby(cols).count()]
r = r.append(
pd.DataFrame(
[
{
"group_count":x_.size,
"patient_count":N,
"field_count":n,
"marketer": x_.size / np.float64(np.sum(x_)),
"prosecutor":1 / np.float64(np.min(x_))
}
]
).join(policy)
)
# g_size = x_.size
# n_ids = np.float64(np.sum(x_))
# sql = """
# SELECT COUNT(g_size) as group_count, :patient_count as patient_count,SUM(g_size) as rec_count, COUNT(g_size)/SUM(g_size) as marketer, 1/ MIN(g_size) as prosecutor, :n as field_count
# FROM (
# SELECT COUNT(*) as g_size,:key,:fields
# FROM :full_name
# GROUP BY :fields
# """.replace(":n",str(n)).replace(":fields",",".join(cols)).replace(":key",id).replace(":patient_count",str(N))
# r.append(self._df.query(sql.replace("\n"," ").replace("\r"," ") ))
return r
# df = pd.read_gbq("select * from deid_risk.risk_30k",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
# r = df.deid.risk(id='person_id',num_runs=200)
# print r[['field_count','patient_count','marketer','prosecutor']]

@ -1,17 +0,0 @@
import sys
SYS_ARGS={}
if len(sys.argv) > 1 :
N = len(sys.argv)
for i in range(1,N) :
value = 1
if sys.argv[i].startswith('--') :
key = sys.argv[i].replace('-','')
if i + 1 < N and not sys.argv[i+1].startswith('--') :
value = sys.argv[i + 1].strip()
SYS_ARGS[key] = value
i += 2
elif 'action' not in SYS_ARGS:
SYS_ARGS['action'] = sys.argv[i].strip()

@ -1,287 +0,0 @@
"""
Steve L. Nyemba & Brad Malin
Health Information Privacy Lab.
This code is proof of concept as to how risk is computed against a database (at least a schema).
The engine will read tables that have a given criteria (patient id) and generate a dataset by performing joins.
Because joins are process intensive we decided to add a limit to the records pulled.
TL;DR:
This engine generates a dataset and computes risk (marketer and prosecutor)
Assumptions:
- We assume tables that reference patients will name the keys identically (best practice). This allows us to be able to leverage data store's that don't support referential integrity
Usage :
Limitations
- It works against bigquery for now
@TODO:
- Need to write a transport layer (database interface)
- Support for referential integrity, so one table can be selected and a dataset derived given referential integrity
- Add support for journalist risk
"""
import pandas as pd
import numpy as np
from google.cloud import bigquery as bq
import time
from params import SYS_ARGS
class utils :
"""
This class is a utility class that will generate SQL-11 compatible code in order to run the risk assessment
@TODO: plugins for other data-stores
"""
def __init__(self,**args):
# self.path = args['path']
self.client = args['client']
def get_tables(self,**args): #id,key='person_id'):
"""
This function returns a list of tables given a key. The key is the name of the field that uniquely designates a patient/person
in the database. The list of tables are tables that can be joined given the provided field.
@param key name of the patient field
@param dataset dataset name
@param client initialized bigquery client ()
@return [{name,fields:[],row_count}]
"""
dataset = args['dataset']
client = args['client']
key = args['key']
r = []
ref = client.dataset(dataset)
tables = list(client.list_tables(ref))
TERMS = ['type','unit','count','refills','stop','supply','quantity']
for table in tables :
if table.table_id.strip() in ['people_seed','measurement','drug_exposure','procedure_occurrence','visit_occurrence','condition_occurrence','device_exposure']:
print ' skiping ...'
continue
ref = table.reference
table = client.get_table(ref)
schema = table.schema
rows = table.num_rows
if rows == 0 :
continue
names = [f.name for f in schema if len (set(TERMS) & set(f.name.strip().split("_"))) == 0 ]
x = list(set(names) & set([key]))
if x :
full_name = ".".join([dataset,table.table_id])
r.append({"name":table.table_id,"fields":names,"row_count":rows,"full_name":full_name})
return r
def get_field_name(self,alias,field_name,index):
"""
This function will format the a field name given an index (the number of times it has occurred in projection)
The index is intended to avoid a "duplicate field" error (bigquery issue)
@param alias alias of the table
@param field_name name of the field to be formatted
@param index the number of times the field appears in the projection
"""
name = [alias,field_name]
if index > 0 :
return ".".join(name)+" AS :field_name:index".replace(":field_name",field_name).replace(":index",str(index))
else:
return ".".join(name)
def get_filtered_table(self,table,key):
"""
This function will return a table with a single record per individual patient
"""
return """
SELECT :table.* FROM (
SELECT row_number() over () as top, * FROM :full_name ) as :table
INNER JOIN (
SELECT MAX(top) as top, :key FROM (
SELECT row_number() over () as top,:key from :full_name ) GROUP BY :key
)as filter
ON filter.top = :table.top and filter.:key = :table.:key
""".replace(":key",key).replace(":full_name",table['full_name']).replace(":table",table['name'])
def get_sql(self,**args):
"""
This function will generate that will join a list of tables given a key and a limit of records
@param tables list of tables
@param key key field to be used in the join. The assumption is that the field name is identical across tables (best practice!)
@param limit a limit imposed, in case of ristrictions considering joins are resource intensive
"""
tables = args['tables']
key = args['key']
limit = args['limit'] if 'limit' in args else 10000
limit = str(limit)
SQL = [
"""
SELECT :fields
FROM
"""]
fields = []
prev_table = None
for table in tables :
name = table['full_name'] #".".join([self.i_dataset,table['name']])
alias= table['name']
index = tables.index(table)
sql_ = """
(select * from :name ) as :alias
""".replace(":limit",limit)
# sql_ = " ".join(["(",self.get_filtered_table(table,key)," ) as :alias"])
sql_ = sql_.replace(":name",name).replace(":alias",alias).replace(":limit",limit)
fields += [self.get_field_name(alias,field_name,index) for field_name in table['fields'] if field_name != key or (field_name==key and tables.index(table) == 0) ]
if tables.index(table) > 0 :
join = """
INNER JOIN :sql ON :alias.:field = :prev_alias.:field
""".replace(":name",name)
join = join.replace(":alias",alias).replace(":field",key).replace(":prev_alias",prev_alias)
sql_ = join.replace(":sql",sql_)
# sql_ = " ".join([sql_,join])
SQL += [sql_]
if index == 0:
prev_alias = str(alias)
return " ".join(SQL).replace(":fields"," , ".join(fields))
class risk :
"""
This class will handle the creation of an SQL query that computes marketer and prosecutor risk (for now)
"""
def __init__(self):
pass
def get_sql(self,**args) :
"""
This function returns the SQL Query that will compute marketer and prosecutor risk
@param key key fields (patient identifier)
@param table table that is subject of the computation
"""
key = args['key']
table = args['table']
fields = list(set(table['fields']) - set([key]))
#-- We need to select n-fields max 64
k = len(fields)
if 'field_count' in args :
n = np.random.randint(2, int(args['field_count']) ) #-- number of random fields we are picking
else:
n = np.random.randint(2,k) #-- how many random fields are we processing
ii = np.random.choice(k,n,replace=False)
stream = np.zeros(len(fields) + 1)
stream[ii] = 1
stream = pd.DataFrame(stream.tolist()).T
stream.columns = args['table']['fields']
fields = list(np.array(fields)[ii])
sql = """
SELECT COUNT(g_size) as group_count,SUM(g_size) as patient_count, COUNT(g_size)/SUM(g_size) as marketer, 1/ MIN(g_size) as prosecutor, :n as field_count
FROM (
SELECT COUNT(*) as g_size,:fields
FROM :full_name
GROUP BY :fields
)
""".replace(":fields", ",".join(fields)).replace(":full_name",table['full_name']).replace(":key",key).replace(":n",str(n))
return {"sql":sql,"stream":stream}
if 'action' in SYS_ARGS and SYS_ARGS['action'] in ['create','compute','migrate'] :
path = SYS_ARGS['path']
client = bq.Client.from_service_account_json(path)
i_dataset = SYS_ARGS['i_dataset']
key = SYS_ARGS['key']
mytools = utils(client = client)
tables = mytools.get_tables(dataset=i_dataset,client=client,key=key)
# print len(tables)
# tables = tables[:6]
if SYS_ARGS['action'] == 'create' :
#usage:
# create --i_dataset <in dataset> --key <patient id> --o_dataset <out dataset> --table <table|file> [--file] --path <bq JSON account file>
#
create_sql = mytools.get_sql(tables=tables,key=key) #-- The create statement
o_dataset = SYS_ARGS['o_dataset']
table = SYS_ARGS['table']
if 'file' in SYS_ARGS :
f = open(table+'.sql','w')
f.write(create_sql)
f.close()
else:
job = bq.QueryJobConfig()
job.destination = client.dataset(o_dataset).table(table)
job.use_query_cache = True
job.allow_large_results = True
job.priority = 'BATCH'
job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
r = client.query(create_sql,location='US',job_config=job)
print [r.job_id,' ** ',r.state]
elif SYS_ARGS['action'] == 'migrate' :
#
#
o_dataset = SYS_ARGS['o_dataset']
for table in tables:
sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",mytools.get_filtered_table(table,key),") as ",table['name']])
print ""
print sql
print ""
# job = bq.QueryJobConfig()
# job.destination = client.dataset(o_dataset).table(table['name'])
# job.use_query_cache = True
# job.allow_large_results = True
# job.priority = 'INTERACTIVE'
# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
# r = client.query(sql,location='US',job_config=job)
# print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
pass
else:
#
#
tables = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ]
limit = int(SYS_ARGS['limit']) if 'limit' in SYS_ARGS else 1
if tables :
risk= risk()
df = pd.DataFrame()
dfs = pd.DataFrame()
np.random.seed(1)
for i in range(0,limit) :
r = risk.get_sql(key=SYS_ARGS['key'],table=tables[0])
sql = r['sql']
dfs = dfs.append(r['stream'],sort=True)
df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard').join(dfs))
# df = df.join(dfs,sort=True)
df.to_csv(SYS_ARGS['table']+'.csv')
# dfs.to_csv(SYS_ARGS['table']+'_stream.csv')
print [i,' ** ',df.shape[0],pd.DataFrame(r['stream']).shape]
time.sleep(2)
else:
print 'ERROR'
pass
# r = risk(path='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json', i_dataset='raw',o_dataset='risk_o',o_table='mo')
# tables = r.get_tables('raw','person_id')
# sql = r.get_sql(tables=tables[:3],key='person_id')
# #
# # let's post this to a designated location
# #
# f = open('foo.sql','w')
# f.write(sql)
# f.close()
# r.get_sql(tables=tables,key='person_id')
# p = r.compute()
# print p
# p.to_csv("risk.csv")
# r.write('foo.sql')
Loading…
Cancel
Save