diff --git a/README.md b/README.md index 36cdb6d..51bf0bb 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ This framework computes re-identification risk of a dataset by extending pandas. It works like a pandas **add-on** The framework will compute the following risk measures: marketer, prosecutor, journalist and pitman risk. References for the risk measures can be found on [http://ehelthinformation.ca] (http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf) and [https://www.scb.se/contentassets](https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf) + + There are two modes available : **explore:** @@ -16,10 +18,10 @@ Here the assumption is that we are clear on the sets of attributes to be used an ### Four risk measures are computed : - - Marketer risk - - Prosecutor risk - - Journalist risk - - Pitman Risk +- Marketer risk +- Prosecutor risk +- Journalist risk +- Pitman Risk [Video tutorial,by Dr. Weiyi Xia](https://www.loom.com/share/173e109ecac64d37a54f09b103bc6681) and [Publication by Dr. Nobuaki Hoshino](https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf) ### Usage: @@ -27,19 +29,19 @@ Install this package using pip as follows : Stable : - pip install git+https://hiplab.mc.vanderbilt.edu/git/steve/deid-risk.git + pip install git+https://dev.the-phi.com/git/healthcareio/privacykit.git@release Latest Development (not fully tested): - pip install git+https://hiplab.mc.vanderbilt.edu/git/steve/deid-risk.git@risk + pip install git+https://dev.the-phi.com/git/healthcareio/privacykit.git@dev The framework will depend on pandas and numpy (for now). Below is a basic sample to get started quickly. import numpy as np import pandas as pd - import risk + import privacykit mydf = pd.DataFrame({"x":np.random.choice( np.random.randint(1,10),50),"y":np.random.choice( np.random.randint(1,10),50),"z":np.random.choice( np.random.randint(1,10),50),"r":np.random.choice( np.random.randint(1,10),50) }) print (mydf.risk.evaluate()) diff --git a/privacykit/risk.py b/privacykit/risk.py index 3110ed2..9457345 100644 --- a/privacykit/risk.py +++ b/privacykit/risk.py @@ -43,6 +43,10 @@ from datetime import datetime import sys from itertools import combinations +# class Compute: +# pass +# class Population(Compute): +# pass @pd.api.extensions.register_dataframe_accessor("risk") class deid : @@ -57,6 +61,16 @@ class deid : # values = df.apply(lambda col: col.unique().size / df.shape[0]) self._dinfo = dict(zip(df.columns.tolist(),values)) + # self.sample = self._df + self.init(sample=self._df) + def init(self,**_args): + _sample = _args['sample'] if 'sample' in _args else self._df + _columns = [] if 'columns' not in _args else _args['columns'] + if _columns : + self._compute = Compute(sample = _sample,columns=_columns) + else: + self._comput = Compute(sample=_sample) + self._pcompute= Population() def explore(self,**args): """ @@ -107,40 +121,45 @@ class deid : for size in np.arange(2,len(columns)) : p = list(combinations(columns,size)) p = (np.array(p)[ np.random.choice( len(p), _policy_count)].tolist()) - flag = 'Policy_'+str(_index) - _index += 1 + + for cols in p : + flag = 'Policy_'+str(_index) r = self.evaluate(sample=sample,cols=cols,flag = flag) p = pd.DataFrame(1*sample.columns.isin(cols)).T p.columns = sample.columns o = pd.concat([o,r.join(p)]) - - - # for i in np.arange(RUNS): - # if 'strict' not in args or ('strict' in args and args['strict'] is False): - # n = np.random.randint(2,k) - # else: - # n = args['field_count'] - # cols = np.random.choice(columns,n,replace=False).tolist() - # params = {'sample':sample,'cols':cols} - # if pop is not None : - # params['pop'] = pop - # if pop_size > 0 : - # params['pop_size'] = pop_size - # r = self.evaluate(**params) - # # - # # let's put the policy in place - # p = pd.DataFrame(1*sample.columns.isin(cols)).T - # p.columns = sample.columns - # # o = o.append(r.join(p)) - # o = pd.concat([o,r.join(p)]) + o['attributes'] = ','.join(cols) + # o['attr'] = ','.join(r.apply()) + _index += 1 + # + # We rename flags to policies and adequately number them, we also have a column to summarize the attributes attr + # + + o.index = np.arange(o.shape[0]).astype(np.int64) - + o = o.rename(columns={'flag':'policies'}) return o - def evaluate(self, **args): + def evaluate(self,**_args): + _measure = {} + + self.init(**_args) + _names = ['marketer','journalist','prosecutor'] #+ (['pitman'] if 'pop_size' in _args else []) + for label in _names : + _pointer = getattr(self,label) + _measure[label] = _pointer(**_args) + + _measure['fields'] = self._compute.cache['count']['fields'] + _measure['groups'] = self._compute.cache['count']['groups'] + _measure['rows'] = self._compute.cache['count']['rows'] + if 'attr' in _args : + _measure = dict(_args['attr'],**_measure) + + return pd.DataFrame([_measure]) + def _evaluate(self, **args): """ This function has the ability to evaluate risk associated with either a population or a sample dataset :sample sample dataset @@ -170,7 +189,7 @@ class deid : r = {"flag":flag} # if sample : - handle_sample = Sample() + handle_sample = Compute() xi = sample.groupby(cols,as_index=False).count().values handle_sample.set('groups',xi) @@ -226,7 +245,83 @@ class deid : # r['field count'] = len(cols) return pd.DataFrame([r]) + + def marketer(self,**_args): + """ + This function delegates the calls to compute marketer risk of a given dataset or sample + :sample optional sample dataset + :columns optional columns of the dataset, if non is provided and inference will be made using non-unique columns + """ + if 'pop' not in _args : + if not 'sample' in _args and not 'columns' in _args : + # _handler = self._compute + pass + else: + + self.init(**_args) + # _handler = Compute(**_args) + _handler = self._compute + else: + # + # Computing population estimates for the population + self._pcompute.init(**_args) + handler = self._pcompute + return _handler.marketer() + def journalist(self,**_args): + """ + This function delegates the calls to compute journalist risk of a given dataset or sample + :sample optional sample dataset + :columns optional columns of the dataset, if non is provided and inference will be made using non-unique columns + """ + if 'pop' not in _args : + if not 'sample' in _args and not 'columns' in _args : + _handler = self._compute + else: + self.init(**_args) + # _handler = Compute(**_args) + _handler = self._compute + # return _compute.journalist() + else: + self._pcompute.init(**_args) + _handler = self._pcompute + return _handler.journalist() + def prosecutor(self,**_args): + """ + This function delegates the calls to compute prosecutor risk of a given dataset or sample + :sample optional sample dataset + :columns optional columns of the dataset, if non is provided and inference will be made using non-unique columns + """ + if 'pop' not in _args : + if not 'sample' in _args and not 'columns' in _args : + # _handler = self._compute + pass + else: + self.init(**_args) + # _handler = Compute(**_args) + _handler = self._compute + + else: + self._pcompute.init(**_args) + _handler = self._pcompute + return _handler.prosecutor() + def pitman(self,**_args): + + if 'population' not in _args : + pop_size = int(_args['pop_size']) + self._compute.set('pop_size',pop_size) + _handler = self._compute; + else: + self._pcompute.init(**_args) + _handler = self._pcompute + + return _handler.pitman() + + # xi = pd.DataFrame({"sample_group_size":sample.groupby(cols,as_index=False).count()}).reset_index() + # yi = pd.DataFrame({"population_group_size":args['pop'].groupby(cols,as_index=False).size()}).reset_index() + # merged_groups = pd.merge(xi,yi,on=cols,how='inner') + # handle_population= Population() + # handle_population.set('merged_groups',merged_groups) class Risk : """ This class is an abstraction of how we chose to structure risk computation i.e in 2 sub classes: @@ -240,24 +335,44 @@ class Risk : self.cache[id] = {} self.cache[key] = value -class Sample(Risk): +class Compute(Risk): """ This class will compute risk for the sample dataset: the marketer and prosecutor risk are computed by default. This class can optionally add pitman risk if the population size is known. """ - def __init__(self): - Risk.__init__(self) + def __init__(self,**_args): + super().__init__() + self._sample = _args['sample'] if 'sample' in _args else pd.DataFrame() + self._columns= _args['columns'] if 'columns' in _args else None + self.cache['count'] = {'groups':0,'fields':0,'rows':0} + if not self._columns : + values = self._sample.apply(lambda col: col.unique().size / self._sample.shape[0]) + self._dinfo = dict(zip(self._sample.columns.tolist(),values)) + self._columns = [key for key in self._dinfo if self._dinfo[key] < 1] + # + # At this point we have all the columns that are valid candidates even if the user didn't specify them + self.cache['count']['fields'] = len(self._columns) + if self._sample.shape[0] > 0 and self._columns: + _sample = _args ['sample'] + _groups = self._sample.groupby(self._columns,as_index=False).count().values + self.set('groups',_groups) + + self.cache['count']['groups'] = len(_groups) + self.cache['count']['rows'] = np.sum([_g[-1] for _g in _groups]) + def marketer(self): """ computing marketer risk for sample dataset """ - + groups = self.cache['groups'] # group_count = groups.size # row_count = groups.sum() - group_count = len(groups) - row_count = np.sum([_g[-1] for _g in groups]) + # group_count = len(groups) + group_count = self.cache['count']['groups'] + # row_count = np.sum([_g[-1] for _g in groups]) + row_count = self.cache['count']['rows'] return group_count / np.float64(row_count) def prosecutor(self): @@ -272,40 +387,52 @@ class Sample(Risk): def unique_ratio(self): groups = self.cache['groups'] # row_count = groups.sum() - row_count = np.sum([_g[-1] for _g in groups]) + # row_count = np.sum([_g[-1] for _g in groups]) + row_count = self.cache['count']['rows'] # return groups[groups == 1].sum() / np.float64(row_count) values = [_g[-1] for _g in groups if _g[-1] == 1] return np.sum(values) / np.float64(row_count) - + def journalist(self): + return self.unique_ratio() def pitman(self): """ This function will approximate pitman de-identification risk based on pitman sampling """ + groups = self.cache['groups'] + print (self.cache['pop_size']) si = groups[groups == 1].size # u = groups.size u = len(groups) alpha = np.divide(si , np.float64(u) ) - row_count = np.sum([_g[-1] for _g in groups]) + # row_count = np.sum([_g[-1] for _g in groups]) + row_count = self.cache['count']['rows'] + # f = np.divide(groups.sum(), np.float64(self.cache['pop_size'])) f = np.divide(row_count, np.float64(self.cache['pop_size'])) return np.power(f,1-alpha) -class Population(Sample): +class Population(Compute): """ This class will compute risk for datasets that have population information or datasets associated with them. This computation includes pitman risk (it requires minimal information about population) """ - def __init__(self,**args): - Sample.__init__(self) + def __init__(self,**_args): + super().__init__(**_args) + + def init(self,**_args): + xi = pd.DataFrame({"sample_group_size":self._sample.groupby(self._columns,as_index=False).count()}).reset_index() + yi = pd.DataFrame({"population_group_size":_args['population'].groupby(self._columns,as_index=False).size()}).reset_index() + merged_groups = pd.merge(xi,yi,on=self._columns,how='inner') + self.set('merged_groups',merged_groups) def set(self,key,value): - Sample.set(self,key,value) + self.set(self,key,value) if key == 'merged_groups' : - Sample.set(self,'pop_size',np.float64(value.population_group_size.sum()) ) - Sample.set(self,'groups',value.sample_group_size) + self.set(self,'pop_size',np.float64(value.population_group_size.sum()) ) + self.set(self,'groups',value.sample_group_size) """ This class will measure risk and account for the existance of a population :merged_groups {sample_group_size, population_group_size} is a merged dataset with group sizes of both population and sample @@ -314,6 +441,7 @@ class Population(Sample): """ This function requires """ + r = self.cache['merged_groups'] sample_row_count = r.sample_group_size.sum() # diff --git a/setup.py b/setup.py index cbe800f..c3bba16 100644 --- a/setup.py +++ b/setup.py @@ -4,11 +4,11 @@ This is a build file for the from setuptools import setup, find_packages setup( - name = "risk", - version = "0.8.1", + name = "privacykit", + version = "0.9.0", author = "Healthcare/IO - The Phi Technology LLC & Health Information Privacy Lab", author_email = "info@the-phi.com", license = "MIT", - packages=['risk'], + packages=['privacykit'], install_requires = ['numpy','pandas'] )