@ -43,6 +43,10 @@ from datetime import datetime
import sys
from itertools import combinations
# class Compute:
# pass
# class Population(Compute):
# pass
@pd.api.extensions.register_dataframe_accessor ( " risk " )
class deid :
@ -57,6 +61,16 @@ class deid :
#
values = df . apply ( lambda col : col . unique ( ) . size / df . shape [ 0 ] )
self . _dinfo = dict ( zip ( df . columns . tolist ( ) , values ) )
# self.sample = self._df
self . init ( sample = self . _df )
def init ( self , * * _args ) :
_sample = _args [ ' sample ' ] if ' sample ' in _args else self . _df
_columns = [ ] if ' columns ' not in _args else _args [ ' columns ' ]
if _columns :
self . _compute = Compute ( sample = _sample , columns = _columns )
else :
self . _comput = Compute ( sample = _sample )
self . _pcompute = Population ( )
def explore ( self , * * args ) :
"""
@ -115,7 +129,9 @@ class deid :
p = pd . DataFrame ( 1 * sample . columns . isin ( cols ) ) . T
p . columns = sample . columns
o = pd . concat ( [ o , r . join ( p ) ] )
o [ ' attr ' ] = ' , ' . join ( cols )
o [ ' attributes ' ] = ' , ' . join ( cols )
# o['attr'] = ','.join(r.apply())
_index + = 1
#
# We rename flags to policies and adequately number them, we also have a column to summarize the attributes attr
@ -127,7 +143,23 @@ class deid :
o . index = np . arange ( o . shape [ 0 ] ) . astype ( np . int64 )
o = o . rename ( columns = { ' flag ' : ' policies ' } )
return o
def evaluate ( self , * * args ) :
def evaluate ( self , * * _args ) :
_measure = { }
self . init ( * * _args )
_names = [ ' marketer ' , ' journalist ' , ' prosecutor ' ] #+ (['pitman'] if 'pop_size' in _args else [])
for label in _names :
_pointer = getattr ( self , label )
_measure [ label ] = _pointer ( * * _args )
_measure [ ' fields ' ] = self . _compute . cache [ ' count ' ] [ ' fields ' ]
_measure [ ' groups ' ] = self . _compute . cache [ ' count ' ] [ ' groups ' ]
_measure [ ' rows ' ] = self . _compute . cache [ ' count ' ] [ ' rows ' ]
if ' attr ' in _args :
_measure = dict ( _args [ ' attr ' ] , * * _measure )
return pd . DataFrame ( [ _measure ] )
def _evaluate ( self , * * args ) :
"""
This function has the ability to evaluate risk associated with either a population or a sample dataset
: sample sample dataset
@ -157,7 +189,7 @@ class deid :
r = { " flag " : flag }
# if sample :
handle_sample = Sampl e( )
handle_sample = Comput e( )
xi = sample . groupby ( cols , as_index = False ) . count ( ) . values
handle_sample . set ( ' groups ' , xi )
@ -213,7 +245,83 @@ class deid :
#
r [ ' field count ' ] = len ( cols )
return pd . DataFrame ( [ r ] )
def marketer ( self , * * _args ) :
"""
This function delegates the calls to compute marketer risk of a given dataset or sample
: sample optional sample dataset
: columns optional columns of the dataset , if non is provided and inference will be made using non - unique columns
"""
if ' pop ' not in _args :
if not ' sample ' in _args and not ' columns ' in _args :
# _handler = self._compute
pass
else :
self . init ( * * _args )
# _handler = Compute(**_args)
_handler = self . _compute
else :
#
# Computing population estimates for the population
self . _pcompute . init ( * * _args )
handler = self . _pcompute
return _handler . marketer ( )
def journalist ( self , * * _args ) :
"""
This function delegates the calls to compute journalist risk of a given dataset or sample
: sample optional sample dataset
: columns optional columns of the dataset , if non is provided and inference will be made using non - unique columns
"""
if ' pop ' not in _args :
if not ' sample ' in _args and not ' columns ' in _args :
_handler = self . _compute
else :
self . init ( * * _args )
# _handler = Compute(**_args)
_handler = self . _compute
# return _compute.journalist()
else :
self . _pcompute . init ( * * _args )
_handler = self . _pcompute
return _handler . journalist ( )
def prosecutor ( self , * * _args ) :
"""
This function delegates the calls to compute prosecutor risk of a given dataset or sample
: sample optional sample dataset
: columns optional columns of the dataset , if non is provided and inference will be made using non - unique columns
"""
if ' pop ' not in _args :
if not ' sample ' in _args and not ' columns ' in _args :
# _handler = self._compute
pass
else :
self . init ( * * _args )
# _handler = Compute(**_args)
_handler = self . _compute
else :
self . _pcompute . init ( * * _args )
_handler = self . _pcompute
return _handler . prosecutor ( )
def pitman ( self , * * _args ) :
if ' population ' not in _args :
pop_size = int ( _args [ ' pop_size ' ] )
self . _compute . set ( ' pop_size ' , pop_size )
_handler = self . _compute ;
else :
self . _pcompute . init ( * * _args )
_handler = self . _pcompute
return _handler . pitman ( )
# xi = pd.DataFrame({"sample_group_size":sample.groupby(cols,as_index=False).count()}).reset_index()
# yi = pd.DataFrame({"population_group_size":args['pop'].groupby(cols,as_index=False).size()}).reset_index()
# merged_groups = pd.merge(xi,yi,on=cols,how='inner')
# handle_population= Population()
# handle_population.set('merged_groups',merged_groups)
class Risk :
"""
This class is an abstraction of how we chose to structure risk computation i . e in 2 sub classes :
@ -227,24 +335,44 @@ class Risk :
self . cache [ id ] = { }
self . cache [ key ] = value
class Sampl e( Risk ) :
class Comput e( Risk ) :
"""
This class will compute risk for the sample dataset : the marketer and prosecutor risk are computed by default .
This class can optionally add pitman risk if the population size is known .
"""
def __init__ ( self ) :
Risk . __init__ ( self )
def __init__ ( self , * * _args ) :
super ( ) . __init__ ( )
self . _sample = _args [ ' sample ' ] if ' sample ' in _args else pd . DataFrame ( )
self . _columns = _args [ ' columns ' ] if ' columns ' in _args else None
self . cache [ ' count ' ] = { ' groups ' : 0 , ' fields ' : 0 , ' rows ' : 0 }
if not self . _columns :
values = self . _sample . apply ( lambda col : col . unique ( ) . size / self . _sample . shape [ 0 ] )
self . _dinfo = dict ( zip ( self . _sample . columns . tolist ( ) , values ) )
self . _columns = [ key for key in self . _dinfo if self . _dinfo [ key ] < 1 ]
#
# At this point we have all the columns that are valid candidates even if the user didn't specify them
self . cache [ ' count ' ] [ ' fields ' ] = len ( self . _columns )
if self . _sample . shape [ 0 ] > 0 and self . _columns :
_sample = _args [ ' sample ' ]
_groups = self . _sample . groupby ( self . _columns , as_index = False ) . count ( ) . values
self . set ( ' groups ' , _groups )
self . cache [ ' count ' ] [ ' groups ' ] = len ( _groups )
self . cache [ ' count ' ] [ ' rows ' ] = np . sum ( [ _g [ - 1 ] for _g in _groups ] )
def marketer ( self ) :
"""
computing marketer risk for sample dataset
"""
groups = self . cache [ ' groups ' ]
# group_count = groups.size
# row_count = groups.sum()
group_count = len ( groups )
row_count = np . sum ( [ _g [ - 1 ] for _g in groups ] )
# group_count = len(groups)
group_count = self . cache [ ' count ' ] [ ' groups ' ]
# row_count = np.sum([_g[-1] for _g in groups])
row_count = self . cache [ ' count ' ] [ ' rows ' ]
return group_count / np . float64 ( row_count )
def prosecutor ( self ) :
@ -259,40 +387,52 @@ class Sample(Risk):
def unique_ratio ( self ) :
groups = self . cache [ ' groups ' ]
# row_count = groups.sum()
row_count = np . sum ( [ _g [ - 1 ] for _g in groups ] )
# row_count = np.sum([_g[-1] for _g in groups])
row_count = self . cache [ ' count ' ] [ ' rows ' ]
# return groups[groups == 1].sum() / np.float64(row_count)
values = [ _g [ - 1 ] for _g in groups if _g [ - 1 ] == 1 ]
return np . sum ( values ) / np . float64 ( row_count )
def journalist ( self ) :
return self . unique_ratio ( )
def pitman ( self ) :
"""
This function will approximate pitman de - identification risk based on pitman sampling
"""
groups = self . cache [ ' groups ' ]
print ( self . cache [ ' pop_size ' ] )
si = groups [ groups == 1 ] . size
# u = groups.size
u = len ( groups )
alpha = np . divide ( si , np . float64 ( u ) )
row_count = np . sum ( [ _g [ - 1 ] for _g in groups ] )
# row_count = np.sum([_g[-1] for _g in groups])
row_count = self . cache [ ' count ' ] [ ' rows ' ]
# f = np.divide(groups.sum(), np.float64(self.cache['pop_size']))
f = np . divide ( row_count , np . float64 ( self . cache [ ' pop_size ' ] ) )
return np . power ( f , 1 - alpha )
class Population ( Sampl e) :
class Population ( Comput e) :
"""
This class will compute risk for datasets that have population information or datasets associated with them .
This computation includes pitman risk ( it requires minimal information about population )
"""
def __init__ ( self , * * args ) :
Sample . __init__ ( self )
def __init__ ( self , * * _args ) :
super ( ) . __init__ ( * * _args )
def init ( self , * * _args ) :
xi = pd . DataFrame ( { " sample_group_size " : self . _sample . groupby ( self . _columns , as_index = False ) . count ( ) } ) . reset_index ( )
yi = pd . DataFrame ( { " population_group_size " : _args [ ' population ' ] . groupby ( self . _columns , as_index = False ) . size ( ) } ) . reset_index ( )
merged_groups = pd . merge ( xi , yi , on = self . _columns , how = ' inner ' )
self . set ( ' merged_groups ' , merged_groups )
def set ( self , key , value ) :
Sample . set ( self , key , value )
self . set ( self , key , value )
if key == ' merged_groups ' :
Sample . set ( self , ' pop_size ' , np . float64 ( value . population_group_size . sum ( ) ) )
Sample . set ( self , ' groups ' , value . sample_group_size )
self . set ( self , ' pop_size ' , np . float64 ( value . population_group_size . sum ( ) ) )
self . set ( self , ' groups ' , value . sample_group_size )
"""
This class will measure risk and account for the existance of a population
: merged_groups { sample_group_size , population_group_size } is a merged dataset with group sizes of both population and sample
@ -301,6 +441,7 @@ class Population(Sample):
"""
This function requires
"""
r = self . cache [ ' merged_groups ' ]
sample_row_count = r . sample_group_size . sum ( )
#