@ -43,6 +43,10 @@ from datetime import datetime
 
			
		
	
		
		
			
				
					
					import  sys import  sys  
			
		
	
		
		
			
				
					
					
 
			
		
	
		
		
			
				
					
					from  itertools  import  combinations from  itertools  import  combinations  
			
		
	
		
		
			
				
					
					# class Compute:  
			
		
	
		
		
			
				
					
					#     pass  
			
		
	
		
		
			
				
					
					# class Population(Compute):  
			
		
	
		
		
			
				
					
					#     pass  
			
		
	
		
		
			
				
					
					
 
			
		
	
		
		
			
				
					
					@pd.api.extensions.register_dataframe_accessor ( " risk " ) @pd.api.extensions.register_dataframe_accessor ( " risk " )  
			
		
	
		
		
			
				
					
					class  deid  : class  deid  :  
			
		
	
	
		
		
			
				
					
						
						
						
							
								 
						
					 
					@ -57,6 +61,16 @@ class deid :
 
			
		
	
		
		
			
				
					
					        # 
        # 
 
			
		
	
		
		
			
				
					
					        values  =  df . apply ( lambda  col :  col . unique ( ) . size  /  df . shape [ 0 ] ) 
        values  =  df . apply ( lambda  col :  col . unique ( ) . size  /  df . shape [ 0 ] ) 
 
			
		
	
		
		
			
				
					
					        self . _dinfo  =  dict ( zip ( df . columns . tolist ( ) , values ) ) 
        self . _dinfo  =  dict ( zip ( df . columns . tolist ( ) , values ) ) 
 
			
		
	
		
		
			
				
					
					        # self.sample = self._df 
 
			
		
	
		
		
			
				
					
					        self . init ( sample = self . _df ) 
 
			
		
	
		
		
			
				
					
					    def  init ( self , * * _args ) : 
 
			
		
	
		
		
			
				
					
					        _sample  =  _args [ ' sample ' ]  if  ' sample '  in  _args  else  self . _df 
 
			
		
	
		
		
			
				
					
					        _columns  =  [ ]  if  ' columns '  not  in  _args  else  _args [ ' columns ' ] 
 
			
		
	
		
		
			
				
					
					        if  _columns  : 
 
			
		
	
		
		
			
				
					
					            self . _compute  =  Compute ( sample  =  _sample , columns = _columns ) 
 
			
		
	
		
		
			
				
					
					        else : 
 
			
		
	
		
		
			
				
					
					            self . _comput  =  Compute ( sample = _sample ) 
 
			
		
	
		
		
			
				
					
					        self . _pcompute =  Population ( )   
 
			
		
	
		
		
			
				
					
					
 
			
		
	
		
		
			
				
					
					    def  explore ( self , * * args ) : 
    def  explore ( self , * * args ) : 
 
			
		
	
		
		
			
				
					
					        """ 
        """ 
 
			
		
	
	
		
		
			
				
					
						
							
								 
						
						
							
								 
						
						
					 
					@ -115,7 +129,9 @@ class deid :
 
			
		
	
		
		
			
				
					
					                p  =   pd . DataFrame ( 1 * sample . columns . isin ( cols ) ) . T 
                p  =   pd . DataFrame ( 1 * sample . columns . isin ( cols ) ) . T 
 
			
		
	
		
		
			
				
					
					                p . columns  =  sample . columns 
                p . columns  =  sample . columns 
 
			
		
	
		
		
			
				
					
					                o  =  pd . concat ( [ o , r . join ( p ) ] ) 
                o  =  pd . concat ( [ o , r . join ( p ) ] ) 
 
			
		
	
		
		
			
				
					
					                o [ ' attr ' ]  =  ' , ' . join ( cols ) 
 
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					                o [ ' attributes ' ]  =  ' , ' . join ( cols ) 
 
			
		
	
		
		
			
				
					
					                # o['attr'] = ','.join(r.apply()) 
 
			
		
	
		
		
			
				
					
					                _index  + =  1 
                _index  + =  1 
 
			
		
	
		
		
			
				
					
					        # 
        # 
 
			
		
	
		
		
			
				
					
					        # We rename flags to policies and adequately number them, we also have a column to summarize the attributes attr 
        # We rename flags to policies and adequately number them, we also have a column to summarize the attributes attr 
 
			
		
	
	
		
		
			
				
					
						
						
						
							
								 
						
					 
					@ -127,7 +143,23 @@ class deid :
 
			
		
	
		
		
			
				
					
					        o . index  =  np . arange ( o . shape [ 0 ] ) . astype ( np . int64 ) 
        o . index  =  np . arange ( o . shape [ 0 ] ) . astype ( np . int64 ) 
 
			
		
	
		
		
			
				
					
					        o  =  o . rename ( columns = { ' flag ' : ' policies ' } ) 
        o  =  o . rename ( columns = { ' flag ' : ' policies ' } ) 
 
			
		
	
		
		
			
				
					
					        return  o 
        return  o 
 
			
		
	
		
		
			
				
					
					    def  evaluate ( self ,  * * args ) : 
    def  evaluate ( self , * * _args ) : 
 
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					        _measure  =  { } 
 
			
		
	
		
		
			
				
					
					
 
			
		
	
		
		
			
				
					
					        self . init ( * * _args ) 
 
			
		
	
		
		
			
				
					
					        _names  =  [ ' marketer ' , ' journalist ' , ' prosecutor ' ]  #+ (['pitman'] if 'pop_size' in _args else []) 
 
			
		
	
		
		
			
				
					
					        for  label  in  _names  : 
 
			
		
	
		
		
			
				
					
					            _pointer  =  getattr ( self , label ) 
 
			
		
	
		
		
			
				
					
					            _measure [ label ]  =  _pointer ( * * _args ) 
 
			
		
	
		
		
			
				
					
					        
 
			
		
	
		
		
			
				
					
					        _measure [ ' fields ' ]  =  self . _compute . cache [ ' count ' ] [ ' fields ' ] 
 
			
		
	
		
		
			
				
					
					        _measure [ ' groups ' ]  =  self . _compute . cache [ ' count ' ] [ ' groups ' ] 
 
			
		
	
		
		
			
				
					
					        _measure [ ' rows ' ]  =  self . _compute . cache [ ' count ' ] [ ' rows ' ] 
 
			
		
	
		
		
			
				
					
					        if  ' attr '  in  _args  : 
 
			
		
	
		
		
			
				
					
					            _measure  =  dict ( _args [ ' attr ' ] , * * _measure ) 
 
			
		
	
		
		
			
				
					
					
 
			
		
	
		
		
			
				
					
					        return  pd . DataFrame ( [ _measure ] ) 
 
			
		
	
		
		
			
				
					
					    def  _evaluate ( self ,  * * args ) : 
 
			
		
	
		
		
			
				
					
					        """ 
        """ 
 
			
		
	
		
		
			
				
					
					        This  function  has  the  ability  to  evaluate  risk  associated  with  either  a  population  or  a  sample  dataset 
        This  function  has  the  ability  to  evaluate  risk  associated  with  either  a  population  or  a  sample  dataset 
 
			
		
	
		
		
			
				
					
					        : sample  sample  dataset 
        : sample  sample  dataset 
 
			
		
	
	
		
		
			
				
					
						
							
								 
						
						
							
								 
						
						
					 
					@ -157,7 +189,7 @@ class deid :
 
			
		
	
		
		
			
				
					
					        r  =  { " flag " : flag } 
        r  =  { " flag " : flag } 
 
			
		
	
		
		
			
				
					
					        # if sample : 
        # if sample : 
 
			
		
	
		
		
			
				
					
					        
        
 
			
		
	
		
		
			
				
					
					        handle_sample    =  Sampl e( )         
        handle_sample    =  Comput e( )         
 
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					        xi               =  sample . groupby ( cols , as_index = False ) . count ( ) . values 
        xi               =  sample . groupby ( cols , as_index = False ) . count ( ) . values 
 
			
		
	
		
		
			
				
					
					        
        
 
			
		
	
		
		
			
				
					
					        handle_sample . set ( ' groups ' , xi ) 
        handle_sample . set ( ' groups ' , xi ) 
 
			
		
	
	
		
		
			
				
					
						
							
								 
						
						
							
								 
						
						
					 
					@ -214,6 +246,82 @@ class deid :
 
			
		
	
		
		
			
				
					
					        r [ ' field count ' ]  =  len ( cols ) 
        r [ ' field count ' ]  =  len ( cols ) 
 
			
		
	
		
		
			
				
					
					        return  pd . DataFrame ( [ r ] ) 
        return  pd . DataFrame ( [ r ] ) 
 
			
		
	
		
		
			
				
					
					    
    
 
			
		
	
		
		
			
				
					
					    def  marketer ( self , * * _args ) : 
 
			
		
	
		
		
			
				
					
					        """ 
 
			
		
	
		
		
			
				
					
					        This  function  delegates  the  calls  to  compute  marketer  risk  of  a  given  dataset  or  sample 
 
			
		
	
		
		
			
				
					
					        : sample      optional  sample  dataset 
 
			
		
	
		
		
			
				
					
					        : columns     optional  columns  of  the  dataset ,  if  non  is  provided  and  inference  will  be  made  using  non - unique  columns 
 
			
		
	
		
		
			
				
					
					        """ 
 
			
		
	
		
		
			
				
					
					        if  ' pop '  not  in  _args  : 
 
			
		
	
		
		
			
				
					
					            if  not  ' sample '  in  _args  and  not  ' columns '  in  _args  : 
 
			
		
	
		
		
			
				
					
					                # _handler =  self._compute 
 
			
		
	
		
		
			
				
					
					                pass 
 
			
		
	
		
		
			
				
					
					            else : 
 
			
		
	
		
		
			
				
					
					                
 
			
		
	
		
		
			
				
					
					                self . init ( * * _args ) 
 
			
		
	
		
		
			
				
					
					                # _handler = Compute(**_args) 
 
			
		
	
		
		
			
				
					
					            _handler  =   self . _compute 
 
			
		
	
		
		
			
				
					
					
 
			
		
	
		
		
			
				
					
					        else : 
 
			
		
	
		
		
			
				
					
					            # 
 
			
		
	
		
		
			
				
					
					            # Computing population estimates for the population 
 
			
		
	
		
		
			
				
					
					            self . _pcompute . init ( * * _args ) 
 
			
		
	
		
		
			
				
					
					            handler  =  self . _pcompute 
 
			
		
	
		
		
			
				
					
					        return  _handler . marketer ( ) 
 
			
		
	
		
		
			
				
					
					    def  journalist ( self , * * _args ) : 
 
			
		
	
		
		
			
				
					
					        """ 
 
			
		
	
		
		
			
				
					
					        This  function  delegates  the  calls  to  compute  journalist  risk  of  a  given  dataset  or  sample 
 
			
		
	
		
		
			
				
					
					        : sample      optional  sample  dataset 
 
			
		
	
		
		
			
				
					
					        : columns     optional  columns  of  the  dataset ,  if  non  is  provided  and  inference  will  be  made  using  non - unique  columns 
 
			
		
	
		
		
			
				
					
					        """ 
 
			
		
	
		
		
			
				
					
					        if  ' pop '  not  in  _args  : 
 
			
		
	
		
		
			
				
					
					            if  not  ' sample '  in  _args  and  not  ' columns '  in  _args  : 
 
			
		
	
		
		
			
				
					
					                _handler  =   self . _compute 
 
			
		
	
		
		
			
				
					
					            else : 
 
			
		
	
		
		
			
				
					
					                self . init ( * * _args ) 
 
			
		
	
		
		
			
				
					
					                # _handler = Compute(**_args) 
 
			
		
	
		
		
			
				
					
					            _handler  =  self . _compute 
 
			
		
	
		
		
			
				
					
					                # return _compute.journalist() 
 
			
		
	
		
		
			
				
					
					        else : 
 
			
		
	
		
		
			
				
					
					            self . _pcompute . init ( * * _args ) 
 
			
		
	
		
		
			
				
					
					            _handler  =  self . _pcompute 
 
			
		
	
		
		
			
				
					
					        return  _handler . journalist ( ) 
 
			
		
	
		
		
			
				
					
					    def  prosecutor ( self , * * _args ) : 
 
			
		
	
		
		
			
				
					
					        """ 
 
			
		
	
		
		
			
				
					
					        This  function  delegates  the  calls  to  compute  prosecutor  risk  of  a  given  dataset  or  sample 
 
			
		
	
		
		
			
				
					
					        : sample      optional  sample  dataset 
 
			
		
	
		
		
			
				
					
					        : columns     optional  columns  of  the  dataset ,  if  non  is  provided  and  inference  will  be  made  using  non - unique  columns 
 
			
		
	
		
		
			
				
					
					        """ 
 
			
		
	
		
		
			
				
					
					        if  ' pop '  not  in  _args  : 
 
			
		
	
		
		
			
				
					
					            if  not  ' sample '  in  _args  and  not  ' columns '  in  _args  : 
 
			
		
	
		
		
			
				
					
					                # _handler =  self._compute 
 
			
		
	
		
		
			
				
					
					                pass 
 
			
		
	
		
		
			
				
					
					            else : 
 
			
		
	
		
		
			
				
					
					                self . init ( * * _args ) 
 
			
		
	
		
		
			
				
					
					                # _handler = Compute(**_args) 
 
			
		
	
		
		
			
				
					
					            _handler  =   self . _compute 
 
			
		
	
		
		
			
				
					
					                
 
			
		
	
		
		
			
				
					
					        else : 
 
			
		
	
		
		
			
				
					
					            self . _pcompute . init ( * * _args ) 
 
			
		
	
		
		
			
				
					
					            _handler  =  self . _pcompute 
 
			
		
	
		
		
			
				
					
					        return  _handler . prosecutor ( ) 
 
			
		
	
		
		
			
				
					
					    def  pitman ( self , * * _args ) : 
 
			
		
	
		
		
			
				
					
					        
 
			
		
	
		
		
			
				
					
					        if  ' population '  not  in  _args  : 
 
			
		
	
		
		
			
				
					
					            pop_size  =  int ( _args [ ' pop_size ' ] ) 
 
			
		
	
		
		
			
				
					
					            self . _compute . set ( ' pop_size ' , pop_size ) 
 
			
		
	
		
		
			
				
					
					            _handler  =   self . _compute ; 
 
			
		
	
		
		
			
				
					
					        else : 
 
			
		
	
		
		
			
				
					
					            self . _pcompute . init ( * * _args ) 
 
			
		
	
		
		
			
				
					
					            _handler  =  self . _pcompute 
 
			
		
	
		
		
			
				
					
					        
 
			
		
	
		
		
			
				
					
					        return  _handler . pitman ( ) 
 
			
		
	
		
		
			
				
					
					        
 
			
		
	
		
		
			
				
					
					        # xi = pd.DataFrame({"sample_group_size":sample.groupby(cols,as_index=False).count()}).reset_index() 
 
			
		
	
		
		
			
				
					
					        # yi = pd.DataFrame({"population_group_size":args['pop'].groupby(cols,as_index=False).size()}).reset_index() 
 
			
		
	
		
		
			
				
					
					        # merged_groups = pd.merge(xi,yi,on=cols,how='inner') 
 
			
		
	
		
		
			
				
					
					        # handle_population= Population()             
 
			
		
	
		
		
			
				
					
					        # handle_population.set('merged_groups',merged_groups) 
 
			
		
	
		
		
			
				
					
					class  Risk  : class  Risk  :  
			
		
	
		
		
			
				
					
					    """ 
    """ 
 
			
		
	
		
		
			
				
					
					    This  class  is  an  abstraction  of  how  we  chose  to  structure  risk  computation  i . e  in  2  sub  classes : 
    This  class  is  an  abstraction  of  how  we  chose  to  structure  risk  computation  i . e  in  2  sub  classes : 
 
			
		
	
	
		
		
			
				
					
						
						
						
							
								 
						
					 
					@ -227,13 +335,31 @@ class Risk :
 
			
		
	
		
		
			
				
					
					            self . cache [ id ]  =  { } 
            self . cache [ id ]  =  { } 
 
			
		
	
		
		
			
				
					
					        self . cache [ key ]  =  value 
        self . cache [ key ]  =  value 
 
			
		
	
		
		
			
				
					
					
 
			
		
	
		
		
			
				
					
					class  Sampl e( Risk ) : class  Comput e( Risk ) :  
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					    """ 
    """ 
 
			
		
	
		
		
			
				
					
					    This  class  will  compute  risk  for  the  sample  dataset :  the  marketer  and  prosecutor  risk  are  computed  by  default . 
    This  class  will  compute  risk  for  the  sample  dataset :  the  marketer  and  prosecutor  risk  are  computed  by  default . 
 
			
		
	
		
		
			
				
					
					    This  class  can  optionally  add  pitman  risk  if  the  population  size  is  known . 
    This  class  can  optionally  add  pitman  risk  if  the  population  size  is  known . 
 
			
		
	
		
		
			
				
					
					    """ 
    """ 
 
			
		
	
		
		
			
				
					
					    def  __init__ ( self ) : 
    def  __init__ ( self , * * _args ) : 
 
			
				
				
			
		
	
		
		
			
				
					
					        Risk . __init__ ( self ) 
        super ( ) . __init__ ( ) 
 
			
				
				
			
		
	
		
		
	
		
		
	
		
		
			
				
					
					        self . _sample  =  _args [ ' sample ' ]  if  ' sample '  in  _args  else  pd . DataFrame ( ) 
 
			
		
	
		
		
			
				
					
					        self . _columns =  _args [ ' columns ' ]  if  ' columns '  in  _args  else  None 
 
			
		
	
		
		
			
				
					
					        self . cache [ ' count ' ]   =  { ' groups ' : 0 , ' fields ' : 0 , ' rows ' : 0 } 
 
			
		
	
		
		
			
				
					
					        if  not  self . _columns  : 
 
			
		
	
		
		
			
				
					
					            values  =  self . _sample . apply ( lambda  col :  col . unique ( ) . size  /  self . _sample . shape [ 0 ] )             
 
			
		
	
		
		
			
				
					
					            self . _dinfo  =  dict ( zip ( self . _sample . columns . tolist ( ) , values ) ) 
 
			
		
	
		
		
			
				
					
					            self . _columns  =  [ key  for  key  in  self . _dinfo  if  self . _dinfo [ key ]  <  1 ] 
 
			
		
	
		
		
			
				
					
					        # 
 
			
		
	
		
		
			
				
					
					        # At this point we have all the columns that are valid candidates even if the user didn't specify them 
 
			
		
	
		
		
			
				
					
					        self . cache [ ' count ' ] [ ' fields ' ]  =  len ( self . _columns ) 
 
			
		
	
		
		
			
				
					
					        if  self . _sample . shape [ 0 ]  >  0  and  self . _columns : 
 
			
		
	
		
		
			
				
					
					            _sample  =  _args  [ ' sample ' ] 
 
			
		
	
		
		
			
				
					
					            _groups  =  self . _sample . groupby ( self . _columns , as_index = False ) . count ( ) . values 
 
			
		
	
		
		
			
				
					
					            self . set ( ' groups ' , _groups ) 
 
			
		
	
		
		
			
				
					
					    
 
			
		
	
		
		
			
				
					
					            self . cache [ ' count ' ] [ ' groups ' ]   =  len ( _groups ) 
 
			
		
	
		
		
			
				
					
					            self . cache [ ' count ' ] [ ' rows ' ]     =  np . sum ( [ _g [ - 1 ]  for  _g  in  _groups ] ) 
 
			
		
	
		
		
			
				
					
					            
 
			
		
	
		
		
			
				
					
					    def  marketer ( self ) : 
    def  marketer ( self ) : 
 
			
		
	
		
		
			
				
					
					        """ 
        """ 
 
			
		
	
		
		
			
				
					
					        computing  marketer  risk  for  sample  dataset 
        computing  marketer  risk  for  sample  dataset 
 
			
		
	
	
		
		
			
				
					
						
						
						
							
								 
						
					 
					@ -243,8 +369,10 @@ class Sample(Risk):
 
			
		
	
		
		
			
				
					
					        groups  =  self . cache [ ' groups ' ] 
        groups  =  self . cache [ ' groups ' ] 
 
			
		
	
		
		
			
				
					
					        # group_count = groups.size 
        # group_count = groups.size 
 
			
		
	
		
		
			
				
					
					        # row_count   = groups.sum() 
        # row_count   = groups.sum() 
 
			
		
	
		
		
			
				
					
					        group_count  =  len ( groups ) 
        # group_count = len(groups) 
 
			
				
				
			
		
	
		
		
			
				
					
					        row_count  =  np . sum ( [ _g [ - 1 ]  for  _g  in  groups ] ) 
        group_count  =  self . cache [ ' count ' ] [ ' groups ' ] 
 
			
				
				
			
		
	
		
		
	
		
		
	
		
		
			
				
					
					        # row_count = np.sum([_g[-1] for _g in groups]) 
 
			
		
	
		
		
			
				
					
					        row_count  =  self . cache [ ' count ' ] [ ' rows ' ] 
 
			
		
	
		
		
			
				
					
					        return  group_count  /  np . float64 ( row_count ) 
        return  group_count  /  np . float64 ( row_count ) 
 
			
		
	
		
		
			
				
					
					
 
			
		
	
		
		
			
				
					
					    def  prosecutor ( self ) : 
    def  prosecutor ( self ) : 
 
			
		
	
	
		
		
			
				
					
						
						
						
							
								 
						
					 
					@ -259,40 +387,52 @@ class Sample(Risk):
 
			
		
	
		
		
			
				
					
					    def  unique_ratio ( self ) : 
    def  unique_ratio ( self ) : 
 
			
		
	
		
		
			
				
					
					        groups  =  self . cache [ ' groups ' ]         
        groups  =  self . cache [ ' groups ' ]         
 
			
		
	
		
		
			
				
					
					        # row_count = groups.sum() 
        # row_count = groups.sum() 
 
			
		
	
		
		
			
				
					
					        row_count  =  np . sum ( [ _g [ - 1 ]  for  _g  in  groups ] ) 
        # row_count = np.sum([_g[-1] for _g in groups]) 
 
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					        row_count  =  self . cache [ ' count ' ] [ ' rows ' ] 
 
			
		
	
		
		
			
				
					
					        # return groups[groups == 1].sum() / np.float64(row_count) 
        # return groups[groups == 1].sum() / np.float64(row_count) 
 
			
		
	
		
		
			
				
					
					        values  =  [ _g [ - 1 ]  for  _g  in  groups  if  _g [ - 1 ]  ==  1 ] 
        values  =  [ _g [ - 1 ]  for  _g  in  groups  if  _g [ - 1 ]  ==  1 ] 
 
			
		
	
		
		
			
				
					
					        
        
 
			
		
	
		
		
			
				
					
					        return  np . sum ( values )  /  np . float64 ( row_count ) 
        return  np . sum ( values )  /  np . float64 ( row_count ) 
 
			
		
	
		
		
			
				
					
					
    def  journalist ( self ) : 
 
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					        return  self . unique_ratio ( ) 
 
			
		
	
		
		
			
				
					
					    def  pitman ( self ) : 
    def  pitman ( self ) : 
 
			
		
	
		
		
			
				
					
					        """ 
        """ 
 
			
		
	
		
		
			
				
					
					        This  function  will  approximate  pitman  de - identification  risk  based  on  pitman  sampling 
        This  function  will  approximate  pitman  de - identification  risk  based  on  pitman  sampling 
 
			
		
	
		
		
			
				
					
					        """ 
        """ 
 
			
		
	
		
		
			
				
					
					        
 
			
		
	
		
		
			
				
					
					        groups  =  self . cache [ ' groups ' ] 
        groups  =  self . cache [ ' groups ' ] 
 
			
		
	
		
		
			
				
					
					        print  ( self . cache [ ' pop_size ' ] ) 
 
			
		
	
		
		
			
				
					
					        si  =  groups [ groups  ==  1 ] . size 
        si  =  groups [ groups  ==  1 ] . size 
 
			
		
	
		
		
			
				
					
					        # u = groups.size 
        # u = groups.size 
 
			
		
	
		
		
			
				
					
					        u  =  len ( groups ) 
        u  =  len ( groups ) 
 
			
		
	
		
		
			
				
					
					        alpha  =  np . divide ( si  ,  np . float64 ( u )  ) 
        alpha  =  np . divide ( si  ,  np . float64 ( u )  ) 
 
			
		
	
		
		
			
				
					
					        row_count  =  np . sum ( [ _g [ - 1 ]  for  _g  in  groups ] ) 
        # row_count = np.sum([_g[-1] for _g in groups]) 
 
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					        row_count  =  self . cache [ ' count ' ] [ ' rows ' ] 
 
			
		
	
		
		
			
				
					
					
 
			
		
	
		
		
			
				
					
					        # f = np.divide(groups.sum(), np.float64(self.cache['pop_size'])) 
        # f = np.divide(groups.sum(), np.float64(self.cache['pop_size'])) 
 
			
		
	
		
		
			
				
					
					        f  =  np . divide ( row_count ,  np . float64 ( self . cache [ ' pop_size ' ] ) ) 
        f  =  np . divide ( row_count ,  np . float64 ( self . cache [ ' pop_size ' ] ) ) 
 
			
		
	
		
		
			
				
					
					        return  np . power ( f , 1 - alpha ) 
        return  np . power ( f , 1 - alpha ) 
 
			
		
	
		
		
			
				
					
					
 
			
		
	
		
		
			
				
					
					class  Population ( Sampl e) : class  Population ( Comput e) :  
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					    """ 
    """ 
 
			
		
	
		
		
			
				
					
					    This  class  will  compute  risk  for  datasets  that  have  population  information  or  datasets  associated  with  them . 
    This  class  will  compute  risk  for  datasets  that  have  population  information  or  datasets  associated  with  them . 
 
			
		
	
		
		
			
				
					
					    This  computation  includes  pitman  risk  ( it  requires  minimal  information  about  population ) 
    This  computation  includes  pitman  risk  ( it  requires  minimal  information  about  population ) 
 
			
		
	
		
		
			
				
					
					    """ 
    """ 
 
			
		
	
		
		
			
				
					
					    def  __init__ ( self , * * args ) : 
    def  __init__ ( self , * * _args ) : 
 
			
				
				
			
		
	
		
		
			
				
					
					        Sample . __init__ ( self ) 
        super ( ) . __init__ ( * * _args ) 
 
			
				
				
			
		
	
		
		
	
		
		
	
		
		
			
				
					
					
 
			
		
	
		
		
			
				
					
					    def  init ( self , * * _args ) : 
 
			
		
	
		
		
			
				
					
					        xi  =  pd . DataFrame ( { " sample_group_size " : self . _sample . groupby ( self . _columns , as_index = False ) . count ( ) } ) . reset_index ( ) 
 
			
		
	
		
		
			
				
					
					        yi  =  pd . DataFrame ( { " population_group_size " : _args [ ' population ' ] . groupby ( self . _columns , as_index = False ) . size ( ) } ) . reset_index ( ) 
 
			
		
	
		
		
			
				
					
					        merged_groups  =  pd . merge ( xi , yi , on = self . _columns , how = ' inner ' )                    
 
			
		
	
		
		
			
				
					
					        self . set ( ' merged_groups ' , merged_groups ) 
 
			
		
	
		
		
			
				
					
					
 
			
		
	
		
		
			
				
					
					    def  set ( self , key , value ) : 
    def  set ( self , key , value ) : 
 
			
		
	
		
		
			
				
					
					        Sample . set ( self , key , value ) 
        self . set ( self , key , value ) 
 
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					        if  key  ==  ' merged_groups '  :   
        if  key  ==  ' merged_groups '  :   
 
			
		
	
		
		
			
				
					
					               
               
 
			
		
	
		
		
			
				
					
					            Sample . set ( self , ' pop_size ' , np . float64 ( value . population_group_size . sum ( ) )  ) 
            self . set ( self , ' pop_size ' , np . float64 ( value . population_group_size . sum ( ) )  ) 
 
			
				
				
			
		
	
		
		
			
				
					
					            Sample . set ( self , ' groups ' , value . sample_group_size ) 
            self . set ( self , ' groups ' , value . sample_group_size ) 
 
			
				
				
			
		
	
		
		
	
		
		
	
		
		
			
				
					
					    """ 
    """ 
 
			
		
	
		
		
			
				
					
					    This  class  will  measure  risk  and  account  for  the  existance  of  a  population 
    This  class  will  measure  risk  and  account  for  the  existance  of  a  population 
 
			
		
	
		
		
			
				
					
					    : merged_groups  { sample_group_size ,  population_group_size }  is  a  merged  dataset  with  group  sizes  of  both  population  and  sample 
    : merged_groups  { sample_group_size ,  population_group_size }  is  a  merged  dataset  with  group  sizes  of  both  population  and  sample 
 
			
		
	
	
		
		
			
				
					
						
						
						
							
								 
						
					 
					@ -301,6 +441,7 @@ class Population(Sample):
 
			
		
	
		
		
			
				
					
					        """ 
        """ 
 
			
		
	
		
		
			
				
					
					        This  function  requires 
        This  function  requires 
 
			
		
	
		
		
			
				
					
					        """ 
        """ 
 
			
		
	
		
		
			
				
					
					        
 
			
		
	
		
		
			
				
					
					        r  =  self . cache [ ' merged_groups ' ] 
        r  =  self . cache [ ' merged_groups ' ] 
 
			
		
	
		
		
			
				
					
					        sample_row_count  =  r . sample_group_size . sum ( )  
        sample_row_count  =  r . sample_group_size . sum ( )  
 
			
		
	
		
		
			
				
					
					        # 
        #