Merge branch 'master' into dev

6 years ago · 5e453f8371
parent 45ad157882 25b30d2c2b
commit 5e453f8371
8 changed files with 396 additions and 723 deletions
--- a/README.md
+++ b/README.md
@ -1,16 +1,63 @@
-# deid-risk
+# Re-Identification Risk

-The code below extends a data-frame by adding it the ability to compute de-identification risk (marketer, prosecutor).
-Because data-frames can connect to any database/file it will be the responsibility of the user to load the dataset into a data-frame.
+This framework computes re-identification risk of a dataset by extending pandas. It works like a pandas **add-on** 
+The framework will compute the following risk measures: marketer, prosecutor, journalist and pitman risk. References for the risk measures can be found on [http://ehelthinformation.ca] (http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf) and [https://www.scb.se/contentassets](https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf)

-Basic examples that illustrate usage of the the framework are in the notebook folder. The example is derived from 
-[http://ehelthinformation.ca](http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf)
+There are two modes available :
+    
+**explore:**
+
+Here the assumption is that we are not sure of the attributes to be disclosed, the framework will randomly generate random combinations of attributes and evaluate them accordingly as it provides all the measures of risk. 
+
+**evaluation**
+
+Here the assumption is that we are clear on the sets of attributes to be used and we are interested in computing the associated risk.
+
+
+### Four risk measures are computed :
+
+    - Marketer risk
+    - Prosecutor risk
+    - Journalist risk
+    - Pitman Risk
+
+### Usage:
+
+Install this package using pip as follows :
+
+Stable :
+    
+    pip install git+https://hiplab.mc.vanderbilt.edu/git/steve/deid-risk.git
    
-Dependencies:
-	numpy 
-	pandas
    
-Limitations:
+Latest Development (not fully tested):
+    
+    pip install git+https://hiplab.mc.vanderbilt.edu/git/steve/deid-risk.git@risk
+    
+The framework will depend on pandas and numpy (for now). Below is a basic sample to get started quickly.
+
+
+    import numpy as np
+    import pandas as pd
+    import risk
+
+    mydf = pd.DataFrame({"x":np.random.choice( np.random.randint(1,10),50),"y":np.random.choice( np.random.randint(1,10),50),"z":np.random.choice( np.random.randint(1,10),50),"r":np.random.choice( np.random.randint(1,10),50)  })
+    print (mydf.risk.evaluate())
+
+
+
+    #
+    # computing journalist and pitman
+    #   - Insure the population size is much greater than the sample size 
+    #   - Insure the fields are identical in both sample and population
+    #
+    pop = pd.DataFrame({"x":np.random.choice( np.random.randint(1,10),150),"y":np.random.choice( np.random.randint(1,10),150) ,"z":np.random.choice( np.random.randint(1,10),150),"r":np.random.choice( np.random.randint(1,10),150)})
+    print (mydf.risk.evaluate(pop=pop))
+
+
+@TODO:
+    - Evaluation of how sparse attributes are (the ratio of non-null over rows)
+    - Have a smart way to drop attributes (based on the above in random policy search)
+Basic examples that illustrate usage of the the framework are in the notebook folder. The example is derived from 
+
 	
-    @TODO:    
-        - Add support for journalist risk
--- a/notebooks/risk.ipynb
+++ b/notebooks/risk.ipynb
@ -1,293 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\"\"\"\n",
-    "    This notebook is intended to show how to use the risk framework:\n",
-    "    There are two basic usages:\n",
-    "        1. Experiment\n",
-    "            \n",
-    "            Here the framework will select a number of random fields other than the patient id and compute risk for the selection.\n",
-    "            This will repeat over a designated number of runs.\n",
-    "            \n",
-    "            The parameters to pass to enable this mode are id=<patient id>,nun_runs=<number of runs>\n",
-    "        2. Assessment\n",
-    "        \n",
-    "            Here the framework assumes you are only interested in a list of quasi identifiers and will run the evaluation once for a given list of quasi identifiers.\n",
-    "            The parameters to enable this mode are id=<patient id>,quasi_id=<list of quasi ids>\n",
-    "\"\"\"\n",
-    "import os\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "\n",
-    "\n",
-    "#\n",
-    "#-- Loading a template file\n",
-    "# The example taken a de-identification white-paper\n",
-    "# http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf\n",
-    "#\n",
-    "\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "from io import StringIO\n",
-    "csv = \"\"\"\n",
-    "id,sex,age,profession,drug_test\n",
-    "1,M,37,doctor,-\n",
-    "2,F,28,doctor,+\n",
-    "3,M,37,doctor,-\n",
-    "4,M,28,doctor,+\n",
-    "5,M,28,doctor,-\n",
-    "6,M,37,doctor,-\n",
-    "\"\"\"\n",
-    "f = StringIO()\n",
-    "f.write(unicode(csv))\n",
-    "f.seek(0)\n",
-    "MY_DATAFRAME = pd.read_csv(f)   "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\"\"\"\n",
-    "    Here's the pandas_risk code verbatim. \n",
-    "    NOTE: \n",
-    "\"\"\"\n",
-    "@pd.api.extensions.register_dataframe_accessor(\"deid\")\n",
-    "class deid :\n",
-    "    \"\"\"\n",
-    "        This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe\n",
-    "    \"\"\"\n",
-    "    def __init__(self,df):\n",
-    "        self._df = df\n",
-    "    \n",
-    "    def risk(self,**args):\n",
-    "        \"\"\"\n",
-    "            @param  id          name of patient field            \n",
-    "            @params num_runs    number of runs (default will be 100)\n",
-    "            @params quasi_id \tlist of quasi identifiers to be used (this will only perform a single run)\n",
-    "        \"\"\"\n",
-    "        \n",
-    "        id  = args['id']\n",
-    "        if 'quasi_id' in args :\n",
-    "            num_runs = 1\n",
-    "            columns = list(set(args['quasi_id'])- set(id) )\n",
-    "        else :\n",
-    "            num_runs  = args['num_runs'] if 'num_runs' in args else 100\n",
-    "            columns = list(set(self._df.columns) - set([id]))\n",
-    "        r   = pd.DataFrame()        \n",
-    "        k = len(columns)\n",
-    "        for i in range(0,num_runs) :\n",
-    "            #\n",
-    "            # let's chose a random number of columns and compute marketer and prosecutor risk\n",
-    "            # Once the fields are selected we run a groupby clause\n",
-    "            #\n",
-    "            if 'quasi_id' not in args :\n",
-    "                n   = np.random.randint(2,k) #-- number of random fields we are picking\n",
-    "                ii = np.random.choice(k,n,replace=False)\n",
-    "                cols = np.array(columns)[ii].tolist()\n",
-    "            else:\n",
-    "                cols \t= columns\n",
-    "                n \t= len(cols)\n",
-    "            x_ = self._df.groupby(cols).count()[id].values\n",
-    "            r = r.append(\n",
-    "                pd.DataFrame(\n",
-    "                    [\n",
-    "                        {\n",
-    "                            \"selected\":n,\n",
-    "                            \"marketer\": x_.size / np.float64(np.sum(x_)),\n",
-    "                            \"prosecutor\":1 / np.float64(np.min(x_))\n",
-    "\n",
-    "                        }\n",
-    "                    ]\n",
-    "                )\n",
-    "            )\n",
-    "            g_size = x_.size\n",
-    "            n_ids = np.float64(np.sum(x_))\n",
-    "\n",
-    "        return r"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>marketer</th>\n",
-       "      <th>prosecutor</th>\n",
-       "      <th>selected</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0.500000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0.500000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0.500000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0.333333</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0.333333</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   marketer  prosecutor  selected\n",
-       "0  0.500000         1.0         2\n",
-       "0  0.500000         1.0         3\n",
-       "0  0.500000         1.0         3\n",
-       "0  0.333333         1.0         2\n",
-       "0  0.333333         0.5         2"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#\n",
-    "# Lets us compute risk here for a random any random selection of quasi identifiers\n",
-    "# We will run this experiment 5 times\n",
-    "#\n",
-    "MY_DATAFRAME.deid.risk(id='id',num_runs=5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>marketer</th>\n",
-       "      <th>prosecutor</th>\n",
-       "      <th>selected</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0.5</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   marketer  prosecutor  selected\n",
-       "0       0.5         1.0         3"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#\n",
-    "# In this scenario we are just interested in sex,profession,age\n",
-    "#\n",
-    "MY_DATAFRAME.deid.risk(id='id',quasi_id=['age','sex','profession'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 2",
-   "language": "python",
-   "name": "python2"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.15rc1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/risk/init.py
+++ b/risk/init.py
@ -0,0 +1,69 @@
+"""
+# Re-Identification Risk
+
+This framework computes re-identification risk of a dataset by extending pandas. It works like a pandas **add-on** 
+The framework will compute the following risk measures: marketer, prosecutor, journalist and pitman risk. 
+References for the risk measures can be found on 
+ - http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf
+ - https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf
+
+There are two modes available :
+    
+**explore:**
+
+Here the assumption is that we are not sure of the attributes to be disclosed, the framework will randomly generate random combinations of attributes and evaluate them accordingly as it provides all the measures of risk. 
+
+**evaluation**
+
+Here the assumption is that we are clear on the sets of attributes to be used and we are interested in computing the associated risk.
+
+
+### Four risk measures are computed :
+
+    - Marketer risk
+    - Prosecutor risk
+    - Journalist risk
+    - Pitman Risk
+
+### Usage:
+
+Install this package using pip as follows :
+
+Stable :
+    
+    pip install git+https://hiplab.mc.vanderbilt.edu/git/steve/deid-risk.git
+    
+    
+Latest Development (not fully tested):
+    
+    pip install git+https://hiplab.mc.vanderbilt.edu/git/steve/deid-risk.git@risk
+    
+The framework will depend on pandas and numpy (for now). Below is a basic sample to get started quickly.
+
+
+    import numpy as np
+    import pandas as pd
+    import risk
+
+    mydf = pd.DataFrame({"x":np.random.choice( np.random.randint(1,10),50),"y":np.random.choice( np.random.randint(1,10),50),"z":np.random.choice( np.random.randint(1,10),50),"r":np.random.choice( np.random.randint(1,10),50)  })
+    print (mydf.risk.evaluate())
+
+
+
+    #
+    # computing journalist and pitman
+    #   - Insure the population size is much greater than the sample size 
+    #   - Insure the fields are identical in both sample and population
+    #
+    pop = pd.DataFrame({"x":np.random.choice( np.random.randint(1,10),150),"y":np.random.choice( np.random.randint(1,10),150) ,"z":np.random.choice( np.random.randint(1,10),150),"r":np.random.choice( np.random.randint(1,10),150)})
+    print (mydf.risk.evaluate(pop=pop))
+
+
+@TODO:
+    - Evaluation of how sparse attributes are (the ratio of non-null over rows)
+    - Have a smart way to drop attributes (based on the above in random policy search)
+Basic examples that illustrate usage of the the framework are in the notebook folder. The example is derived from 
+
+	
+"""
+from risk import deid
--- a/risk/risk.py
+++ b/risk/risk.py
@ -0,0 +1,255 @@
+"""
+    Health Information Privacy Lab
+    Brad. Malin, Weiyi Xia, Steve L. Nyemba
+
+    This framework computes re-identification risk of a dataset assuming the data being shared can be loaded into a dataframe (pandas)
+    The framework will compute the following risk measures:
+        - marketer
+        - prosecutor
+        - pitman
+
+    References :
+        https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf
+
+    This framework integrates pandas (for now) as an extension and can be used in two modes :
+    Experimental mode
+        Here the assumption is that we are not sure of the attributes to be disclosed, the framework will explore a variety of combinations and associate risk measures every random combinations
+
+    Evaluation mode
+        The evaluation mode assumes the set of attributes given are known and thus will evaluate risk for a subset of attributes.
+
+    features :
+        - determine viable fields (quantifiable in terms of uniqueness). This is a way to identify fields that can act as identifiers.
+        - explore and evaluate risk of a sample dataset against a known population dataset
+        - explore and evaluate risk on a sample dataset
+    Usage:
+    from pandas_risk import *
+
+    mydataframe = pd.DataFrame('/myfile.csv')
+    resp = mydataframe.risk.evaluate(id=<name of patient field>,num_runs=<number of runs>,cols=[])
+    resp = mydataframe.risk.explore(id=<name of patient field>,num_runs=<number of runs>,cols=[])
+
+
+    @TODO:
+        - Provide a selected number of fields and risk will be computed for those fields.
+        - include journalist risk
+
+"""
+import pandas as pd
+import numpy as np
+import logging	
+import json
+from datetime import datetime
+import sys
+
+
+@pd.api.extensions.register_dataframe_accessor("risk")
+class deid :
+
+    """
+    This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
+    """
+    def __init__(self,df):
+        self._df = df.fillna(' ')
+
+    def explore(self,**args):
+        """
+        This function will perform experimentation by performing a random policies (combinations of attributes)
+        This function is intended to explore a variety of policies and evaluate their associated risk.
+
+        @param pop|sample   data-frame with popublation reference
+        @param id       key field that uniquely identifies patient/customer ...
+        """
+        
+        pop= args['pop'] if 'pop' in args else None
+        
+        if 'pop_size' in args :
+            pop_size = np.float64(args['pop_size'])
+        else:
+            pop_size = -1
+        
+        
+        #
+        # Policies will be generated with a number of runs
+        #
+        RUNS = args['num_runs'] if 'num_runs' in args else 5
+        
+        sample = args['sample'] if 'sample' in args else pd.DataFrame(self._df)
+        
+        k = sample.columns.size if 'field_count' not in args else int(args['field_count']) + 1
+        if 'id' in args :
+            id = args['id']
+            columns = list(set(sample.columns.tolist()) - set([id]))
+        else:
+            columns = sample.columns.tolist()
+        o = pd.DataFrame()
+        
+        for i in np.arange(RUNS):
+            n = np.random.randint(2,k)
+            
+            cols = np.random.choice(columns,n,replace=False).tolist()            
+            params = {'sample':sample,'cols':cols}
+            if pop is not None :
+                params['pop'] = pop
+            if pop_size > 0  :
+                params['pop_size'] = pop_size
+
+            r = self.evaluate(**params)
+            #
+            # let's put the policy in place
+            p =  pd.DataFrame(1*sample.columns.isin(cols)).T
+            p.columns = sample.columns
+            o = o.append(r.join(p))
+            
+        o.index = np.arange(o.shape[0]).astype(np.int64)
+
+        return o
+    def evaluate(self, **args):
+        """
+        This function has the ability to evaluate risk associated with either a population or a sample dataset
+        :sample sample dataset
+        :pop    population dataset
+        :cols   list of columns of interest or policies
+        :flag   user provided flag for the context of the evaluation
+        """
+        if 'sample' in args :
+            sample = pd.DataFrame(args['sample'])
+        else:
+            sample = pd.DataFrame(self._df)
+
+        if not args  or 'cols' not in args:
+            cols = sample.columns.tolist()
+        elif args and 'cols' in args:
+            cols = args['cols']
+        flag = 'UNFLAGGED' if 'flag' not in args else args['flag']
+        #
+        # @TODO: auto select the columns i.e removing the columns that will have the effect of an identifier
+        #
+        # if 'population' in args :
+        #     pop = pd.DataFrame(args['population'])
+        r = {"flag":flag}
+        # if sample :
+
+        handle_sample   = Sample()
+        xi              = sample.groupby(cols,as_index=False).size().values
+
+        handle_sample.set('groups',xi)
+        if 'pop_size' in args :
+            pop_size = np.float64(args['pop_size'])
+        else:
+            pop_size = -1
+        #
+        #-- The following conditional line is to address the labels that will be returned
+        # @TODO: Find a more elegant way of doing this.
+        #
+        if 'pop' in args :
+            r['sample marketer']   = handle_sample.marketer()
+            r['sample prosecutor'] = handle_sample.prosecutor()
+            r['sample unique ratio']     = handle_sample.unique_ratio()
+            r['sample group count'] = xi.size
+        else:
+            r['marketer']   = handle_sample.marketer()
+            r['prosecutor'] = handle_sample.prosecutor()
+            r['unique ratio']     = handle_sample.unique_ratio()
+            r['group count'] =  xi.size
+            if pop_size > 0 :
+                handle_sample.set('pop_size',pop_size)
+                r['pitman risk'] = handle_sample.pitman()
+        if 'pop' in args :
+            xi = pd.DataFrame({"sample_group_size":sample.groupby(cols,as_index=False).size()}).reset_index()
+            yi = pd.DataFrame({"population_group_size":args['pop'].groupby(cols,as_index=False).size()}).reset_index()
+            merged_groups = pd.merge(xi,yi,on=cols,how='inner')
+            handle_population= Population()            
+            handle_population.set('merged_groups',merged_groups)
+            
+            r['pop. marketer'] = handle_population.marketer()            
+            r['pitman risk'] = handle_population.pitman()
+            r['pop. group size'] = np.unique(yi.population_group_size).size
+        #
+        # At this point we have both columns for either sample,population or both
+        #
+        r['field count'] = len(cols)
+        return pd.DataFrame([r])
+
+class Risk :
+    """
+    This class is an abstraction of how we chose to structure risk computation i.e in 2 sub classes:
+        - Sample        computes risk associated with a sample dataset only
+        - Population    computes risk associated with a population
+    """
+    def __init__(self):
+        self.cache = {}        
+    def set(self,key,value):        
+        if id not in self.cache :
+            self.cache[id] = {}
+        self.cache[key] = value
+
+class Sample(Risk):
+    """
+    This class will compute risk for the sample dataset: the marketer and prosecutor risk are computed by default.
+    This class can optionally add pitman risk if the population size is known.
+    """
+    def __init__(self):
+        Risk.__init__(self)
+    def marketer(self):
+        """
+        computing marketer risk for sample dataset
+        """
+        groups = self.cache['groups']
+        group_count = groups.size
+        row_count   = groups.sum()
+        return group_count / np.float64(row_count)
+
+    def prosecutor(self):
+        """
+        The prosecutor risk consists in determining 1 over the smallest group size
+        It identifies if there is at least one record that is unique
+        """
+        groups = self.cache['groups']
+        return 1 / np.float64(groups.min())
+    def unique_ratio(self):
+        groups = self.cache['groups']        
+        row_count = groups.sum()
+        return groups[groups == 1].sum() / np.float64(row_count)
+
+    def pitman(self):
+        """
+        This function will approximate pitman de-identification risk based on pitman sampling
+        """
+        groups = self.cache['groups']
+        si = groups[groups == 1].size
+        u = groups.size
+        alpha = np.divide(si , np.float64(u) )
+        f = np.divide(groups.sum(), np.float64(self.cache['pop_size']))
+        return np.power(f,1-alpha)
+
+class Population(Sample):
+    """
+    This class will compute risk for datasets that have population information or datasets associated with them.
+    This computation includes pitman risk (it requires minimal information about population)
+    """
+    def __init__(self,**args):
+        Sample.__init__(self)
+
+    def set(self,key,value):
+        Sample.set(self,key,value)
+        if key == 'merged_groups' :  
+               
+            Sample.set(self,'pop_size',np.float64(value.population_group_size.sum()) )
+            Sample.set(self,'groups',value.sample_group_size)
+    """
+    This class will measure risk and account for the existance of a population
+    :merged_groups {sample_group_size, population_group_size} is a merged dataset with group sizes of both population and sample
+    """
+    def marketer(self):
+        """
+        This function requires
+        """
+        r = self.cache['merged_groups']
+        sample_row_count = r.sample_group_size.sum() 
+        #
+        # @TODO : make sure the above line is size (not sum)
+        # sample_row_count = r.sample_group_size.size
+        return r.apply(lambda row: (row.sample_group_size / np.float64(row.population_group_size)) /np.float64(sample_row_count) ,axis=1).sum()
+
+
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,14 @@
+"""
+This is a build file for the 
+"""
+from setuptools import setup, find_packages
+ 
+setup(
+    name = "risk",
+    version = "0.1",
+    author = "Health Information Privacy Lab",
+    author_email = "steve.l.nyemba@vanderbilt.edu",
+    license = "MIT",
+    packages=['risk'],
+    install_requires = ['numpy','pandas']
+    )
--- a/src/pandas_risk.py
+++ b/src/pandas_risk.py
@ -1,115 +0,0 @@
-"""
-    Health Information Privacy Lab
-    Steve L. Nyemba & Brad. Malin
-
-
-    This is an extension to the pandas data-frame that will perform a risk assessment on a variety of attributes
-    This implementation puts the responsibility on the user of the framework to join datasets and load the final results into a pandas data-frame.
-
-    The code will randomly select fields and compute the risk (marketer and prosecutor) and perform a given number of runs.
-
-    Usage:
-    from pandas_risk import *
-
-    mydataframe = pd.DataFrame('/myfile.csv')
-    risk = mydataframe.deid.risk(id=<name of patient field>,num_runs=<number of runs>)
-
-
-    @TODO:
-        - Provide a selected number of fields and risk will be computed for those fields.
-        - include journalist risk
-
-"""
-import pandas as pd
-import numpy as np
-
-@pd.api.extensions.register_dataframe_accessor("deid")
-class deid :
-    """
-        This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
-    """
-    def __init__(self,df):
-        self._df = df
-    
-    def risk(self,**args):
-        """
-            @param  id          name of patient field            
-            @params num_runs    number of runs (default will be 100)
-	    @params quasi_id 	list of quasi identifiers to be used (this will only perform a single run)
-        """
-        
-        id  = args['id']
-        if 'quasi_id' in args :
-            num_runs = 1
-            columns = list(set(args['quasi_id'])- set(id) )
-        else :
-            num_runs  = args['num_runs'] if 'num_runs' in args else 100
-            columns = list(set(self._df.columns) - set([id]))
-        
-        r   = pd.DataFrame()    
-        k = len(columns)
-        N = self._df.shape[0]
-        tmp = self._df.fillna(' ')
-        np.random.seed(1)
-        for i in range(0,num_runs) :
-            
-            #
-            # let's chose a random number of columns and compute marketer and prosecutor risk
-            # Once the fields are selected we run a groupby clause
-            #
-            if 'quasi_id' not in args :
-                if 'field_count' in args :
-                    #
-                    # We chose to limit how many fields we passin
-                    n   = np.random.randint(2,int(args['field_count'])) #-- number of random fields we are picking    
-                else :
-                    n   = np.random.randint(2,k) #-- number of random fields we are picking
-                ii = np.random.choice(k,n,replace=False)
-                cols = np.array(columns)[ii].tolist()
-                policy = np.zeros(k)
-                policy [ii]  = 1
-                policy =  pd.DataFrame(policy).T
-
-            else:
-                cols 	= columns
-                policy = np.ones(k)
-                policy = pd.DataFrame(policy).T
-            n 	= len(cols)
-            policy.columns = columns
-            N = tmp.shape[0]
-
-            x_ = tmp.groupby(cols).size().values
-            # print [id,i,n,k,self._df.groupby(cols).count()]
-            r = r.append(
-                pd.DataFrame(
-                    [
-                        {
-                            "group_count":x_.size,
-                            "patient_count":N,
-                            "field_count":n,
-                            "marketer": x_.size / np.float64(np.sum(x_)),
-                            "prosecutor":1 / np.float64(np.min(x_))
-
-                        }
-                    ]
-                ).join(policy)
-            )
-            # g_size = x_.size
-            # n_ids = np.float64(np.sum(x_))  
-            # sql = """
-            #  SELECT COUNT(g_size) as group_count, :patient_count as patient_count,SUM(g_size) as rec_count, COUNT(g_size)/SUM(g_size) as marketer, 1/ MIN(g_size) as prosecutor, :n as field_count
-            #     FROM (
-            #     SELECT COUNT(*) as g_size,:key,:fields
-            #     FROM :full_name
-            #     GROUP BY :fields
-            # """.replace(":n",str(n)).replace(":fields",",".join(cols)).replace(":key",id).replace(":patient_count",str(N))
-            # r.append(self._df.query(sql.replace("\n"," ").replace("\r"," ") ))
-
-        return r
-
-
-# df = pd.read_gbq("select * from deid_risk.risk_30k",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
-# r =  df.deid.risk(id='person_id',num_runs=200)
-# print r[['field_count','patient_count','marketer','prosecutor']]
-
-
--- a/src/params.py
+++ b/src/params.py
@ -1,17 +0,0 @@
-import sys
-SYS_ARGS={}
-if len(sys.argv) > 1 :
-    N = len(sys.argv)
-    for i in range(1,N) :
-        value = 1
-        
-        if sys.argv[i].startswith('--') :
-            key = sys.argv[i].replace('-','')
-            
-            if i + 1 < N and not sys.argv[i+1].startswith('--') :
-                value = sys.argv[i + 1].strip()
-            SYS_ARGS[key] = value
-            i += 2
-        elif 'action' not in SYS_ARGS:
-            SYS_ARGS['action'] = sys.argv[i].strip()
-        
--- a/src/risk.py
+++ b/src/risk.py
@ -1,287 +0,0 @@
-"""
-    Steve L. Nyemba & Brad Malin
-    Health Information Privacy Lab.
-
-    This code is proof of concept as to how risk is computed against a database (at least a schema).
-    The engine will read tables that have a given criteria (patient id) and generate a dataset by performing joins.
-    Because joins are process intensive we decided to add a limit to the records pulled.
-    
-    TL;DR:
-        This engine generates a dataset and computes risk (marketer and prosecutor)    
-    Assumptions:
-        - We assume tables that reference patients will name the keys identically (best practice). This allows us to be able to leverage data store's that don't support referential integrity
-        
-    Usage :
-        
-    Limitations
-        - It works against bigquery for now
-        @TODO:    
-            - Need to write a transport layer (database interface)
-            - Support for referential integrity, so one table can be selected and a dataset derived given referential integrity
-            - Add support for journalist risk
-"""
-import pandas as pd
-import numpy as np
-from google.cloud import bigquery as bq
-import time
-from params import SYS_ARGS
-class utils :
-    """
-        This class is a utility class that will generate SQL-11 compatible code in order to run the risk assessment
-        
-        @TODO: plugins for other data-stores
-    """
-    def __init__(self,**args):
-        # self.path = args['path']
-        self.client = args['client']
-    
-    def get_tables(self,**args): #id,key='person_id'):
-        """
-            This function returns a list of tables given a key. The key is the name of the field that uniquely designates a patient/person
-            in the database. The list of tables are tables that can be joined given the provided field.
-
-            @param key  name of the patient field
-            @param dataset   dataset name
-            @param client   initialized bigquery client ()
-            @return [{name,fields:[],row_count}]
-        """
-        dataset = args['dataset']
-        client  = args['client']
-        key     = args['key']
-        r = []
-        ref = client.dataset(dataset)
-        tables = list(client.list_tables(ref))
-        TERMS = ['type','unit','count','refills','stop','supply','quantity']
-        for table in tables :
-            
-            if table.table_id.strip() in ['people_seed','measurement','drug_exposure','procedure_occurrence','visit_occurrence','condition_occurrence','device_exposure']:
-                print ' skiping ...'
-                continue
-            ref = table.reference
-            table = client.get_table(ref)
-            schema = table.schema
-            rows = table.num_rows
-            if rows == 0 :
-                continue
-            
-            names = [f.name for f in schema if len (set(TERMS) & set(f.name.strip().split("_"))) == 0 ]
-            
-            x = list(set(names) & set([key]))
-            if x  :
-                full_name = ".".join([dataset,table.table_id])
-                r.append({"name":table.table_id,"fields":names,"row_count":rows,"full_name":full_name})
-        return r
-    
-    def get_field_name(self,alias,field_name,index):
-        """
-            This function will format the a field name given an index (the number of times it has occurred in projection)
-            The index is intended to avoid a "duplicate field" error (bigquery issue)
-
-            @param alias    alias of the table
-            @param field_name   name of the field to be formatted
-            @param index    the number of times the field appears in the projection
-        """
-        name = [alias,field_name]
-        if index > 0 :
-            return ".".join(name)+" AS :field_name:index".replace(":field_name",field_name).replace(":index",str(index))
-        else:
-            return ".".join(name)
-    def get_filtered_table(self,table,key):
-        """
-            This function will return a table with a single record per individual patient
-        """
-        return """
-            
-            SELECT :table.* FROM (
-                SELECT row_number() over () as top, * FROM :full_name ) as :table
-                
-            
-            INNER JOIN (
-            SELECT MAX(top) as top, :key FROM ( 
-                SELECT row_number() over () as top,:key from :full_name ) GROUP BY :key
-                
-            )as filter
-            ON filter.top = :table.top and filter.:key = :table.:key 
-
-        """.replace(":key",key).replace(":full_name",table['full_name']).replace(":table",table['name'])
-    
-    def get_sql(self,**args):
-        """
-            This function will generate that will join a list of tables given a key and a limit of records
-            @param tables   list of tables
-            @param  key     key field to be used in the join. The assumption is that the field name is identical across tables (best practice!)
-            @param limit    a limit imposed, in case of ristrictions considering joins are resource intensive
-        """
-        tables  = args['tables'] 
-        key     = args['key']
-        limit   = args['limit'] if 'limit' in args else 10000
-        limit   = str(limit) 
-        SQL = [
-            """ 
-            SELECT :fields 
-            FROM 
-            """]
-        fields = []
-        prev_table = None
-        for table in tables :
-            name = table['full_name'] #".".join([self.i_dataset,table['name']])
-            alias= table['name']
-            index = tables.index(table)
-            sql_ = """ 
-                (select * from :name ) as :alias
-            """.replace(":limit",limit)
-            # sql_ = " ".join(["(",self.get_filtered_table(table,key)," ) as :alias"])
-            sql_ = sql_.replace(":name",name).replace(":alias",alias).replace(":limit",limit)
-            fields += [self.get_field_name(alias,field_name,index) for field_name in table['fields'] if field_name != key or  (field_name==key and  tables.index(table) == 0) ]
-            if tables.index(table) > 0 :
-                join = """
-                    INNER JOIN :sql ON :alias.:field = :prev_alias.:field
-                """.replace(":name",name)
-                join = join.replace(":alias",alias).replace(":field",key).replace(":prev_alias",prev_alias)
-                sql_ = join.replace(":sql",sql_)
-                # sql_ = " ".join([sql_,join])
-            SQL += [sql_]
-            if index == 0:
-                prev_alias = str(alias)
-        
-        return " ".join(SQL).replace(":fields"," , ".join(fields))
-
-class risk :
-    """
-        This class will handle the creation of an SQL query that computes marketer and prosecutor risk (for now)
-    """
-    def __init__(self):
-        pass
-    def get_sql(self,**args) :
-        """
-            This function returns the SQL Query that will compute marketer and prosecutor risk
-            @param key      key fields (patient identifier)
-            @param table    table that is subject of the computation
-        """
-        key     = args['key']
-        table   = args['table']
-        fields  = list(set(table['fields']) - set([key]))
-        #-- We need to select n-fields max 64
-        k = len(fields)        
-        if 'field_count' in args :
-            n   = np.random.randint(2, int(args['field_count']) ) #-- number of random fields we are picking
-        else:
-            n = np.random.randint(2,k)  #-- how many random fields are we processing
-        ii = np.random.choice(k,n,replace=False)
-        stream = np.zeros(len(fields) + 1) 
-        stream[ii] = 1
-        stream = pd.DataFrame(stream.tolist()).T
-        stream.columns = args['table']['fields']
-        fields = list(np.array(fields)[ii])
-
-        sql = """
-            SELECT COUNT(g_size) as group_count,SUM(g_size) as patient_count, COUNT(g_size)/SUM(g_size) as marketer, 1/ MIN(g_size) as prosecutor, :n as field_count
-            FROM (
-                SELECT COUNT(*) as g_size,:fields
-                FROM :full_name
-                GROUP BY :fields
-            )
-        """.replace(":fields", ",".join(fields)).replace(":full_name",table['full_name']).replace(":key",key).replace(":n",str(n))
-        return {"sql":sql,"stream":stream}
-    
-
-        
-
-
-if 'action' in SYS_ARGS and  SYS_ARGS['action'] in ['create','compute','migrate'] :
-
-    path = SYS_ARGS['path']
-    client = bq.Client.from_service_account_json(path)
-    i_dataset = SYS_ARGS['i_dataset']
-    key = SYS_ARGS['key'] 
-
-    mytools = utils(client = client)
-    tables = mytools.get_tables(dataset=i_dataset,client=client,key=key)
-    # print len(tables)
-    # tables = tables[:6]
-
-    if SYS_ARGS['action'] == 'create' :
-        #usage:
-        #   create --i_dataset <in dataset> --key <patient id> --o_dataset <out dataset> --table <table|file> [--file] --path <bq JSON account file>
-        #
-        create_sql = mytools.get_sql(tables=tables,key=key) #-- The create statement
-        o_dataset = SYS_ARGS['o_dataset']
-        table = SYS_ARGS['table']
-        if 'file' in SYS_ARGS :
-            f = open(table+'.sql','w')
-            f.write(create_sql)
-            f.close()
-        else:
-            job = bq.QueryJobConfig()
-            job.destination = client.dataset(o_dataset).table(table)
-            job.use_query_cache = True
-            job.allow_large_results = True 
-            job.priority = 'BATCH'
-            job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
-
-            r = client.query(create_sql,location='US',job_config=job) 
-            
-            print [r.job_id,' ** ',r.state]
-    elif SYS_ARGS['action'] == 'migrate' :
-        #
-        #
-
-        o_dataset = SYS_ARGS['o_dataset']
-        for table in tables:
-            sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",mytools.get_filtered_table(table,key),") as ",table['name']])
-            print ""
-            print sql
-            print ""
-            # job = bq.QueryJobConfig()
-            # job.destination = client.dataset(o_dataset).table(table['name'])
-            # job.use_query_cache = True
-            # job.allow_large_results = True 
-            # job.priority = 'INTERACTIVE'
-            # job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
-
-            # r = client.query(sql,location='US',job_config=job) 
-            
-            # print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
-
-
-        pass
-    else:
-        #
-        #
-        tables  = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ]  
-        limit   = int(SYS_ARGS['limit']) if 'limit' in SYS_ARGS else 1
-        if tables :            
-            risk= risk()
-            df  = pd.DataFrame()
-            dfs = pd.DataFrame()
-            np.random.seed(1)
-            for i in range(0,limit) :
-                r = risk.get_sql(key=SYS_ARGS['key'],table=tables[0])
-                sql = r['sql']
-                dfs = dfs.append(r['stream'],sort=True)
-                df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard').join(dfs))
-                # df = df.join(dfs,sort=True)
-                df.to_csv(SYS_ARGS['table']+'.csv')
-                # dfs.to_csv(SYS_ARGS['table']+'_stream.csv') 
-                print [i,' ** ',df.shape[0],pd.DataFrame(r['stream']).shape]
-                time.sleep(2)
-                
-    
-else:
-    print 'ERROR'
-    pass
-
-# r = risk(path='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json', i_dataset='raw',o_dataset='risk_o',o_table='mo')
-# tables = r.get_tables('raw','person_id')
-# sql = r.get_sql(tables=tables[:3],key='person_id')
-# #
-# # let's post this to a designated location
-# #
-# f = open('foo.sql','w')
-# f.write(sql)
-# f.close()
-# r.get_sql(tables=tables,key='person_id')
-# p = r.compute()
-# print p
-# p.to_csv("risk.csv")
-# r.write('foo.sql')