Refactored, including population risk assessment

pull/2/head
Steve L. Nyemba 6 years ago
parent 6863df382e
commit c3066408c9

@ -22,16 +22,108 @@
""" """
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import time
@pd.api.extensions.register_dataframe_accessor("deid") @pd.api.extensions.register_dataframe_accessor("deid")
class deid : class deid :
""" """
This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
""" """
def __init__(self,df): def __init__(self,df):
self._df = df self._df = df.fillna(' ')
def explore(self,**args):
"""
This function will perform experimentation by performing a random policies (combinations of attributes)
This function is intended to explore a variety of policies and evaluate their associated risk.
@param pop|sample data-frame with popublation reference
@param id key field that uniquely identifies patient/customer ...
"""
# id = args['id']
pop= args['pop'] if 'pop' in args else None
# if 'columns' in args :
# cols = args['columns']
# params = {"sample":args['data'],"cols":cols}
# if pop is not None :
# params['pop'] = pop
# return self.evaluate(**params)
# else :
#
# Policies will be generated with a number of runs
#
RUNS = args['num_runs'] if 'num_runs' in args else 5
sample = args['sample'] if 'sample' in args else pd.DataFrame(self._df)
k = sample.columns.size -1 if 'field_count' not in args else int(args['field_count'])
columns = list(set(sample.columns.tolist()) - set([id]))
o = pd.DataFrame()
# pop = args['pop'] if 'pop' in args else None
for i in np.arange(RUNS):
n = np.random.randint(2,k)
cols = np.random.choice(columns,n,replace=False).tolist()
params = {'sample':sample,'cols':cols}
if pop is not None :
params['pop'] = pop
r = self.evaluate(**params)
#
# let's put the policy in place
p = pd.DataFrame(1*sample.columns.isin(cols)).T
p.columns = sample.columns
o = o.append(r.join(p))
o.index = np.arange(o.shape[0]).astype(np.int64)
def risk(self,**args): return o
def evaluate(self,**args) :
"""
This function will compute the marketer, if a population is provided it will evaluate the marketer risk relative to both the population and sample
@param smaple data-frame with the data to be processed
@param policy the columns to be considered.
@param pop population dataset
@params flag user defined flag (no computation use)
"""
if (args and 'sample' not in args) or not args :
x_i = pd.DataFrame(self._df)
elif args and 'sample' in args :
x_i = args['sample']
if (args and 'cols' not in args) or not args :
cols = x_i.columns.tolist()
# cols = self._df.columns.tolist()
elif args and 'cols' in args :
cols = args['cols']
flag = args['flag'] if 'flag' in args else 'UNFLAGGED'
# if args and 'sample' in args :
# x_i = pd.DataFrame(self._df)
# else :
# cols = args['cols'] if 'cols' in args else self._df.columns.tolist()
# x_i = x_i.groupby(cols,as_index=False).size().values
x_i_values = x_i.groupby(cols,as_index=False).size().values
SAMPLE_GROUP_COUNT = x_i_values.size
SAMPLE_FIELD_COUNT = len(cols)
SAMPLE_POPULATION = x_i_values.sum()
SAMPLE_MARKETER = SAMPLE_GROUP_COUNT / np.float64(SAMPLE_POPULATION)
SAMPLE_PROSECUTOR = 1/ np.min(x_i_values).astype(np.float64)
if 'pop' in args :
Yi = args['pop']
y_i= pd.DataFrame({"group_size":Yi.groupby(cols,as_index=False).size()}).reset_index()
# y_i['group'] = pd.DataFrame({"group_size":args['pop'].groupby(cols,as_index=False).size().values}).reset_index()
# x_i = pd.DataFrame({"group_size":x_i.groupby(cols,as_index=False).size().values}).reset_index()
x_i = pd.DataFrame({"group_size":x_i.groupby(cols,as_index=False).size()}).reset_index()
SAMPLE_RATIO = int(100 * x_i.size/args['pop'].shape[0])
r = pd.merge(x_i,y_i,on=cols,how='inner')
r['marketer'] = r.apply(lambda row: (row.group_size_x / np.float64(row.group_size_y)) /np.sum(x_i.group_size) ,axis=1)
r['sample %'] = np.repeat(SAMPLE_RATIO,r.shape[0])
r['tier'] = np.repeat(flag,r.shape[0])
r['sample marketer'] = np.repeat(SAMPLE_MARKETER,r.shape[0])
r = r.groupby(['sample %','tier','sample marketer'],as_index=False).sum()[['sample %','marketer','sample marketer','tier']]
else:
r = pd.DataFrame({"marketer":[SAMPLE_MARKETER],"prosecutor":[SAMPLE_PROSECUTOR],"field_count":[SAMPLE_FIELD_COUNT],"group_count":[SAMPLE_GROUP_COUNT]})
return r
def _risk(self,**args):
""" """
@param id name of patient field @param id name of patient field
@params num_runs number of runs (default will be 100) @params num_runs number of runs (default will be 100)
@ -50,7 +142,7 @@ class deid :
k = len(columns) k = len(columns)
N = self._df.shape[0] N = self._df.shape[0]
tmp = self._df.fillna(' ') tmp = self._df.fillna(' ')
np.random.seed(1) np.random.seed(int(time.time()) )
for i in range(0,num_runs) : for i in range(0,num_runs) :
# #
@ -85,6 +177,7 @@ class deid :
[ [
{ {
"group_count":x_.size, "group_count":x_.size,
"patient_count":N, "patient_count":N,
"field_count":n, "field_count":n,
"marketer": x_.size / np.float64(np.sum(x_)), "marketer": x_.size / np.float64(np.sum(x_)),

@ -146,7 +146,7 @@ class utils :
return " ".join(SQL).replace(":fields"," , ".join(fields)) return " ".join(SQL).replace(":fields"," , ".join(fields))
class risk : class SQLRisk :
""" """
This class will handle the creation of an SQL query that computes marketer and prosecutor risk (for now) This class will handle the creation of an SQL query that computes marketer and prosecutor risk (for now)
""" """
@ -186,102 +186,163 @@ class risk :
class UtilHandler :
def __init__(self,**args) :
"""
@param path path to the service account file
@param dataset input dataset name
@param key_field key_field (e.g person_id)
@param key_table
"""
self.path = args['path']
self.client = bq.Client.from_service_account_json(self.path)
dataset = args['dataset']
self.key = args['key_field']
self.mytools = utils(client = self.client)
self.tables = self.mytools.get_tables(dataset=dataset,client=self.client,key=self.key)
index = [ self.tables.index(item) for item in self.tables if item['name'] == args['key_table']] [0]
if index != 0 :
first = self.tables[0]
aux = self.tables[index]
self.tables[0] = aux
self.tables[index] = first
if 'filter' in args :
self.tables = [item for item in self.tables if item['name'] in args['filter']]
def create_table(self,**args):
"""
@param path absolute filename to save the create statement
if 'action' in SYS_ARGS and SYS_ARGS['action'] in ['create','compute','migrate'] : """
create_sql = self.mytools.get_sql(tables=self.tables,key=self.key) #-- The create statement
path = SYS_ARGS['path'] # o_dataset = SYS_ARGS['o_dataset']
client = bq.Client.from_service_account_json(path) # table = SYS_ARGS['table']
i_dataset = SYS_ARGS['i_dataset'] if 'path' in args:
key = SYS_ARGS['key'] f = open(args['path'],'w')
mytools = utils(client = client)
tables = mytools.get_tables(dataset=i_dataset,client=client,key=key)
# print len(tables)
# tables = tables[:6]
if SYS_ARGS['action'] == 'create' :
#usage:
# create --i_dataset <in dataset> --key <patient id> --o_dataset <out dataset> --table <table|file> [--file] --path <bq JSON account file>
#
create_sql = mytools.get_sql(tables=tables,key=key) #-- The create statement
o_dataset = SYS_ARGS['o_dataset']
table = SYS_ARGS['table']
if 'file' in SYS_ARGS :
f = open(table+'.sql','w')
f.write(create_sql) f.write(create_sql)
f.close() f.close()
else: return create_sql
job = bq.QueryJobConfig() def migrate_tables(self,**args):
job.destination = client.dataset(o_dataset).table(table) """
job.use_query_cache = True This function will migrate a table from one location to another
job.allow_large_results = True The reason for migration is to be able to reduce a candidate table to only represent a patient by her quasi-identifiers.
job.priority = 'BATCH' @param dataset target dataset
job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY) """
o_dataset = args['dataset'] if 'dataset' in args else None
r = client.query(create_sql,location='US',job_config=job) p = []
for table in self.tables:
print [r.job_id,' ** ',r.state] sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",self.mytools.get_filtered_table(table,self.key),") as ",table['name']])
elif SYS_ARGS['action'] == 'migrate' : p.append(sql)
# if o_dataset :
# job = bq.QueryJobConfig()
job.destination = self.client.dataset(o_dataset).table(table['name'])
o_dataset = SYS_ARGS['o_dataset'] job.use_query_cache = True
for table in tables: job.allow_large_results = True
sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",mytools.get_filtered_table(table,key),") as ",table['name']]) job.priority = 'INTERACTIVE'
print "" job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
print sql
print "" r = self.client.query(sql,location='US',job_config=job)
# job = bq.QueryJobConfig()
# job.destination = client.dataset(o_dataset).table(table['name']) print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
# job.use_query_cache = True return p
# job.allow_large_results = True
# job.priority = 'INTERACTIVE' # if 'action' in SYS_ARGS and SYS_ARGS['action'] in ['create','compute','migrate'] :
# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
# path = SYS_ARGS['path']
# r = client.query(sql,location='US',job_config=job) # client = bq.Client.from_service_account_json(path)
# i_dataset = SYS_ARGS['i_dataset']
# print [table['full_name'],' ** ',r.job_id,' ** ',r.state] # key = SYS_ARGS['key']
# mytools = utils(client = client)
pass # tables = mytools.get_tables(dataset=i_dataset,client=client,key=key)
else: # # print len(tables)
# # # tables = tables[:6]
#
tables = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ] # if SYS_ARGS['action'] == 'create' :
limit = int(SYS_ARGS['limit']) if 'limit' in SYS_ARGS else 1 # #usage:
if tables : # # create --i_dataset <in dataset> --key <patient id> --o_dataset <out dataset> --table <table|file> [--file] --path <bq JSON account file>
risk= risk() # #
df = pd.DataFrame() # create_sql = mytools.get_sql(tables=tables,key=key) #-- The create statement
dfs = pd.DataFrame() # o_dataset = SYS_ARGS['o_dataset']
np.random.seed(1) # table = SYS_ARGS['table']
for i in range(0,limit) : # if 'file' in SYS_ARGS :
r = risk.get_sql(key=SYS_ARGS['key'],table=tables[0]) # f = open(table+'.sql','w')
sql = r['sql'] # f.write(create_sql)
dfs = dfs.append(r['stream'],sort=True) # f.close()
df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard').join(dfs)) # else:
# df = df.join(dfs,sort=True) # job = bq.QueryJobConfig()
df.to_csv(SYS_ARGS['table']+'.csv') # job.destination = client.dataset(o_dataset).table(table)
# dfs.to_csv(SYS_ARGS['table']+'_stream.csv') # job.use_query_cache = True
print [i,' ** ',df.shape[0],pd.DataFrame(r['stream']).shape] # job.allow_large_results = True
time.sleep(2) # job.priority = 'BATCH'
# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
else: # r = client.query(create_sql,location='US',job_config=job)
print 'ERROR'
pass # print [r.job_id,' ** ',r.state]
# elif SYS_ARGS['action'] == 'migrate' :
# r = risk(path='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json', i_dataset='raw',o_dataset='risk_o',o_table='mo') # #
# tables = r.get_tables('raw','person_id') # #
# sql = r.get_sql(tables=tables[:3],key='person_id')
# # # o_dataset = SYS_ARGS['o_dataset']
# # let's post this to a designated location # for table in tables:
# # # sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",mytools.get_filtered_table(table,key),") as ",table['name']])
# f = open('foo.sql','w') # print ""
# f.write(sql) # print sql
# f.close() # print ""
# r.get_sql(tables=tables,key='person_id') # # job = bq.QueryJobConfig()
# p = r.compute() # # job.destination = client.dataset(o_dataset).table(table['name'])
# print p # # job.use_query_cache = True
# p.to_csv("risk.csv") # # job.allow_large_results = True
# r.write('foo.sql') # # job.priority = 'INTERACTIVE'
# # job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
# # r = client.query(sql,location='US',job_config=job)
# # print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
# pass
# else:
# #
# #
# tables = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ]
# limit = int(SYS_ARGS['limit']) if 'limit' in SYS_ARGS else 1
# if tables :
# risk= risk()
# df = pd.DataFrame()
# dfs = pd.DataFrame()
# np.random.seed(1)
# for i in range(0,limit) :
# r = risk.get_sql(key=SYS_ARGS['key'],table=tables[0])
# sql = r['sql']
# dfs = dfs.append(r['stream'],sort=True)
# df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard').join(dfs))
# # df = df.join(dfs,sort=True)
# df.to_csv(SYS_ARGS['table']+'.csv')
# # dfs.to_csv(SYS_ARGS['table']+'_stream.csv')
# print [i,' ** ',df.shape[0],pd.DataFrame(r['stream']).shape]
# time.sleep(2)
# else:
# print 'ERROR'
# pass
# # r = risk(path='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json', i_dataset='raw',o_dataset='risk_o',o_table='mo')
# # tables = r.get_tables('raw','person_id')
# # sql = r.get_sql(tables=tables[:3],key='person_id')
# # #
# # # let's post this to a designated location
# # #
# # f = open('foo.sql','w')
# # f.write(sql)
# # f.close()
# # r.get_sql(tables=tables,key='person_id')
# # p = r.compute()
# # print p
# # p.to_csv("risk.csv")
# # r.write('foo.sql')

Loading…
Cancel
Save