|
|
@ -146,7 +146,7 @@ class utils :
|
|
|
|
|
|
|
|
|
|
|
|
return " ".join(SQL).replace(":fields"," , ".join(fields))
|
|
|
|
return " ".join(SQL).replace(":fields"," , ".join(fields))
|
|
|
|
|
|
|
|
|
|
|
|
class risk :
|
|
|
|
class SQLRisk :
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
This class will handle the creation of an SQL query that computes marketer and prosecutor risk (for now)
|
|
|
|
This class will handle the creation of an SQL query that computes marketer and prosecutor risk (for now)
|
|
|
|
"""
|
|
|
|
"""
|
|
|
@ -186,102 +186,163 @@ class risk :
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class UtilHandler :
|
|
|
|
|
|
|
|
def __init__(self,**args) :
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
@param path path to the service account file
|
|
|
|
|
|
|
|
@param dataset input dataset name
|
|
|
|
|
|
|
|
@param key_field key_field (e.g person_id)
|
|
|
|
|
|
|
|
@param key_table
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
self.path = args['path']
|
|
|
|
|
|
|
|
self.client = bq.Client.from_service_account_json(self.path)
|
|
|
|
|
|
|
|
dataset = args['dataset']
|
|
|
|
|
|
|
|
self.key = args['key_field']
|
|
|
|
|
|
|
|
|
|
|
|
if 'action' in SYS_ARGS and SYS_ARGS['action'] in ['create','compute','migrate'] :
|
|
|
|
self.mytools = utils(client = self.client)
|
|
|
|
|
|
|
|
self.tables = self.mytools.get_tables(dataset=dataset,client=self.client,key=self.key)
|
|
|
|
|
|
|
|
index = [ self.tables.index(item) for item in self.tables if item['name'] == args['key_table']] [0]
|
|
|
|
|
|
|
|
if index != 0 :
|
|
|
|
|
|
|
|
first = self.tables[0]
|
|
|
|
|
|
|
|
aux = self.tables[index]
|
|
|
|
|
|
|
|
self.tables[0] = aux
|
|
|
|
|
|
|
|
self.tables[index] = first
|
|
|
|
|
|
|
|
if 'filter' in args :
|
|
|
|
|
|
|
|
self.tables = [item for item in self.tables if item['name'] in args['filter']]
|
|
|
|
|
|
|
|
|
|
|
|
path = SYS_ARGS['path']
|
|
|
|
|
|
|
|
client = bq.Client.from_service_account_json(path)
|
|
|
|
|
|
|
|
i_dataset = SYS_ARGS['i_dataset']
|
|
|
|
|
|
|
|
key = SYS_ARGS['key']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mytools = utils(client = client)
|
|
|
|
def create_table(self,**args):
|
|
|
|
tables = mytools.get_tables(dataset=i_dataset,client=client,key=key)
|
|
|
|
"""
|
|
|
|
# print len(tables)
|
|
|
|
@param path absolute filename to save the create statement
|
|
|
|
# tables = tables[:6]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if SYS_ARGS['action'] == 'create' :
|
|
|
|
"""
|
|
|
|
#usage:
|
|
|
|
create_sql = self.mytools.get_sql(tables=self.tables,key=self.key) #-- The create statement
|
|
|
|
# create --i_dataset <in dataset> --key <patient id> --o_dataset <out dataset> --table <table|file> [--file] --path <bq JSON account file>
|
|
|
|
# o_dataset = SYS_ARGS['o_dataset']
|
|
|
|
#
|
|
|
|
# table = SYS_ARGS['table']
|
|
|
|
create_sql = mytools.get_sql(tables=tables,key=key) #-- The create statement
|
|
|
|
if 'path' in args:
|
|
|
|
o_dataset = SYS_ARGS['o_dataset']
|
|
|
|
f = open(args['path'],'w')
|
|
|
|
table = SYS_ARGS['table']
|
|
|
|
|
|
|
|
if 'file' in SYS_ARGS :
|
|
|
|
|
|
|
|
f = open(table+'.sql','w')
|
|
|
|
|
|
|
|
f.write(create_sql)
|
|
|
|
f.write(create_sql)
|
|
|
|
f.close()
|
|
|
|
f.close()
|
|
|
|
else:
|
|
|
|
return create_sql
|
|
|
|
job = bq.QueryJobConfig()
|
|
|
|
def migrate_tables(self,**args):
|
|
|
|
job.destination = client.dataset(o_dataset).table(table)
|
|
|
|
"""
|
|
|
|
job.use_query_cache = True
|
|
|
|
This function will migrate a table from one location to another
|
|
|
|
job.allow_large_results = True
|
|
|
|
The reason for migration is to be able to reduce a candidate table to only represent a patient by her quasi-identifiers.
|
|
|
|
job.priority = 'BATCH'
|
|
|
|
@param dataset target dataset
|
|
|
|
job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
|
|
|
"""
|
|
|
|
|
|
|
|
o_dataset = args['dataset'] if 'dataset' in args else None
|
|
|
|
|
|
|
|
p = []
|
|
|
|
|
|
|
|
for table in self.tables:
|
|
|
|
|
|
|
|
sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",self.mytools.get_filtered_table(table,self.key),") as ",table['name']])
|
|
|
|
|
|
|
|
p.append(sql)
|
|
|
|
|
|
|
|
if o_dataset :
|
|
|
|
|
|
|
|
job = bq.QueryJobConfig()
|
|
|
|
|
|
|
|
job.destination = self.client.dataset(o_dataset).table(table['name'])
|
|
|
|
|
|
|
|
job.use_query_cache = True
|
|
|
|
|
|
|
|
job.allow_large_results = True
|
|
|
|
|
|
|
|
job.priority = 'INTERACTIVE'
|
|
|
|
|
|
|
|
job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
|
|
|
|
|
|
|
|
|
|
|
r = client.query(create_sql,location='US',job_config=job)
|
|
|
|
r = self.client.query(sql,location='US',job_config=job)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
|
|
|
|
|
|
|
|
return p
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# if 'action' in SYS_ARGS and SYS_ARGS['action'] in ['create','compute','migrate'] :
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# path = SYS_ARGS['path']
|
|
|
|
|
|
|
|
# client = bq.Client.from_service_account_json(path)
|
|
|
|
|
|
|
|
# i_dataset = SYS_ARGS['i_dataset']
|
|
|
|
|
|
|
|
# key = SYS_ARGS['key']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# mytools = utils(client = client)
|
|
|
|
|
|
|
|
# tables = mytools.get_tables(dataset=i_dataset,client=client,key=key)
|
|
|
|
|
|
|
|
# # print len(tables)
|
|
|
|
|
|
|
|
# # tables = tables[:6]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# if SYS_ARGS['action'] == 'create' :
|
|
|
|
|
|
|
|
# #usage:
|
|
|
|
|
|
|
|
# # create --i_dataset <in dataset> --key <patient id> --o_dataset <out dataset> --table <table|file> [--file] --path <bq JSON account file>
|
|
|
|
|
|
|
|
# #
|
|
|
|
|
|
|
|
# create_sql = mytools.get_sql(tables=tables,key=key) #-- The create statement
|
|
|
|
|
|
|
|
# o_dataset = SYS_ARGS['o_dataset']
|
|
|
|
|
|
|
|
# table = SYS_ARGS['table']
|
|
|
|
|
|
|
|
# if 'file' in SYS_ARGS :
|
|
|
|
|
|
|
|
# f = open(table+'.sql','w')
|
|
|
|
|
|
|
|
# f.write(create_sql)
|
|
|
|
|
|
|
|
# f.close()
|
|
|
|
|
|
|
|
# else:
|
|
|
|
|
|
|
|
# job = bq.QueryJobConfig()
|
|
|
|
|
|
|
|
# job.destination = client.dataset(o_dataset).table(table)
|
|
|
|
|
|
|
|
# job.use_query_cache = True
|
|
|
|
|
|
|
|
# job.allow_large_results = True
|
|
|
|
|
|
|
|
# job.priority = 'BATCH'
|
|
|
|
|
|
|
|
# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# r = client.query(create_sql,location='US',job_config=job)
|
|
|
|
|
|
|
|
|
|
|
|
print [r.job_id,' ** ',r.state]
|
|
|
|
# print [r.job_id,' ** ',r.state]
|
|
|
|
elif SYS_ARGS['action'] == 'migrate' :
|
|
|
|
# elif SYS_ARGS['action'] == 'migrate' :
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
|
|
|
|
|
|
|
|
o_dataset = SYS_ARGS['o_dataset']
|
|
|
|
# o_dataset = SYS_ARGS['o_dataset']
|
|
|
|
for table in tables:
|
|
|
|
# for table in tables:
|
|
|
|
sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",mytools.get_filtered_table(table,key),") as ",table['name']])
|
|
|
|
# sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",mytools.get_filtered_table(table,key),") as ",table['name']])
|
|
|
|
print ""
|
|
|
|
# print ""
|
|
|
|
print sql
|
|
|
|
# print sql
|
|
|
|
print ""
|
|
|
|
# print ""
|
|
|
|
# job = bq.QueryJobConfig()
|
|
|
|
# # job = bq.QueryJobConfig()
|
|
|
|
# job.destination = client.dataset(o_dataset).table(table['name'])
|
|
|
|
# # job.destination = client.dataset(o_dataset).table(table['name'])
|
|
|
|
# job.use_query_cache = True
|
|
|
|
# # job.use_query_cache = True
|
|
|
|
# job.allow_large_results = True
|
|
|
|
# # job.allow_large_results = True
|
|
|
|
# job.priority = 'INTERACTIVE'
|
|
|
|
# # job.priority = 'INTERACTIVE'
|
|
|
|
# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
|
|
|
# # job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
|
|
|
|
|
|
|
|
|
|
|
# r = client.query(sql,location='US',job_config=job)
|
|
|
|
# # r = client.query(sql,location='US',job_config=job)
|
|
|
|
|
|
|
|
|
|
|
|
# print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
|
|
|
|
# # print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pass
|
|
|
|
# pass
|
|
|
|
else:
|
|
|
|
# else:
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
#
|
|
|
|
# #
|
|
|
|
tables = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ]
|
|
|
|
# tables = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ]
|
|
|
|
limit = int(SYS_ARGS['limit']) if 'limit' in SYS_ARGS else 1
|
|
|
|
# limit = int(SYS_ARGS['limit']) if 'limit' in SYS_ARGS else 1
|
|
|
|
if tables :
|
|
|
|
# if tables :
|
|
|
|
risk= risk()
|
|
|
|
# risk= risk()
|
|
|
|
df = pd.DataFrame()
|
|
|
|
# df = pd.DataFrame()
|
|
|
|
dfs = pd.DataFrame()
|
|
|
|
# dfs = pd.DataFrame()
|
|
|
|
np.random.seed(1)
|
|
|
|
# np.random.seed(1)
|
|
|
|
for i in range(0,limit) :
|
|
|
|
# for i in range(0,limit) :
|
|
|
|
r = risk.get_sql(key=SYS_ARGS['key'],table=tables[0])
|
|
|
|
# r = risk.get_sql(key=SYS_ARGS['key'],table=tables[0])
|
|
|
|
sql = r['sql']
|
|
|
|
# sql = r['sql']
|
|
|
|
dfs = dfs.append(r['stream'],sort=True)
|
|
|
|
# dfs = dfs.append(r['stream'],sort=True)
|
|
|
|
df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard').join(dfs))
|
|
|
|
# df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard').join(dfs))
|
|
|
|
# df = df.join(dfs,sort=True)
|
|
|
|
# # df = df.join(dfs,sort=True)
|
|
|
|
df.to_csv(SYS_ARGS['table']+'.csv')
|
|
|
|
# df.to_csv(SYS_ARGS['table']+'.csv')
|
|
|
|
# dfs.to_csv(SYS_ARGS['table']+'_stream.csv')
|
|
|
|
# # dfs.to_csv(SYS_ARGS['table']+'_stream.csv')
|
|
|
|
print [i,' ** ',df.shape[0],pd.DataFrame(r['stream']).shape]
|
|
|
|
# print [i,' ** ',df.shape[0],pd.DataFrame(r['stream']).shape]
|
|
|
|
time.sleep(2)
|
|
|
|
# time.sleep(2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
# else:
|
|
|
|
print 'ERROR'
|
|
|
|
# print 'ERROR'
|
|
|
|
pass
|
|
|
|
# pass
|
|
|
|
|
|
|
|
|
|
|
|
# r = risk(path='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json', i_dataset='raw',o_dataset='risk_o',o_table='mo')
|
|
|
|
# # r = risk(path='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json', i_dataset='raw',o_dataset='risk_o',o_table='mo')
|
|
|
|
# tables = r.get_tables('raw','person_id')
|
|
|
|
# # tables = r.get_tables('raw','person_id')
|
|
|
|
# sql = r.get_sql(tables=tables[:3],key='person_id')
|
|
|
|
# # sql = r.get_sql(tables=tables[:3],key='person_id')
|
|
|
|
# #
|
|
|
|
# # #
|
|
|
|
# # let's post this to a designated location
|
|
|
|
# # # let's post this to a designated location
|
|
|
|
# #
|
|
|
|
# # #
|
|
|
|
# f = open('foo.sql','w')
|
|
|
|
# # f = open('foo.sql','w')
|
|
|
|
# f.write(sql)
|
|
|
|
# # f.write(sql)
|
|
|
|
# f.close()
|
|
|
|
# # f.close()
|
|
|
|
# r.get_sql(tables=tables,key='person_id')
|
|
|
|
# # r.get_sql(tables=tables,key='person_id')
|
|
|
|
# p = r.compute()
|
|
|
|
# # p = r.compute()
|
|
|
|
# print p
|
|
|
|
# # print p
|
|
|
|
# p.to_csv("risk.csv")
|
|
|
|
# # p.to_csv("risk.csv")
|
|
|
|
# r.write('foo.sql')
|
|
|
|
# # r.write('foo.sql')
|
|
|
|