@ -6,10 +6,13 @@ This file will perform basic tasks to finalize the GAN process by performing the
"""
"""
import pandas as pd
import pandas as pd
import numpy as np
import numpy as np
from multiprocessing import Process , Lock
from google . oauth2 import service_account
from google . oauth2 import service_account
from google . cloud import bigquery as bq
from google . cloud import bigquery as bq
import transport
from data . params import SYS_ARGS
from data . params import SYS_ARGS
import json
import json
class Analytics :
class Analytics :
"""
"""
This class will compile basic analytics about a given dataset i . e compare original / synthetic
This class will compile basic analytics about a given dataset i . e compare original / synthetic
@ -33,15 +36,23 @@ class Analytics :
"""
"""
This function will measure the distance between
This function will measure the distance between
"""
"""
df = args [ ' data ' ]
pass
names = [ name for name in df_counts . columns . tolist ( ) if name . endswith ( ' _io ' ) == False ]
class Utils :
class Utils :
@staticmethod
def log ( * * args ) :
logger = transport . factory . instance ( type = " mongo.MongoWriter " , args = { " dbname " : " aou " , " doc " : " logs " } )
logger . write ( args )
logger . close ( )
class get :
class get :
@staticmethod
@staticmethod
def config ( * * args ) :
def pipeline ( table , path ) :
contexts = args [ ' contexts ' ] . split ( ' , ' ) if type ( args [ ' contexts ' ] ) == str else args [ ' contexts ' ]
# contexts = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts']
pipeline = args [ ' pipeline ' ]
config = json . loads ( ( open ( path ) ) . read ( ) )
return [ item for item in pipeline if item [ ' context ' ] in contexts ]
pipeline = config [ ' pipeline ' ]
# return [ item for item in pipeline if item['context'] in contexts]
pipeline = [ item for item in pipeline if ' from ' in item and item [ ' from ' ] . strip ( ) == table ]
Utils . log ( module = table , action = ' init ' , input = { " pipeline " : pipeline } )
return pipeline
@staticmethod
@staticmethod
def sql ( * * args ) :
def sql ( * * args ) :
"""
"""
@ -54,7 +65,8 @@ class Utils :
SQL = [ " SELECT * FROM :from " ]
SQL = [ " SELECT * FROM :from " ]
SQL_FILTER = [ ]
SQL_FILTER = [ ]
NO_FILTERS_FOUND = True
NO_FILTERS_FOUND = True
pipeline = Utils . get . config ( * * args )
# pipeline = Utils.get.config(**args)
pipeline = args [ ' pipeline ' ]
REVERSE_QUALIFIER = { ' IN ' : ' NOT IN ' , ' NOT IN ' : ' IN ' , ' = ' : ' <> ' , ' <> ' : ' = ' }
REVERSE_QUALIFIER = { ' IN ' : ' NOT IN ' , ' NOT IN ' : ' IN ' , ' = ' : ' <> ' , ' <> ' : ' = ' }
for item in pipeline :
for item in pipeline :
@ -73,7 +85,7 @@ class Utils :
#
#
# let's pull the field schemas out of the table definition
# let's pull the field schemas out of the table definition
#
#
Utils . log ( module = args [ ' from ' ] , action = ' sql ' , input = { " sql " : " " . join ( SQL ) } )
return " " . join ( SQL ) . replace ( " :from " , src )
return " " . join ( SQL ) . replace ( " :from " , src )
@ -91,26 +103,36 @@ def mk(**args) :
return client . create_dataset ( dataset )
return client . create_dataset ( dataset )
return found [ 0 ]
return found [ 0 ]
def move ( * * args ) :
def move ( args ) :
"""
"""
This function will move a table from the synthetic dataset into a designated location
This function will move a table from the synthetic dataset into a designated location
This is the simplest case for finalizing a synthetic data set
This is the simplest case for finalizing a synthetic data set
: private_key
: private_key
"""
"""
private_key = args [ ' private_key ' ]
pipeline = Utils . get . pipeline ( args [ ' from ' ] , args [ ' config ' ] )
client = bq . Client . from_service_account_json ( private_key )
_args = json . loads ( ( open ( args [ ' config ' ] ) ) . read ( ) )
config = Utils . get . config ( * * args )
_args [ ' pipeline ' ] = pipeline
# del _args['pipeline']
args = dict ( args , * * _args )
# del args['pipeline']
# private_key = args['private_key']
client = bq . Client . from_service_account_json ( args [ ' private_key ' ] )
dataset = args [ ' dataset ' ]
dataset = args [ ' dataset ' ]
if ' contexts ' in args :
if pipeline :
SQL = [ ' ' . join ( [ " SELECT * FROM io. " , item [ ' context ' ] , ' _full_io ' ] ) for item in config ]
SQL = [ ' ' . join ( [ " SELECT * FROM io. " , item [ ' context ' ] , ' _full_io ' ] ) for item in pipeline ]
SQL + = [ Utils . get . sql ( * * args ) ]
SQL + = [ Utils . get . sql ( * * args ) ]
SQL = ( ' \n UNION ALL \n ' . join ( SQL ) . replace ( ' :dataset ' , ' io ' ) )
SQL = ( ' \n UNION ALL \n ' . join ( SQL ) . replace ( ' :dataset ' , ' io ' ) )
else :
else :
#
#
# moving a table to a designated location
# moving a table to a designated location
tablename = args [ ' from ' ]
tablename = args [ ' from ' ]
SQL = " SELECT * FROM :dataset.:table " . replace ( " :dataset " , dataset ) . replace ( " :table " , tablename )
if ' sql ' not in args :
SQL = " SELECT * FROM :dataset.:table "
else :
SQL = args [ ' sql ' ]
SQL = SQL . replace ( " :dataset " , dataset ) . replace ( " :table " , tablename )
Utils . log ( module = args [ ' from ' ] , action = ' sql ' , input = { ' sql ' : SQL } )
#
#
# At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table
# At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table
#
#
@ -132,7 +154,7 @@ def move (**args):
SQL = SQL . replace ( " * " , " , " . join ( fields ) )
SQL = SQL . replace ( " * " , " , " . join ( fields ) )
# print (SQL)
# print (SQL)
out = client . query ( SQL , location = ' US ' , job_config = config )
out = client . query ( SQL , location = ' US ' , job_config = config )
print ( )
Utils . log ( module = args [ ' from ' ] , action = ' move ' , input = { ' job ' : out . job_id } )
return ( out . job_id )
return ( out . job_id )
@ -158,23 +180,59 @@ if __name__ == '__main__' :
Usage :
Usage :
finalize - - < move | stats > - - contexts < c1 , c2 , . . . c3 > - - from < table >
finalize - - < move | stats > - - contexts < c1 , c2 , . . . c3 > - - from < table >
"""
"""
if ' move ' in SYS_ARGS :
if ' move ' in SYS_ARGS :
# table = SYS_ARGS['from']
# args = dict(config,**{"private_key":"../curation-prod.json"})
if ' init ' in SYS_ARGS :
args = dict ( args , * * SYS_ARGS )
dep = config [ ' dep ' ] if ' dep ' in config else { }
contexts = [ item [ ' context ' ] for item in config [ ' pipeline ' ] if item [ ' from ' ] == SYS_ARGS [ ' from ' ] ]
info = [ ]
log = [ ]
if contexts :
args [ ' contexts ' ] = contexts
log = move ( * * args )
if ' queries ' in dep :
info + = dep [ ' queries ' ]
print ( ' ________ ' )
if ' tables ' in dep :
info + = dep [ ' tables ' ]
args = { }
jobs = [ ]
for item in info :
args = { }
if type ( item ) == str :
args [ ' from ' ] = item
name = item
else :
args = item
name = item [ ' from ' ]
args [ ' config ' ] = SYS_ARGS [ ' config ' ]
# args['pipeline'] = []
job = Process ( target = move , args = ( args , ) )
job . name = name
jobs . append ( job )
job . start ( )
# while len(jobs) > 0 :
# jobs = [job for job in jobs if job.is_alive()]
# time.sleep(1)
else :
else :
tables = args [ ' from ' ] . split ( ' , ' )
move ( SYS_ARGS )
for name in tables :
# # table = SYS_ARGS['from']
name = name . strip ( )
# # args = dict(config,**{"private_key":"../curation-prod.json"})
args [ ' from ' ] = name
# args = dict(args,**SYS_ARGS)
log + = [ move ( * * args ) ]
# contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
print ( " \n " . join ( log ) )
# log = []
# if contexts :
# args['contexts'] = contexts
# log = move(**args)
# else:
# tables = args['from'].split(',')
# for name in tables :
# name = name.strip()
# args['from'] = name
# log += [move(**args)]
# print ("\n".join(log))