@ -6,10 +6,13 @@ This file will perform basic tasks to finalize the GAN process by performing the
"""
import pandas as pd
import numpy as np
from multiprocessing import Process , Lock
from google . oauth2 import service_account
from google . cloud import bigquery as bq
import transport
from data . params import SYS_ARGS
import json
class Analytics :
"""
This class will compile basic analytics about a given dataset i . e compare original / synthetic
@ -33,15 +36,23 @@ class Analytics :
"""
This function will measure the distance between
"""
df = args [ ' data ' ]
names = [ name for name in df_counts . columns . tolist ( ) if name . endswith ( ' _io ' ) == False ]
pass
class Utils :
@staticmethod
def log ( * * args ) :
logger = transport . factory . instance ( type = " mongo.MongoWriter " , args = { " dbname " : " aou " , " doc " : " logs " } )
logger . write ( args )
logger . close ( )
class get :
@staticmethod
def config ( * * args ) :
contexts = args [ ' contexts ' ] . split ( ' , ' ) if type ( args [ ' contexts ' ] ) == str else args [ ' contexts ' ]
pipeline = args [ ' pipeline ' ]
return [ item for item in pipeline if item [ ' context ' ] in contexts ]
def pipeline ( table , path ) :
# contexts = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts']
config = json . loads ( ( open ( path ) ) . read ( ) )
pipeline = config [ ' pipeline ' ]
# return [ item for item in pipeline if item['context'] in contexts]
pipeline = [ item for item in pipeline if ' from ' in item and item [ ' from ' ] . strip ( ) == table ]
Utils . log ( module = table , action = ' init ' , input = { " pipeline " : pipeline } )
return pipeline
@staticmethod
def sql ( * * args ) :
"""
@ -54,7 +65,8 @@ class Utils :
SQL = [ " SELECT * FROM :from " ]
SQL_FILTER = [ ]
NO_FILTERS_FOUND = True
pipeline = Utils . get . config ( * * args )
# pipeline = Utils.get.config(**args)
pipeline = args [ ' pipeline ' ]
REVERSE_QUALIFIER = { ' IN ' : ' NOT IN ' , ' NOT IN ' : ' IN ' , ' = ' : ' <> ' , ' <> ' : ' = ' }
for item in pipeline :
@ -73,7 +85,7 @@ class Utils :
#
# let's pull the field schemas out of the table definition
#
Utils . log ( module = args [ ' from ' ] , action = ' sql ' , input = { " sql " : " " . join ( SQL ) } )
return " " . join ( SQL ) . replace ( " :from " , src )
@ -91,26 +103,36 @@ def mk(**args) :
return client . create_dataset ( dataset )
return found [ 0 ]
def move ( * * args ) :
def move ( args ) :
"""
This function will move a table from the synthetic dataset into a designated location
This is the simplest case for finalizing a synthetic data set
: private_key
"""
private_key = args [ ' private_key ' ]
client = bq . Client . from_service_account_json ( private_key )
config = Utils . get . config ( * * args )
pipeline = Utils . get . pipeline ( args [ ' from ' ] , args [ ' config ' ] )
_args = json . loads ( ( open ( args [ ' config ' ] ) ) . read ( ) )
_args [ ' pipeline ' ] = pipeline
# del _args['pipeline']
args = dict ( args , * * _args )
# del args['pipeline']
# private_key = args['private_key']
client = bq . Client . from_service_account_json ( args [ ' private_key ' ] )
dataset = args [ ' dataset ' ]
if ' contexts ' in args :
SQL = [ ' ' . join ( [ " SELECT * FROM io. " , item [ ' context ' ] , ' _full_io ' ] ) for item in config ]
if pipeline :
SQL = [ ' ' . join ( [ " SELECT * FROM io. " , item [ ' context ' ] , ' _full_io ' ] ) for item in pipeline ]
SQL + = [ Utils . get . sql ( * * args ) ]
SQL = ( ' \n UNION ALL \n ' . join ( SQL ) . replace ( ' :dataset ' , ' io ' ) )
else :
#
# moving a table to a designated location
tablename = args [ ' from ' ]
SQL = " SELECT * FROM :dataset.:table " . replace ( " :dataset " , dataset ) . replace ( " :table " , tablename )
if ' sql ' not in args :
SQL = " SELECT * FROM :dataset.:table "
else :
SQL = args [ ' sql ' ]
SQL = SQL . replace ( " :dataset " , dataset ) . replace ( " :table " , tablename )
Utils . log ( module = args [ ' from ' ] , action = ' sql ' , input = { ' sql ' : SQL } )
#
# At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table
#
@ -132,7 +154,7 @@ def move (**args):
SQL = SQL . replace ( " * " , " , " . join ( fields ) )
# print (SQL)
out = client . query ( SQL , location = ' US ' , job_config = config )
print ( )
Utils . log ( module = args [ ' from ' ] , action = ' move ' , input = { ' job ' : out . job_id } )
return ( out . job_id )
@ -158,23 +180,59 @@ if __name__ == '__main__' :
Usage :
finalize - - < move | stats > - - contexts < c1 , c2 , . . . c3 > - - from < table >
"""
if ' move ' in SYS_ARGS :
# table = SYS_ARGS['from']
# args = dict(config,**{"private_key":"../curation-prod.json"})
args = dict ( args , * * SYS_ARGS )
contexts = [ item [ ' context ' ] for item in config [ ' pipeline ' ] if item [ ' from ' ] == SYS_ARGS [ ' from ' ] ]
log = [ ]
if contexts :
args [ ' contexts ' ] = contexts
log = move ( * * args )
if ' init ' in SYS_ARGS :
dep = config [ ' dep ' ] if ' dep ' in config else { }
info = [ ]
if ' queries ' in dep :
info + = dep [ ' queries ' ]
print ( ' ________ ' )
if ' tables ' in dep :
info + = dep [ ' tables ' ]
args = { }
jobs = [ ]
for item in info :
args = { }
if type ( item ) == str :
args [ ' from ' ] = item
name = item
else :
args = item
name = item [ ' from ' ]
args [ ' config ' ] = SYS_ARGS [ ' config ' ]
# args['pipeline'] = []
job = Process ( target = move , args = ( args , ) )
job . name = name
jobs . append ( job )
job . start ( )
# while len(jobs) > 0 :
# jobs = [job for job in jobs if job.is_alive()]
# time.sleep(1)
else :
tables = args [ ' from ' ] . split ( ' , ' )
for name in tables :
name = name . strip ( )
args [ ' from ' ] = name
log + = [ move ( * * args ) ]
print ( " \n " . join ( log ) )
move ( SYS_ARGS )
# # table = SYS_ARGS['from']
# # args = dict(config,**{"private_key":"../curation-prod.json"})
# args = dict(args,**SYS_ARGS)
# contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
# log = []
# if contexts :
# args['contexts'] = contexts
# log = move(**args)
# else:
# tables = args['from'].split(',')
# for name in tables :
# name = name.strip()
# args['from'] = name
# log += [move(**args)]
# print ("\n".join(log))