commit
59d6cc50c0
@ -0,0 +1 @@
|
||||
pipeline.py
|
@ -0,0 +1,377 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
This file will perform basic tasks to finalize the GAN process by performing the following :
|
||||
- basic stats & analytics
|
||||
- rebuild io to another dataset
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from multiprocessing import Process, Lock
|
||||
from google.oauth2 import service_account
|
||||
from google.cloud import bigquery as bq
|
||||
import transport
|
||||
from data.params import SYS_ARGS
|
||||
import json
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from google.oauth2 import service_account
|
||||
import json
|
||||
|
||||
# path = '../curation-prod.json'
|
||||
# credentials = service_account.Credentials.from_service_account_file(path)
|
||||
# df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
|
||||
filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config']
|
||||
f = open(filename)
|
||||
config = json.loads(f.read())
|
||||
args = config['pipeline']
|
||||
f.close()
|
||||
|
||||
def _formatSQL(**_args):
|
||||
"""
|
||||
This function will build the _map for a given segment
|
||||
"""
|
||||
sql = """
|
||||
select DISTINCT x.person_id synthetic,y.person_id original
|
||||
FROM :synthetic.:table x
|
||||
INNER JOIN :original.:table y on x.person_id in (:ids)
|
||||
AND x.person_id <> y.person_id AND x.gender_source_value = y.gender_source_value
|
||||
AND x.year_of_birth = y.year_of_birth
|
||||
ORDER BY 1
|
||||
"""
|
||||
table= _args['table']
|
||||
original,synthetic = _args['schema']['original'],_args['schema']['synthetic']
|
||||
_ids = np.array(_args['ids']).astype(str)
|
||||
return sql.replace(":ids",",".join(_ids)).replace(":synthetic",synthetic).replace(":original",original).replace(":table",table)
|
||||
def _addCounts(**_args) :
|
||||
store = _args['store']
|
||||
sql = _args['sql']
|
||||
reader = transport.factory.instance(**store['source'])
|
||||
_df = reader.read(sql=sql)
|
||||
_ids = _df.synthetic.unique()
|
||||
_counts = [ np.sum(_df.synthetic == value) for value in _ids]
|
||||
original = [_df[_df.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids]
|
||||
_df = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts})
|
||||
|
||||
#
|
||||
# We can post this to the backend ...
|
||||
#
|
||||
table = '_map' #-- Yes this is hard-coded
|
||||
writer = transport.factory.instance(**dict(store['target'],**{"parallel":True,"table":table}))
|
||||
# if writer.has(table=table) is False:
|
||||
# writer.write(_df)
|
||||
# else:
|
||||
_schema = [{"name":name,"type":"INTEGER"} for name in _df.columns]
|
||||
writer.write(_df,schema=_schema)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def Init(**_args) :
|
||||
"""
|
||||
This function will build a map of the synthetic to real individuals.
|
||||
The assumption is that the synthesized data is stored in the same data-store as the original the parameters provided are :
|
||||
:param store object from the configuration file with source,target entries
|
||||
:param table name of the original/synthetic tables (they should be the same)
|
||||
:param feat. featuress/attributes ... demographics to account for
|
||||
"""
|
||||
store = _args['store']
|
||||
reader = transport.factory.instance(**store['source'])
|
||||
original,synthetic = _args['schema']['original'],_args['schema']['synthetic']
|
||||
table = _args['table']
|
||||
sql = _args['sql'].replace(':synthetic',synthetic).replace(':original',original).replace(':table',table)
|
||||
|
||||
_map = reader.read(sql=sql)
|
||||
|
||||
|
||||
|
||||
k = _args['k'] if 'k' in _args else 2
|
||||
# _iodf = reader.read(table=table)
|
||||
# _ids = _iodf['person_id'].unique().tolist()
|
||||
# x_ = np.array_split(_ids,1000)
|
||||
jobs = []
|
||||
# for _items in x_ :
|
||||
# _p = {"ids":_items,"schema":_args['schema'],'store':store,'table':table}
|
||||
# sql = _formatSQL(**_p)
|
||||
# _p['sql'] = sql
|
||||
# _apply = lambda params: _addCounts(**params)
|
||||
# thread = Process(target=_apply,args=(_p,))
|
||||
# thread.start()
|
||||
# jobs.append(thread)
|
||||
|
||||
# return jobs
|
||||
#
|
||||
# We have performed a m:m (many-to-many) relationship with original participants and synthetic participants
|
||||
# The goal is to obtain a singular map against which records will be migrated
|
||||
#
|
||||
print (['... computing counts (k)'])
|
||||
_ids = _map.synthetic.unique()
|
||||
_counts = [ np.sum(_map.synthetic == value) for value in _ids]
|
||||
original = [_map[_map.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids]
|
||||
print (['Building k-classes/groups'])
|
||||
_mdf = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts})
|
||||
i = _mdf.apply(lambda row: row.counts >= k,axis=1)
|
||||
_mdf = _mdf[i]
|
||||
#
|
||||
# Log what just happened here so we know about the equivalence classes,
|
||||
# {"module":"binder","action":"map-generation","input":{"k":k,"rows":{"synthetic":_mdf.shape[0],"original":len(_counts)}}}
|
||||
|
||||
return _mdf
|
||||
#
|
||||
# now we are posting this to target storage ...
|
||||
#
|
||||
def ApplyOn (**_args):
|
||||
"""
|
||||
This function will rewrite SQL that applies the synthetic identifier to the entries of the pipeline
|
||||
We assume that the _map has two attributes (synthetic and original)
|
||||
:param store
|
||||
:param _config
|
||||
"""
|
||||
store_args = _args['store']
|
||||
_config = _args['config']
|
||||
|
||||
table = _config['from']
|
||||
reader = transport.factory.instance(**dict(store_args['source'],**{"table":table}))
|
||||
attr = reader.read(limit=1).columns.tolist()
|
||||
original_key = _args['original_key'] #-- assuming referential integrity
|
||||
|
||||
# synthetic_key= columns['synthetic']
|
||||
# mapped_original=columns['orginal']
|
||||
fields = list(set(attr) - set([original_key]))
|
||||
sql = "select _map.synthetic as :original_key,:fields from :original_schema.:table inner join :synthetic_schema._map on _map.original = :table.:original_key"
|
||||
sql = sql.replace(":table",table).replace(":fields",",".join(fields))
|
||||
sql = sql.replace(":original_key",original_key)
|
||||
_schema = _args['schema']
|
||||
sql = sql.replace(":original_schema",_schema['original']).replace(":synthetic_schema",_schema['synthetic'])
|
||||
|
||||
return reader.read (sql=sql)
|
||||
|
||||
if __name__ == '__main__' :
|
||||
pass
|
||||
|
||||
# class Analytics :
|
||||
# """
|
||||
# This class will compile basic analytics about a given dataset i.e compare original/synthetic
|
||||
# """
|
||||
# @staticmethod
|
||||
# def distribution(**args):
|
||||
# context = args['context']
|
||||
# df = args['data']
|
||||
# #
|
||||
# #-- This data frame counts unique values for each feature (space)
|
||||
# df_counts = pd.DataFrame(df.apply(lambda col: col.unique().size),columns=['counts']).T # unique counts
|
||||
# #
|
||||
# #-- Get the distributions for common values
|
||||
# #
|
||||
# names = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False]
|
||||
# ddf = df.apply(lambda col: pd.DataFrame(col.values,columns=[col.name]).groupby([col.name]).size() ).fillna(0)
|
||||
# ddf[context] = ddf.index
|
||||
|
||||
# pass
|
||||
# def distance(**args):
|
||||
# """
|
||||
# This function will measure the distance between
|
||||
# """
|
||||
# pass
|
||||
# class Utils :
|
||||
# @staticmethod
|
||||
# def log(**args):
|
||||
# logger = transport.factory.instance(type="mongo.MongoWriter",args={"dbname":"aou","doc":"logs"})
|
||||
# logger.write(args)
|
||||
# logger.close()
|
||||
# class get :
|
||||
# @staticmethod
|
||||
# def pipeline(table,path) :
|
||||
# # contexts = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts']
|
||||
# config = json.loads((open(path)).read())
|
||||
# pipeline = config['pipeline']
|
||||
# # return [ item for item in pipeline if item['context'] in contexts]
|
||||
# pipeline = [item for item in pipeline if 'from' in item and item['from'].strip() == table]
|
||||
# Utils.log(module=table,action='init',input={"pipeline":pipeline})
|
||||
# return pipeline
|
||||
# @staticmethod
|
||||
# def sql(**args) :
|
||||
# """
|
||||
# This function is intended to build SQL query for the remainder of the table that was not synthesized
|
||||
# :config configuration entries
|
||||
# :from source of the table name
|
||||
# :dataset name of the source dataset
|
||||
|
||||
# """
|
||||
# SQL = ["SELECT * FROM :from "]
|
||||
# SQL_FILTER = []
|
||||
# NO_FILTERS_FOUND = True
|
||||
# # pipeline = Utils.get.config(**args)
|
||||
# pipeline = args['pipeline']
|
||||
# REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='}
|
||||
# for item in pipeline :
|
||||
|
||||
|
||||
# if 'filter' in item :
|
||||
# if NO_FILTERS_FOUND :
|
||||
# NO_FILTERS_FOUND = False
|
||||
# SQL += ['WHERE']
|
||||
# #
|
||||
# # Let us load the filter in the SQL Query
|
||||
# FILTER = item['filter']
|
||||
# QUALIFIER = REVERSE_QUALIFIER[FILTER['qualifier'].upper()]
|
||||
# SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')']).replace(":dataset",args['dataset'])]
|
||||
# src = ".".join([args['dataset'],args['from']])
|
||||
# SQL += [" AND ".join(SQL_FILTER)]
|
||||
# #
|
||||
# # let's pull the field schemas out of the table definition
|
||||
# #
|
||||
# Utils.log(module=args['from'],action='sql',input={"sql":" ".join(SQL) })
|
||||
# return " ".join(SQL).replace(":from",src)
|
||||
|
||||
|
||||
# def mk(**args) :
|
||||
# dataset = args['dataset']
|
||||
# client = args['client'] if 'client' in args else bq.Client.from_service_account_file(args['private_key'])
|
||||
# #
|
||||
# # let us see if we have a dataset handy here
|
||||
# #
|
||||
# datasets = list(client.list_datasets())
|
||||
# found = [item for item in datasets if item.dataset_id == dataset]
|
||||
|
||||
# if not found :
|
||||
|
||||
# return client.create_dataset(dataset)
|
||||
# return found[0]
|
||||
|
||||
# def move (args):
|
||||
# """
|
||||
# This function will move a table from the synthetic dataset into a designated location
|
||||
# This is the simplest case for finalizing a synthetic data set
|
||||
# :private_key
|
||||
# """
|
||||
# pipeline = Utils.get.pipeline(args['from'],args['config'])
|
||||
# _args = json.loads((open(args['config'])).read())
|
||||
# _args['pipeline'] = pipeline
|
||||
# # del _args['pipeline']
|
||||
# args = dict(args,**_args)
|
||||
# # del args['pipeline']
|
||||
# # private_key = args['private_key']
|
||||
# client = bq.Client.from_service_account_json(args['private_key'])
|
||||
|
||||
# dataset = args['dataset']
|
||||
# if pipeline :
|
||||
# SQL = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in pipeline]
|
||||
# SQL += [Utils.get.sql(**args)]
|
||||
# SQL = ('\n UNION ALL \n'.join(SQL).replace(':dataset','io'))
|
||||
# else:
|
||||
# #
|
||||
# # moving a table to a designated location
|
||||
# tablename = args['from']
|
||||
# if 'sql' not in args :
|
||||
# SQL = "SELECT * FROM :dataset.:table"
|
||||
# else:
|
||||
# SQL = args['sql']
|
||||
# SQL = SQL.replace(":dataset",dataset).replace(":table",tablename)
|
||||
# Utils.log(module=args['from'],action='sql',input={'sql':SQL})
|
||||
# #
|
||||
# # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table
|
||||
# #
|
||||
|
||||
|
||||
|
||||
# odataset = mk(dataset=dataset+'_io',client=client)
|
||||
# # SQL = "SELECT * FROM io.:context_full_io".replace(':context',context)
|
||||
# config = bq.QueryJobConfig()
|
||||
# config.destination = client.dataset(odataset.dataset_id).table(args['from'])
|
||||
# config.use_query_cache = True
|
||||
# config.allow_large_results = True
|
||||
# config.priority = 'INTERACTIVE'
|
||||
# #
|
||||
# #
|
||||
|
||||
# schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
|
||||
# fields = [" ".join(["CAST (",item.name,"AS",item.field_type.replace("INTEGER","INT64").replace("FLOAT","FLOAT64"),") ",item.name]) for item in schema]
|
||||
# SQL = SQL.replace("*"," , ".join(fields))
|
||||
# # print (SQL)
|
||||
# out = client.query(SQL,location='US',job_config=config)
|
||||
# Utils.log(module=args['from'],action='move',input={'job':out.job_id})
|
||||
# return (out.job_id)
|
||||
|
||||
|
||||
|
||||
|
||||
# import pandas as pd
|
||||
# import numpy as np
|
||||
# from google.oauth2 import service_account
|
||||
# import json
|
||||
|
||||
# # path = '../curation-prod.json'
|
||||
# # credentials = service_account.Credentials.from_service_account_file(path)
|
||||
# # df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
|
||||
# filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config']
|
||||
# f = open(filename)
|
||||
# config = json.loads(f.read())
|
||||
# args = config['pipeline']
|
||||
# f.close()
|
||||
|
||||
|
||||
# if __name__ == '__main__' :
|
||||
# """
|
||||
# Usage :
|
||||
# finalize --<move|stats> --contexts <c1,c2,...c3> --from <table>
|
||||
# """
|
||||
|
||||
# if 'move' in SYS_ARGS :
|
||||
|
||||
# if 'init' in SYS_ARGS :
|
||||
# dep = config['dep'] if 'dep' in config else {}
|
||||
# info = []
|
||||
|
||||
# if 'queries' in dep :
|
||||
# info += dep['queries']
|
||||
# print ('________')
|
||||
# if 'tables' in dep :
|
||||
# info += dep['tables']
|
||||
# args = {}
|
||||
# jobs = []
|
||||
# for item in info :
|
||||
# args = {}
|
||||
# if type(item) == str :
|
||||
# args['from'] = item
|
||||
# name = item
|
||||
# else:
|
||||
# args = item
|
||||
# name = item['from']
|
||||
# args['config'] = SYS_ARGS['config']
|
||||
# # args['pipeline'] = []
|
||||
# job = Process(target=move,args=(args,))
|
||||
# job.name = name
|
||||
# jobs.append(job)
|
||||
# job.start()
|
||||
|
||||
|
||||
# # while len(jobs) > 0 :
|
||||
# # jobs = [job for job in jobs if job.is_alive()]
|
||||
# # time.sleep(1)
|
||||
|
||||
|
||||
# else:
|
||||
# move(SYS_ARGS)
|
||||
# # # table = SYS_ARGS['from']
|
||||
# # # args = dict(config,**{"private_key":"../curation-prod.json"})
|
||||
# # args = dict(args,**SYS_ARGS)
|
||||
# # contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
|
||||
# # log = []
|
||||
# # if contexts :
|
||||
# # args['contexts'] = contexts
|
||||
# # log = move(**args)
|
||||
|
||||
# # else:
|
||||
# # tables = args['from'].split(',')
|
||||
# # for name in tables :
|
||||
# # name = name.strip()
|
||||
# # args['from'] = name
|
||||
# # log += [move(**args)]
|
||||
# # print ("\n".join(log))
|
||||
|
||||
|
||||
|
||||
# else:
|
||||
# print ("NOT YET READY !")
|
@ -1,2 +1,6 @@
|
||||
import data.params as params
|
||||
|
||||
# import data.params as params
|
||||
from data.params import SYS_ARGS
|
||||
import transport
|
||||
from multiprocessing import Process, Queue
|
||||
from data.maker import prepare
|
||||
from data.maker import state
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,10 +0,0 @@
|
||||
import pandas as pd
|
||||
import data.maker
|
||||
|
||||
df = pd.read_csv('sample.csv')
|
||||
column = 'gender'
|
||||
id = 'id'
|
||||
context = 'demo'
|
||||
store = {"type":"mongo.MongoWriter","args":{"host":"localhost:27017","dbname":"GAN"}}
|
||||
max_epochs = 11
|
||||
data.maker.train(store=store,max_epochs=max_epochs,context=context,data=df,column=column,id=id,logs='foo')
|
@ -0,0 +1,76 @@
|
||||
"""
|
||||
This file is designed to specify the appliction of pre/post-processing code.
|
||||
The pre-processing code gets applied after the data has been loaded
|
||||
The post-processing code get applied after the data has been generated for instance:
|
||||
-approximation code/logic; date shifting; suppression; adding noise
|
||||
-
|
||||
"""
|
||||
import numpy as np
|
||||
from datetime import datetime, timedelta
|
||||
import time
|
||||
|
||||
class Phase:
|
||||
def __init__(self,**_args):
|
||||
self._df = _args['data']
|
||||
self.callback = _args['callback']
|
||||
def apply(self,**_args):
|
||||
"""
|
||||
:param data data-frame
|
||||
:param _info arguments needed to be applied
|
||||
:param callback callback function once done
|
||||
"""
|
||||
raise Exception ("Function needs to be Implemented")
|
||||
class Pre(Phase):
|
||||
pass
|
||||
class Post(Phase):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
pass
|
||||
|
||||
class Date(Post):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
def make(self,**_args):
|
||||
"""
|
||||
This function generates a random date given a year and optionally a set of days from the randomly generated date
|
||||
:param year initial value of a year
|
||||
:param offset list of days between initial date
|
||||
"""
|
||||
if _args['year'] in ['',None,np.nan] :
|
||||
return None
|
||||
year = int(_args['year'])
|
||||
|
||||
offset = _args['offset'] if 'offset' in _args else 0
|
||||
month = np.random.randint(1,13)
|
||||
if month == 2:
|
||||
_end = 28 if year % 4 != 0 else 29
|
||||
else:
|
||||
_end = 31 if month in [1,3,5,7,8,10,12] else 30
|
||||
day = np.random.randint(1,_end)
|
||||
|
||||
#-- synthetic date
|
||||
_date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0)
|
||||
FORMAT = '%Y-%m-%d' if 'format' not in _args else _args['format']
|
||||
|
||||
|
||||
|
||||
# print ([_name,FORMAT, _date.strftime(FORMAT)])
|
||||
r = []
|
||||
if offset :
|
||||
r = [_date.strftime(FORMAT)]
|
||||
for _delta in offset :
|
||||
_date = _date + timedelta(_delta)
|
||||
r.append(_date.strptime(FORMAT))
|
||||
return r
|
||||
else:
|
||||
return _date.strftime(FORMAT)
|
||||
|
||||
def apply(self,**_args):
|
||||
"""
|
||||
|
||||
"""
|
||||
pass
|
||||
class Approximate(Post):
|
||||
def apply(**_args):
|
||||
pass
|
||||
def applyWithRange(**_args):
|
@ -0,0 +1,284 @@
|
||||
"""
|
||||
(c) 2018 - 2021, Vanderbilt University Medical Center
|
||||
Steve L. Nyemba, steve.l.nyemba@vumc.org
|
||||
|
||||
This file is designed to handle preconditions for a generative adversarial network:
|
||||
- The file will read/get data from a source specified by transport (or data-frame)
|
||||
- The class will convert the data to a binary vector
|
||||
- The class will also help rebuild the data from a binary matrix.
|
||||
Usage :
|
||||
|
||||
"""
|
||||
import transport
|
||||
import json
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
# import cupy as cp
|
||||
import sys
|
||||
import os
|
||||
#
|
||||
# The following is to address the issue over creating a large matrix ...
|
||||
#
|
||||
# from multiprocessing import Process, Queue
|
||||
|
||||
# if 'GPU' in os.environ :
|
||||
# import cupy as np
|
||||
# else:
|
||||
# import numpy as np
|
||||
class void:
|
||||
pass
|
||||
class Hardware :
|
||||
"""
|
||||
This class is intended to allow the use of hardware i.e GPU, index or CPU
|
||||
"""
|
||||
pass
|
||||
|
||||
class Input :
|
||||
class NOVALUES :
|
||||
RANDOM,IGNORE,ALWAYS = ['random','ignore','always']
|
||||
"""
|
||||
This class is designed to read data from a source and and perform a variet of operations :
|
||||
- provide a feature space, and rows (matrix profile)
|
||||
- a data index map
|
||||
"""
|
||||
|
||||
def __init__(self,**_args):
|
||||
"""
|
||||
:param data
|
||||
:param store data-store parameters/configuration
|
||||
:param sql sql query that pulls a representative sample of the data
|
||||
"""
|
||||
self._schema = _args['schema'] if 'schema' in _args else {}
|
||||
#
|
||||
# schema data should be in a hash map for these purposes
|
||||
#
|
||||
# if self._schema :
|
||||
# r = {}
|
||||
# for _item in self._schema :
|
||||
# r[_item['name']] = r[_item['type']]
|
||||
# self._schema = r
|
||||
|
||||
self.df = _args['data']
|
||||
if 'sql' not in _args :
|
||||
self._initdata(**_args)
|
||||
#
|
||||
pass
|
||||
else:
|
||||
self._initsql(**_args)
|
||||
#
|
||||
# We need to have a means to map of values,columns and vector positions in order
|
||||
# to perform convert and revert to and from binary
|
||||
#
|
||||
self._map = {} if 'map' not in _args else _args['map']
|
||||
|
||||
|
||||
def _initsql(self,**_args):
|
||||
"""
|
||||
This function will initialize the class on the basis of a data-store and optionally pre-defined columns to be used to be synthesized
|
||||
:param store data-store configuration
|
||||
:param columns list of columns to be
|
||||
"""
|
||||
|
||||
if 'columns' not in _args :
|
||||
self._initcols(data=self.df)
|
||||
else:
|
||||
self._initcols(data=self.df,columns=_args['columns'])
|
||||
|
||||
pass
|
||||
def _init_map(self,values):
|
||||
self._map = dict(zip(np.arange(len(values)),values))
|
||||
for key in self._map :
|
||||
self._map[key] = self._map[key].tolist()
|
||||
def _initcols (self,**_args) :
|
||||
"""
|
||||
This function will initialize the columns to be synthesized and/or determine which ones can be synthesized
|
||||
:param data data-frame that holds the data (matrix)
|
||||
:param columns optional columns to be synthesized
|
||||
"""
|
||||
# df = _args['data'].copy()
|
||||
row_count = self.df.shape[0]
|
||||
cols = None if 'columns' not in _args else _args['columns']
|
||||
self.columns = self.df.columns.tolist()
|
||||
self._io = []
|
||||
|
||||
if 'columns' in _args :
|
||||
self._columns = _args['columns']
|
||||
# else:
|
||||
#
|
||||
# We will look into the count and make a judgment call
|
||||
try:
|
||||
# _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T
|
||||
# MIN_SPACE_SIZE = 2
|
||||
# self._columns = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist()
|
||||
# self._io = _df.to_dict(orient='records')
|
||||
_df = pd.DataFrame(self.df.nunique().T / self.df.shape[0]).T
|
||||
self._io = (_df.to_dict(orient='records'))
|
||||
|
||||
except Exception as e:
|
||||
print (e)
|
||||
self._io = []
|
||||
def _initdata(self,**_args):
|
||||
"""
|
||||
This function will initialize the class with a data-frame and columns of interest (if any)
|
||||
:param data data-frame that holds the data
|
||||
:param columns columns that need to be synthesized if any
|
||||
"""
|
||||
self._initcols(**_args)
|
||||
|
||||
def _convert(self,**_args):
|
||||
"""
|
||||
This function will convert a data-frame into a binary matrix and provide a map to be able to map the values back to the matrix
|
||||
:param columns in case we specify the columns to account for (just in case the original assumptions don't hold)
|
||||
"""
|
||||
if 'columns' in _args or 'column' in _args :
|
||||
columns = _args['columns'] if 'columns' in _args else [_args['column']]
|
||||
else:
|
||||
columns = self._columns
|
||||
_df = self.df if 'data' not in _args else _args['data']
|
||||
#
|
||||
# At this point we have the list of features we want to use
|
||||
i = 0
|
||||
|
||||
_m = np.array([])
|
||||
_values = []
|
||||
for name in columns :
|
||||
#
|
||||
# In case we have dataset with incomplete value space, we should still be able to generate something meaningful
|
||||
#
|
||||
values = None if name not in self._map else list(self._map[name]['values'])
|
||||
_type = self._schema[name] if name in self._schema else _df[name].dtype
|
||||
cols, _matrix = self.tobinary(_df[name],values)
|
||||
_beg,_end = i,i+len(cols)
|
||||
if name not in self._map :
|
||||
self._map[name] = {"beg":_beg,"end":_end ,"values":cols.tolist()}
|
||||
i += len(cols)
|
||||
if not _m.shape[0]:
|
||||
_m = _matrix ;
|
||||
else:
|
||||
_m = np.concatenate((_m,_matrix),axis=1)
|
||||
if values :
|
||||
_values += list(values)
|
||||
#
|
||||
# @NOTE:
|
||||
# The map should allow us to be able to convert or reconvert the binary matrix to whatever we want ...
|
||||
#
|
||||
# self._matrix = _m
|
||||
|
||||
return _values,_m
|
||||
|
||||
def _revert(self,**_args) :
|
||||
"""
|
||||
This function will take in a binary matrix and based on the map of values it will repopulate it with values
|
||||
:param _matrix binary matrix
|
||||
:param column|columns column name or columns if the column is specified
|
||||
"""
|
||||
_column = _args['column'] if 'column' in _args else None
|
||||
|
||||
|
||||
matrix = _args['matrix']
|
||||
row_count = matrix.shape[0]
|
||||
r = {}
|
||||
for key in self._map :
|
||||
if _column and key != _column :
|
||||
continue
|
||||
_item = self._map[key]
|
||||
_beg = _item['beg']
|
||||
_end = _item['end']
|
||||
columns = np.array(_item['values'])
|
||||
#
|
||||
# @NOTE: We are accessing matrices in terms of [row,col],
|
||||
# The beg,end variables are for the columns in the matrix (mini matrix)
|
||||
#
|
||||
# if not _column :
|
||||
# _matrix = matrix[:,_beg:_end] #-- The understanding is that _end is not included
|
||||
# else:
|
||||
# _matrix = matrix
|
||||
_matrix = matrix[:,_beg:_end]
|
||||
#
|
||||
# vectorize the matrix to replace the bits by their actual values (accounting for the data-types)
|
||||
# @TODO: Find ways to do this on a GPU (for big data) or across threads
|
||||
#
|
||||
row_count = _matrix.shape[0]
|
||||
# r[key] = [columns[np.where(row == 1) [0][0] ] for row in _matrix[:,_beg:_end]]
|
||||
|
||||
r[key] = [columns[np.where(row==1)[0][0]] if np.where(row==1)[0].size > 0 else '' for row in _matrix]
|
||||
#
|
||||
# we should consider decoding the matrix if possible
|
||||
#
|
||||
|
||||
return pd.DataFrame(r)
|
||||
|
||||
def tobinary(self,rows,cols=None) :
|
||||
"""
|
||||
This function will compile a binary matrix from a row of values this allows hopefully this can be done in parallel, this function can be vectorized and processed
|
||||
:param rows np.array or list of vector of values
|
||||
:param cols a space of values if it were to be different fromt he current sample.
|
||||
"""
|
||||
if not cols:
|
||||
#
|
||||
# In the advent the sample rows do NOT have the values of the
|
||||
cols = rows.unique()
|
||||
cols = np.array(cols)
|
||||
row_count = np.int64(len(rows))
|
||||
# if 'GPU' not in os.environ :
|
||||
# _matrix = np.zeros([row_count,cols.size],dtype=int)
|
||||
#
|
||||
# @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure)
|
||||
#
|
||||
_matrix = np.array([np.repeat(0,cols.size) for i in range(0,row_count)])
|
||||
|
||||
[np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
|
||||
# else:
|
||||
# _matrix = cp.zeros([row_count,cols.size])
|
||||
# [cp.put(_matrix[i], cp.where(cols == rows[i]),1)for i in cp.arange(row_count) ]
|
||||
# _matrix = _matrix.asnumpy()
|
||||
|
||||
|
||||
return cols,_matrix
|
||||
def convert(self,**_args):
|
||||
if 'columns' in _args or 'column' in _args :
|
||||
columns = _args['columns'] if 'columns' in _args else [_args['column']]
|
||||
else:
|
||||
columns = self._columns
|
||||
_df = self.df if 'data' not in _args else _args['data']
|
||||
_values,_matrix = self.encode(_df,columns)
|
||||
_, _matrix = self.tobinary(_matrix)
|
||||
self._init_map(_values)
|
||||
return _values,_matrix #-- matrix has been updated !
|
||||
def revert(self,**_args):
|
||||
# _columns = _args['column'] if 'column' in _args else None
|
||||
_matrix = _args['matrix']
|
||||
# print (_matrix)
|
||||
return self.decode(_matrix,columns=self._columns)
|
||||
pass
|
||||
def encode(self,df,columns) :
|
||||
_df = df[columns].drop_duplicates()
|
||||
_values = _df.values.tolist()
|
||||
_encoded = df[columns].apply(lambda row: _values.index( list(row)) ,axis=1)
|
||||
return np.array(_values),_encoded
|
||||
def decode (self,_matrix,**_args):
|
||||
#
|
||||
# _matrix binary matrix
|
||||
#
|
||||
|
||||
columns = _args['columns']
|
||||
_values = np.array( list(self._map.values()))
|
||||
_matrix = pd.DataFrame(_matrix) #if type(_matrix) != pd.DataFrame else _matrix
|
||||
# x = _matrix.apply(lambda row: _values[row.values == 1 ].tolist()[0] if row.values.sum() > 0 else None, axis=1).tolist()
|
||||
#@TODO: Provide random values for things that are missing
|
||||
|
||||
# x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist()
|
||||
#
|
||||
# @TODO: Provide a parameter to either:
|
||||
# - missing = {outlier,random,none}
|
||||
# - outlier: select an outlier, random: randomly select a value, none: do nothing ...
|
||||
#
|
||||
if np.random.choice([0,1],1)[0] :
|
||||
novalues = _values[np.random.choice( len(_values),1)[0]].tolist()
|
||||
else:
|
||||
novalues = np.repeat(None,len(self._columns))
|
||||
x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else novalues ,axis=1).tolist()
|
||||
return pd.DataFrame(x,columns=columns)
|
||||
|
||||
|
||||
|
@ -0,0 +1 @@
|
||||
__init__.py
|
@ -0,0 +1,105 @@
|
||||
"""
|
||||
This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditiions
|
||||
"""
|
||||
"""
|
||||
This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditions,
|
||||
The specifications for this are as follows (within an entry of the configuration)
|
||||
{
|
||||
"state":{
|
||||
"pre":[{"approximate":{"field":"int"}},{"newdate":{"field":"format"}}],"post":[{"limit":10}]
|
||||
}
|
||||
}
|
||||
"""
|
||||
import importlib
|
||||
import importlib.util
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from data.maker.state.default import *
|
||||
import os
|
||||
|
||||
|
||||
class State :
|
||||
@staticmethod
|
||||
def apply(_data,lpointers):
|
||||
"""
|
||||
This function applies a pipeline against a given data-frame, the calling code must decide whether it is a pre/post
|
||||
:_data data-frame
|
||||
:_lpointers functions modules returned by instance (module,_args)
|
||||
"""
|
||||
for _item in lpointers :
|
||||
if _item is None :
|
||||
continue
|
||||
|
||||
pointer = _item['module']
|
||||
_args = _item['args']
|
||||
|
||||
_data = pointer(_data,_args)
|
||||
return _data
|
||||
@staticmethod
|
||||
def instance(_args):
|
||||
pre = []
|
||||
post=[]
|
||||
|
||||
out = {}
|
||||
for key in _args :
|
||||
#
|
||||
# If the item has a path property is should be ignored
|
||||
path = _args[key]['path'] if 'path' in _args[key] else ''
|
||||
out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']]
|
||||
|
||||
return out
|
||||
# if 'pre' in _args:
|
||||
# path = _args['pre']['path'] if 'path' in _args['pre'] else ''
|
||||
|
||||
# pre = [ State._build(dict(_item,**{'path':path})) for _item in _args['pre']['pipeline']]
|
||||
# else:
|
||||
# path = _args['post']['path'] if 'path' in _args['post'] else ''
|
||||
|
||||
# post = [ State._build(dict(_item,**{'path':path})) for _item in _args['post']['pipeline']]
|
||||
# return {'pre':pre,'post':post}
|
||||
|
||||
@staticmethod
|
||||
def _extract(_entry):
|
||||
|
||||
_name = list(set(_entry.keys()) - set(['path']) )
|
||||
_name = _name[0]
|
||||
path = _entry['path'] if 'path' in _entry and os.path.exists(_entry['path']) else ''
|
||||
return {"module": _name,"args": _entry[_name],'name':_name,'path':path}
|
||||
pass
|
||||
@staticmethod
|
||||
def _build(_args):
|
||||
|
||||
_info = State._extract(_args)
|
||||
# _info = dict(_args,**_info)
|
||||
|
||||
_info['module'] = State._instance(_info)
|
||||
return _info if _info['module'] is not None else None
|
||||
|
||||
@staticmethod
|
||||
def _instance(_args):
|
||||
"""
|
||||
:path optional path of the file on disk
|
||||
:module name of the function
|
||||
"""
|
||||
|
||||
_name = _args['module']
|
||||
|
||||
if 'path' in _args and os.path.exists(_args['path']):
|
||||
path= _args['path']
|
||||
|
||||
spec = importlib.util.spec_from_file_location(_name, path)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
else:
|
||||
#
|
||||
# Probably calling a built-in module (should be in this file)
|
||||
|
||||
module = sys.modules['data.maker.state.default']
|
||||
|
||||
return getattr(module,_name) if hasattr(module,_name) else None
|
||||
|
||||
#
|
||||
# Adding a few custom functions that should be able to help ....
|
||||
# These functions can be called without specifying a path
|
||||
#
|
||||
|
@ -0,0 +1,116 @@
|
||||
"""
|
||||
This file contains default functions applied to a data-frame/dataset as pre/post processing jobs.
|
||||
The functions are organized in a pipeline i.e the data will be applied to each function
|
||||
|
||||
Custom functions :
|
||||
functions must tak 2 arguments (_data,_args) : where _data is a data frame and _arg is a object describing the input parameters
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
def limit(_data,size):
|
||||
"""
|
||||
...,{limit:size}
|
||||
"""
|
||||
|
||||
# size = int(_args['limit'])
|
||||
return _data.iloc[:size]
|
||||
def format(_data,_schema):
|
||||
"""
|
||||
This function enforces a schema against a data-frame, this may or may not work depending on the persistence storage
|
||||
:_data data-frame containing all data
|
||||
:_args schema to enforce the data, we are expecting the format as a list of {name,type,description}
|
||||
"""
|
||||
return _data
|
||||
|
||||
def approximate(_data,_args):
|
||||
"""
|
||||
:_args Object of {field:type}
|
||||
This function will approximate n-fields in the data given it's distribution
|
||||
"""
|
||||
_m = {'int':int,'float':float,'integer':int,'double':float}
|
||||
columns = list(_args.keys())
|
||||
for _name in columns :
|
||||
if _name not in _data :
|
||||
continue
|
||||
otype = _args[_name]
|
||||
otype = str if otype not in _m else _m[otype]
|
||||
_data.loc[:,_name] = np.random.uniform(_data[_name].values).astype(otype)
|
||||
|
||||
return _data
|
||||
def split_date(_data,_args):
|
||||
"""
|
||||
This function takes a field and applies the format from other fields
|
||||
:_data data-frame
|
||||
:_config configuration entry {column:{format,column:format,type}}
|
||||
"""
|
||||
_columns = list(_args.keys())
|
||||
_m = {'int':int,'float':float,'integer':int,'double':float}
|
||||
for _name in _columns :
|
||||
_iname = _args[_name]['column']
|
||||
_iformat = _args[_name]['format']['in']
|
||||
_oformat = _args[_name]['format']['out']
|
||||
_otype = str if 'type' not in _args[_name] else _args[_name]['type']
|
||||
_data.loc[:,_name] = _data[_iname].apply(lambda _date: datetime.strftime(datetime.strptime(str(_date),_iformat),_oformat)).astype(_otype)
|
||||
return _data
|
||||
def newdate(_data,_args):
|
||||
"""
|
||||
This function creates a new data on a given column from another
|
||||
:_data data frame
|
||||
:_args configuration column:{format,column}
|
||||
"""
|
||||
_columns = list(_args.keys())
|
||||
for _name in _columns :
|
||||
|
||||
format = _args[_name]['format']
|
||||
ROW_COUNT = _data[_name].size
|
||||
if 'column' in _args[_name] :
|
||||
srcName = _args[_name]['column']
|
||||
years = _data[srcName].values
|
||||
else:
|
||||
years = np.random.choice(np.arange(datetime.now().year- 90,datetime.now().year),ROW_COUNT)
|
||||
_data.loc[:,_name] = [ _makedate(year = years[_index],format = format) for _index in np.arange(ROW_COUNT)]
|
||||
|
||||
return _data
|
||||
def _makedate(**_args):
|
||||
"""
|
||||
This function creates a new date and applies it to a column
|
||||
:_data data-frame with columns
|
||||
:_args arguments for col1:format
|
||||
"""
|
||||
_columns = list(_args.keys())
|
||||
|
||||
# if _args['year'] in ['',None,np.nan] :
|
||||
# year = np.random.choice(np.arange(1920,222),1)
|
||||
# else:
|
||||
# year = int(_args['year'])
|
||||
year = int(_args['year'])
|
||||
offset = _args['offset'] if 'offset' in _args else 0
|
||||
month = np.random.randint(1,13)
|
||||
if month == 2:
|
||||
_end = 28 if year % 4 != 0 else 29
|
||||
else:
|
||||
_end = 31 if month in [1,3,5,7,8,10,12] else 30
|
||||
day = np.random.randint(1,_end)
|
||||
|
||||
#-- synthetic date
|
||||
_date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0)
|
||||
FORMAT = '%Y-%m-%d'
|
||||
|
||||
if 'format' in _args:
|
||||
FORMAT = _args['format']
|
||||
|
||||
|
||||
# print ([_name,FORMAT, _date.strftime(FORMAT)])
|
||||
r = []
|
||||
if offset :
|
||||
r = [_date.strftime(FORMAT)]
|
||||
for _delta in offset :
|
||||
_date = _date + timedelta(_delta)
|
||||
r.append(_date.strptime(FORMAT))
|
||||
return r
|
||||
else:
|
||||
return _date.strftime(FORMAT)
|
||||
|
@ -0,0 +1,303 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
from transport import factory
|
||||
import numpy as np
|
||||
import os
|
||||
from multiprocessing import Process
|
||||
import pandas as pd
|
||||
from google.oauth2 import service_account
|
||||
import data.maker
|
||||
|
||||
from data.params import SYS_ARGS
|
||||
|
||||
#
|
||||
# The configuration array is now loaded and we will execute the pipe line as follows
|
||||
DATASET='combined20190510'
|
||||
|
||||
class Components :
|
||||
|
||||
@staticmethod
|
||||
def get(args):
|
||||
"""
|
||||
This function returns a data-frame provided a bigquery sql statement with conditions (and limits for testing purposes)
|
||||
The function must be wrapped around a lambda this makes testing easier and changing data stores transparent to the rest of the code. (Vital when testing)
|
||||
:sql basic sql statement
|
||||
:condition optional condition and filters
|
||||
"""
|
||||
SQL = args['sql']
|
||||
if 'condition' in args :
|
||||
condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')'])
|
||||
SQL = " ".join([SQL,'WHERE',condition])
|
||||
|
||||
SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 "
|
||||
if 'limit' in args :
|
||||
SQL = SQL + 'LIMIT ' + args['limit']
|
||||
credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
|
||||
df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
|
||||
return df
|
||||
|
||||
# return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna()
|
||||
@staticmethod
|
||||
def split(X,MAX_ROWS=3,PART_SIZE=3):
|
||||
|
||||
return list(pd.cut( np.arange(X.shape[0]+1),PART_SIZE).categories)
|
||||
|
||||
def train(self,**args):
|
||||
"""
|
||||
This function will perform training on the basis of a given pointer that reads data
|
||||
|
||||
"""
|
||||
#
|
||||
# @TODO: we need to log something here about the parameters being passed
|
||||
pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args)
|
||||
df = pointer()
|
||||
|
||||
#
|
||||
# Now we can parse the arguments and submit the entire thing to training
|
||||
#
|
||||
|
||||
logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
|
||||
log_folder = args['logs'] if 'logs' in args else 'logs'
|
||||
_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
|
||||
_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
|
||||
_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1
|
||||
|
||||
MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0
|
||||
PART_SIZE = args['part_size'] if 'part_size' in args else 0
|
||||
|
||||
if df.shape[0] > MAX_ROWS and 'partition' not in args:
|
||||
lbound = 0
|
||||
bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
|
||||
# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
|
||||
|
||||
qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'})
|
||||
|
||||
for b in bounds :
|
||||
part_index = bounds.index(b)
|
||||
ubound = int(b.right)
|
||||
|
||||
|
||||
_data = df.iloc[lbound:ubound][args['columns']]
|
||||
lbound = ubound
|
||||
|
||||
# _args['logs'] = os.sep.join([log_folder,str(part_index)])
|
||||
_args['partition'] = str(part_index)
|
||||
_args['logger'] = {'args':{'dbname':'aou','doc':args['context']},'type':'mongo.MongoWriter'}
|
||||
#
|
||||
# We should post the the partitions to a queue server (at least the instructions on ):
|
||||
# - where to get the data
|
||||
# - and athe arguments to use (partition #,columns,gpu,epochs)
|
||||
#
|
||||
info = {"rows":_data.shape[0],"cols":_data.shape[1], "paritition":part_index,"logs":_args['logs']}
|
||||
p = {"args":_args,"data":_data.to_dict(orient="records"),"info":info}
|
||||
qwriter.write(p)
|
||||
#
|
||||
# @TODO:
|
||||
# - Notify that information was just posted to the queue
|
||||
info['max_rows'] = MAX_ROWS
|
||||
info['part_size'] = PART_SIZE
|
||||
logger.write({"module":"train","action":"setup-partition","input":info})
|
||||
|
||||
pass
|
||||
else:
|
||||
partition = args['partition'] if 'partition' in args else ''
|
||||
log_folder = os.sep.join([log_folder,args['context'],partition])
|
||||
_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
|
||||
_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
|
||||
_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
|
||||
|
||||
_args['data'] = df
|
||||
#
|
||||
# @log :
|
||||
# Logging information about the training process for this partition (or not)
|
||||
#
|
||||
info = {"rows":df.shape[0],"cols":df.shape[1], "partition":partition,"logs":_args['logs']}
|
||||
logger.write({"module":"train","action":"train","input":info})
|
||||
data.maker.train(**_args)
|
||||
|
||||
pass
|
||||
|
||||
# @staticmethod
|
||||
def generate(self,args):
|
||||
"""
|
||||
This function will generate data and store it to a given,
|
||||
"""
|
||||
logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
|
||||
log_folder = args['logs'] if 'logs' in args else 'logs'
|
||||
partition = args['partition'] if 'partition' in args else ''
|
||||
log_folder = os.sep.join([log_folder,args['context'],partition])
|
||||
_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
|
||||
_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
|
||||
_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
|
||||
_args['no_value']= args['no_value']
|
||||
MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0
|
||||
PART_SIZE = args['part_size'] if 'part_size' in args else 0
|
||||
|
||||
# credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
|
||||
# _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
|
||||
reader = args['reader']
|
||||
df = reader()
|
||||
if 'partition' in args :
|
||||
bounds = Components.split(df,MAX_ROWS,PART_SIZE)
|
||||
# bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
|
||||
lbound = int(bounds[int(partition)].left)
|
||||
ubound = int(bounds[int(partition)].right)
|
||||
df = df.iloc[lbound:ubound]
|
||||
_args['data'] = df
|
||||
# _args['data'] = reader()
|
||||
#_args['data'] = _args['data'].astype(object)
|
||||
_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1
|
||||
_dc = data.maker.generate(**_args)
|
||||
#
|
||||
# We need to post the generate the data in order to :
|
||||
# 1. compare immediately
|
||||
# 2. synthetic copy
|
||||
#
|
||||
|
||||
cols = _dc.columns.tolist()
|
||||
|
||||
data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query)
|
||||
base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it)
|
||||
|
||||
for name in cols :
|
||||
_args['data'][name] = _dc[name]
|
||||
info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
|
||||
if partition != '' :
|
||||
info['partition'] = partition
|
||||
logger.write(info)
|
||||
# filename = os.sep.join([log_folder,'output',name+'.csv'])
|
||||
# data_comp[[name]].to_csv(filename,index=False)
|
||||
|
||||
#
|
||||
#-- Let us store all of this into bigquery
|
||||
prefix = args['notify']+'.'+_args['context']
|
||||
table = '_'.join([prefix,partition,'io']).replace('__','_')
|
||||
folder = os.sep.join([args['logs'],args['context'],partition,'output'])
|
||||
if 'file' in args :
|
||||
|
||||
_fname = os.sep.join([folder,table.replace('_io','_full_io.csv')])
|
||||
_pname = os.sep.join([folder,table])+'.csv'
|
||||
data_comp.to_csv( _pname,index=False)
|
||||
_args['data'].to_csv(_fname,index=False)
|
||||
|
||||
|
||||
else:
|
||||
credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
|
||||
_pname = os.sep.join([folder,table+'.csv'])
|
||||
_fname = table.replace('_io','_full_io')
|
||||
data_comp.to_gbq(if_exists='replace',destination_table=_pname,credentials='credentials',chunk_size=50000)
|
||||
data_comp.to_csv(_pname,index=False)
|
||||
INSERT_FLAG = 'replace' if 'partition' not in args else 'append'
|
||||
_args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=_fname,credentials='credentials',chunk_size=50000)
|
||||
|
||||
info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} }
|
||||
if partition :
|
||||
info ['partition'] = partition
|
||||
logger.write({"module":"generate","action":"write","info":info} )
|
||||
@staticmethod
|
||||
def callback(channel,method,header,stream):
|
||||
|
||||
info = json.loads(stream)
|
||||
logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']})
|
||||
|
||||
logger.write({'module':'process','action':'read-partition','input':info['info']})
|
||||
df = pd.DataFrame(info['data'])
|
||||
args = info['args']
|
||||
if int(args['num_gpu']) > 1 and args['gpu'] > 0:
|
||||
args['gpu'] = args['gpu'] + args['num_gpu']
|
||||
args['reader'] = lambda: df
|
||||
#
|
||||
# @TODO: Fix
|
||||
# There is an inconsistency in column/columns ... fix this shit!
|
||||
#
|
||||
args['columns'] = args['column']
|
||||
(Components()).train(**args)
|
||||
logger.write({"module":"process","action":"exit","info":info["info"]})
|
||||
channel.close()
|
||||
channel.connection.close()
|
||||
pass
|
||||
|
||||
if __name__ == '__main__' :
|
||||
filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json'
|
||||
f = open (filename)
|
||||
PIPELINE = json.loads(f.read())
|
||||
f.close()
|
||||
index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else 0
|
||||
|
||||
args = (PIPELINE[index])
|
||||
args['dataset'] = 'combined20190510'
|
||||
args = dict(args,**SYS_ARGS)
|
||||
args['max_rows'] = int(args['max_rows']) if 'max_rows' in args else 3
|
||||
args['part_size']= int(args['part_size']) if 'part_size' in args else 3
|
||||
|
||||
#
|
||||
# @TODO:
|
||||
# Log what was initiated so we have context of this processing ...
|
||||
#
|
||||
if 'listen' not in SYS_ARGS :
|
||||
if 'file' in args :
|
||||
reader = lambda: pd.read_csv(args['file']) ;
|
||||
else:
|
||||
reader = lambda: Components().get(args)
|
||||
args['reader'] = reader
|
||||
|
||||
if 'generate' in SYS_ARGS :
|
||||
#
|
||||
# Let us see if we have partitions given the log folder
|
||||
|
||||
content = os.listdir( os.sep.join([args['logs'],args['context']]))
|
||||
generator = Components()
|
||||
if ''.join(content).isnumeric() :
|
||||
#
|
||||
# we have partitions we are working with
|
||||
|
||||
for id in ''.join(content) :
|
||||
args['partition'] = id
|
||||
|
||||
generator.generate(args)
|
||||
else:
|
||||
generator.generate(args)
|
||||
# Components.generate(args)
|
||||
elif 'listen' in args :
|
||||
#
|
||||
# This will start a worker just in case to listen to a queue
|
||||
if 'read' in SYS_ARGS :
|
||||
QUEUE_TYPE = 'queue.QueueReader'
|
||||
pointer = lambda qreader: qreader.read(1)
|
||||
else:
|
||||
QUEUE_TYPE = 'queue.QueueListener'
|
||||
pointer = lambda qlistener: qlistener.listen()
|
||||
N = int(SYS_ARGS['jobs']) if 'jobs' in SYS_ARGS else 1
|
||||
|
||||
qhandlers = [factory.instance(type=QUEUE_TYPE,args={'queue':'aou.io'}) for i in np.arange(N)]
|
||||
jobs = []
|
||||
for qhandler in qhandlers :
|
||||
qhandler.callback = Components.callback
|
||||
job = Process(target=pointer,args=(qhandler,))
|
||||
job.start()
|
||||
jobs.append(job)
|
||||
#
|
||||
# let us wait for the jobs
|
||||
print (["Started ",len(jobs)," trainers"])
|
||||
while len(jobs) > 0 :
|
||||
|
||||
jobs = [job for job in jobs if job.is_alive()]
|
||||
|
||||
# pointer(qhandler)
|
||||
|
||||
|
||||
# qreader.read(1)
|
||||
pass
|
||||
else:
|
||||
|
||||
trainer = Components()
|
||||
trainer.train(**args)
|
||||
# Components.train(**args)
|
||||
#for args in PIPELINE :
|
||||
#args['dataset'] = 'combined20190510'
|
||||
#process = Process(target=Components.train,args=(args,))
|
||||
#process.name = args['context']
|
||||
#process.start()
|
||||
# Components.train(args)
|
@ -0,0 +1,692 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
from transport import factory
|
||||
import numpy as np
|
||||
import time
|
||||
import os
|
||||
from multiprocessing import Process, Lock
|
||||
import pandas as pd
|
||||
from google.oauth2 import service_account
|
||||
from google.cloud import bigquery as bq
|
||||
import data.maker
|
||||
import copy
|
||||
from data.params import SYS_ARGS
|
||||
|
||||
#
|
||||
# The configuration array is now loaded and we will execute the pipe line as follows
|
||||
|
||||
class Components :
|
||||
lock = Lock()
|
||||
class KEYS :
|
||||
PIPELINE_KEY = 'pipeline'
|
||||
SQL_FILTER = 'filter'
|
||||
@staticmethod
|
||||
def get_filter (**args):
|
||||
if args['qualifier'] == 'IN' :
|
||||
return ' '.join([args['field'],args['qualifier'],'(',args['value'],')'])
|
||||
else:
|
||||
return ' '.join([args['field'],args['qualifier'],args['value']])
|
||||
@staticmethod
|
||||
def get_logger(**args) :
|
||||
return factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
|
||||
@staticmethod
|
||||
def get(args):
|
||||
"""
|
||||
This function returns a data-frame provided a bigquery sql statement with conditions (and limits for testing purposes)
|
||||
The function must be wrapped around a lambda this makes testing easier and changing data stores transparent to the rest of the code. (Vital when testing)
|
||||
:sql basic sql statement
|
||||
:condition optional condition and filters
|
||||
"""
|
||||
SQL = args['sql']
|
||||
if Components.KEYS.SQL_FILTER in args :
|
||||
FILTER_KEY = Components.KEYS.SQL_FILTER
|
||||
SQL_FILTER = args[FILTER_KEY] if type(args[FILTER_KEY]) == list else [args[FILTER_KEY]]
|
||||
# condition = ' '.join([args[FILTER_KEY]['field'],args[FILTER_KEY]['qualifier'],'(',args[FILTER_KEY]['value'],')'])
|
||||
|
||||
condition = ' AND '.join([Components.get_filter(**item) for item in SQL_FILTER])
|
||||
SQL = " ".join([SQL,'WHERE',condition])
|
||||
|
||||
SQL = SQL.replace(':dataset',args['dataset']) #+ " LI "
|
||||
|
||||
if 'limit' in args :
|
||||
SQL = SQL + ' LIMIT ' + args['limit']
|
||||
#
|
||||
# let's log the sql query that has been performed here
|
||||
logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
|
||||
logger.write({"module":"bigquery","action":"read","input":{"sql":SQL}})
|
||||
credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
|
||||
df = pd.read_gbq(SQL,credentials=credentials,dialect='standard')
|
||||
return df
|
||||
|
||||
# return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna()
|
||||
@staticmethod
|
||||
def split(X,MAX_ROWS=3,PART_SIZE=3):
|
||||
|
||||
return list(pd.cut( np.arange(X.shape[0]+1),PART_SIZE).categories)
|
||||
def format_schema(self,schema):
|
||||
_schema = {}
|
||||
for _item in schema :
|
||||
_type = int
|
||||
_value = 0
|
||||
if _item.field_type == 'FLOAT' :
|
||||
_type =float
|
||||
elif _item.field_type != 'INTEGER' :
|
||||
_type = str
|
||||
_value = ''
|
||||
_schema[_item.name] = _type
|
||||
return _schema
|
||||
def get_ignore(self,**_args) :
|
||||
if 'columns' in _args and 'data' in _args :
|
||||
_df = _args['data']
|
||||
terms = _args['columns']
|
||||
return [name for name in _df.columns if np.sum( [int(field in name )for field in terms ]) ]
|
||||
|
||||
return []
|
||||
def set_gpu(self,**_args) :
|
||||
if 'gpu' in _args :
|
||||
gpu = _args['gpu'] if type(_args['gpu']) != str else [_args['gpu']]
|
||||
_index = str(gpu[0])
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = _index
|
||||
return gpu
|
||||
else :
|
||||
return None
|
||||
def train(self,**args):
|
||||
"""
|
||||
This function will perform training on the basis of a given pointer that reads data
|
||||
|
||||
"""
|
||||
schema = None
|
||||
if 'file' in args :
|
||||
|
||||
df = pd.read_csv(args['file'])
|
||||
del args['file']
|
||||
elif 'data' not in args :
|
||||
|
||||
reader = factory.instance(**args['store']['source'])
|
||||
|
||||
|
||||
if 'row_limit' in args :
|
||||
df = reader.read(sql=args['sql'],limit=args['row_limit'])
|
||||
else:
|
||||
df = reader.read(sql=args['sql'])
|
||||
schema = reader.meta(table=args['from']) if hasattr(reader,'meta') and 'from' in args else None
|
||||
else:
|
||||
df = args['data']
|
||||
|
||||
#
|
||||
#
|
||||
# df = df.fillna('')
|
||||
if schema :
|
||||
_schema = []
|
||||
for _item in schema :
|
||||
_type = int
|
||||
_value = 0
|
||||
if _item.field_type == 'FLOAT' :
|
||||
_type =float
|
||||
elif _item.field_type != 'INTEGER' :
|
||||
_type = str
|
||||
_value = ''
|
||||
_schema += [{"name":_item.name,"type":_item.field_type}]
|
||||
df[_item.name] = df[_item.name].fillna(_value).astype(_type)
|
||||
args['schema'] = _schema
|
||||
# df[_item.name] = df[_item.name].astype(_type)
|
||||
_args = copy.deepcopy(args)
|
||||
# _args['store'] = args['store']['source']
|
||||
_args['data'] = df
|
||||
#
|
||||
# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
|
||||
if 'continuous' in args :
|
||||
x_cols = args['continuous']
|
||||
else:
|
||||
x_cols = []
|
||||
|
||||
if 'ignore' in args and 'columns' in args['ignore'] :
|
||||
_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
|
||||
_args['data'] = df[ list(set(df.columns)- set(_cols))]
|
||||
#
|
||||
# We need to make sure that continuous columns are removed
|
||||
if x_cols :
|
||||
_args['data'] = _args['data'][list(set(_args['data'].columns) - set(x_cols))]
|
||||
if 'gpu' in args :
|
||||
_args['gpu'] = self.set_gpu(gpu=args['gpu'])
|
||||
if 'partition' in args :
|
||||
_args['partition'] = args['partition']
|
||||
if df.shape[0] and df.shape[0] :
|
||||
#
|
||||
# We have a full blown matrix to be processed
|
||||
print ('-- Training --')
|
||||
data.maker.train(**_args)
|
||||
else:
|
||||
print ("... skipping training !!")
|
||||
|
||||
if 'autopilot' in ( list(args.keys())) :
|
||||
|
||||
args['data'] = df
|
||||
print (['autopilot mode enabled ....',args['context']])
|
||||
self.generate(args)
|
||||
|
||||
pass
|
||||
|
||||
def approximate(self,values):
|
||||
"""
|
||||
:param values array of values to be approximated
|
||||
"""
|
||||
if values.dtype in [int,float] :
|
||||
#
|
||||
# @TODO: create bins?
|
||||
r = np.random.dirichlet(values+.001) #-- dirichlet doesn't work on values with zeros
|
||||
_sd = values[values > 0].std()
|
||||
_me = values[values > 0].mean()
|
||||
_mi = values.min()
|
||||
x = []
|
||||
_type = values.dtype
|
||||
for index in np.arange(values.size) :
|
||||
|
||||
if np.random.choice([0,1],1)[0] :
|
||||
value = values[index] + (values[index] * r[index])
|
||||
|
||||
else :
|
||||
value = values[index] - (values[index] * r[index])
|
||||
#
|
||||
# randomly shifting the measurements
|
||||
if np.random.choice([0,1],1)[0] and _me > _sd :
|
||||
if np.random.choice([0,1],1)[0] :
|
||||
value = value * np.divide(_me,_sd)
|
||||
else:
|
||||
value = value + (np.divide(_me,_sd))
|
||||
value = int(value) if _type == int else np.round(value,2)
|
||||
x.append( value)
|
||||
np.random.shuffle(x)
|
||||
return np.array(x)
|
||||
else:
|
||||
return values
|
||||
pass
|
||||
|
||||
def shuffle(self,_args):
|
||||
if 'data' in args :
|
||||
df = data['data']
|
||||
else:
|
||||
reader = factory.instance(**args['store']['source'])
|
||||
if 'file' in args :
|
||||
df = pd.read_csv(args['file'])
|
||||
elif 'data' in _args :
|
||||
df = _args['data']
|
||||
else:
|
||||
if 'row_limit' in args and 'sql' in args:
|
||||
df = reader.read(sql=args['sql'],limit=args['row_limit'])
|
||||
else:
|
||||
df = reader.read(sql=args['sql'])
|
||||
schema = None
|
||||
if 'schema' not in args and hasattr(reader,'meta') and 'file' not in args:
|
||||
schema = reader.meta(table=args['from'])
|
||||
schema = [{"name":_item.name,"type":_item.field_type} for _item in schema]
|
||||
#
|
||||
# We are shufling designated colmns and will be approximating the others
|
||||
#
|
||||
x_cols = [] #-- coumns tobe approximated.
|
||||
_cols = [] #-- columns to be ignored
|
||||
if 'continuous' in args :
|
||||
x_cols = args['continuous']
|
||||
if 'ignore' in args and 'columns' in args['ignore'] :
|
||||
_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
|
||||
|
||||
columns = args['columns'] if 'columns' in args else df.columns
|
||||
columns = list(set(columns) - set(_cols))
|
||||
for name in columns:
|
||||
i = np.arange(df.shape[0])
|
||||
np.random.shuffle(i)
|
||||
if name in x_cols :
|
||||
if df[name].unique().size > 0 :
|
||||
df[name] = self.approximate(df.iloc[i][name].fillna(0).values)
|
||||
# df[name] = df[name].astype(str)
|
||||
# pass
|
||||
|
||||
df.index = np.arange(df.shape[0])
|
||||
self.post(data=df,schema=schema,store=args['store']['target'])
|
||||
def post(self,**_args) :
|
||||
table = _args['from'] if 'from' in _args else _args['store']['table']
|
||||
_schema = _args['schema'] if 'schema' in _args else None
|
||||
writer = factory.instance(**_args['store'])
|
||||
_df = _args['data']
|
||||
if _schema :
|
||||
columns = []
|
||||
for _item in _schema :
|
||||
name = _item['name']
|
||||
_type = str
|
||||
_value = 0
|
||||
if _item['type'] in ['DATE','TIMESTAMP','DATETIMESTAMP','DATETIME'] :
|
||||
if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
|
||||
#
|
||||
# There is an issue with missing dates that needs to be resolved.
|
||||
# for some reason a missing date/time here will cause the types to turn into timestamp (problem)
|
||||
# The following is a hack to address the issue (alas) assuming 10 digit dates and 'NaT' replaces missing date values (pandas specifications)
|
||||
#
|
||||
_df[name] = _df[name].apply(lambda value: None if str(value) == 'NaT' else (str(value)[:10]) if _item['type'] in ['DATE','DATETIME'] else str(value))
|
||||
#_df[name] = _df[name].dt.date
|
||||
# _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce')
|
||||
else:
|
||||
pass
|
||||
_df[name] = pd.to_datetime(_df[name])
|
||||
else:
|
||||
value = 0
|
||||
if _item['type'] == 'INTEGER' :
|
||||
_type = np.int64
|
||||
elif _item['type'] in ['FLOAT','NUMERIC']:
|
||||
_type = np.float64
|
||||
else:
|
||||
|
||||
_value = ''
|
||||
_df[name] = _df[name].fillna(_value) #.astype(_type)
|
||||
columns.append(name)
|
||||
|
||||
fields = _df.columns.tolist()
|
||||
if not writer.has(table=table) and _args['store']['provider'] != 'bigquery':
|
||||
|
||||
_map = {'STRING':'VARCHAR(256)','INTEGER':'BIGINT'} if 'provider' in _args['store'] and _args['store']['provider'] != 'bigquery' else {}
|
||||
_params = {'map':_map,'table':args['from']}
|
||||
if _schema :
|
||||
_params['schema'] = _schema
|
||||
|
||||
else:
|
||||
_params['fields'] = fields
|
||||
|
||||
writer.make(**_params)
|
||||
|
||||
fields = _df.columns.tolist()
|
||||
_df = _df[fields]
|
||||
# writer.fields = fields
|
||||
if _args['store']['provider'] == 'bigquery' :
|
||||
print (['_______ POSTING ______________ ',table])
|
||||
print (['_______________ ',_df.shape[0],' ___________________'])
|
||||
writer.write(_df.astype(object),schema=_schema,table=table)
|
||||
else:
|
||||
writer.table = table
|
||||
writer.write(_df)
|
||||
# else:
|
||||
# writer.write(_df,table=args['from'])
|
||||
|
||||
|
||||
def finalize(self,args):
|
||||
"""
|
||||
This function performs post-processing opertions on a synthetic table i.e :
|
||||
- remove duplicate keys
|
||||
- remove orphaned keys i.e
|
||||
"""
|
||||
reader = factory.instance(**args['store']['source'])
|
||||
logger = factory.instance(**args['store']['logs'])
|
||||
|
||||
target = args['store']['target']['args']['dataset']
|
||||
source = args['store']['source']['args']['dataset']
|
||||
table = args['from']
|
||||
schema = reader.meta(table=args['from'])
|
||||
#
|
||||
# keys :
|
||||
unique_field = "_".join([args['from'],'id']) if 'unique_fields' not in args else args['unique_fields']
|
||||
fields = [ item.name if item.name != unique_field else "y."+item.name for item in schema]
|
||||
SQL = [
|
||||
"SELECT :fields FROM ",
|
||||
"(SELECT ROW_NUMBER() OVER() AS row_number,* FROM :target.:table) x","INNER JOIN",
|
||||
"(SELECT ROW_NUMBER() OVER() AS row_number, :unique_field FROM :source.:table ORDER BY RAND()) y",
|
||||
"ON y.row_number = x.row_number"
|
||||
]
|
||||
SQL = " ".join(SQL).replace(":fields",",".join(fields)).replace(":table",table).replace(":source",source).replace(":target",target)
|
||||
SQL = SQL.replace(":unique_field",unique_field)
|
||||
#
|
||||
# Use a native job to get this done ...
|
||||
#
|
||||
client = bq.Client.from_service_account_json(args['store']['source']['args']["private_key"])
|
||||
job = bq.QueryJobConfig()
|
||||
job.destination = client.dataset(target).table(table)
|
||||
job.use_query_cache = True
|
||||
job.allow_large_results = True
|
||||
# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
||||
job.write_disposition = "WRITE_TRUNCATE"
|
||||
job.priority = 'BATCH'
|
||||
r = client.query(SQL,location='US',job_config=job)
|
||||
logger.write({"job":r.job_id,"action":"finalize", "args":{"sql":SQL,"source":"".join([source,table]),"destimation":".".join([target,table])}})
|
||||
#
|
||||
# Keep a log of what just happened...
|
||||
#
|
||||
otable = ".".join([args['store']['source']['args']['dataset'],args['from']])
|
||||
dtable = ".".join([args['store']['target']['args']['dataset'],args['from']])
|
||||
def generate(self,args):
|
||||
"""
|
||||
This function will generate data and store it to a given,
|
||||
"""
|
||||
store = args['store']['logs']
|
||||
if 'args' in store :
|
||||
store['args']['doc'] = args['context']
|
||||
else:
|
||||
store['doc'] = args['context']
|
||||
logger = factory.instance(**store) #type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
|
||||
|
||||
ostore = args['store']['target']
|
||||
writer = factory.instance(**ostore)
|
||||
|
||||
schema = args['schema'] if 'schema' in args else None
|
||||
if 'data' in args :
|
||||
|
||||
df = args['data']
|
||||
else:
|
||||
|
||||
reader = factory.instance(**args['store']['source'])
|
||||
if 'row_limit' in args :
|
||||
df = reader.read(sql=args['sql'],limit=args['row_limit'])
|
||||
else:
|
||||
df = reader.read(sql=args['sql'])
|
||||
if 'schema' not in args and hasattr(reader,'meta'):
|
||||
schema = reader.meta(table=args['from'])
|
||||
schema = [{"name":_item.name,"type":_item.field_type} for _item in schema]
|
||||
|
||||
# else:
|
||||
# #
|
||||
# # This will account for autopilot mode ...
|
||||
# df = args['data']
|
||||
_cast = {}
|
||||
if schema :
|
||||
|
||||
for _item in schema :
|
||||
dtype = str
|
||||
name = _item['name']
|
||||
novalue = 0
|
||||
if _item['type'] in ['INTEGER','NUMERIC']:
|
||||
dtype = np.int64
|
||||
|
||||
elif _item['type'] == 'FLOAT' :
|
||||
dtype = np.float64
|
||||
else:
|
||||
novalue = ''
|
||||
# _cast[schema['name']] = dtype
|
||||
df[name] = df[name].fillna(novalue).astype(dtype)
|
||||
|
||||
_info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[1]},"schema":schema}
|
||||
logger.write(_info)
|
||||
|
||||
|
||||
_dc = pd.DataFrame()
|
||||
# for mdf in df :
|
||||
args['data'] = df.copy()
|
||||
#
|
||||
# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
|
||||
if 'continuous' in args :
|
||||
x_cols = args['continuous']
|
||||
else:
|
||||
x_cols = []
|
||||
|
||||
if 'ignore' in args and 'columns' in args['ignore'] :
|
||||
_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
|
||||
args['data'] = args['data'][ list(set(df.columns)- set(_cols))]
|
||||
#
|
||||
# We need to remove the continuous columns from the data-frame
|
||||
# @TODO: Abstract this !!
|
||||
#
|
||||
real_df = pd.DataFrame()
|
||||
if x_cols :
|
||||
args['data'] = args['data'][list(set(args['data'].columns) - set(x_cols))]
|
||||
real_df = df[x_cols].copy()
|
||||
|
||||
args['candidates'] = 1 if 'candidates' not in args else int(args['candidates'])
|
||||
if 'gpu' in args :
|
||||
args['gpu'] = self.set_gpu(gpu=args['gpu'])
|
||||
# if 'partition' in args :
|
||||
# args['logs'] = os.sep.join([args['logs'],str(args['partition'])])
|
||||
|
||||
_info = {"module":"gan-prep","action":"prune","shape":{"rows":args['data'].shape[0],"columns":args['data'].shape[1]}}
|
||||
logger.write(_info)
|
||||
if args['data'].shape[0] > 0 and args['data'].shape[1] > 0 :
|
||||
candidates = (data.maker.generate(**args))
|
||||
|
||||
else:
|
||||
candidates = [df]
|
||||
|
||||
# if 'sql.BQWriter' in ostore['type'] :
|
||||
_columns = None
|
||||
skip_columns = []
|
||||
_schema = schema
|
||||
if schema :
|
||||
cols = [_item['name'] for _item in _schema]
|
||||
else:
|
||||
cols = df.columns.tolist()
|
||||
_info = {"module":"gan-prep","action":"selection","input":{"candidates":len(candidates),"features":cols}}
|
||||
logger.write(_info)
|
||||
for _df in candidates :
|
||||
#
|
||||
# we need to format the fields here to make sure we have something cohesive
|
||||
#
|
||||
|
||||
if not skip_columns :
|
||||
if 'ignore' in args and 'columns' in args['ignore'] :
|
||||
skip_columns = self.get_ignore(data=_df,columns=args['ignore']['columns'])
|
||||
#
|
||||
# We perform a series of set operations to insure that the following conditions are met:
|
||||
# - the synthetic dataset only has fields that need to be synthesized
|
||||
# - The original dataset has all the fields except those that need to be synthesized
|
||||
#
|
||||
|
||||
_df = _df[list(set(_df.columns) - set(skip_columns))].copy()
|
||||
if x_cols :
|
||||
_approx = {}
|
||||
for _col in x_cols :
|
||||
if real_df[_col].unique().size > 0 :
|
||||
|
||||
|
||||
_df[_col] = self.approximate(real_df[_col].values)
|
||||
_approx[_col] = {
|
||||
"io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)},
|
||||
"real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}
|
||||
}
|
||||
else:
|
||||
_df[_col] = -1
|
||||
logger.write({"module":"gan-generate","action":"approximate","status":_approx})
|
||||
if set(df.columns) & set(_df.columns) :
|
||||
_columns = list(set(df.columns) - set(_df.columns))
|
||||
df = df[_columns]
|
||||
|
||||
#
|
||||
# Let us merge the dataset here and and have a comprehensive dataset
|
||||
|
||||
_df = pd.DataFrame.join(df,_df)
|
||||
_params = {'data':_df,'store' : ostore,'from':args['from']}
|
||||
if _schema :
|
||||
_params ['schema'] = _schema
|
||||
_info = {"module":"gan-prep","action":"write","input":{"rows":_df.shape[0],"cols":_df.shape[1]}}
|
||||
logger.write(_info)
|
||||
self.post(**_params)
|
||||
# print (['_______ posting _________________',_df.shape])
|
||||
break
|
||||
|
||||
|
||||
pass
|
||||
# else:
|
||||
# pass
|
||||
def bind(self,**_args):
|
||||
print (_args)
|
||||
|
||||
|
||||
if __name__ == '__main__' :
|
||||
filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json'
|
||||
f = open (filename)
|
||||
_config = json.loads(f.read())
|
||||
f.close()
|
||||
PIPELINE = _config['pipeline']
|
||||
index = SYS_ARGS['index']
|
||||
if index.isnumeric() :
|
||||
index = int(SYS_ARGS['index'])
|
||||
else:
|
||||
#
|
||||
# The index provided is a key to a pipeline entry mainly the context
|
||||
#
|
||||
N = len(PIPELINE)
|
||||
f = [i for i in range(0,N) if PIPELINE[i]['context'] == index]
|
||||
index = f[0] if f else 0
|
||||
#
|
||||
|
||||
print ("..::: ",PIPELINE[index]['context'],':::..')
|
||||
args = (PIPELINE[index])
|
||||
for key in _config :
|
||||
if key == 'pipeline' or key in args:
|
||||
#
|
||||
# skip in case of pipeline or if key exists in the selected pipeline (provided by index)
|
||||
#
|
||||
continue
|
||||
args[key] = _config[key]
|
||||
|
||||
args = dict(args,**SYS_ARGS)
|
||||
if 'matrix_size' in args :
|
||||
args['matrix_size'] = int(args['matrix_size'])
|
||||
if 'batch_size' not in args :
|
||||
args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size'])
|
||||
if 'dataset' not in args :
|
||||
args['dataset'] = 'combined20191004v2_deid'
|
||||
args['logs'] = args['logs'] if 'logs' in args else 'logs'
|
||||
PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
|
||||
#
|
||||
# @TODO:
|
||||
# Log what was initiated so we have context of this processing ...
|
||||
#
|
||||
|
||||
GPU_CHIPS = args['gpu'] if 'gpu' in args else None
|
||||
if GPU_CHIPS and type(GPU_CHIPS) != list :
|
||||
GPU_CHIPS = [int(_id.strip()) for _id in GPU_CHIPS.split(',')] if type(GPU_CHIPS) == str else [GPU_CHIPS]
|
||||
if 'gpu' in SYS_ARGS :
|
||||
args['gpu'] = GPU_CHIPS
|
||||
jobs = []
|
||||
if 'generate' in SYS_ARGS :
|
||||
#
|
||||
# Let us see if we have partitions given the log folder
|
||||
|
||||
content = os.listdir( os.sep.join([args['logs'],'train',args['context']]))
|
||||
if 'all-chips' in SYS_ARGS and GPU_CHIPS:
|
||||
index = 0
|
||||
jobs = []
|
||||
for _gpu in GPU_CHIPS :
|
||||
_args = copy.deepcopy(args)
|
||||
_args['gpu'] = [int(_gpu)]
|
||||
_args['partition'] = int(_gpu) #index
|
||||
index += 1
|
||||
make = lambda _params: (Components()).generate(_params)
|
||||
job = Process(target=make,args=( dict(_args),))
|
||||
job.name = 'Trainer # ' + str(index)
|
||||
job.start()
|
||||
jobs.append(job)
|
||||
pass
|
||||
else:
|
||||
|
||||
generator = Components()
|
||||
generator.generate(args)
|
||||
elif 'bind' in SYS_ARGS :
|
||||
import binder
|
||||
_args = _config['_map']
|
||||
_args['store'] = copy.deepcopy(_config['store'])
|
||||
if 'init' in SYS_ARGS :
|
||||
#
|
||||
# Creating and persisting the map ...
|
||||
print (['.... Binding Initialization'])
|
||||
# jobs = binder.Init(**_args)
|
||||
_mapped = binder.Init(**_args)
|
||||
|
||||
|
||||
_schema = [{"name":_name,"type":"INTEGER"} for _name in _mapped.columns.tolist()]
|
||||
publisher = lambda _params: (Components()).post(**_params)
|
||||
_args = {'data':_mapped,'store':_config['store']['target']}
|
||||
_args['store']['table'] = '_map'
|
||||
if _args['store']['provider'] =='bigquery' :
|
||||
_args['schema'] = _schema
|
||||
|
||||
job = Process (target = publisher,args=(_args,))
|
||||
job.start()
|
||||
jobs = [job]
|
||||
else:
|
||||
#
|
||||
# Applying the map of k on a particular dataset
|
||||
#
|
||||
index = int(SYS_ARGS['index'])
|
||||
_args['config'] = _config['pipeline'][index]
|
||||
_args['original_key'] = 'person_id' if 'original_key' in _config else 'person_id'
|
||||
table = _config['pipeline'][index]['from']
|
||||
_df = binder.ApplyOn(**_args)
|
||||
_df = np.array_split(_df,PART_SIZE)
|
||||
jobs = []
|
||||
print (['Publishing ',PART_SIZE,' PARTITION'])
|
||||
for data in _df :
|
||||
publisher = lambda _params: ( Components() ).post(**_params)
|
||||
_args = {'data':data,'store':_config['store']['target']}
|
||||
_args['store']['table'] = table
|
||||
print (_args['store'])
|
||||
job = Process(target = publisher,args=(_args,))
|
||||
job.name = "Publisher "+str(len(jobs)+1)
|
||||
job.start()
|
||||
jobs.append(job)
|
||||
|
||||
elif 'shuffle' in SYS_ARGS :
|
||||
index = 0
|
||||
if GPU_CHIPS and 'all-chips' in SYS_ARGS:
|
||||
|
||||
for index in GPU_CHIPS :
|
||||
publisher = lambda _params: ( Components() ).shuffle(_params)
|
||||
job = Process (target = publisher,args=( args,))
|
||||
job.name = 'Shuffler #' + str(index)
|
||||
job.start()
|
||||
jobs.append(job)
|
||||
else:
|
||||
shuffler = Components()
|
||||
shuffler.shuffle(args)
|
||||
pass
|
||||
elif 'train' in SYS_ARGS:
|
||||
|
||||
# DATA = np.array_split(DATA,PART_SIZE)
|
||||
#
|
||||
# Let us create n-jobs across n-gpus, The assumption here is the data that is produced will be a partition
|
||||
# @TODO: Find better name for partition
|
||||
#
|
||||
|
||||
if GPU_CHIPS and 'all-chips' in SYS_ARGS:
|
||||
index = 0
|
||||
print (['... launching ',len(GPU_CHIPS),' jobs',args['context']])
|
||||
for _gpu in GPU_CHIPS :
|
||||
_args = copy.deepcopy(args)
|
||||
_args['gpu'] = [int(_gpu)]
|
||||
_args['partition'] = int(_gpu) #index
|
||||
index += 1
|
||||
make = lambda _params: (Components()).train(**_params)
|
||||
job = Process(target=make,args=( _args,))
|
||||
job.name = 'Trainer # ' + str(index)
|
||||
job.start()
|
||||
jobs.append(job)
|
||||
|
||||
|
||||
|
||||
|
||||
else:
|
||||
#
|
||||
# The choice of the chip will be made internally
|
||||
|
||||
agent = Components()
|
||||
agent.train(**args)
|
||||
#
|
||||
# If we have any obs we should wait till they finish
|
||||
#
|
||||
DIRTY = 0
|
||||
if (len(jobs)) :
|
||||
print (['.... waiting on ',len(jobs),' jobs'])
|
||||
while len(jobs)> 0 :
|
||||
DIRTY =1
|
||||
jobs = [job for job in jobs if job.is_alive()]
|
||||
time.sleep(2)
|
||||
if DIRTY:
|
||||
print (["..:: jobs finished "])
|
||||
#
|
||||
# We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations
|
||||
# This holds true for bigquery - bigquery only
|
||||
IS_BIGQUERY = _config['store']['source']['provider'] == _config['store']['target']['provider'] and _config['store']['source']['provider'] == 'bigquery'
|
||||
|
||||
# if 'bind' not in SYS_ARGS and IS_BIGQUERY and ('autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS)) :
|
||||
# #
|
||||
# # We should pull all the primary keys and regenerate them in order to insure some form of consistency
|
||||
# #
|
||||
|
||||
# #
|
||||
# #
|
||||
|
||||
# print (["..:: Finalizing process"])
|
||||
# (Components()).finalize(args)
|
Loading…
Reference in new issue