From f99af3655d0c8792c34f11a246e437c7d00ae46c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 13 Jan 2022 15:05:00 -0600 Subject: [PATCH] bug fix: misc. improvements --- data/gan.py | 21 ++++++++++++--------- data/maker/__init__.py | 5 ++++- pipeline.py | 18 ++++++++++++++---- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/data/gan.py b/data/gan.py index 643e838..0008489 100644 --- a/data/gan.py +++ b/data/gan.py @@ -20,7 +20,9 @@ EMBEDDED IN CODE : """ import tensorflow as tf -from tensorflow.contrib.layers import l2_regularizer +# from tensorflow.contrib.layers import l2_regularizer +from tensorflow.keras import layers +from tensorflow.keras.regularizers import L2 as l2_regularizer import numpy as np import pandas as pd import time @@ -34,7 +36,7 @@ import pickle os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ['CUDA_VISIBLE_DEVICES'] = "0" os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' - +tf.compat.v1.disable_eager_execution() # STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256 # NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu']) # BATCHSIZE_PER_GPU = 2000 @@ -211,13 +213,14 @@ class GNet : labels = None if 'labels' not in args else args['labels'] n_labels= None if 'n_labels' not in args else args['n_labels'] shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing - mean, var = tf.nn.moments(inputs, shift, keep_dims=True) - shape = inputs.shape[1].value + # mean, var = tf.nn.moments(inputs, shift, keep_dims=True) + mean, var = tf.nn.moments(inputs, shift,keepdims=True) + # shape = inputs.shape[1].value + shape = inputs.shape[1] + if labels is not None: - offset_m = self.get.variables(shape=[1,shape], name='offset'+name, - initializer=tf.zeros_initializer) - scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name, - initializer=tf.ones_initializer) + offset_m = self.get.variables(shape=[1,shape], name='offset'+name,initializer=tf.zeros_initializer) + scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,initializer=tf.ones_initializer) offset = tf.nn.embedding_lookup(offset_m, labels) scale = tf.nn.embedding_lookup(scale_m, labels) @@ -595,7 +598,7 @@ class Predict(GNet): df = pd.DataFrame() CANDIDATE_COUNT = args['candidates'] if 'candidates' in args else 1 #0 if self.ROW_COUNT < 1000 else 100 candidates = [] - + with tf.compat.v1.Session() as sess: saver.restore(sess, model_dir) if self._LABEL is not None : diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 7439e45..9db2b8d 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -106,6 +106,8 @@ def train (**_args): values = _inputhandler._map[key]['values'].tolist() _map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()} info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map} + print() + # print ([_args['context'],_inputhandler._io]) logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io}) args['logs'] = _args['logs'] if 'logs' in _args else 'logs' @@ -142,9 +144,10 @@ def generate(**_args): :param context :param logs """ + _args['logs'] = _args['logs'] if 'logs' in _args else 'logs' partition = _args['partition'] if 'partition' in _args else None if not partition : - MAP_FLDER = os.sep.join([_args['logs'],'output',_args['context']]) + MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context']]) # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json'])) else: MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context'],str(partition)]) diff --git a/pipeline.py b/pipeline.py index 56e522e..296d4d5 100644 --- a/pipeline.py +++ b/pipeline.py @@ -151,6 +151,7 @@ class Components : if df.shape[0] and df.shape[0] : # # We have a full blown matrix to be processed + print ('-- Training --') data.maker.train(**_args) else: print ("... skipping training !!") @@ -259,16 +260,23 @@ class Components : _df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10]) #_df[name] = _df[name].dt.date # _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce') + else: + pass + _df[name] = pd.to_datetime(_df[name]) else: + value = 0 if _item['type'] == 'INTEGER' : _type = np.int64 elif _item['type'] in ['FLOAT','NUMERIC']: _type = np.float64 else: + _value = '' - _df[name] = _df[name].fillna(_value).astype(_type) + _df[name] = _df[name].fillna(_value) #.astype(_type) columns.append(name) - writer.write(_df,schema=_schema,table=args['from']) + print () + print (_df) + writer.write(_df.astype(object),schema=_schema,table=args['from']) else: writer.write(_df,table=args['from']) @@ -350,7 +358,7 @@ class Components : for _item in schema : dtype = str name = _item['name'] - novalue = -1 + novalue = 0 if _item['type'] in ['INTEGER','NUMERIC']: dtype = np.int64 @@ -550,7 +558,7 @@ if __name__ == '__main__' : index = f[0] if f else 0 # - print ("..::: ",PIPELINE[index]['context']) + print ("..::: ",PIPELINE[index]['context'],':::..') args = (PIPELINE[index]) for key in _config : if key == 'pipeline' or key in args: @@ -567,6 +575,7 @@ if __name__ == '__main__' : args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size']) if 'dataset' not in args : args['dataset'] = 'combined20191004v2_deid' + args['logs'] = args['logs'] if 'logs' in args else 'logs' PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 # # @TODO: @@ -599,6 +608,7 @@ if __name__ == '__main__' : jobs.append(job) pass else: + generator = Components() generator.generate(args) elif 'shuffle' in SYS_ARGS :