From 936bd3ee0be7e01352e364a5dd91337e09cc797c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 16 Sep 2022 19:10:49 -0500 Subject: [PATCH] bug fix with model saving, and pre/post processing --- data/gan.py | 6 +- data/maker/__init__.py | 2 +- data/maker/apply.py | 76 +++++++++++++++++++++++ data/maker/state/__init__.py | 105 +++++++++++++++++++++++++++++++ data/maker/state/default.py | 116 +++++++++++++++++++++++++++++++++++ 5 files changed, 301 insertions(+), 4 deletions(-) create mode 100644 data/maker/apply.py create mode 100644 data/maker/state/__init__.py create mode 100644 data/maker/state/default.py diff --git a/data/gan.py b/data/gan.py index 3727edb..f864dbf 100644 --- a/data/gan.py +++ b/data/gan.py @@ -469,7 +469,7 @@ class Train (GNet): else : dataset = tf.data.Dataset.from_tensor_slices(features_placeholder) # labels_placeholder = None - dataset = dataset.repeat(20000) + dataset = dataset.repeat(80000) dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) dataset = dataset.prefetch(1) @@ -564,12 +564,12 @@ class Train (GNet): # if epoch % self.MAX_EPOCHS == 0: # if epoch in [5,10,20,50,75, self.MAX_EPOCHS] : - if epoch in self.CHECKPOINTS or int(epoch) == 1: + if epoch in self.CHECKPOINTS : # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] suffix = self.CONTEXT #self.get.suffix() _name = os.sep.join([self.train_dir,str(epoch),suffix]) # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch) - saver.save(sess, _name, write_meta_graph=False, global_step=epoch) + saver.save(sess, _name, write_meta_graph=False, global_step=np.int64(epoch)) # # diff --git a/data/maker/__init__.py b/data/maker/__init__.py index dea44eb..21b3017 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -265,7 +265,7 @@ class Trainer(Learner): # # Let us find the smallest, the item is sorted by loss ... - _args['epochs'] = gTrain.logs['epochs'][0]['epochs'] + _args['network_args']['max_epochs'] = gTrain.logs['epochs'][0]['epochs'] g = Generator(**_args) # g.run() diff --git a/data/maker/apply.py b/data/maker/apply.py new file mode 100644 index 0000000..bb6a085 --- /dev/null +++ b/data/maker/apply.py @@ -0,0 +1,76 @@ +""" +This file is designed to specify the appliction of pre/post-processing code. + The pre-processing code gets applied after the data has been loaded + The post-processing code get applied after the data has been generated for instance: + -approximation code/logic; date shifting; suppression; adding noise + - +""" +import numpy as np +from datetime import datetime, timedelta +import time + +class Phase: + def __init__(self,**_args): + self._df = _args['data'] + self.callback = _args['callback'] + def apply(self,**_args): + """ + :param data data-frame + :param _info arguments needed to be applied + :param callback callback function once done + """ + raise Exception ("Function needs to be Implemented") +class Pre(Phase): + pass +class Post(Phase): + def __init__(self,**_args): + super().__init__(**_args) + pass + +class Date(Post): + def __init__(self,**_args): + super().__init__(**_args) + def make(self,**_args): + """ + This function generates a random date given a year and optionally a set of days from the randomly generated date + :param year initial value of a year + :param offset list of days between initial date + """ + if _args['year'] in ['',None,np.nan] : + return None + year = int(_args['year']) + + offset = _args['offset'] if 'offset' in _args else 0 + month = np.random.randint(1,13) + if month == 2: + _end = 28 if year % 4 != 0 else 29 + else: + _end = 31 if month in [1,3,5,7,8,10,12] else 30 + day = np.random.randint(1,_end) + + #-- synthetic date + _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0) + FORMAT = '%Y-%m-%d' if 'format' not in _args else _args['format'] + + + + # print ([_name,FORMAT, _date.strftime(FORMAT)]) + r = [] + if offset : + r = [_date.strftime(FORMAT)] + for _delta in offset : + _date = _date + timedelta(_delta) + r.append(_date.strptime(FORMAT)) + return r + else: + return _date.strftime(FORMAT) + + def apply(self,**_args): + """ + + """ + pass +class Approximate(Post): + def apply(**_args): + pass + def applyWithRange(**_args): diff --git a/data/maker/state/__init__.py b/data/maker/state/__init__.py new file mode 100644 index 0000000..adf9837 --- /dev/null +++ b/data/maker/state/__init__.py @@ -0,0 +1,105 @@ +""" +This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditiions +""" +""" +This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditions, +The specifications for this are as follows (within an entry of the configuration) + { + "state":{ + "pre":[{"approximate":{"field":"int"}},{"newdate":{"field":"format"}}],"post":[{"limit":10}] + } + } +""" +import importlib +import importlib.util +import sys +from datetime import datetime +from data.maker.state.default import * +import os + + +class State : + @staticmethod + def apply(_data,lpointers): + """ + This function applies a pipeline against a given data-frame, the calling code must decide whether it is a pre/post + :_data data-frame + :_lpointers functions modules returned by instance (module,_args) + """ + for _item in lpointers : + if _item is None : + continue + + pointer = _item['module'] + _args = _item['args'] + + _data = pointer(_data,_args) + return _data + @staticmethod + def instance(_args): + pre = [] + post=[] + + out = {} + for key in _args : + # + # If the item has a path property is should be ignored + path = _args[key]['path'] if 'path' in _args[key] else '' + out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']] + + return out + # if 'pre' in _args: + # path = _args['pre']['path'] if 'path' in _args['pre'] else '' + + # pre = [ State._build(dict(_item,**{'path':path})) for _item in _args['pre']['pipeline']] + # else: + # path = _args['post']['path'] if 'path' in _args['post'] else '' + + # post = [ State._build(dict(_item,**{'path':path})) for _item in _args['post']['pipeline']] + # return {'pre':pre,'post':post} + + @staticmethod + def _extract(_entry): + + _name = list(set(_entry.keys()) - set(['path']) ) + _name = _name[0] + path = _entry['path'] if 'path' in _entry and os.path.exists(_entry['path']) else '' + return {"module": _name,"args": _entry[_name],'name':_name,'path':path} + pass + @staticmethod + def _build(_args): + + _info = State._extract(_args) + # _info = dict(_args,**_info) + + _info['module'] = State._instance(_info) + return _info if _info['module'] is not None else None + + @staticmethod + def _instance(_args): + """ + :path optional path of the file on disk + :module name of the function + """ + + _name = _args['module'] + + if 'path' in _args and os.path.exists(_args['path']): + path= _args['path'] + + spec = importlib.util.spec_from_file_location(_name, path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + else: + # + # Probably calling a built-in module (should be in this file) + + module = sys.modules['data.maker.state.default'] + + return getattr(module,_name) if hasattr(module,_name) else None + +# +# Adding a few custom functions that should be able to help .... +# These functions can be called without specifying a path +# + diff --git a/data/maker/state/default.py b/data/maker/state/default.py new file mode 100644 index 0000000..75c2c4b --- /dev/null +++ b/data/maker/state/default.py @@ -0,0 +1,116 @@ +""" +This file contains default functions applied to a data-frame/dataset as pre/post processing jobs. +The functions are organized in a pipeline i.e the data will be applied to each function + +Custom functions : + functions must tak 2 arguments (_data,_args) : where _data is a data frame and _arg is a object describing the input parameters +""" +import pandas as pd +import numpy as np +from datetime import datetime, timedelta + + +def limit(_data,size): + """ + ...,{limit:size} + """ + + # size = int(_args['limit']) + return _data.iloc[:size] +def format(_data,_schema): + """ + This function enforces a schema against a data-frame, this may or may not work depending on the persistence storage + :_data data-frame containing all data + :_args schema to enforce the data, we are expecting the format as a list of {name,type,description} + """ + return _data + +def approximate(_data,_args): + """ + :_args Object of {field:type} + This function will approximate n-fields in the data given it's distribution + """ + _m = {'int':int,'float':float,'integer':int,'double':float} + columns = list(_args.keys()) + for _name in columns : + if _name not in _data : + continue + otype = _args[_name] + otype = str if otype not in _m else _m[otype] + _data.loc[:,_name] = np.random.uniform(_data[_name].values).astype(otype) + + return _data +def split_date(_data,_args): + """ + This function takes a field and applies the format from other fields + :_data data-frame + :_config configuration entry {column:{format,column:format,type}} + """ + _columns = list(_args.keys()) + _m = {'int':int,'float':float,'integer':int,'double':float} + for _name in _columns : + _iname = _args[_name]['column'] + _iformat = _args[_name]['format']['in'] + _oformat = _args[_name]['format']['out'] + _otype = str if 'type' not in _args[_name] else _args[_name]['type'] + _data.loc[:,_name] = _data[_iname].apply(lambda _date: datetime.strftime(datetime.strptime(str(_date),_iformat),_oformat)).astype(_otype) + return _data +def newdate(_data,_args): + """ + This function creates a new data on a given column from another + :_data data frame + :_args configuration column:{format,column} + """ + _columns = list(_args.keys()) + for _name in _columns : + + format = _args[_name]['format'] + ROW_COUNT = _data[_name].size + if 'column' in _args[_name] : + srcName = _args[_name]['column'] + years = _data[srcName].values + else: + years = np.random.choice(np.arange(datetime.now().year- 90,datetime.now().year),ROW_COUNT) + _data.loc[:,_name] = [ _makedate(year = years[_index],format = format) for _index in np.arange(ROW_COUNT)] + + return _data +def _makedate(**_args): + """ + This function creates a new date and applies it to a column + :_data data-frame with columns + :_args arguments for col1:format + """ + _columns = list(_args.keys()) + + # if _args['year'] in ['',None,np.nan] : + # year = np.random.choice(np.arange(1920,222),1) + # else: + # year = int(_args['year']) + year = int(_args['year']) + offset = _args['offset'] if 'offset' in _args else 0 + month = np.random.randint(1,13) + if month == 2: + _end = 28 if year % 4 != 0 else 29 + else: + _end = 31 if month in [1,3,5,7,8,10,12] else 30 + day = np.random.randint(1,_end) + + #-- synthetic date + _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0) + FORMAT = '%Y-%m-%d' + + if 'format' in _args: + FORMAT = _args['format'] + + + # print ([_name,FORMAT, _date.strftime(FORMAT)]) + r = [] + if offset : + r = [_date.strftime(FORMAT)] + for _delta in offset : + _date = _date + timedelta(_delta) + r.append(_date.strptime(FORMAT)) + return r + else: + return _date.strftime(FORMAT) +