From 936bd3ee0be7e01352e364a5dd91337e09cc797c Mon Sep 17 00:00:00 2001
From: Steve Nyemba <nyemba@gmail.com>
Date: Fri, 16 Sep 2022 19:10:49 -0500
Subject: [PATCH] bug fix with model saving, and pre/post processing

---
 data/gan.py                  |   6 +-
 data/maker/__init__.py       |   2 +-
 data/maker/apply.py          |  76 +++++++++++++++++++++++
 data/maker/state/__init__.py | 105 +++++++++++++++++++++++++++++++
 data/maker/state/default.py  | 116 +++++++++++++++++++++++++++++++++++
 5 files changed, 301 insertions(+), 4 deletions(-)
 create mode 100644 data/maker/apply.py
 create mode 100644 data/maker/state/__init__.py
 create mode 100644 data/maker/state/default.py

diff --git a/data/gan.py b/data/gan.py
index 3727edb..f864dbf 100644
--- a/data/gan.py
+++ b/data/gan.py
@@ -469,7 +469,7 @@ class Train (GNet):
                 else :
                         dataset = tf.data.Dataset.from_tensor_slices(features_placeholder)
                 # labels_placeholder = None
-                dataset = dataset.repeat(20000)
+                dataset = dataset.repeat(80000)
                 
                 dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
                 dataset = dataset.prefetch(1)
@@ -564,12 +564,12 @@ class Train (GNet):
 
                                         # if epoch % self.MAX_EPOCHS == 0:
                                         # if epoch in [5,10,20,50,75, self.MAX_EPOCHS] :
-                                        if epoch in self.CHECKPOINTS  or int(epoch) == 1:
+                                        if epoch in self.CHECKPOINTS  :
                                                 # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
                                                 suffix = self.CONTEXT #self.get.suffix()
                                                 _name  = os.sep.join([self.train_dir,str(epoch),suffix])
                                                 # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
-                                                saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
+                                                saver.save(sess, _name, write_meta_graph=False, global_step=np.int64(epoch))
                                                 
                                                 #
                                                 #
diff --git a/data/maker/__init__.py b/data/maker/__init__.py
index dea44eb..21b3017 100644
--- a/data/maker/__init__.py
+++ b/data/maker/__init__.py
@@ -265,7 +265,7 @@ class Trainer(Learner):
 
         #
         # Let us find the smallest, the item is sorted by loss ...
-        _args['epochs'] = gTrain.logs['epochs'][0]['epochs']
+        _args['network_args']['max_epochs'] = gTrain.logs['epochs'][0]['epochs']
         g = Generator(**_args)
         # g.run() 
         
diff --git a/data/maker/apply.py b/data/maker/apply.py
new file mode 100644
index 0000000..bb6a085
--- /dev/null
+++ b/data/maker/apply.py
@@ -0,0 +1,76 @@
+"""
+This file is designed to specify the appliction of pre/post-processing code. 
+    The pre-processing code gets applied after the data has been loaded
+    The post-processing code get applied after the data has been generated for instance:
+        -approximation code/logic; date shifting; suppression; adding noise
+        - 
+"""
+import numpy as np
+from datetime import datetime, timedelta
+import time
+
+class Phase:
+    def __init__(self,**_args):
+        self._df = _args['data']
+        self.callback = _args['callback']
+    def apply(self,**_args):
+        """
+            :param  data        data-frame
+            :param  _info       arguments needed to be applied
+            :param  callback    callback function once done
+        """
+        raise Exception ("Function needs to be Implemented")
+class Pre(Phase):
+    pass
+class Post(Phase):
+    def __init__(self,**_args):
+        super().__init__(**_args)
+    pass
+
+class Date(Post):
+    def __init__(self,**_args):
+        super().__init__(**_args)
+    def make(self,**_args):
+        """
+        This function generates a random date given a year and optionally a set of days from the randomly generated date
+        :param year     initial value of a year
+        :param offset   list of days between initial date    
+        """
+        if _args['year'] in ['',None,np.nan] :
+            return None
+        year = int(_args['year'])
+
+        offset = _args['offset'] if 'offset' in _args else 0
+        month   = np.random.randint(1,13)
+        if month == 2:
+            _end = 28 if year % 4 != 0 else 29
+        else:
+            _end = 31 if month in [1,3,5,7,8,10,12] else 30
+        day = np.random.randint(1,_end)
+
+        #-- synthetic date
+        _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0)
+        FORMAT =  '%Y-%m-%d' if 'format' not in _args else _args['format']
+
+
+
+        # print ([_name,FORMAT, _date.strftime(FORMAT)])
+        r = []
+        if offset :
+            r = [_date.strftime(FORMAT)]
+            for _delta in offset :
+                _date = _date + timedelta(_delta)
+                r.append(_date.strptime(FORMAT))
+            return r
+        else:
+            return _date.strftime(FORMAT)
+        
+    def apply(self,**_args):
+        """
+        
+        """
+      pass
+class Approximate(Post):
+    def apply(**_args):
+        pass
+    def applyWithRange(**_args):
diff --git a/data/maker/state/__init__.py b/data/maker/state/__init__.py
new file mode 100644
index 0000000..adf9837
--- /dev/null
+++ b/data/maker/state/__init__.py
@@ -0,0 +1,105 @@
+"""
+This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditiions
+"""
+"""
+This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditions,
+The specifications for this are as follows (within an entry of the configuration)
+    {
+        "state":{
+            "pre":[{"approximate":{"field":"int"}},{"newdate":{"field":"format"}}],"post":[{"limit":10}]
+        }
+    }
+"""
+import importlib
+import importlib.util
+import sys
+from datetime import datetime
+from data.maker.state.default import *
+import os
+
+
+class State :
+    @staticmethod
+    def apply(_data,lpointers):
+        """
+        This function applies a pipeline against a given data-frame, the calling code must decide whether it is a pre/post
+        :_data  data-frame
+        :_lpointers functions modules returned by instance (module,_args)
+        """
+        for _item in lpointers :
+            if _item is None :
+                continue
+            
+            pointer = _item['module']
+            _args = _item['args']
+            
+            _data = pointer(_data,_args)
+        return _data
+    @staticmethod
+    def instance(_args):
+        pre = []
+        post=[]
+       
+        out  = {}
+        for key in _args :
+            #
+            # If the item has a path property is should be ignored
+            path  = _args[key]['path'] if 'path' in _args[key] else ''
+            out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']]
+            
+        return out
+        # if 'pre' in _args:
+        #     path  = _args['pre']['path'] if 'path' in _args['pre'] else ''
+            
+        #     pre = [ State._build(dict(_item,**{'path':path})) for _item in _args['pre']['pipeline']]
+        # else:
+        #     path  = _args['post']['path'] if 'path' in _args['post'] else ''
+
+        #     post = [ State._build(dict(_item,**{'path':path})) for _item in _args['post']['pipeline']]
+        # return {'pre':pre,'post':post}
+   
+    @staticmethod
+    def _extract(_entry):
+        
+        _name = list(set(_entry.keys()) - set(['path']) )
+        _name = _name[0]
+        path = _entry['path'] if 'path' in _entry and os.path.exists(_entry['path']) else ''
+        return {"module": _name,"args": _entry[_name],'name':_name,'path':path}
+        pass
+    @staticmethod
+    def _build(_args):
+        
+        _info = State._extract(_args)
+        # _info = dict(_args,**_info)
+        
+        _info['module'] = State._instance(_info)
+        return _info if _info['module'] is not None else None
+            
+    @staticmethod
+    def _instance(_args):
+        """
+            :path   optional path of the file on disk
+            :module   name of the function
+        """
+        
+        _name = _args['module']
+        
+        if 'path' in _args and os.path.exists(_args['path']):
+            path= _args['path']
+            
+            spec = importlib.util.spec_from_file_location(_name, path)
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+        else:
+            #
+            # Probably calling a built-in module (should be in this file)
+            
+            module = sys.modules['data.maker.state.default'] 
+       
+        return getattr(module,_name) if hasattr(module,_name) else None
+ 
+#
+# Adding a few custom functions that should be able to help ....
+# These functions can be called without specifying a path
+#
+
diff --git a/data/maker/state/default.py b/data/maker/state/default.py
new file mode 100644
index 0000000..75c2c4b
--- /dev/null
+++ b/data/maker/state/default.py
@@ -0,0 +1,116 @@
+"""
+This file contains default functions applied to a data-frame/dataset as pre/post processing jobs. 
+The functions are organized in a pipeline i.e the data will be applied to each function 
+
+Custom functions :
+    functions must tak 2 arguments (_data,_args) : where _data is a data frame and _arg is a object describing the input parameters
+"""
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+
+
+def limit(_data,size):
+    """
+        ...,{limit:size}
+    """
+    
+    # size = int(_args['limit'])
+    return _data.iloc[:size]
+def format(_data,_schema):
+    """
+    This function enforces a schema against a data-frame, this may or may not work depending on the persistence storage
+    :_data  data-frame containing all data
+    :_args  schema to enforce the data, we are expecting the format as a list of {name,type,description}    
+    """   
+    return _data 
+
+def approximate(_data,_args):
+    """
+    :_args  Object of {field:type}
+    This function will approximate n-fields in the data given it's distribution
+    """
+    _m = {'int':int,'float':float,'integer':int,'double':float}
+    columns = list(_args.keys())
+    for _name in columns :
+        if _name not in _data :
+            continue
+        otype = _args[_name]
+        otype = str if otype not in _m else _m[otype]
+        _data.loc[:,_name] = np.random.uniform(_data[_name].values).astype(otype)
+        
+    return _data
+def split_date(_data,_args):
+    """
+    This function takes a field and applies the format from other fields
+    :_data  data-frame
+    :_config    configuration entry {column:{format,column:format,type}}
+    """
+    _columns = list(_args.keys())
+    _m = {'int':int,'float':float,'integer':int,'double':float}
+    for _name in _columns :
+        _iname = _args[_name]['column']
+        _iformat = _args[_name]['format']['in']
+        _oformat = _args[_name]['format']['out']
+        _otype = str if 'type' not in _args[_name]  else _args[_name]['type']
+        _data.loc[:,_name] = _data[_iname].apply(lambda _date: datetime.strftime(datetime.strptime(str(_date),_iformat),_oformat)).astype(_otype)
+    return _data
+def newdate(_data,_args):
+    """
+    This function creates a new data on a given column from another 
+    :_data  data frame
+    :_args  configuration column:{format,column}
+    """
+    _columns = list(_args.keys())
+    for _name in _columns :
+        
+        format = _args[_name]['format']
+        ROW_COUNT = _data[_name].size
+        if 'column' in _args[_name] :
+            srcName = _args[_name]['column']
+            years = _data[srcName].values
+        else:
+            years = np.random.choice(np.arange(datetime.now().year- 90,datetime.now().year),ROW_COUNT)
+        _data.loc[:,_name] = [ _makedate(year = years[_index],format = format) for _index in np.arange(ROW_COUNT)]
+        
+    return _data
+def _makedate(**_args):
+    """
+    This function creates a new date and applies it to a column
+    :_data  data-frame with columns
+    :_args  arguments for col1:format
+    """
+    _columns = list(_args.keys())
+   
+    # if _args['year'] in ['',None,np.nan] :
+    #     year = np.random.choice(np.arange(1920,222),1)
+    # else:
+    #     year = int(_args['year'])
+    year = int(_args['year'])
+    offset = _args['offset'] if 'offset' in _args else 0
+    month   = np.random.randint(1,13)
+    if month == 2:
+        _end = 28 if year % 4 != 0 else 29
+    else:
+        _end = 31 if month in [1,3,5,7,8,10,12] else 30
+    day = np.random.randint(1,_end)
+
+    #-- synthetic date
+    _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0)
+    FORMAT =  '%Y-%m-%d'
+    
+    if 'format' in _args:
+        FORMAT = _args['format']
+    
+
+    # print ([_name,FORMAT, _date.strftime(FORMAT)])
+    r = []
+    if offset :
+        r = [_date.strftime(FORMAT)]
+        for _delta in offset :
+            _date = _date + timedelta(_delta)
+            r.append(_date.strptime(FORMAT))
+        return r
+    else:
+        return _date.strftime(FORMAT)
+