parent
							
								
									0efd4b13bc
								
							
						
					
					
						commit
						936bd3ee0b
					
				@ -0,0 +1,76 @@
 | 
				
			|||||||
 | 
					"""
 | 
				
			||||||
 | 
					This file is designed to specify the appliction of pre/post-processing code. 
 | 
				
			||||||
 | 
					    The pre-processing code gets applied after the data has been loaded
 | 
				
			||||||
 | 
					    The post-processing code get applied after the data has been generated for instance:
 | 
				
			||||||
 | 
					        -approximation code/logic; date shifting; suppression; adding noise
 | 
				
			||||||
 | 
					        - 
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					from datetime import datetime, timedelta
 | 
				
			||||||
 | 
					import time
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Phase:
 | 
				
			||||||
 | 
					    def __init__(self,**_args):
 | 
				
			||||||
 | 
					        self._df = _args['data']
 | 
				
			||||||
 | 
					        self.callback = _args['callback']
 | 
				
			||||||
 | 
					    def apply(self,**_args):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					            :param  data        data-frame
 | 
				
			||||||
 | 
					            :param  _info       arguments needed to be applied
 | 
				
			||||||
 | 
					            :param  callback    callback function once done
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        raise Exception ("Function needs to be Implemented")
 | 
				
			||||||
 | 
					class Pre(Phase):
 | 
				
			||||||
 | 
					    pass
 | 
				
			||||||
 | 
					class Post(Phase):
 | 
				
			||||||
 | 
					    def __init__(self,**_args):
 | 
				
			||||||
 | 
					        super().__init__(**_args)
 | 
				
			||||||
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Date(Post):
 | 
				
			||||||
 | 
					    def __init__(self,**_args):
 | 
				
			||||||
 | 
					        super().__init__(**_args)
 | 
				
			||||||
 | 
					    def make(self,**_args):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        This function generates a random date given a year and optionally a set of days from the randomly generated date
 | 
				
			||||||
 | 
					        :param year     initial value of a year
 | 
				
			||||||
 | 
					        :param offset   list of days between initial date    
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        if _args['year'] in ['',None,np.nan] :
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					        year = int(_args['year'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        offset = _args['offset'] if 'offset' in _args else 0
 | 
				
			||||||
 | 
					        month   = np.random.randint(1,13)
 | 
				
			||||||
 | 
					        if month == 2:
 | 
				
			||||||
 | 
					            _end = 28 if year % 4 != 0 else 29
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            _end = 31 if month in [1,3,5,7,8,10,12] else 30
 | 
				
			||||||
 | 
					        day = np.random.randint(1,_end)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        #-- synthetic date
 | 
				
			||||||
 | 
					        _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0)
 | 
				
			||||||
 | 
					        FORMAT =  '%Y-%m-%d' if 'format' not in _args else _args['format']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # print ([_name,FORMAT, _date.strftime(FORMAT)])
 | 
				
			||||||
 | 
					        r = []
 | 
				
			||||||
 | 
					        if offset :
 | 
				
			||||||
 | 
					            r = [_date.strftime(FORMAT)]
 | 
				
			||||||
 | 
					            for _delta in offset :
 | 
				
			||||||
 | 
					                _date = _date + timedelta(_delta)
 | 
				
			||||||
 | 
					                r.append(_date.strptime(FORMAT))
 | 
				
			||||||
 | 
					            return r
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return _date.strftime(FORMAT)
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    def apply(self,**_args):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					      pass
 | 
				
			||||||
 | 
					class Approximate(Post):
 | 
				
			||||||
 | 
					    def apply(**_args):
 | 
				
			||||||
 | 
					        pass
 | 
				
			||||||
 | 
					    def applyWithRange(**_args):
 | 
				
			||||||
@ -0,0 +1,105 @@
 | 
				
			|||||||
 | 
					"""
 | 
				
			||||||
 | 
					This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditiions
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditions,
 | 
				
			||||||
 | 
					The specifications for this are as follows (within an entry of the configuration)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        "state":{
 | 
				
			||||||
 | 
					            "pre":[{"approximate":{"field":"int"}},{"newdate":{"field":"format"}}],"post":[{"limit":10}]
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					import importlib
 | 
				
			||||||
 | 
					import importlib.util
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					from datetime import datetime
 | 
				
			||||||
 | 
					from data.maker.state.default import *
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class State :
 | 
				
			||||||
 | 
					    @staticmethod
 | 
				
			||||||
 | 
					    def apply(_data,lpointers):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        This function applies a pipeline against a given data-frame, the calling code must decide whether it is a pre/post
 | 
				
			||||||
 | 
					        :_data  data-frame
 | 
				
			||||||
 | 
					        :_lpointers functions modules returned by instance (module,_args)
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        for _item in lpointers :
 | 
				
			||||||
 | 
					            if _item is None :
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            pointer = _item['module']
 | 
				
			||||||
 | 
					            _args = _item['args']
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            _data = pointer(_data,_args)
 | 
				
			||||||
 | 
					        return _data
 | 
				
			||||||
 | 
					    @staticmethod
 | 
				
			||||||
 | 
					    def instance(_args):
 | 
				
			||||||
 | 
					        pre = []
 | 
				
			||||||
 | 
					        post=[]
 | 
				
			||||||
 | 
					       
 | 
				
			||||||
 | 
					        out  = {}
 | 
				
			||||||
 | 
					        for key in _args :
 | 
				
			||||||
 | 
					            #
 | 
				
			||||||
 | 
					            # If the item has a path property is should be ignored
 | 
				
			||||||
 | 
					            path  = _args[key]['path'] if 'path' in _args[key] else ''
 | 
				
			||||||
 | 
					            out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']]
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					        return out
 | 
				
			||||||
 | 
					        # if 'pre' in _args:
 | 
				
			||||||
 | 
					        #     path  = _args['pre']['path'] if 'path' in _args['pre'] else ''
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					        #     pre = [ State._build(dict(_item,**{'path':path})) for _item in _args['pre']['pipeline']]
 | 
				
			||||||
 | 
					        # else:
 | 
				
			||||||
 | 
					        #     path  = _args['post']['path'] if 'path' in _args['post'] else ''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        #     post = [ State._build(dict(_item,**{'path':path})) for _item in _args['post']['pipeline']]
 | 
				
			||||||
 | 
					        # return {'pre':pre,'post':post}
 | 
				
			||||||
 | 
					   
 | 
				
			||||||
 | 
					    @staticmethod
 | 
				
			||||||
 | 
					    def _extract(_entry):
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        _name = list(set(_entry.keys()) - set(['path']) )
 | 
				
			||||||
 | 
					        _name = _name[0]
 | 
				
			||||||
 | 
					        path = _entry['path'] if 'path' in _entry and os.path.exists(_entry['path']) else ''
 | 
				
			||||||
 | 
					        return {"module": _name,"args": _entry[_name],'name':_name,'path':path}
 | 
				
			||||||
 | 
					        pass
 | 
				
			||||||
 | 
					    @staticmethod
 | 
				
			||||||
 | 
					    def _build(_args):
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        _info = State._extract(_args)
 | 
				
			||||||
 | 
					        # _info = dict(_args,**_info)
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        _info['module'] = State._instance(_info)
 | 
				
			||||||
 | 
					        return _info if _info['module'] is not None else None
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					    @staticmethod
 | 
				
			||||||
 | 
					    def _instance(_args):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					            :path   optional path of the file on disk
 | 
				
			||||||
 | 
					            :module   name of the function
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        _name = _args['module']
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        if 'path' in _args and os.path.exists(_args['path']):
 | 
				
			||||||
 | 
					            path= _args['path']
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            spec = importlib.util.spec_from_file_location(_name, path)
 | 
				
			||||||
 | 
					            module = importlib.util.module_from_spec(spec)
 | 
				
			||||||
 | 
					            spec.loader.exec_module(module)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            #
 | 
				
			||||||
 | 
					            # Probably calling a built-in module (should be in this file)
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            module = sys.modules['data.maker.state.default'] 
 | 
				
			||||||
 | 
					       
 | 
				
			||||||
 | 
					        return getattr(module,_name) if hasattr(module,_name) else None
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Adding a few custom functions that should be able to help ....
 | 
				
			||||||
 | 
					# These functions can be called without specifying a path
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -0,0 +1,116 @@
 | 
				
			|||||||
 | 
					"""
 | 
				
			||||||
 | 
					This file contains default functions applied to a data-frame/dataset as pre/post processing jobs. 
 | 
				
			||||||
 | 
					The functions are organized in a pipeline i.e the data will be applied to each function 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Custom functions :
 | 
				
			||||||
 | 
					    functions must tak 2 arguments (_data,_args) : where _data is a data frame and _arg is a object describing the input parameters
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					import pandas as pd
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					from datetime import datetime, timedelta
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def limit(_data,size):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					        ...,{limit:size}
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # size = int(_args['limit'])
 | 
				
			||||||
 | 
					    return _data.iloc[:size]
 | 
				
			||||||
 | 
					def format(_data,_schema):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    This function enforces a schema against a data-frame, this may or may not work depending on the persistence storage
 | 
				
			||||||
 | 
					    :_data  data-frame containing all data
 | 
				
			||||||
 | 
					    :_args  schema to enforce the data, we are expecting the format as a list of {name,type,description}    
 | 
				
			||||||
 | 
					    """   
 | 
				
			||||||
 | 
					    return _data 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def approximate(_data,_args):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    :_args  Object of {field:type}
 | 
				
			||||||
 | 
					    This function will approximate n-fields in the data given it's distribution
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    _m = {'int':int,'float':float,'integer':int,'double':float}
 | 
				
			||||||
 | 
					    columns = list(_args.keys())
 | 
				
			||||||
 | 
					    for _name in columns :
 | 
				
			||||||
 | 
					        if _name not in _data :
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        otype = _args[_name]
 | 
				
			||||||
 | 
					        otype = str if otype not in _m else _m[otype]
 | 
				
			||||||
 | 
					        _data.loc[:,_name] = np.random.uniform(_data[_name].values).astype(otype)
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    return _data
 | 
				
			||||||
 | 
					def split_date(_data,_args):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    This function takes a field and applies the format from other fields
 | 
				
			||||||
 | 
					    :_data  data-frame
 | 
				
			||||||
 | 
					    :_config    configuration entry {column:{format,column:format,type}}
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    _columns = list(_args.keys())
 | 
				
			||||||
 | 
					    _m = {'int':int,'float':float,'integer':int,'double':float}
 | 
				
			||||||
 | 
					    for _name in _columns :
 | 
				
			||||||
 | 
					        _iname = _args[_name]['column']
 | 
				
			||||||
 | 
					        _iformat = _args[_name]['format']['in']
 | 
				
			||||||
 | 
					        _oformat = _args[_name]['format']['out']
 | 
				
			||||||
 | 
					        _otype = str if 'type' not in _args[_name]  else _args[_name]['type']
 | 
				
			||||||
 | 
					        _data.loc[:,_name] = _data[_iname].apply(lambda _date: datetime.strftime(datetime.strptime(str(_date),_iformat),_oformat)).astype(_otype)
 | 
				
			||||||
 | 
					    return _data
 | 
				
			||||||
 | 
					def newdate(_data,_args):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    This function creates a new data on a given column from another 
 | 
				
			||||||
 | 
					    :_data  data frame
 | 
				
			||||||
 | 
					    :_args  configuration column:{format,column}
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    _columns = list(_args.keys())
 | 
				
			||||||
 | 
					    for _name in _columns :
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        format = _args[_name]['format']
 | 
				
			||||||
 | 
					        ROW_COUNT = _data[_name].size
 | 
				
			||||||
 | 
					        if 'column' in _args[_name] :
 | 
				
			||||||
 | 
					            srcName = _args[_name]['column']
 | 
				
			||||||
 | 
					            years = _data[srcName].values
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            years = np.random.choice(np.arange(datetime.now().year- 90,datetime.now().year),ROW_COUNT)
 | 
				
			||||||
 | 
					        _data.loc[:,_name] = [ _makedate(year = years[_index],format = format) for _index in np.arange(ROW_COUNT)]
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    return _data
 | 
				
			||||||
 | 
					def _makedate(**_args):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    This function creates a new date and applies it to a column
 | 
				
			||||||
 | 
					    :_data  data-frame with columns
 | 
				
			||||||
 | 
					    :_args  arguments for col1:format
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    _columns = list(_args.keys())
 | 
				
			||||||
 | 
					   
 | 
				
			||||||
 | 
					    # if _args['year'] in ['',None,np.nan] :
 | 
				
			||||||
 | 
					    #     year = np.random.choice(np.arange(1920,222),1)
 | 
				
			||||||
 | 
					    # else:
 | 
				
			||||||
 | 
					    #     year = int(_args['year'])
 | 
				
			||||||
 | 
					    year = int(_args['year'])
 | 
				
			||||||
 | 
					    offset = _args['offset'] if 'offset' in _args else 0
 | 
				
			||||||
 | 
					    month   = np.random.randint(1,13)
 | 
				
			||||||
 | 
					    if month == 2:
 | 
				
			||||||
 | 
					        _end = 28 if year % 4 != 0 else 29
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        _end = 31 if month in [1,3,5,7,8,10,12] else 30
 | 
				
			||||||
 | 
					    day = np.random.randint(1,_end)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    #-- synthetic date
 | 
				
			||||||
 | 
					    _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0)
 | 
				
			||||||
 | 
					    FORMAT =  '%Y-%m-%d'
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    if 'format' in _args:
 | 
				
			||||||
 | 
					        FORMAT = _args['format']
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # print ([_name,FORMAT, _date.strftime(FORMAT)])
 | 
				
			||||||
 | 
					    r = []
 | 
				
			||||||
 | 
					    if offset :
 | 
				
			||||||
 | 
					        r = [_date.strftime(FORMAT)]
 | 
				
			||||||
 | 
					        for _delta in offset :
 | 
				
			||||||
 | 
					            _date = _date + timedelta(_delta)
 | 
				
			||||||
 | 
					            r.append(_date.strptime(FORMAT))
 | 
				
			||||||
 | 
					        return r
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        return _date.strftime(FORMAT)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
					Loading…
					
					
				
		Reference in new issue