""" This class is a wrapper around read/write classes of cloud,sql,nosql,other packages The wrapper allows for application of plugins as pre-post conditions. NOTE: Plugins are converted to a pipeline, so we apply a pipeline when reading or writing: - upon initialization we will load plugins - on read/write we apply a pipeline (if passed as an argument) """ from transport.plugins import PluginLoader import transport from transport import providers from multiprocessing import Process, RLock import time import types from . import registry from datetime import datetime import pandas as pd import os import sys class IO: """ Base wrapper class for read/write and support for logs """ def __init__(self,_agent,plugins,_logger=None): # # We need to initialize the logger here ... # # registry.init() self._logger = _logger #transport.get.writer(label='logger') #if registry.has('logger') else None if not _logger and hasattr(_agent,'_logger') : self._logger = getattr(_agent,'_logger') self._agent = _agent _date = _date = str(datetime.now()) self._logTable = 'logs' #'_'.join(['logs',_date[:10]+_date[11:19]]).replace(':','').replace('-','_') if plugins : self._init_plugins(plugins) else: self._plugins = None def setLogger(self,_logger): self._logger = _logger def log (self,**_args): if self._logger : _date = str(datetime.now()) _data = dict({'pid':os.getpid(),'date':_date[:10],'time':_date[11:19]},**_args) for key in _data : _data[key] = str(_data[key]) self._logger.write(pd.DataFrame([_data])) #,table=self._logTable) def _init_plugins(self,_items): """ This function will load pipelined functions as a plugin loader """ registry.plugins.init() self._plugins = PluginLoader(registry=registry.plugins) [self._plugins.set(_name) for _name in _items] self.log(action='init-plugins',caller='read', input =[_name for _name in _items]) # if 'path' in _args and 'names' in _args : # self._plugins = PluginLoader(**_args) # else: # self._plugins = PluginLoader(registry=registry.plugins) # [self._plugins.set(_pointer) for _pointer in _args] # # @TODO: We should have a way to log what plugins are loaded and ready to use def meta (self,**_args): if hasattr(self._agent,'meta') : return self._agent.meta(**_args) return [] def close(self): if hasattr(self._agent,'close') : self._agent.close() def apply(self): """ applying pre/post conditions given a pipeline expression """ for _pointer in self._plugins : _data = _pointer(_data) def apply(self,_query): if hasattr(self._agent,'apply') : return self._agent.apply(_query) return None def submit(self,_query): return self.delegate('submit',_query) def delegate(self,_name,_query): if hasattr(self._agent,_name) : pointer = getattr(self._agent,_name) return pointer(_query) return None class IReader(IO): """ This is a wrapper for read functionalities """ def __init__(self,_agent,_plugins=None,_logger=None): super().__init__(_agent,_plugins,_logger) def _stream (self,_data ): # self.log(action='streaming',object=self._agent._engine.name, input= type(_data).__name__) _shape = [] for _segment in _data : _shape.append(list(_segment.shape)) yield self._plugins.apply(_segment,self.log) self.log(action='streaming',object=self._agent._engine.name, input= {'shape':_shape}) def read(self,**_args): if 'plugins' in _args : self._init_plugins(_args['plugins']) _data = self._agent.read(**_args) _objectName = '.'.join([self._agent.__class__.__module__,self._agent.__class__.__name__]) if types.GeneratorType == type(_data): if self._plugins : return self._stream(_data) else: self.log(action='streaming',object=_objectName, input= {'memory_size':sys.getsizeof(_data)}) return _data else: self.log(action='read',object=_objectName, input=_data.shape) if self._plugins : _data = self._plugins.apply(_data) return _data # if self._plugins and self._plugins.ratio() > 0 : # if types.GeneratorType == type(_data): # return self._stream(_data) # else: # _data = self._plugins.apply(_data) # return _data # else: # self.log(action='read',object=self._agent._engine.name, input=_data.shape) # return _data class IWriter(IO): lock = RLock() def __init__(self,_agent,pipeline=None,_logger=None): super().__init__(_agent,pipeline,_logger) def write(self,_data,**_args): if 'plugins' in _args : self._init_plugins(_args['plugins']) if self._plugins and self._plugins.ratio() > 0 : _data = self._plugins.apply(_data,self._logger) try: # IWriter.lock.acquire() self._agent.write(_data,**_args) finally: # IWriter.lock.release() pass # # The ETL object in its simplest form is an aggregation of read/write objects # @TODO: ETL can/should aggregate a writer as a plugin and apply it as a process class IETL(IReader) : """ This class performs an ETL operation by ineriting a read and adding writes as pipeline functions """ def __init__(self,**_args): super().__init__(transport.get.reader(**_args['source'])) if 'target' in _args: self._targets = _args['target'] if type(_args['target']) == list else [_args['target']] else: self._targets = [] self.jobs = [] # # If the parent is already multiprocessing self._hasParentProcess = False if 'hasParentProcess' not in _args else _args['hasParentProcess'] # def run(self) : # """ # We should apply the etl here, if we are in multiprocessing mode # """ # return self.read() def run(self,**_args): _data = super().read(**_args) self._targets = [transport.get.writer(**_kwargs) for _kwargs in self._targets] if types.GeneratorType == type(_data): _index = 0 for _segment in _data : _index += 1 for _writer in self._targets : self.post(_segment,writer=_writer,index=_index) time.sleep(1) else: for _writer in self._targets : self.post(_data,writer=_writer) return _data # return _data def post (self,_data,**_args) : """ This function returns an instance of a process that will perform the write operation :_args parameters associated with writer object """ #writer = transport.get.writer(**_args) try: _action = 'post' _shape = dict(zip(['rows','columns'],_data.shape)) _index = _args['index'] if 'index' in _args else 0 writer = _args['writer'] writer.write(_data) except Exception as e: _action = 'post-error' print (e) pass self.log(action=_action,object=writer._agent.__module__, input= {'shape':_shape,'segment':_index})