bug fixes and enhancements, iceberg casting, typer parameters, etl throtling

7 months ago · 5c423205c5
parent b2a2e49858
commit 5c423205c5
5 changed files with 167 additions and 68 deletions
--- a/bin/transport
+++ b/bin/transport
@ -32,8 +32,11 @@ from typing_extensions import Annotated
 from typing import Optional
 import time
 from termcolor import colored
+from enum import Enum
+from typing import Tuple

 app = typer.Typer()
+app_x = typer.Typer()
 REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport'])
 REGISTRY_FILE= 'transport-registry.json'
 CHECK_MARK = ' '.join(['[',colored(u'\u2713', 'green'),']'])
@ -45,10 +48,15 @@ def wait(jobs):
    while jobs :
        jobs = [thread for thread in jobs if thread.is_alive()]
        time.sleep(1)
+def wait (jobs):
+    while jobs :
+            jobs = [pthread for pthread in jobs if pthread.is_alive()]

-@app.command(name="apply-etl")
+@app.command(name="etl")
 def apply (path:Annotated[str,typer.Argument(help="path of the configuration file")],
-        index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed")):
+        index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed"),
+        batch:int = typer.Option(default=5, help="The number of parallel processes to run at once")
+        ):
    """
    This function applies data transport ETL feature to read data from one source to write it one or several others
    """
@ -57,20 +65,28 @@ def apply (path:Annotated[str,typer.Argument(help="path of the configuration fil
        file = open(path)
        _config = json.loads (file.read() )
        file.close()
-        if index :            
+        if index is not None:            
            _config = [_config[ int(index)]]
        jobs = []          
        for _args in _config :
            # pthread = etl.instance(**_args) #-- automatically starts the process
-            _worker = IETL(**_args)
-            pthread = Process(target=_worker.run)
+            def bootup ():
+                _worker = IETL(**_args)
+                _worker.run()
+            pthread = Process(target=bootup)
            pthread.start()
            jobs.append(pthread)
+            if len(jobs) == batch :
+                wait(jobs)
+                jobs = []
+        
+        if jobs :
+            wait (jobs)
        #
        # @TODO: Log the number of processes started and estfrom transport impfrom transport impimated time
-        while jobs :
-             jobs = [pthread for pthread in jobs if pthread.is_alive()]
-             time.sleep(1)
+        # while jobs :
+        #      jobs = [pthread for pthread in jobs if pthread.is_alive()]
+        #      time.sleep(1)
        #
        # @TODO: Log the job termination here ...
@app.command(name="providers")
@ -151,7 +167,8 @@ def register (label:Annotated[str,typer.Argument(help="unique label that will be
        _msg = f"""{TIMES_MARK} {e}"""
    print (_msg)
    
-@app.command(name='plugin-add') 
+    pass
+@app_x.command(name='add') 
 def register_plugs (
    alias:Annotated[str,typer.Argument(help="unique alias fo the file being registered")],
    path:Annotated[str,typer.Argument(help="path of the python file, that contains functions")]
@ -164,7 +181,7 @@ def register_plugs (
    _mark = TIMES_MARK if not _log else CHECK_MARK
    _msg  = f"""Could NOT add the \033[1m{alias}\033[0m to the registry""" if not _log else f""" successfully added {alias}, {len(_log)} functions added"""
    print (f"""{_mark} {_msg}""")
-@app.command(name="plugin-list") 
+@app_x.command(name="list") 
 def registry_list ():

    transport.registry.plugins.init()
@ -177,7 +194,7 @@ def registry_list ():
    else:
        print (f"""{TIMES_MARK}, Plugin registry is not available or needs initialization""")

-@app.command(name="plugin-test") 
+@app_x.command(name="test") 
 def registry_test (key):
    """
    This function allows to test syntax for a plugin i.e in terms of alis@function
@ -190,6 +207,7 @@ def registry_test (key):
    else:
        print (f"{TIMES_MARK} unable to load \033[1m{key}\033[0m. Make sure it is registered")

+app.add_typer(app_x, name="plugins")
 if __name__ == '__main__' :
     app()
 	
--- a/info/init.py
+++ b/info/init.py
@ -1,6 +1,6 @@
 __app_name__  = 'data-transport'
 __author__ = 'The Phi Technology'
-__version__= '2.4.9'
+__version__= '2.4.12
 __email__  = "info@the-phi.com"
 __license__=f"""
 Copyright 2010 - 2024, Steve L. Nyemba
--- a/transport/init.py
+++ b/transport/init.py
@ -119,7 +119,7 @@ def instance (**_args):
        #     _kwa = registry.get('logger')
        #     _lmodule = getPROVIDERS[_kwa['provider']]
            
-        if ('label' not in _args and registry.has('logger')):
+        if ( ('label' in _args and _args['label'] != 'logger') and registry.has('logger')):
            #
            # We did not request label called logger, so we are setting up a logger if it is specified in the registry
            #
@ -132,6 +132,7 @@ def instance (**_args):
            _logger = _logger(**_kwargs)
        else:
            _logger = None
+        
        _kwargs = {'agent':_agent,'plugins':_plugins,'logger':_logger}
        if 'args' in _args :
            _kwargs['args'] = _args['args']
--- a/transport/iowrapper.py
+++ b/transport/iowrapper.py
@ -14,12 +14,33 @@ import types
 from . import registry
 from datetime import datetime
 import pandas as pd
+import numpy as np
 import os
 import sys
 import itertools
 import json
+class BaseIO :
+    def __init__(self,**_args):
+        
+        self._logger = _args['logger'] if 'logger' in _args else None
+        self._logTable = 'logs' if 'logTable' not in _args else _args['logTable']
+
+    def setLogger(self,_logger):
+        self._logger = _logger        
+    def log (self,**_args):
+        
+        if self._logger :
+            _date = str(datetime.now())                                    
+            _data = dict({'pid':os.getpid(),'date':_date[:10],'time':_date[11:19]},**_args)
+            for key in _data :
+                if type(_data[key]) == list :
+                    _data[key] = [_item.__name__ if type(_item).__name__== 'function' else _item for _item in _data[key]]
+                    
+                _data[key] = str(_data[key]) if type(_data[key]) not in [list,dict] else json.dumps(_data[key])
+            
+            self._logger.write(pd.DataFrame([_data])) #,table=self._logTable)
    
-class IO:
+class IO(BaseIO):
    """
    Base wrapper class for read/write and support for logs
    """
@ -28,33 +49,34 @@ class IO:
        #
        # We need to initialize the logger here ...
        #
-        # registry.init()
+        super().__init__(**_args)
        _agent  = _args['agent']
        plugins = _args['plugins']
-        _logger = _args['logger'] if 'logger' in _args else None
-        self._logger = _logger if not type(_agent) in [IReader,IWriter] else _agent._logger #transport.get.writer(label='logger') #if registry.has('logger') else None
+        # _logger = _args['logger'] if 'logger' in _args else None
+        # self._logger = _logger if not type(_agent) in [IReader,IWriter] else _agent._logger #transport.get.writer(label='logger') #if registry.has('logger') else None
        # if not _logger and hasattr(_agent,'_logger') :
        #     self._logger = getattr(_agent,'_logger')
        self._agent = _agent
        _date = _date = str(datetime.now())
-        self._logTable = 'logs' #'_'.join(['logs',_date[:10]+_date[11:19]]).replace(':','').replace('-','_')
+        # self._logTable = 'logs' #'_'.join(['logs',_date[:10]+_date[11:19]]).replace(':','').replace('-','_')
        
        if plugins :
            self._init_plugins(plugins)
        else:
            self._plugins = None
-    def setLogger(self,_logger):
-        self._logger = _logger        
-    def log (self,**_args):
-        if self._logger :
-            _date = str(datetime.now())                                    
-            _data = dict({'pid':os.getpid(),'date':_date[:10],'time':_date[11:19]},**_args)
-            for key in _data :
-                if type(_data[key]) == list :
-                    _data[key] = [_item.__name__ if type(_item).__name__== 'function' else _item for _item in _data[key]]
+    # def setLogger(self,_logger):
+    #     self._logger = _logger        
+    # def log (self,**_args):
+    #     if self._logger :
+    #         _date = str(datetime.now())                                    
+    #         _data = dict({'pid':os.getpid(),'date':_date[:10],'time':_date[11:19]},**_args)
+    #         for key in _data :
+    #             if type(_data[key]) == list :
+    #                 _data[key] = [_item.__name__ if type(_item).__name__== 'function' else _item for _item in _data[key]]
                    
-                _data[key] = str(_data[key]) if type(_data[key]) not in [list,dict] else json.dumps(_data[key])
-            self._logger.write(pd.DataFrame([_data])) #,table=self._logTable)
+    #             _data[key] = str(_data[key]) if type(_data[key]) not in [list,dict] else json.dumps(_data[key])
+            
+    #         self._logger.write(pd.DataFrame([_data])) #,table=self._logTable)
    def _init_plugins(self,_items):
        """
        This function will load pipelined functions as a plugin loader
@ -63,7 +85,7 @@ class IO:
        self._plugins = PluginLoader(registry=registry.plugins)
        [self._plugins.set(_name) for _name in _items]
        
-        self.log(action='init-plugins',caller='read',object=self.getClassName(self),input =[_name for _name in _items])
+        self.log(action='init-plugins',object=self.getClassName(self),input =[_name for _name in _items])
        # if 'path' in _args and 'names' in _args :
        #     self._plugins = PluginLoader(**_args)
        # else:
@ -109,13 +131,17 @@ class IReader(IO):
        # self.log(action='streaming',object=self._agent._engine.name, input= type(_data).__name__)
        _shape = []
        for _segment in _data :     
-            _shape.append(list(_segment.shape))
+            _shape += list(_segment.shape)
            if self._plugins :
                yield self._plugins.apply(_segment,self.log)
            else:
                yield _segment
        _objectName = '.'.join([self._agent.__class__.__module__,self._agent.__class__.__name__])
-        self.log(action='streaming',object=_objectName, input= {'shape':_shape})
+        
+        _input = {'shape':_shape}
+        if hasattr(self._agent,'_table') :
+            _input['table'] = self._agent._table
+        self.log(action='streaming',object=_objectName, input= _input)
        
    def read(self,**_args):
        
@ -141,8 +167,11 @@ class IReader(IO):
                # return _data
        elif type(_data) == pd.DataFrame :
            _shape = _data.shape #[0,0] if not _data.shape[] else list(_data.shape)
+            _input = {'shape':_shape}
+            if hasattr(self._agent,'_table') :
+                _input['table'] = self._agent._table
            
-            self.log(action='read',object=_objectName, input=_shape)
+            self.log(action='read',object=_objectName, input=_input)
            if self._plugins :
                _logs = []
                _data = self._plugins.apply(_data,self.log)
@ -171,48 +200,67 @@ class IWriter(IO):
 # The ETL object in its simplest form is an aggregation of read/write objects
 # @TODO: ETL can/should aggregate a writer as a plugin and apply it as a process

-class IETL(IReader) :
+class IETL(BaseIO) :
    """
    This class performs an ETL operation by ineriting a read and adding writes as pipeline functions
    """
    def __init__(self,**_args):
-        _source = _args['source']
-        _plugins = _source['plugins'] if 'plugins' in _source else None
-        
-        # super().__init__(transport.get.reader(**_args['source']))
-        super().__init__(agent=transport.get.reader(**_source),plugins=_plugins)
-        # _logger = 
-        if 'target' in _args:
-            self._targets = _args['target'] if type(_args['target']) == list else [_args['target']]
-        else:
-            self._targets = []
-        self.jobs = []
+        # _source = _args['source']
+        # _plugins = _source['plugins'] if 'plugins' in _source else None
+        
+        # # super().__init__(transport.get.reader(**_args['source']))
+        # super().__init__(agent=transport.get.reader(**_source),plugins=_plugins)
+        # # _logger = 
+        # if 'target' in _args:
+        #     self._targets = _args['target'] if type(_args['target']) == list else [_args['target']]
+        # else:
+        #     self._targets = []
+        # self.jobs = []
+        
+        # #
+        # # If the parent is already multiprocessing
+        # if 'token' in _source :
+        #     self._logToken = _source['token']
+        # self._sourceArgs = _source['args'] if 'args' in _source else None
+        # self._hasParentProcess = False if 'hasParentProcess' not in _args else _args['hasParentProcess']
+        super().__init__()
+        self._source = _args['source']
+        self._targets= _args['target'] if _args['target'] == list else [_args['target']]
+        #
+        # ETL Initialization, we should provide some measure of context ...
        #
-        # If the parent is already multiprocessing
-        if 'token' in _source :
-            self._logToken = _source['token']
-        self._sourceArgs = _source['args'] if 'args' in _source else None
-        self._hasParentProcess = False if 'hasParentProcess' not in _args else _args['hasParentProcess']
+        
    # def run(self) :
    #     """
    #     We should apply the etl here, if we are in multiprocessing mode
    #     """
    #     return self.read()
    def run(self,**_args):
-        _data = super().read(**_args) if not self._sourceArgs else super().read(**self._sourceArgs)
-        self._targets = [transport.get.writer(**_kwargs) for _kwargs in self._targets]
+        # _data = super().read(**_args) if not self._sourceArgs else super().read(**self._sourceArgs)
+        # self._targets = [transport.get.writer(**_kwargs) for _kwargs in self._targets]
+        
+        _reader = transport.get.reader(**self._source)
+        if hasattr(_reader,'_logger') :
+            self.setLogger(_reader._logger)
+        
+        self.log(action='init-etl',input={'source':self._source,'target':self._targets})
+        _data = _reader.read(**self._source['args'])if 'args' in self._source else _reader.read()
+        _reader.close()
+        _writers = [transport.get.writer(**_kwargs) for _kwargs in self._targets]
+        _schema = [] if not getattr(_reader._agent,'_table') else _reader.meta()
        if types.GeneratorType == type(_data):
            _index = 0
            for _segment in _data :
                _index += 1
-                for _writer in self._targets :
-                    self.post(_segment,writer=_writer,index=_index)
+                for _writer in _writers :
+                    self.post(_segment,writer=_writer,index=_index,schema=_schema)
                    time.sleep(1)
-        else:
            
-            for _writer in self._targets :
-                self.post(_data,writer=_writer)
+        else:
            
+            for _writer in _writers :
+                self.post(_data,writer=_writer,schema=_schema)
+        #     pass
        return _data
        # return _data
    def post (self,_data,**_args) :
@ -221,18 +269,27 @@ class IETL(IReader) :
        :_args  parameters associated with writer object
        """
        #writer = transport.get.writer(**_args)
-        
+        _input = {}
        try:
            _action = 'post'
            _shape = dict(zip(['rows','columns'],_data.shape))
            _index = _args['index'] if 'index' in _args else 0
-            writer = _args['writer']
            
-            writer.write(_data)
+            writer = _args['writer']
+            _schema= _args['schema']
            
+            for _item in _schema :
+                if _item['type'] == 'INTEGER' :
+                    _data.loc[:,_item['name']] = _data[_item['name']].copy().astype(np.int64)
+            writer.write(_data,schema=_schema)
+            #
+            # -- things to log
+            _input = {'shape':_shape,'segment':_index}
+            if hasattr(writer._agent,'_table'):
+                _input['table'] = writer._agent._table
        except Exception as e:
            _action = 'post-error'
-            print (e)
+            print ([e])
            pass
        
-        self.log(action=_action,object=writer._agent.__module__, input= {'shape':_shape,'segment':_index})
+        self.log(action=_action,object=writer._agent.__module__, input= _input)
--- a/transport/warehouse/iceberg.py
+++ b/transport/warehouse/iceberg.py
@ -7,7 +7,8 @@ NOTE:
 """
 from pyspark.sql import SparkSession
 from pyspark import SparkContext
-
+from pyspark.sql.types import *
+from pyspark.sql.functions import col, to_date, to_timestamp
 import copy

 class Iceberg :
@ -25,6 +26,7 @@ class Iceberg :
        #   Make arrangements for additional configuration elements 
        #
        self._session = SparkSession.builder.appName("data-transport").getOrCreate()
+        self._session.conf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS")
        # self._session.sparkContext.setLogLevel("ERROR")
        self._catalog = self._session.catalog
        self._table = _args['table'] if 'table' in _args else None
@ -53,7 +55,8 @@ class Iceberg :
        """
        _schema = []
        try:
-            _tableName = self._getPrefix(**_args) + f".{_args['table']}"
+            _table = _args['table'] if 'table' in _args else self._table
+            _tableName = self._getPrefix(**_args) + f".{_table}"
            _tmp = self._session.table(_tableName).schema
            _schema = _tmp.jsonValue()['fields']
            for _item in _schema :
@ -106,13 +109,32 @@ class Writer (Iceberg):
        super().__init__(**_args)
        self._mode = 'append' if 'mode' not in _args else _args['mode']
        self._table = None if 'table' not in _args else _args['table']
+    def format (self,_schema) :
+        _iceSchema = StructType([])
+        _map = {'integer':IntegerType(),'float':DoubleType(),'double':DoubleType(),'date':DateType(),
+                'timestamp':TimestampType(),'datetime':TimestampType(),'string':StringType(),'varchar':StringType()}
+        for _item in _schema :
+            _name = _item['name']
+            _type = _item['type'].lower()
+            if _type not in _map :
+                _iceType = StringType()
+            else:
+                _iceType = _map[_type]
+            
+            _iceSchema.add (StructField(_name,_iceType,True))
+        return _iceSchema if len(_iceSchema) else []
    def write(self,_data,**_args):
        _prefix = self._getPrefix(**_args)
        if 'table' not in _args and not self._table :
            raise Exception (f"Table Name should be specified for catalog/database {_prefix}")
-        rdd = self._session.createDataFrame(_data,verifySchema=False)
+        _schema = self.format(_args['schema']) if 'schema' in _args else []
+        if not _schema :
+            rdd = self._session.createDataFrame(_data,verifySchema=False)
+        else :
+            rdd = self._session.createDataFrame(_data,schema=_schema,verifySchema=True)
        _mode = self._mode if 'mode' not in _args else _args['mode']
        _table = self._table if 'table' not in _args else _args['table']
+        
        # print (_data.shape,_mode,_table)
        
        if not self._session.catalog.tableExists(_table):
@ -125,4 +147,5 @@ class Writer (Iceberg):
        else:
            # rdd.writeTo(_table).append()
        # #     _table = f'{_prefix}.{_table}'
+
            rdd.coalesce(10).write.format('iceberg').mode('append').save(_table)