bug fixes and enhancements, iceberg casting, typer parameters, etl throtling

v2.4
Steve Nyemba 4 weeks ago
parent b2a2e49858
commit 5c423205c5

@ -32,8 +32,11 @@ from typing_extensions import Annotated
from typing import Optional from typing import Optional
import time import time
from termcolor import colored from termcolor import colored
from enum import Enum
from typing import Tuple
app = typer.Typer() app = typer.Typer()
app_x = typer.Typer()
REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport']) REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport'])
REGISTRY_FILE= 'transport-registry.json' REGISTRY_FILE= 'transport-registry.json'
CHECK_MARK = ' '.join(['[',colored(u'\u2713', 'green'),']']) CHECK_MARK = ' '.join(['[',colored(u'\u2713', 'green'),']'])
@ -45,10 +48,15 @@ def wait(jobs):
while jobs : while jobs :
jobs = [thread for thread in jobs if thread.is_alive()] jobs = [thread for thread in jobs if thread.is_alive()]
time.sleep(1) time.sleep(1)
def wait (jobs):
while jobs :
jobs = [pthread for pthread in jobs if pthread.is_alive()]
@app.command(name="apply-etl") @app.command(name="etl")
def apply (path:Annotated[str,typer.Argument(help="path of the configuration file")], def apply (path:Annotated[str,typer.Argument(help="path of the configuration file")],
index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed")): index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed"),
batch:int = typer.Option(default=5, help="The number of parallel processes to run at once")
):
""" """
This function applies data transport ETL feature to read data from one source to write it one or several others This function applies data transport ETL feature to read data from one source to write it one or several others
""" """
@ -57,20 +65,28 @@ def apply (path:Annotated[str,typer.Argument(help="path of the configuration fil
file = open(path) file = open(path)
_config = json.loads (file.read() ) _config = json.loads (file.read() )
file.close() file.close()
if index : if index is not None:
_config = [_config[ int(index)]] _config = [_config[ int(index)]]
jobs = [] jobs = []
for _args in _config : for _args in _config :
# pthread = etl.instance(**_args) #-- automatically starts the process # pthread = etl.instance(**_args) #-- automatically starts the process
_worker = IETL(**_args) def bootup ():
pthread = Process(target=_worker.run) _worker = IETL(**_args)
_worker.run()
pthread = Process(target=bootup)
pthread.start() pthread.start()
jobs.append(pthread) jobs.append(pthread)
if len(jobs) == batch :
wait(jobs)
jobs = []
if jobs :
wait (jobs)
# #
# @TODO: Log the number of processes started and estfrom transport impfrom transport impimated time # @TODO: Log the number of processes started and estfrom transport impfrom transport impimated time
while jobs : # while jobs :
jobs = [pthread for pthread in jobs if pthread.is_alive()] # jobs = [pthread for pthread in jobs if pthread.is_alive()]
time.sleep(1) # time.sleep(1)
# #
# @TODO: Log the job termination here ... # @TODO: Log the job termination here ...
@app.command(name="providers") @app.command(name="providers")
@ -150,8 +166,9 @@ def register (label:Annotated[str,typer.Argument(help="unique label that will be
except Exception as e: except Exception as e:
_msg = f"""{TIMES_MARK} {e}""" _msg = f"""{TIMES_MARK} {e}"""
print (_msg) print (_msg)
@app.command(name='plugin-add') pass
@app_x.command(name='add')
def register_plugs ( def register_plugs (
alias:Annotated[str,typer.Argument(help="unique alias fo the file being registered")], alias:Annotated[str,typer.Argument(help="unique alias fo the file being registered")],
path:Annotated[str,typer.Argument(help="path of the python file, that contains functions")] path:Annotated[str,typer.Argument(help="path of the python file, that contains functions")]
@ -164,7 +181,7 @@ def register_plugs (
_mark = TIMES_MARK if not _log else CHECK_MARK _mark = TIMES_MARK if not _log else CHECK_MARK
_msg = f"""Could NOT add the \033[1m{alias}\033[0m to the registry""" if not _log else f""" successfully added {alias}, {len(_log)} functions added""" _msg = f"""Could NOT add the \033[1m{alias}\033[0m to the registry""" if not _log else f""" successfully added {alias}, {len(_log)} functions added"""
print (f"""{_mark} {_msg}""") print (f"""{_mark} {_msg}""")
@app.command(name="plugin-list") @app_x.command(name="list")
def registry_list (): def registry_list ():
transport.registry.plugins.init() transport.registry.plugins.init()
@ -177,7 +194,7 @@ def registry_list ():
else: else:
print (f"""{TIMES_MARK}, Plugin registry is not available or needs initialization""") print (f"""{TIMES_MARK}, Plugin registry is not available or needs initialization""")
@app.command(name="plugin-test") @app_x.command(name="test")
def registry_test (key): def registry_test (key):
""" """
This function allows to test syntax for a plugin i.e in terms of alis@function This function allows to test syntax for a plugin i.e in terms of alis@function
@ -190,6 +207,7 @@ def registry_test (key):
else: else:
print (f"{TIMES_MARK} unable to load \033[1m{key}\033[0m. Make sure it is registered") print (f"{TIMES_MARK} unable to load \033[1m{key}\033[0m. Make sure it is registered")
app.add_typer(app_x, name="plugins")
if __name__ == '__main__' : if __name__ == '__main__' :
app() app()

@ -1,6 +1,6 @@
__app_name__ = 'data-transport' __app_name__ = 'data-transport'
__author__ = 'The Phi Technology' __author__ = 'The Phi Technology'
__version__= '2.4.9' __version__= '2.4.12
__email__ = "info@the-phi.com" __email__ = "info@the-phi.com"
__license__=f""" __license__=f"""
Copyright 2010 - 2024, Steve L. Nyemba Copyright 2010 - 2024, Steve L. Nyemba

@ -119,7 +119,7 @@ def instance (**_args):
# _kwa = registry.get('logger') # _kwa = registry.get('logger')
# _lmodule = getPROVIDERS[_kwa['provider']] # _lmodule = getPROVIDERS[_kwa['provider']]
if ('label' not in _args and registry.has('logger')): if ( ('label' in _args and _args['label'] != 'logger') and registry.has('logger')):
# #
# We did not request label called logger, so we are setting up a logger if it is specified in the registry # We did not request label called logger, so we are setting up a logger if it is specified in the registry
# #
@ -132,6 +132,7 @@ def instance (**_args):
_logger = _logger(**_kwargs) _logger = _logger(**_kwargs)
else: else:
_logger = None _logger = None
_kwargs = {'agent':_agent,'plugins':_plugins,'logger':_logger} _kwargs = {'agent':_agent,'plugins':_plugins,'logger':_logger}
if 'args' in _args : if 'args' in _args :
_kwargs['args'] = _args['args'] _kwargs['args'] = _args['args']

@ -14,12 +14,33 @@ import types
from . import registry from . import registry
from datetime import datetime from datetime import datetime
import pandas as pd import pandas as pd
import numpy as np
import os import os
import sys import sys
import itertools import itertools
import json import json
class BaseIO :
def __init__(self,**_args):
self._logger = _args['logger'] if 'logger' in _args else None
self._logTable = 'logs' if 'logTable' not in _args else _args['logTable']
class IO: def setLogger(self,_logger):
self._logger = _logger
def log (self,**_args):
if self._logger :
_date = str(datetime.now())
_data = dict({'pid':os.getpid(),'date':_date[:10],'time':_date[11:19]},**_args)
for key in _data :
if type(_data[key]) == list :
_data[key] = [_item.__name__ if type(_item).__name__== 'function' else _item for _item in _data[key]]
_data[key] = str(_data[key]) if type(_data[key]) not in [list,dict] else json.dumps(_data[key])
self._logger.write(pd.DataFrame([_data])) #,table=self._logTable)
class IO(BaseIO):
""" """
Base wrapper class for read/write and support for logs Base wrapper class for read/write and support for logs
""" """
@ -28,33 +49,34 @@ class IO:
# #
# We need to initialize the logger here ... # We need to initialize the logger here ...
# #
# registry.init() super().__init__(**_args)
_agent = _args['agent'] _agent = _args['agent']
plugins = _args['plugins'] plugins = _args['plugins']
_logger = _args['logger'] if 'logger' in _args else None # _logger = _args['logger'] if 'logger' in _args else None
self._logger = _logger if not type(_agent) in [IReader,IWriter] else _agent._logger #transport.get.writer(label='logger') #if registry.has('logger') else None # self._logger = _logger if not type(_agent) in [IReader,IWriter] else _agent._logger #transport.get.writer(label='logger') #if registry.has('logger') else None
# if not _logger and hasattr(_agent,'_logger') : # if not _logger and hasattr(_agent,'_logger') :
# self._logger = getattr(_agent,'_logger') # self._logger = getattr(_agent,'_logger')
self._agent = _agent self._agent = _agent
_date = _date = str(datetime.now()) _date = _date = str(datetime.now())
self._logTable = 'logs' #'_'.join(['logs',_date[:10]+_date[11:19]]).replace(':','').replace('-','_') # self._logTable = 'logs' #'_'.join(['logs',_date[:10]+_date[11:19]]).replace(':','').replace('-','_')
if plugins : if plugins :
self._init_plugins(plugins) self._init_plugins(plugins)
else: else:
self._plugins = None self._plugins = None
def setLogger(self,_logger): # def setLogger(self,_logger):
self._logger = _logger # self._logger = _logger
def log (self,**_args): # def log (self,**_args):
if self._logger : # if self._logger :
_date = str(datetime.now()) # _date = str(datetime.now())
_data = dict({'pid':os.getpid(),'date':_date[:10],'time':_date[11:19]},**_args) # _data = dict({'pid':os.getpid(),'date':_date[:10],'time':_date[11:19]},**_args)
for key in _data : # for key in _data :
if type(_data[key]) == list : # if type(_data[key]) == list :
_data[key] = [_item.__name__ if type(_item).__name__== 'function' else _item for _item in _data[key]] # _data[key] = [_item.__name__ if type(_item).__name__== 'function' else _item for _item in _data[key]]
_data[key] = str(_data[key]) if type(_data[key]) not in [list,dict] else json.dumps(_data[key]) # _data[key] = str(_data[key]) if type(_data[key]) not in [list,dict] else json.dumps(_data[key])
self._logger.write(pd.DataFrame([_data])) #,table=self._logTable)
# self._logger.write(pd.DataFrame([_data])) #,table=self._logTable)
def _init_plugins(self,_items): def _init_plugins(self,_items):
""" """
This function will load pipelined functions as a plugin loader This function will load pipelined functions as a plugin loader
@ -63,7 +85,7 @@ class IO:
self._plugins = PluginLoader(registry=registry.plugins) self._plugins = PluginLoader(registry=registry.plugins)
[self._plugins.set(_name) for _name in _items] [self._plugins.set(_name) for _name in _items]
self.log(action='init-plugins',caller='read',object=self.getClassName(self),input =[_name for _name in _items]) self.log(action='init-plugins',object=self.getClassName(self),input =[_name for _name in _items])
# if 'path' in _args and 'names' in _args : # if 'path' in _args and 'names' in _args :
# self._plugins = PluginLoader(**_args) # self._plugins = PluginLoader(**_args)
# else: # else:
@ -109,13 +131,17 @@ class IReader(IO):
# self.log(action='streaming',object=self._agent._engine.name, input= type(_data).__name__) # self.log(action='streaming',object=self._agent._engine.name, input= type(_data).__name__)
_shape = [] _shape = []
for _segment in _data : for _segment in _data :
_shape.append(list(_segment.shape)) _shape += list(_segment.shape)
if self._plugins : if self._plugins :
yield self._plugins.apply(_segment,self.log) yield self._plugins.apply(_segment,self.log)
else: else:
yield _segment yield _segment
_objectName = '.'.join([self._agent.__class__.__module__,self._agent.__class__.__name__]) _objectName = '.'.join([self._agent.__class__.__module__,self._agent.__class__.__name__])
self.log(action='streaming',object=_objectName, input= {'shape':_shape})
_input = {'shape':_shape}
if hasattr(self._agent,'_table') :
_input['table'] = self._agent._table
self.log(action='streaming',object=_objectName, input= _input)
def read(self,**_args): def read(self,**_args):
@ -141,8 +167,11 @@ class IReader(IO):
# return _data # return _data
elif type(_data) == pd.DataFrame : elif type(_data) == pd.DataFrame :
_shape = _data.shape #[0,0] if not _data.shape[] else list(_data.shape) _shape = _data.shape #[0,0] if not _data.shape[] else list(_data.shape)
_input = {'shape':_shape}
if hasattr(self._agent,'_table') :
_input['table'] = self._agent._table
self.log(action='read',object=_objectName, input=_shape) self.log(action='read',object=_objectName, input=_input)
if self._plugins : if self._plugins :
_logs = [] _logs = []
_data = self._plugins.apply(_data,self.log) _data = self._plugins.apply(_data,self.log)
@ -171,48 +200,67 @@ class IWriter(IO):
# The ETL object in its simplest form is an aggregation of read/write objects # The ETL object in its simplest form is an aggregation of read/write objects
# @TODO: ETL can/should aggregate a writer as a plugin and apply it as a process # @TODO: ETL can/should aggregate a writer as a plugin and apply it as a process
class IETL(IReader) : class IETL(BaseIO) :
""" """
This class performs an ETL operation by ineriting a read and adding writes as pipeline functions This class performs an ETL operation by ineriting a read and adding writes as pipeline functions
""" """
def __init__(self,**_args): def __init__(self,**_args):
_source = _args['source'] # _source = _args['source']
_plugins = _source['plugins'] if 'plugins' in _source else None # _plugins = _source['plugins'] if 'plugins' in _source else None
# super().__init__(transport.get.reader(**_args['source'])) # # super().__init__(transport.get.reader(**_args['source']))
super().__init__(agent=transport.get.reader(**_source),plugins=_plugins) # super().__init__(agent=transport.get.reader(**_source),plugins=_plugins)
# _logger = # # _logger =
if 'target' in _args: # if 'target' in _args:
self._targets = _args['target'] if type(_args['target']) == list else [_args['target']] # self._targets = _args['target'] if type(_args['target']) == list else [_args['target']]
else: # else:
self._targets = [] # self._targets = []
self.jobs = [] # self.jobs = []
# #
# # If the parent is already multiprocessing
# if 'token' in _source :
# self._logToken = _source['token']
# self._sourceArgs = _source['args'] if 'args' in _source else None
# self._hasParentProcess = False if 'hasParentProcess' not in _args else _args['hasParentProcess']
super().__init__()
self._source = _args['source']
self._targets= _args['target'] if _args['target'] == list else [_args['target']]
# #
# If the parent is already multiprocessing # ETL Initialization, we should provide some measure of context ...
if 'token' in _source : #
self._logToken = _source['token']
self._sourceArgs = _source['args'] if 'args' in _source else None
self._hasParentProcess = False if 'hasParentProcess' not in _args else _args['hasParentProcess']
# def run(self) : # def run(self) :
# """ # """
# We should apply the etl here, if we are in multiprocessing mode # We should apply the etl here, if we are in multiprocessing mode
# """ # """
# return self.read() # return self.read()
def run(self,**_args): def run(self,**_args):
_data = super().read(**_args) if not self._sourceArgs else super().read(**self._sourceArgs) # _data = super().read(**_args) if not self._sourceArgs else super().read(**self._sourceArgs)
self._targets = [transport.get.writer(**_kwargs) for _kwargs in self._targets] # self._targets = [transport.get.writer(**_kwargs) for _kwargs in self._targets]
_reader = transport.get.reader(**self._source)
if hasattr(_reader,'_logger') :
self.setLogger(_reader._logger)
self.log(action='init-etl',input={'source':self._source,'target':self._targets})
_data = _reader.read(**self._source['args'])if 'args' in self._source else _reader.read()
_reader.close()
_writers = [transport.get.writer(**_kwargs) for _kwargs in self._targets]
_schema = [] if not getattr(_reader._agent,'_table') else _reader.meta()
if types.GeneratorType == type(_data): if types.GeneratorType == type(_data):
_index = 0 _index = 0
for _segment in _data : for _segment in _data :
_index += 1 _index += 1
for _writer in self._targets : for _writer in _writers :
self.post(_segment,writer=_writer,index=_index) self.post(_segment,writer=_writer,index=_index,schema=_schema)
time.sleep(1) time.sleep(1)
else: else:
for _writer in self._targets : for _writer in _writers :
self.post(_data,writer=_writer) self.post(_data,writer=_writer,schema=_schema)
# pass
return _data return _data
# return _data # return _data
def post (self,_data,**_args) : def post (self,_data,**_args) :
@ -221,18 +269,27 @@ class IETL(IReader) :
:_args parameters associated with writer object :_args parameters associated with writer object
""" """
#writer = transport.get.writer(**_args) #writer = transport.get.writer(**_args)
_input = {}
try: try:
_action = 'post' _action = 'post'
_shape = dict(zip(['rows','columns'],_data.shape)) _shape = dict(zip(['rows','columns'],_data.shape))
_index = _args['index'] if 'index' in _args else 0 _index = _args['index'] if 'index' in _args else 0
writer = _args['writer']
writer.write(_data) writer = _args['writer']
_schema= _args['schema']
for _item in _schema :
if _item['type'] == 'INTEGER' :
_data.loc[:,_item['name']] = _data[_item['name']].copy().astype(np.int64)
writer.write(_data,schema=_schema)
#
# -- things to log
_input = {'shape':_shape,'segment':_index}
if hasattr(writer._agent,'_table'):
_input['table'] = writer._agent._table
except Exception as e: except Exception as e:
_action = 'post-error' _action = 'post-error'
print (e) print ([e])
pass pass
self.log(action=_action,object=writer._agent.__module__, input= {'shape':_shape,'segment':_index}) self.log(action=_action,object=writer._agent.__module__, input= _input)

@ -7,7 +7,8 @@ NOTE:
""" """
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
from pyspark import SparkContext from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.functions import col, to_date, to_timestamp
import copy import copy
class Iceberg : class Iceberg :
@ -25,6 +26,7 @@ class Iceberg :
# Make arrangements for additional configuration elements # Make arrangements for additional configuration elements
# #
self._session = SparkSession.builder.appName("data-transport").getOrCreate() self._session = SparkSession.builder.appName("data-transport").getOrCreate()
self._session.conf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS")
# self._session.sparkContext.setLogLevel("ERROR") # self._session.sparkContext.setLogLevel("ERROR")
self._catalog = self._session.catalog self._catalog = self._session.catalog
self._table = _args['table'] if 'table' in _args else None self._table = _args['table'] if 'table' in _args else None
@ -53,7 +55,8 @@ class Iceberg :
""" """
_schema = [] _schema = []
try: try:
_tableName = self._getPrefix(**_args) + f".{_args['table']}" _table = _args['table'] if 'table' in _args else self._table
_tableName = self._getPrefix(**_args) + f".{_table}"
_tmp = self._session.table(_tableName).schema _tmp = self._session.table(_tableName).schema
_schema = _tmp.jsonValue()['fields'] _schema = _tmp.jsonValue()['fields']
for _item in _schema : for _item in _schema :
@ -106,15 +109,34 @@ class Writer (Iceberg):
super().__init__(**_args) super().__init__(**_args)
self._mode = 'append' if 'mode' not in _args else _args['mode'] self._mode = 'append' if 'mode' not in _args else _args['mode']
self._table = None if 'table' not in _args else _args['table'] self._table = None if 'table' not in _args else _args['table']
def format (self,_schema) :
_iceSchema = StructType([])
_map = {'integer':IntegerType(),'float':DoubleType(),'double':DoubleType(),'date':DateType(),
'timestamp':TimestampType(),'datetime':TimestampType(),'string':StringType(),'varchar':StringType()}
for _item in _schema :
_name = _item['name']
_type = _item['type'].lower()
if _type not in _map :
_iceType = StringType()
else:
_iceType = _map[_type]
_iceSchema.add (StructField(_name,_iceType,True))
return _iceSchema if len(_iceSchema) else []
def write(self,_data,**_args): def write(self,_data,**_args):
_prefix = self._getPrefix(**_args) _prefix = self._getPrefix(**_args)
if 'table' not in _args and not self._table : if 'table' not in _args and not self._table :
raise Exception (f"Table Name should be specified for catalog/database {_prefix}") raise Exception (f"Table Name should be specified for catalog/database {_prefix}")
rdd = self._session.createDataFrame(_data,verifySchema=False) _schema = self.format(_args['schema']) if 'schema' in _args else []
if not _schema :
rdd = self._session.createDataFrame(_data,verifySchema=False)
else :
rdd = self._session.createDataFrame(_data,schema=_schema,verifySchema=True)
_mode = self._mode if 'mode' not in _args else _args['mode'] _mode = self._mode if 'mode' not in _args else _args['mode']
_table = self._table if 'table' not in _args else _args['table'] _table = self._table if 'table' not in _args else _args['table']
# print (_data.shape,_mode,_table) # print (_data.shape,_mode,_table)
if not self._session.catalog.tableExists(_table): if not self._session.catalog.tableExists(_table):
# # @TODO: # # @TODO:
# # add partitioning information here # # add partitioning information here
@ -125,4 +147,5 @@ class Writer (Iceberg):
else: else:
# rdd.writeTo(_table).append() # rdd.writeTo(_table).append()
# # _table = f'{_prefix}.{_table}' # # _table = f'{_prefix}.{_table}'
rdd.coalesce(10).write.format('iceberg').mode('append').save(_table) rdd.coalesce(10).write.format('iceberg').mode('append').save(_table)

Loading…
Cancel
Save