bug fixes and enhancements, iceberg casting, typer parameters, etl throtling

v2.4
Steve Nyemba 4 weeks ago
parent b2a2e49858
commit 5c423205c5

@ -32,8 +32,11 @@ from typing_extensions import Annotated
from typing import Optional
import time
from termcolor import colored
from enum import Enum
from typing import Tuple
app = typer.Typer()
app_x = typer.Typer()
REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport'])
REGISTRY_FILE= 'transport-registry.json'
CHECK_MARK = ' '.join(['[',colored(u'\u2713', 'green'),']'])
@ -45,10 +48,15 @@ def wait(jobs):
while jobs :
jobs = [thread for thread in jobs if thread.is_alive()]
time.sleep(1)
def wait (jobs):
while jobs :
jobs = [pthread for pthread in jobs if pthread.is_alive()]
@app.command(name="apply-etl")
@app.command(name="etl")
def apply (path:Annotated[str,typer.Argument(help="path of the configuration file")],
index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed")):
index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed"),
batch:int = typer.Option(default=5, help="The number of parallel processes to run at once")
):
"""
This function applies data transport ETL feature to read data from one source to write it one or several others
"""
@ -57,20 +65,28 @@ def apply (path:Annotated[str,typer.Argument(help="path of the configuration fil
file = open(path)
_config = json.loads (file.read() )
file.close()
if index :
if index is not None:
_config = [_config[ int(index)]]
jobs = []
for _args in _config :
# pthread = etl.instance(**_args) #-- automatically starts the process
_worker = IETL(**_args)
pthread = Process(target=_worker.run)
def bootup ():
_worker = IETL(**_args)
_worker.run()
pthread = Process(target=bootup)
pthread.start()
jobs.append(pthread)
if len(jobs) == batch :
wait(jobs)
jobs = []
if jobs :
wait (jobs)
#
# @TODO: Log the number of processes started and estfrom transport impfrom transport impimated time
while jobs :
jobs = [pthread for pthread in jobs if pthread.is_alive()]
time.sleep(1)
# while jobs :
# jobs = [pthread for pthread in jobs if pthread.is_alive()]
# time.sleep(1)
#
# @TODO: Log the job termination here ...
@app.command(name="providers")
@ -151,7 +167,8 @@ def register (label:Annotated[str,typer.Argument(help="unique label that will be
_msg = f"""{TIMES_MARK} {e}"""
print (_msg)
@app.command(name='plugin-add')
pass
@app_x.command(name='add')
def register_plugs (
alias:Annotated[str,typer.Argument(help="unique alias fo the file being registered")],
path:Annotated[str,typer.Argument(help="path of the python file, that contains functions")]
@ -164,7 +181,7 @@ def register_plugs (
_mark = TIMES_MARK if not _log else CHECK_MARK
_msg = f"""Could NOT add the \033[1m{alias}\033[0m to the registry""" if not _log else f""" successfully added {alias}, {len(_log)} functions added"""
print (f"""{_mark} {_msg}""")
@app.command(name="plugin-list")
@app_x.command(name="list")
def registry_list ():
transport.registry.plugins.init()
@ -177,7 +194,7 @@ def registry_list ():
else:
print (f"""{TIMES_MARK}, Plugin registry is not available or needs initialization""")
@app.command(name="plugin-test")
@app_x.command(name="test")
def registry_test (key):
"""
This function allows to test syntax for a plugin i.e in terms of alis@function
@ -190,6 +207,7 @@ def registry_test (key):
else:
print (f"{TIMES_MARK} unable to load \033[1m{key}\033[0m. Make sure it is registered")
app.add_typer(app_x, name="plugins")
if __name__ == '__main__' :
app()

@ -1,6 +1,6 @@
__app_name__ = 'data-transport'
__author__ = 'The Phi Technology'
__version__= '2.4.9'
__version__= '2.4.12
__email__ = "info@the-phi.com"
__license__=f"""
Copyright 2010 - 2024, Steve L. Nyemba

@ -119,7 +119,7 @@ def instance (**_args):
# _kwa = registry.get('logger')
# _lmodule = getPROVIDERS[_kwa['provider']]
if ('label' not in _args and registry.has('logger')):
if ( ('label' in _args and _args['label'] != 'logger') and registry.has('logger')):
#
# We did not request label called logger, so we are setting up a logger if it is specified in the registry
#
@ -132,6 +132,7 @@ def instance (**_args):
_logger = _logger(**_kwargs)
else:
_logger = None
_kwargs = {'agent':_agent,'plugins':_plugins,'logger':_logger}
if 'args' in _args :
_kwargs['args'] = _args['args']

@ -14,12 +14,33 @@ import types
from . import registry
from datetime import datetime
import pandas as pd
import numpy as np
import os
import sys
import itertools
import json
class BaseIO :
def __init__(self,**_args):
self._logger = _args['logger'] if 'logger' in _args else None
self._logTable = 'logs' if 'logTable' not in _args else _args['logTable']
def setLogger(self,_logger):
self._logger = _logger
def log (self,**_args):
if self._logger :
_date = str(datetime.now())
_data = dict({'pid':os.getpid(),'date':_date[:10],'time':_date[11:19]},**_args)
for key in _data :
if type(_data[key]) == list :
_data[key] = [_item.__name__ if type(_item).__name__== 'function' else _item for _item in _data[key]]
_data[key] = str(_data[key]) if type(_data[key]) not in [list,dict] else json.dumps(_data[key])
self._logger.write(pd.DataFrame([_data])) #,table=self._logTable)
class IO:
class IO(BaseIO):
"""
Base wrapper class for read/write and support for logs
"""
@ -28,33 +49,34 @@ class IO:
#
# We need to initialize the logger here ...
#
# registry.init()
super().__init__(**_args)
_agent = _args['agent']
plugins = _args['plugins']
_logger = _args['logger'] if 'logger' in _args else None
self._logger = _logger if not type(_agent) in [IReader,IWriter] else _agent._logger #transport.get.writer(label='logger') #if registry.has('logger') else None
# _logger = _args['logger'] if 'logger' in _args else None
# self._logger = _logger if not type(_agent) in [IReader,IWriter] else _agent._logger #transport.get.writer(label='logger') #if registry.has('logger') else None
# if not _logger and hasattr(_agent,'_logger') :
# self._logger = getattr(_agent,'_logger')
self._agent = _agent
_date = _date = str(datetime.now())
self._logTable = 'logs' #'_'.join(['logs',_date[:10]+_date[11:19]]).replace(':','').replace('-','_')
# self._logTable = 'logs' #'_'.join(['logs',_date[:10]+_date[11:19]]).replace(':','').replace('-','_')
if plugins :
self._init_plugins(plugins)
else:
self._plugins = None
def setLogger(self,_logger):
self._logger = _logger
def log (self,**_args):
if self._logger :
_date = str(datetime.now())
_data = dict({'pid':os.getpid(),'date':_date[:10],'time':_date[11:19]},**_args)
for key in _data :
if type(_data[key]) == list :
_data[key] = [_item.__name__ if type(_item).__name__== 'function' else _item for _item in _data[key]]
# def setLogger(self,_logger):
# self._logger = _logger
# def log (self,**_args):
# if self._logger :
# _date = str(datetime.now())
# _data = dict({'pid':os.getpid(),'date':_date[:10],'time':_date[11:19]},**_args)
# for key in _data :
# if type(_data[key]) == list :
# _data[key] = [_item.__name__ if type(_item).__name__== 'function' else _item for _item in _data[key]]
_data[key] = str(_data[key]) if type(_data[key]) not in [list,dict] else json.dumps(_data[key])
self._logger.write(pd.DataFrame([_data])) #,table=self._logTable)
# _data[key] = str(_data[key]) if type(_data[key]) not in [list,dict] else json.dumps(_data[key])
# self._logger.write(pd.DataFrame([_data])) #,table=self._logTable)
def _init_plugins(self,_items):
"""
This function will load pipelined functions as a plugin loader
@ -63,7 +85,7 @@ class IO:
self._plugins = PluginLoader(registry=registry.plugins)
[self._plugins.set(_name) for _name in _items]
self.log(action='init-plugins',caller='read',object=self.getClassName(self),input =[_name for _name in _items])
self.log(action='init-plugins',object=self.getClassName(self),input =[_name for _name in _items])
# if 'path' in _args and 'names' in _args :
# self._plugins = PluginLoader(**_args)
# else:
@ -109,13 +131,17 @@ class IReader(IO):
# self.log(action='streaming',object=self._agent._engine.name, input= type(_data).__name__)
_shape = []
for _segment in _data :
_shape.append(list(_segment.shape))
_shape += list(_segment.shape)
if self._plugins :
yield self._plugins.apply(_segment,self.log)
else:
yield _segment
_objectName = '.'.join([self._agent.__class__.__module__,self._agent.__class__.__name__])
self.log(action='streaming',object=_objectName, input= {'shape':_shape})
_input = {'shape':_shape}
if hasattr(self._agent,'_table') :
_input['table'] = self._agent._table
self.log(action='streaming',object=_objectName, input= _input)
def read(self,**_args):
@ -141,8 +167,11 @@ class IReader(IO):
# return _data
elif type(_data) == pd.DataFrame :
_shape = _data.shape #[0,0] if not _data.shape[] else list(_data.shape)
_input = {'shape':_shape}
if hasattr(self._agent,'_table') :
_input['table'] = self._agent._table
self.log(action='read',object=_objectName, input=_shape)
self.log(action='read',object=_objectName, input=_input)
if self._plugins :
_logs = []
_data = self._plugins.apply(_data,self.log)
@ -171,48 +200,67 @@ class IWriter(IO):
# The ETL object in its simplest form is an aggregation of read/write objects
# @TODO: ETL can/should aggregate a writer as a plugin and apply it as a process
class IETL(IReader) :
class IETL(BaseIO) :
"""
This class performs an ETL operation by ineriting a read and adding writes as pipeline functions
"""
def __init__(self,**_args):
_source = _args['source']
_plugins = _source['plugins'] if 'plugins' in _source else None
# super().__init__(transport.get.reader(**_args['source']))
super().__init__(agent=transport.get.reader(**_source),plugins=_plugins)
# _logger =
if 'target' in _args:
self._targets = _args['target'] if type(_args['target']) == list else [_args['target']]
else:
self._targets = []
self.jobs = []
# _source = _args['source']
# _plugins = _source['plugins'] if 'plugins' in _source else None
# # super().__init__(transport.get.reader(**_args['source']))
# super().__init__(agent=transport.get.reader(**_source),plugins=_plugins)
# # _logger =
# if 'target' in _args:
# self._targets = _args['target'] if type(_args['target']) == list else [_args['target']]
# else:
# self._targets = []
# self.jobs = []
# #
# # If the parent is already multiprocessing
# if 'token' in _source :
# self._logToken = _source['token']
# self._sourceArgs = _source['args'] if 'args' in _source else None
# self._hasParentProcess = False if 'hasParentProcess' not in _args else _args['hasParentProcess']
super().__init__()
self._source = _args['source']
self._targets= _args['target'] if _args['target'] == list else [_args['target']]
#
# ETL Initialization, we should provide some measure of context ...
#
# If the parent is already multiprocessing
if 'token' in _source :
self._logToken = _source['token']
self._sourceArgs = _source['args'] if 'args' in _source else None
self._hasParentProcess = False if 'hasParentProcess' not in _args else _args['hasParentProcess']
# def run(self) :
# """
# We should apply the etl here, if we are in multiprocessing mode
# """
# return self.read()
def run(self,**_args):
_data = super().read(**_args) if not self._sourceArgs else super().read(**self._sourceArgs)
self._targets = [transport.get.writer(**_kwargs) for _kwargs in self._targets]
# _data = super().read(**_args) if not self._sourceArgs else super().read(**self._sourceArgs)
# self._targets = [transport.get.writer(**_kwargs) for _kwargs in self._targets]
_reader = transport.get.reader(**self._source)
if hasattr(_reader,'_logger') :
self.setLogger(_reader._logger)
self.log(action='init-etl',input={'source':self._source,'target':self._targets})
_data = _reader.read(**self._source['args'])if 'args' in self._source else _reader.read()
_reader.close()
_writers = [transport.get.writer(**_kwargs) for _kwargs in self._targets]
_schema = [] if not getattr(_reader._agent,'_table') else _reader.meta()
if types.GeneratorType == type(_data):
_index = 0
for _segment in _data :
_index += 1
for _writer in self._targets :
self.post(_segment,writer=_writer,index=_index)
for _writer in _writers :
self.post(_segment,writer=_writer,index=_index,schema=_schema)
time.sleep(1)
else:
for _writer in self._targets :
self.post(_data,writer=_writer)
else:
for _writer in _writers :
self.post(_data,writer=_writer,schema=_schema)
# pass
return _data
# return _data
def post (self,_data,**_args) :
@ -221,18 +269,27 @@ class IETL(IReader) :
:_args parameters associated with writer object
"""
#writer = transport.get.writer(**_args)
_input = {}
try:
_action = 'post'
_shape = dict(zip(['rows','columns'],_data.shape))
_index = _args['index'] if 'index' in _args else 0
writer = _args['writer']
writer.write(_data)
writer = _args['writer']
_schema= _args['schema']
for _item in _schema :
if _item['type'] == 'INTEGER' :
_data.loc[:,_item['name']] = _data[_item['name']].copy().astype(np.int64)
writer.write(_data,schema=_schema)
#
# -- things to log
_input = {'shape':_shape,'segment':_index}
if hasattr(writer._agent,'_table'):
_input['table'] = writer._agent._table
except Exception as e:
_action = 'post-error'
print (e)
print ([e])
pass
self.log(action=_action,object=writer._agent.__module__, input= {'shape':_shape,'segment':_index})
self.log(action=_action,object=writer._agent.__module__, input= _input)

@ -7,7 +7,8 @@ NOTE:
"""
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.functions import col, to_date, to_timestamp
import copy
class Iceberg :
@ -25,6 +26,7 @@ class Iceberg :
# Make arrangements for additional configuration elements
#
self._session = SparkSession.builder.appName("data-transport").getOrCreate()
self._session.conf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS")
# self._session.sparkContext.setLogLevel("ERROR")
self._catalog = self._session.catalog
self._table = _args['table'] if 'table' in _args else None
@ -53,7 +55,8 @@ class Iceberg :
"""
_schema = []
try:
_tableName = self._getPrefix(**_args) + f".{_args['table']}"
_table = _args['table'] if 'table' in _args else self._table
_tableName = self._getPrefix(**_args) + f".{_table}"
_tmp = self._session.table(_tableName).schema
_schema = _tmp.jsonValue()['fields']
for _item in _schema :
@ -106,13 +109,32 @@ class Writer (Iceberg):
super().__init__(**_args)
self._mode = 'append' if 'mode' not in _args else _args['mode']
self._table = None if 'table' not in _args else _args['table']
def format (self,_schema) :
_iceSchema = StructType([])
_map = {'integer':IntegerType(),'float':DoubleType(),'double':DoubleType(),'date':DateType(),
'timestamp':TimestampType(),'datetime':TimestampType(),'string':StringType(),'varchar':StringType()}
for _item in _schema :
_name = _item['name']
_type = _item['type'].lower()
if _type not in _map :
_iceType = StringType()
else:
_iceType = _map[_type]
_iceSchema.add (StructField(_name,_iceType,True))
return _iceSchema if len(_iceSchema) else []
def write(self,_data,**_args):
_prefix = self._getPrefix(**_args)
if 'table' not in _args and not self._table :
raise Exception (f"Table Name should be specified for catalog/database {_prefix}")
rdd = self._session.createDataFrame(_data,verifySchema=False)
_schema = self.format(_args['schema']) if 'schema' in _args else []
if not _schema :
rdd = self._session.createDataFrame(_data,verifySchema=False)
else :
rdd = self._session.createDataFrame(_data,schema=_schema,verifySchema=True)
_mode = self._mode if 'mode' not in _args else _args['mode']
_table = self._table if 'table' not in _args else _args['table']
# print (_data.shape,_mode,_table)
if not self._session.catalog.tableExists(_table):
@ -125,4 +147,5 @@ class Writer (Iceberg):
else:
# rdd.writeTo(_table).append()
# # _table = f'{_prefix}.{_table}'
rdd.coalesce(10).write.format('iceberg').mode('append').save(_table)

Loading…
Cancel
Save